diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index 8edc00bc6bd37..48d2aa2f330ec 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -26,7 +26,67 @@ class JobMetrics:
     workflow_id: int
 
 
-def get_metrics(github_repo: github.Repository, workflows_to_track: dict[str, int]):
+@dataclass
+class GaugeMetric:
+    name: str
+    value: int
+    time_ns: int
+
+
+def get_sampled_workflow_metrics(github_repo: github.Repository):
+    """Gets global statistics about the Github workflow queue
+
+    Args:
+      github_repo: A github repo object to use to query the relevant information.
+
+    Returns:
+      Returns a list of GaugeMetric objects, containing the relevant metrics about
+      the workflow
+    """
+
+    # Other states are available (pending, waiting, etc), but the meaning
+    # is not documented (See #70540).
+    # "queued" seems to be the info we want.
+    queued_workflow_count = len(
+        [
+            x
+            for x in github_repo.get_workflow_runs(status="queued")
+            if x.name in WORKFLOWS_TO_TRACK
+        ]
+    )
+    running_workflow_count = len(
+        [
+            x
+            for x in github_repo.get_workflow_runs(status="in_progress")
+            if x.name in WORKFLOWS_TO_TRACK
+        ]
+    )
+
+    workflow_metrics = []
+    workflow_metrics.append(
+        GaugeMetric(
+            "workflow_queue_size",
+            queued_workflow_count,
+            time.time_ns(),
+        )
+    )
+    workflow_metrics.append(
+        GaugeMetric(
+            "running_workflow_count",
+            running_workflow_count,
+            time.time_ns(),
+        )
+    )
+    # Always send a hearbeat metric so we can monitor is this container is still able to log to Grafana.
+    workflow_metrics.append(
+        GaugeMetric("metrics_container_heartbeat", 1, time.time_ns())
+    )
+    return workflow_metrics
+
+
+def get_per_workflow_metrics(
+    github_repo: github.Repository, workflows_to_track: dict[str, int]
+):
     """Gets the metrics for specified Github workflows.
 
     This function takes in a list of workflows to track, and optionally the
@@ -43,14 +103,14 @@ def get_metrics(github_repo: github.Repository, workflows_to_track: dict[str, in
       Returns a list of JobMetrics objects, containing the relevant metrics about
       the workflow.
     """
-    workflow_runs = iter(github_repo.get_workflow_runs())
-
     workflow_metrics = []
 
     workflows_to_include = set(workflows_to_track.keys())
 
-    while len(workflows_to_include) > 0:
-        workflow_run = next(workflow_runs)
+    for workflow_run in iter(github_repo.get_workflow_runs()):
+        if len(workflows_to_include) == 0:
+            break
+
         if workflow_run.status != "completed":
             continue
 
@@ -139,12 +199,27 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
       metrics_userid: The userid to use for the upload.
       api_key: The API key to use for the upload.
     """
+
+    if len(workflow_metrics) == 0:
+        print("No metrics found to upload.", file=sys.stderr)
+        return
+
     metrics_batch = []
     for workflow_metric in workflow_metrics:
-        workflow_formatted_name = workflow_metric.job_name.lower().replace(" ", "_")
-        metrics_batch.append(
-            f"{workflow_formatted_name} queue_time={workflow_metric.queue_time},run_time={workflow_metric.run_time},status={workflow_metric.status} {workflow_metric.created_at_ns}"
-        )
+        if isinstance(workflow_metric, GaugeMetric):
+            name = workflow_metric.name.lower().replace(" ", "_")
+            metrics_batch.append(
+                f"{name} value={workflow_metric.value} {workflow_metric.time_ns}"
+            )
+        elif isinstance(workflow_metric, JobMetrics):
+            name = workflow_metric.job_name.lower().replace(" ", "_")
+            metrics_batch.append(
+                f"{name} queue_time={workflow_metric.queue_time},run_time={workflow_metric.run_time},status={workflow_metric.status} {workflow_metric.created_at_ns}"
+            )
+        else:
+            raise ValueError(
+                f"Unsupported object type {type(workflow_metric)}: {str(workflow_metric)}"
+            )
 
     request_data = "\n".join(metrics_batch)
     response = requests.post(
@@ -176,16 +251,21 @@ def main():
     # Enter the main loop. Every five minutes we wake up and dump metrics for
     # the relevant jobs.
     while True:
-        current_metrics = get_metrics(github_repo, workflows_to_track)
-        if len(current_metrics) == 0:
-            print("No metrics found to upload.", file=sys.stderr)
-            continue
+        current_metrics = get_per_workflow_metrics(github_repo, workflows_to_track)
+        current_metrics += get_sampled_workflow_metrics(github_repo)
+        # Always send a hearbeat metric so we can monitor is this container is still able to log to Grafana.
+        current_metrics.append(
+            GaugeMetric("metrics_container_heartbeat", 1, time.time_ns())
+        )
 
         upload_metrics(current_metrics, grafana_metrics_userid, grafana_api_key)
         print(f"Uploaded {len(current_metrics)} metrics", file=sys.stderr)
 
         for workflow_metric in reversed(current_metrics):
-            workflows_to_track[workflow_metric.job_name] = workflow_metric.workflow_id
+            if isinstance(workflow_metric, JobMetrics):
+                workflows_to_track[
+                    workflow_metric.job_name
+                ] = workflow_metric.workflow_id
 
         time.sleep(SCRAPE_INTERVAL_SECONDS)
 
diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt
index 9ac196ad0e821..04db160b64b05 100644
--- a/bolt/CMakeLists.txt
+++ b/bolt/CMakeLists.txt
@@ -163,8 +163,8 @@ if (BOLT_ENABLE_RUNTIME)
   add_llvm_install_targets(install-bolt_rt
     DEPENDS bolt_rt bolt
     COMPONENT bolt)
-  set(LIBBOLT_RT_INSTR "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib/libbolt_rt_instr.a")
-  set(LIBBOLT_RT_HUGIFY "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib/libbolt_rt_hugify.a")
+  set(LIBBOLT_RT_INSTR "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_instr.a")
+  set(LIBBOLT_RT_HUGIFY "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_hugify.a")
 endif()
 
 find_program(GNU_LD_EXECUTABLE NAMES ${LLVM_DEFAULT_TARGET_TRIPLE}-ld.bfd ld.bfd DOC "GNU ld")
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 115e59ca0697e..94fe4aa8aa0e5 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -1363,6 +1363,12 @@ class BinaryContext {
     if (std::optional<uint32_t> Size = MIB->getSize(Inst))
       return *Size;
 
+    if (MIB->isPseudo(Inst))
+      return 0;
+
+    if (std::optional<uint32_t> Size = MIB->getInstructionSize(Inst))
+      return *Size;
+
     if (!Emitter)
       Emitter = this->MCE.get();
     SmallString<256> Code;
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 3634fed9757ce..5d77e6faff2fc 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -1204,6 +1204,11 @@ class MCPlusBuilder {
   /// Get instruction size specified via annotation.
   std::optional<uint32_t> getSize(const MCInst &Inst) const;
 
+  /// Get target-specific instruction size.
+  virtual std::optional<uint32_t> getInstructionSize(const MCInst &Inst) const {
+    return std::nullopt;
+  }
+
   /// Set instruction size.
   void setSize(MCInst &Inst, uint32_t Size) const;
 
diff --git a/bolt/lib/Passes/Inliner.cpp b/bolt/lib/Passes/Inliner.cpp
index f004a8eeea185..1793f4ff1f148 100644
--- a/bolt/lib/Passes/Inliner.cpp
+++ b/bolt/lib/Passes/Inliner.cpp
@@ -310,13 +310,13 @@ Inliner::inlineCall(BinaryBasicBlock &CallerBB,
       if (MIB.isPseudo(Inst))
         continue;
 
-      MIB.stripAnnotations(Inst, /*KeepTC=*/BC.isX86());
+      MIB.stripAnnotations(Inst, /*KeepTC=*/BC.isX86() || BC.isAArch64());
 
       // Fix branch target. Strictly speaking, we don't have to do this as
       // targets of direct branches will be fixed later and don't matter
       // in the CFG state. However, disassembly may look misleading, and
       // hence we do the fixing.
-      if (MIB.isBranch(Inst)) {
+      if (MIB.isBranch(Inst) && !MIB.isTailCall(Inst)) {
         assert(!MIB.isIndirectBranch(Inst) &&
                "unexpected indirect branch in callee");
         const BinaryBasicBlock *TargetBB =
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 679c9774c767f..d84da10b5bbe6 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -133,6 +133,36 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 public:
   using MCPlusBuilder::MCPlusBuilder;
 
+  MCPhysReg getStackPointer() const override { return AArch64::SP; }
+
+  bool isPush(const MCInst &Inst) const override { return false; }
+
+  bool isPop(const MCInst &Inst) const override { return false; }
+
+  void createCall(MCInst &Inst, const MCSymbol *Target,
+                  MCContext *Ctx) override {
+    createDirectCall(Inst, Target, Ctx, false);
+  }
+
+  bool convertTailCallToCall(MCInst &Inst) override {
+    int NewOpcode;
+    switch (Inst.getOpcode()) {
+    default:
+      return false;
+    case AArch64::B:
+      NewOpcode = AArch64::BL;
+      break;
+    case AArch64::BR:
+      NewOpcode = AArch64::BLR;
+      break;
+    }
+
+    Inst.setOpcode(NewOpcode);
+    removeAnnotation(Inst, MCPlus::MCAnnotation::kTailCall);
+    clearOffset(Inst);
+    return true;
+  }
+
   bool equals(const MCTargetExpr &A, const MCTargetExpr &B,
               CompFuncTy Comp) const override {
     const auto &AArch64ExprA = cast<AArch64MCExpr>(A);
@@ -1792,6 +1822,11 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
   }
 
   uint16_t getMinFunctionAlignment() const override { return 4; }
+
+  std::optional<uint32_t>
+  getInstructionSize(const MCInst &Inst) const override {
+    return 4;
+  }
 };
 
 } // end anonymous namespace
diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt
index 40f4fbc9f30d5..0deb69a27d435 100644
--- a/bolt/runtime/CMakeLists.txt
+++ b/bolt/runtime/CMakeLists.txt
@@ -16,18 +16,18 @@ add_library(bolt_rt_instr STATIC
   instr.cpp
   ${CMAKE_CURRENT_BINARY_DIR}/config.h
   )
-set_target_properties(bolt_rt_instr PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+set_target_properties(bolt_rt_instr PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "lib${LLVM_LIBDIR_SUFFIX}")
 add_library(bolt_rt_hugify STATIC
   hugify.cpp
   ${CMAKE_CURRENT_BINARY_DIR}/config.h
   )
-set_target_properties(bolt_rt_hugify PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+set_target_properties(bolt_rt_hugify PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "lib${LLVM_LIBDIR_SUFFIX}")
 
 if(NOT BOLT_BUILT_STANDALONE)
   add_custom_command(TARGET bolt_rt_instr POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_instr.a" "${LLVM_LIBRARY_DIR}")
+    COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_instr.a" "${LLVM_LIBRARY_DIR}")
   add_custom_command(TARGET bolt_rt_hugify POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_hugify.a" "${LLVM_LIBRARY_DIR}")
+    COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_hugify.a" "${LLVM_LIBRARY_DIR}")
 endif()
 
 set(BOLT_RT_FLAGS
@@ -53,23 +53,23 @@ target_include_directories(bolt_rt_instr PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 target_compile_options(bolt_rt_hugify PRIVATE ${BOLT_RT_FLAGS})
 target_include_directories(bolt_rt_hugify PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 
-install(TARGETS bolt_rt_instr DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
-install(TARGETS bolt_rt_hugify DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+install(TARGETS bolt_rt_instr DESTINATION "lib${LLVM_LIBDIR_SUFFIX}")
+install(TARGETS bolt_rt_hugify DESTINATION "lib${LLVM_LIBDIR_SUFFIX}")
 
 if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
   add_library(bolt_rt_instr_osx STATIC
     instr.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/config.h
   )
-  set_target_properties(bolt_rt_instr_osx PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+  set_target_properties(bolt_rt_instr_osx PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "lib${LLVM_LIBDIR_SUFFIX}")
   target_include_directories(bolt_rt_instr_osx PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
   target_compile_options(bolt_rt_instr_osx PRIVATE
     -target x86_64-apple-darwin19.6.0
     ${BOLT_RT_FLAGS})
-  install(TARGETS bolt_rt_instr_osx DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+  install(TARGETS bolt_rt_instr_osx DESTINATION "lib${LLVM_LIBDIR_SUFFIX}")
 
   if(NOT BOLT_BUILT_STANDALONE)
     add_custom_command(TARGET bolt_rt_instr_osx POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_instr_osx.a" "${LLVM_LIBRARY_DIR}")
+      COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_instr_osx.a" "${LLVM_LIBRARY_DIR}")
   endif()
 endif()
diff --git a/bolt/test/AArch64/inline-small-function-1.s b/bolt/test/AArch64/inline-small-function-1.s
new file mode 100644
index 0000000000000..3ea22a9915fb4
--- /dev/null
+++ b/bolt/test/AArch64/inline-small-function-1.s
@@ -0,0 +1,42 @@
+## This test checks that inline is properly handled by BOLT on aarch64.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags -O0 %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt --inline-small-functions --print-inline  --print-only=_Z3barP1A  \
+# RUN: %t.exe -o %t.bolt  | FileCheck %s
+
+# CHECK: BOLT-INFO: inlined 0 calls at 1 call sites in 2 iteration(s). Change in binary size: 4 bytes.
+# CHECK: Binary Function "_Z3barP1A" after inlining {
+# CHECK-NOT: bl	_Z3fooP1A
+# CHECK: ldr	x8, [x0]
+# CHECK-NEXT: ldr	w0, [x8]
+ 
+	.text
+	.globl	_Z3fooP1A                      
+	.type	_Z3fooP1A,@function
+_Z3fooP1A:                              
+	ldr	x8, [x0]
+	ldr	w0, [x8]
+	ret
+	.size	_Z3fooP1A, .-_Z3fooP1A
+
+	.globl	_Z3barP1A                       
+	.type	_Z3barP1A,@function
+_Z3barP1A:                              
+	stp	x29, x30, [sp, #-16]!           
+	mov	x29, sp
+	bl	_Z3fooP1A
+	mul	w0, w0, w0
+	ldp	x29, x30, [sp], #16             
+	ret
+	.size	_Z3barP1A, .-_Z3barP1A
+
+	.globl	main                            
+	.p2align	2
+	.type	main,@function
+main:                                   
+	mov	w0, wzr
+	ret
+	.size	main, .-main
diff --git a/bolt/test/AArch64/inline-small-function-2.s b/bolt/test/AArch64/inline-small-function-2.s
new file mode 100644
index 0000000000000..5eb7d391fd157
--- /dev/null
+++ b/bolt/test/AArch64/inline-small-function-2.s
@@ -0,0 +1,48 @@
+## This test checks that inline is properly handled by BOLT on aarch64.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags -O0 %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt --inline-small-functions --print-inline  --print-only=test  \
+# RUN: %t.exe -o %t.bolt | FileCheck %s
+
+#CHECK: BOLT-INFO: inlined 0 calls at 1 call sites in 2 iteration(s). Change in binary size: 4 bytes.
+#CHECK: Binary Function "test" after inlining {
+#CHECK-NOT: bl	indirect
+#CHECK: add	w0, w1, w0
+#CHECK-NEXT: blr	x2
+ 
+	.text
+	.globl	indirect                       
+	.type	indirect,@function
+indirect:                               
+	add	w0, w1, w0
+	br	x2
+	.size	indirect, .-indirect
+
+	.globl	test                           
+	.type	test,@function
+test:                                   
+	stp	x29, x30, [sp, #-32]!          
+	stp	x20, x19, [sp, #16]            
+	mov	x29, sp
+	mov	w19, w1
+	mov	w20, w0
+	bl	indirect
+	add	w8, w19, w20
+	cmp	w0, #0
+	csinc	w0, w8, wzr, eq
+	ldp	x20, x19, [sp, #16]             
+	ldp	x29, x30, [sp], #32            
+	ret
+	.size	test, .-test
+
+	.globl	main                            
+	.type	main,@function
+main:                                   
+	mov	w0, wzr
+	ret
+	.size	main, .-main
+
+ 
\ No newline at end of file
diff --git a/bolt/test/lit.local.cfg b/bolt/test/lit.local.cfg
index e2fa0a4a2210f..d5a6849b27a77 100644
--- a/bolt/test/lit.local.cfg
+++ b/bolt/test/lit.local.cfg
@@ -1,5 +1,5 @@
 host_linux_triple = config.target_triple.split("-")[0] + "-unknown-linux-gnu"
-common_linker_flags = "-fuse-ld=lld -Wl,--unresolved-symbols=ignore-all -pie"
+common_linker_flags = "-fuse-ld=lld -Wl,--unresolved-symbols=ignore-all -Wl,--build-id=none -pie"
 flags = f"--target={host_linux_triple} -fPIE {common_linker_flags}"
 
 config.substitutions.insert(0, ("%cflags", f"%cflags {flags}"))
diff --git a/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp b/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp
index 80ee31368fe9a..30bc8be1719d5 100644
--- a/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp
+++ b/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp
@@ -118,35 +118,6 @@ findMembersUsedInInitExpr(const CXXCtorInitializer *Initializer,
   return Results;
 }
 
-/// Returns the next token after `Loc` (including comment tokens).
-static std::optional<Token> getTokenAfter(SourceLocation Loc,
-                                          const SourceManager &SM,
-                                          const LangOptions &LangOpts) {
-  if (Loc.isMacroID()) {
-    return std::nullopt;
-  }
-  Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
-
-  // Break down the source location.
-  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
-
-  // Try to load the file buffer.
-  bool InvalidTemp = false;
-  StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
-  if (InvalidTemp)
-    return std::nullopt;
-
-  const char *TokenBegin = File.data() + LocInfo.second;
-
-  Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
-              TokenBegin, File.end());
-  lexer.SetCommentRetentionState(true);
-  // Find the token.
-  Token Tok;
-  lexer.LexFromRawLexer(Tok);
-  return Tok;
-}
-
 /// Returns the end of the trailing comments after `Loc`.
 static SourceLocation getEndOfTrailingComment(SourceLocation Loc,
                                               const SourceManager &SM,
@@ -154,11 +125,12 @@ static SourceLocation getEndOfTrailingComment(SourceLocation Loc,
   // We consider any following comment token that is indented more than the
   // first comment to be part of the trailing comment.
   const unsigned Column = SM.getPresumedColumnNumber(Loc);
-  std::optional<Token> Tok = getTokenAfter(Loc, SM, LangOpts);
+  std::optional<Token> Tok =
+      Lexer::findNextToken(Loc, SM, LangOpts, /*IncludeComments=*/true);
   while (Tok && Tok->is(tok::comment) &&
          SM.getPresumedColumnNumber(Tok->getLocation()) > Column) {
     Loc = Tok->getEndLoc();
-    Tok = getTokenAfter(Loc, SM, LangOpts);
+    Tok = Lexer::findNextToken(Loc, SM, LangOpts, /*IncludeComments=*/true);
   }
   return Loc;
 }
diff --git a/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.cpp b/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.cpp
index 7ec62f41aec01..24674a407cb36 100644
--- a/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.cpp
@@ -9,7 +9,11 @@
 #include "RawStringLiteralCheck.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/SourceManager.h"
 #include "clang/Lex/Lexer.h"
+#include "llvm/ADT/StringRef.h"
+#include <optional>
 
 using namespace clang::ast_matchers;
 
@@ -66,20 +70,6 @@ bool containsDelimiter(StringRef Bytes, const std::string &Delimiter) {
                         : (")" + Delimiter + R"(")")) != StringRef::npos;
 }
 
-std::string asRawStringLiteral(const StringLiteral *Literal,
-                               const std::string &DelimiterStem) {
-  const StringRef Bytes = Literal->getBytes();
-  std::string Delimiter;
-  for (int I = 0; containsDelimiter(Bytes, Delimiter); ++I) {
-    Delimiter = (I == 0) ? DelimiterStem : DelimiterStem + std::to_string(I);
-  }
-
-  if (Delimiter.empty())
-    return (R"(R"()" + Bytes + R"lit()")lit").str();
-
-  return (R"(R")" + Delimiter + "(" + Bytes + ")" + Delimiter + R"(")").str();
-}
-
 } // namespace
 
 RawStringLiteralCheck::RawStringLiteralCheck(StringRef Name,
@@ -119,30 +109,73 @@ void RawStringLiteralCheck::registerMatchers(MatchFinder *Finder) {
       stringLiteral(unless(hasParent(predefinedExpr()))).bind("lit"), this);
 }
 
+static std::optional<StringRef>
+createUserDefinedSuffix(const StringLiteral *Literal, const SourceManager &SM,
+                        const LangOptions &LangOpts) {
+  const CharSourceRange TokenRange =
+      CharSourceRange::getTokenRange(Literal->getSourceRange());
+  Token T;
+  if (Lexer::getRawToken(Literal->getBeginLoc(), T, SM, LangOpts))
+    return std::nullopt;
+  const CharSourceRange CharRange =
+      Lexer::makeFileCharRange(TokenRange, SM, LangOpts);
+  if (T.hasUDSuffix()) {
+    StringRef Text = Lexer::getSourceText(CharRange, SM, LangOpts);
+    const size_t UDSuffixPos = Text.find_last_of('"');
+    if (UDSuffixPos == StringRef::npos)
+      return std::nullopt;
+    return Text.slice(UDSuffixPos + 1, Text.size());
+  }
+  return std::nullopt;
+}
+
+static std::string createRawStringLiteral(const StringLiteral *Literal,
+                                          const std::string &DelimiterStem,
+                                          const SourceManager &SM,
+                                          const LangOptions &LangOpts) {
+  const StringRef Bytes = Literal->getBytes();
+  std::string Delimiter;
+  for (int I = 0; containsDelimiter(Bytes, Delimiter); ++I) {
+    Delimiter = (I == 0) ? DelimiterStem : DelimiterStem + std::to_string(I);
+  }
+
+  std::optional<StringRef> UserDefinedSuffix =
+      createUserDefinedSuffix(Literal, SM, LangOpts);
+
+  if (Delimiter.empty())
+    return (R"(R"()" + Bytes + R"lit()")lit" + UserDefinedSuffix.value_or(""))
+        .str();
+
+  return (R"(R")" + Delimiter + "(" + Bytes + ")" + Delimiter + R"(")" +
+          UserDefinedSuffix.value_or(""))
+      .str();
+}
+
+static bool compareStringLength(StringRef Replacement,
+                                const StringLiteral *Literal,
+                                const SourceManager &SM,
+                                const LangOptions &LangOpts) {
+  return Replacement.size() <=
+         Lexer::MeasureTokenLength(Literal->getBeginLoc(), SM, LangOpts);
+}
+
 void RawStringLiteralCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *Literal = Result.Nodes.getNodeAs<StringLiteral>("lit");
   if (Literal->getBeginLoc().isMacroID())
     return;
-
+  const SourceManager &SM = *Result.SourceManager;
+  const LangOptions &LangOpts = getLangOpts();
   if (containsEscapedCharacters(Result, Literal, DisallowedChars)) {
-    std::string Replacement = asRawStringLiteral(Literal, DelimiterStem);
+    const std::string Replacement =
+        createRawStringLiteral(Literal, DelimiterStem, SM, LangOpts);
     if (ReplaceShorterLiterals ||
-        Replacement.length() <=
-            Lexer::MeasureTokenLength(Literal->getBeginLoc(),
-                                      *Result.SourceManager, getLangOpts()))
-      replaceWithRawStringLiteral(Result, Literal, Replacement);
+        compareStringLength(Replacement, Literal, SM, LangOpts)) {
+      diag(Literal->getBeginLoc(),
+           "escaped string literal can be written as a raw string literal")
+          << FixItHint::CreateReplacement(Literal->getSourceRange(),
+                                          Replacement);
+    }
   }
 }
 
-void RawStringLiteralCheck::replaceWithRawStringLiteral(
-    const MatchFinder::MatchResult &Result, const StringLiteral *Literal,
-    StringRef Replacement) {
-  CharSourceRange CharRange = Lexer::makeFileCharRange(
-      CharSourceRange::getTokenRange(Literal->getSourceRange()),
-      *Result.SourceManager, getLangOpts());
-  diag(Literal->getBeginLoc(),
-       "escaped string literal can be written as a raw string literal")
-      << FixItHint::CreateReplacement(CharRange, Replacement);
-}
-
 } // namespace clang::tidy::modernize
diff --git a/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.h b/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.h
index aae58ca0e98d9..879255550dd5b 100644
--- a/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.h
@@ -33,10 +33,6 @@ class RawStringLiteralCheck : public ClangTidyCheck {
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
 
 private:
-  void replaceWithRawStringLiteral(
-      const ast_matchers::MatchFinder::MatchResult &Result,
-      const StringLiteral *Literal, StringRef Replacement);
-
   std::string DelimiterStem;
   CharsBitSet DisallowedChars;
   const bool ReplaceShorterLiterals;
diff --git a/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp b/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp
index 92c3e0ed7894e..50da196315d3b 100644
--- a/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp
@@ -86,29 +86,6 @@ SourceLocation findNextTerminator(SourceLocation Start, const SourceManager &SM,
   return findNextAnyTokenKind(Start, SM, LangOpts, tok::comma, tok::semi);
 }
 
-std::optional<Token>
-findNextTokenIncludingComments(SourceLocation Start, const SourceManager &SM,
-                               const LangOptions &LangOpts) {
-  // `Lexer::findNextToken` will ignore comment
-  if (Start.isMacroID())
-    return std::nullopt;
-  Start = Lexer::getLocForEndOfToken(Start, 0, SM, LangOpts);
-  // Break down the source location.
-  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Start);
-  bool InvalidTemp = false;
-  StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
-  if (InvalidTemp)
-    return std::nullopt;
-  // Lex from the start of the given location.
-  Lexer L(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
-          File.data() + LocInfo.second, File.end());
-  L.SetCommentRetentionState(true);
-  // Find the token.
-  Token Tok;
-  L.LexFromRawLexer(Tok);
-  return Tok;
-}
-
 std::optional<Token>
 findNextTokenSkippingComments(SourceLocation Start, const SourceManager &SM,
                               const LangOptions &LangOpts) {
diff --git a/clang-tools-extra/clang-tidy/utils/LexerUtils.h b/clang-tools-extra/clang-tidy/utils/LexerUtils.h
index ea9bd512b68b8..afd63885e388c 100644
--- a/clang-tools-extra/clang-tidy/utils/LexerUtils.h
+++ b/clang-tools-extra/clang-tidy/utils/LexerUtils.h
@@ -89,9 +89,11 @@ SourceLocation findNextAnyTokenKind(SourceLocation Start,
   }
 }
 
-std::optional<Token>
+inline std::optional<Token>
 findNextTokenIncludingComments(SourceLocation Start, const SourceManager &SM,
-                               const LangOptions &LangOpts);
+                               const LangOptions &LangOpts) {
+  return Lexer::findNextToken(Start, SM, LangOpts, true);
+}
 
 // Finds next token that's not a comment.
 std::optional<Token> findNextTokenSkippingComments(SourceLocation Start,
diff --git a/clang-tools-extra/clangd/CMakeLists.txt b/clang-tools-extra/clangd/CMakeLists.txt
index d797ddce8c44d..6f10afe4a5625 100644
--- a/clang-tools-extra/clangd/CMakeLists.txt
+++ b/clang-tools-extra/clangd/CMakeLists.txt
@@ -91,7 +91,6 @@ add_clang_library(clangDaemon STATIC
   GlobalCompilationDatabase.cpp
   Headers.cpp
   HeaderSourceSwitch.cpp
-  HeuristicResolver.cpp
   Hover.cpp
   IncludeCleaner.cpp
   IncludeFixer.cpp
diff --git a/clang-tools-extra/clangd/FindTarget.cpp b/clang-tools-extra/clangd/FindTarget.cpp
index e702c6b3537a0..04fd6d437b7bd 100644
--- a/clang-tools-extra/clangd/FindTarget.cpp
+++ b/clang-tools-extra/clangd/FindTarget.cpp
@@ -8,7 +8,6 @@
 
 #include "FindTarget.h"
 #include "AST.h"
-#include "HeuristicResolver.h"
 #include "support/Logger.h"
 #include "clang/AST/ASTConcept.h"
 #include "clang/AST/ASTTypeTraits.h"
@@ -35,6 +34,7 @@
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/Specifiers.h"
+#include "clang/Sema/HeuristicResolver.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
diff --git a/clang-tools-extra/clangd/FindTarget.h b/clang-tools-extra/clangd/FindTarget.h
index b41c547095100..a7706804ce7ec 100644
--- a/clang-tools-extra/clangd/FindTarget.h
+++ b/clang-tools-extra/clangd/FindTarget.h
@@ -33,9 +33,11 @@
 #include <bitset>
 
 namespace clang {
-namespace clangd {
+
 class HeuristicResolver;
 
+namespace clangd {
+
 /// Describes the link between an AST node and a Decl it refers to.
 enum class DeclRelation : unsigned;
 /// A bitfield of DeclRelations.
diff --git a/clang-tools-extra/clangd/InlayHints.cpp b/clang-tools-extra/clangd/InlayHints.cpp
index fefffeb4efc1a..1b1bcf78c9855 100644
--- a/clang-tools-extra/clangd/InlayHints.cpp
+++ b/clang-tools-extra/clangd/InlayHints.cpp
@@ -9,7 +9,6 @@
 #include "../clang-tidy/utils/DesignatedInitializers.h"
 #include "AST.h"
 #include "Config.h"
-#include "HeuristicResolver.h"
 #include "ParsedAST.h"
 #include "Protocol.h"
 #include "SourceCode.h"
@@ -27,6 +26,7 @@
 #include "clang/Basic/OperatorKinds.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Sema/HeuristicResolver.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/clang-tools-extra/clangd/ParsedAST.cpp b/clang-tools-extra/clangd/ParsedAST.cpp
index 725cbeb154cb8..89d6f26d0f150 100644
--- a/clang-tools-extra/clangd/ParsedAST.cpp
+++ b/clang-tools-extra/clangd/ParsedAST.cpp
@@ -20,7 +20,6 @@
 #include "Feature.h"
 #include "FeatureModule.h"
 #include "Headers.h"
-#include "HeuristicResolver.h"
 #include "IncludeCleaner.h"
 #include "IncludeFixer.h"
 #include "Preamble.h"
@@ -53,6 +52,7 @@
 #include "clang/Lex/Lexer.h"
 #include "clang/Lex/PPCallbacks.h"
 #include "clang/Lex/Preprocessor.h"
+#include "clang/Sema/HeuristicResolver.h"
 #include "clang/Serialization/ASTWriter.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "clang/Tooling/Core/Diagnostic.h"
diff --git a/clang-tools-extra/clangd/ParsedAST.h b/clang-tools-extra/clangd/ParsedAST.h
index 8d9d1e6456926..82fac96360488 100644
--- a/clang-tools-extra/clangd/ParsedAST.h
+++ b/clang-tools-extra/clangd/ParsedAST.h
@@ -38,9 +38,9 @@
 #include <vector>
 
 namespace clang {
+class HeuristicResolver;
 class Sema;
 namespace clangd {
-class HeuristicResolver;
 
 /// Stores and provides access to parsed AST.
 class ParsedAST {
diff --git a/clang-tools-extra/clangd/SemanticHighlighting.cpp b/clang-tools-extra/clangd/SemanticHighlighting.cpp
index e6d16af2495fe..86ca05644c703 100644
--- a/clang-tools-extra/clangd/SemanticHighlighting.cpp
+++ b/clang-tools-extra/clangd/SemanticHighlighting.cpp
@@ -9,7 +9,6 @@
 #include "SemanticHighlighting.h"
 #include "Config.h"
 #include "FindTarget.h"
-#include "HeuristicResolver.h"
 #include "ParsedAST.h"
 #include "Protocol.h"
 #include "SourceCode.h"
@@ -27,6 +26,7 @@
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Sema/HeuristicResolver.h"
 #include "clang/Tooling/Syntax/Tokens.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
diff --git a/clang-tools-extra/clangd/XRefs.cpp b/clang-tools-extra/clangd/XRefs.cpp
index 8d5ab2e491a40..0a093108b752c 100644
--- a/clang-tools-extra/clangd/XRefs.cpp
+++ b/clang-tools-extra/clangd/XRefs.cpp
@@ -10,7 +10,6 @@
 #include "FindSymbols.h"
 #include "FindTarget.h"
 #include "Headers.h"
-#include "HeuristicResolver.h"
 #include "IncludeCleaner.h"
 #include "ParsedAST.h"
 #include "Protocol.h"
@@ -53,6 +52,7 @@
 #include "clang/Index/IndexingOptions.h"
 #include "clang/Index/USRGeneration.h"
 #include "clang/Lex/Lexer.h"
+#include "clang/Sema/HeuristicResolver.h"
 #include "clang/Tooling/Syntax/Tokens.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
diff --git a/clang-tools-extra/clangd/unittests/CMakeLists.txt b/clang-tools-extra/clangd/unittests/CMakeLists.txt
index 8dba8088908d5..dffdcd5d014ca 100644
--- a/clang-tools-extra/clangd/unittests/CMakeLists.txt
+++ b/clang-tools-extra/clangd/unittests/CMakeLists.txt
@@ -64,7 +64,6 @@ add_unittest(ClangdUnitTests ClangdTests
   GlobalCompilationDatabaseTests.cpp
   HeadersTests.cpp
   HeaderSourceSwitchTests.cpp
-  HeuristicResolverTests.cpp
   HoverTests.cpp
   IncludeCleanerTests.cpp
   IndexActionTests.cpp
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 8ba47dfc84f26..33a452f525f76 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -321,6 +321,10 @@ Changes in existing checks
   a false positive when only an implicit conversion happened inside an
   initializer list.
 
+- Improved :doc:`modernize-raw-string-literal
+  <clang-tidy/checks/modernize/raw-string-literal>` check to fix incorrect
+  fix-it when the string contains a user-defined suffix.
+
 - Improved :doc:`modernize-use-designated-initializers
   <clang-tidy/checks/modernize/use-designated-initializers>` check to fix a
   crash when a class is declared but not defined.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/raw-string-literal.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/raw-string-literal.cpp
index ad5d450036f2f..5856b8882574a 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/raw-string-literal.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/raw-string-literal.cpp
@@ -129,3 +129,16 @@ void callFn() {
   // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: {{.*}} can be written as a raw string literal
   // CHECK-FIXES: {{^}}  fn<double>(R"(foo\bar)");{{$}}
 }
+
+namespace std {
+using size_t = decltype(sizeof(0));
+namespace ud {
+int operator""_abc(const char *str, std::size_t len);
+} // namespace ud
+} // namespace std
+namespace gh97243 {
+using namespace std::ud;
+auto UserDefinedLiteral = "foo\\bar"_abc;
+// CHECK-MESSAGES: :[[@LINE-1]]:27: warning: {{.*}} can be written as a raw string literal
+// CHECK-FIXES: {{^}}auto UserDefinedLiteral = R"(foo\bar)"_abc;
+} // namespace gh97243
diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
index e799900094df3..1cbf691f29d58 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -300,7 +300,7 @@ if(FUCHSIA_SDK)
   set(LLVM_RUNTIME_MULTILIB_hwasan+noexcept_TARGETS "aarch64-unknown-fuchsia;riscv64-unknown-fuchsia" CACHE STRING "")
 endif()
 
-foreach(target armv6m-none-eabi;armv7m-none-eabi;armv8m.main-none-eabi;armv8.1m.main-none-eabi;aarch64-none-elf)
+foreach(target armv6m-none-eabi;armv7m-none-eabi;armv7em-none-eabi;armv8m.main-none-eabi;armv8.1m.main-none-eabi;aarch64-none-elf)
   list(APPEND BUILTIN_TARGETS "${target}")
   set(BUILTINS_${target}_CMAKE_SYSTEM_NAME Generic CACHE STRING "")
   set(BUILTINS_${target}_CMAKE_SYSTEM_PROCESSOR arm CACHE STRING "")
diff --git a/clang/cmake/caches/hexagon-unknown-linux-musl-clang-cross.cmake b/clang/cmake/caches/hexagon-unknown-linux-musl-clang-cross.cmake
index 91bbe26b62105..fd2aeec819fc0 100644
--- a/clang/cmake/caches/hexagon-unknown-linux-musl-clang-cross.cmake
+++ b/clang/cmake/caches/hexagon-unknown-linux-musl-clang-cross.cmake
@@ -10,6 +10,9 @@ set(CLANG_LINKS_TO_CREATE
             hexagon-none-elf-clang
             hexagon-unknown-none-elf-clang++
             hexagon-unknown-none-elf-clang
+            clang++
+            clang-cl
+            clang-cpp
             CACHE STRING "")
 
 set(LLVM_INSTALL_TOOLCHAIN_ONLY ON CACHE BOOL "")
diff --git a/clang/docs/InternalsManual.rst b/clang/docs/InternalsManual.rst
index 39d389b816f12..a2b551b6f333e 100644
--- a/clang/docs/InternalsManual.rst
+++ b/clang/docs/InternalsManual.rst
@@ -276,6 +276,21 @@ Description:
   diagnostic instead of having to do things textually.  The selected string
   does undergo formatting.
 
+**"enum_select format**
+
+Example:
+  ``unknown frobbling of a %enum_select<FrobbleKind>{%VarDecl{variable declaration}|%FuncDecl{function declaration}}0 when blarging``
+Class:
+  Integers
+Description:
+  This format specifier is used exactly like a ``select`` specifier, except it
+  additionally generates a namespace, enumeration, and enumerator list based on
+  the format string given. In the above case, a namespace is generated named
+  ``FrobbleKind`` that has an unscoped enumeration with the enumerators
+  ``VarDecl`` and ``FuncDecl`` which correspond to the values 0 and 1. This
+  permits a clearer use of the ``Diag`` in source code, as the above could be
+  called as: ``Diag(Loc, diag::frobble) << diag::FrobbleKind::VarDecl``.
+
 **"plural" format**
 
 Example:
diff --git a/clang/docs/Multilib.rst b/clang/docs/Multilib.rst
index 7637d0db9565b..d36b73dce68cd 100644
--- a/clang/docs/Multilib.rst
+++ b/clang/docs/Multilib.rst
@@ -122,6 +122,73 @@ subclass and a suitable base multilib variant is present then the
 It is the responsibility of layered multilib authors to ensure that headers and
 libraries in each layer are complete enough to mask any incompatibilities.
 
+Multilib custom flags
+=====================
+
+Introduction
+------------
+
+The multilib mechanism supports library variants that correspond to target,
+code generation or language command-line flags. Examples include ``--target``,
+``-mcpu``, ``-mfpu``, ``-mbranch-protection``, ``-fno-rtti``. However, some library
+variants are particular to features that do not correspond to any command-line
+option. Multithreading and semihosting, for instance, have no associated
+compiler option.
+
+In order to support the selection of variants for which no compiler option
+exists, the multilib specification includes the concept of *custom flags*.
+These flags have no impact on code generation and are only used in the multilib
+processing.
+
+Multilib custom flags follow this format in the driver invocation:
+
+::
+
+  -fmultilib-flag=<value>
+
+They are fed into the multilib system alongside the remaining flags.
+
+Custom flag declarations
+------------------------
+
+Custom flags can be declared in the YAML file under the *Flags* section.
+
+.. code-block:: yaml
+
+  Flags:
+  - Name: multithreaded
+    Values:
+    - Name: no-multithreaded
+      MacroDefines: [__SINGLE_THREAD__]
+    - Name: multithreaded
+    Default: no-multithreaded
+
+* Name: the name to categorize a flag.
+* Values: a list of flag Values (defined below).
+* Default: it specifies the name of the value this flag should take if not
+  specified in the command-line invocation. It must be one value from the Values
+  field.
+
+Each flag *Value* is defined as:
+
+* Name: name of the value. This is the string to be used in
+  ``-fmultilib-flag=<string>``.
+* MacroDefines: a list of strings to be used as macro definitions. Each string
+  is fed into the driver as ``-D<string>``.
+
+The namespace of flag values is common across all flags. This means that flag
+value names must be unique.
+
+Usage of custom flags in the *Variants* specifications
+------------------------------------------------------
+
+Library variants should list their requirement on one or more custom flags like
+they do for any other flag. Each requirement must be listed as
+``-fmultilib-flag=<value>``.
+
+A variant that does not specify a requirement on one particular flag can be
+matched against any value of that flag.
+
 Stability
 =========
 
@@ -222,6 +289,23 @@ For a more comprehensive example see
     # Flags is a list of one or more strings.
     Flags: [--target=thumbv7m-none-eabi]
 
+  # Custom flag declarations. Each item is a different declaration.
+  Flags:
+    # Name of the flag
+  - Name: multithreaded
+    # List of custom flag values
+    Values:
+      # Name of the custom flag value. To be used in -fmultilib-flag=<string>.
+    - Name: no-multithreaded
+      # Macro definitions. Useful for defining extra macros for building the
+      # associated library variant(s).
+      MacroDefines: [__SINGLE_THREAD__]
+    - Name: multithreaded
+    # Default flag value. If no value for this flag declaration is used in the
+    # command-line, the multilib system will use this one. Must be equal to one
+    # of the flag value names from this flag declaration.
+    Default: no-multithreaded
+
 Design principles
 =================
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c6bc95594f613..aa1c02d04f7ca 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -789,6 +789,7 @@ Improvements to Clang's diagnostics
       scope.Unlock();
       require(scope); // Warning!  Requires mu1.
     }
+- Diagnose invalid declarators in the declaration of constructors and destructors (#GH121706).
 
 Improvements to Clang's time-trace
 ----------------------------------
@@ -952,6 +953,10 @@ Bug Fixes to C++ Support
 - Clang now identifies unexpanded parameter packs within the type constraint on a non-type template parameter. (#GH88866)
 - Fixed an issue while resolving type of expression indexing into a pack of values of non-dependent type (#GH121242)
 - Fixed a crash when __PRETTY_FUNCTION__ or __FUNCSIG__ (clang-cl) appears in the trailing return type of the lambda (#GH121274)
+- Fixed a crash caused by the incorrect construction of template arguments for CTAD alias guides when type
+  constraints are applied. (#GH122134)
+- Fixed canonicalization of pack indexing types - Clang did not always recognized identical pack indexing. (#GH123033)
+
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 0e07c5d6ce8fb..4e9b961688d55 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -1726,6 +1726,13 @@ class ASTContext : public RefCountedBase<ASTContext> {
 
   QualType getEnumType(const EnumDecl *Decl) const;
 
+  /// Compute BestType and BestPromotionType for an enum based on the highest
+  /// number of negative and positive bits of its elements.
+  /// Returns true if enum width is too large.
+  bool computeBestEnumTypes(bool IsPacked, unsigned NumNegativeBits,
+                            unsigned NumPositiveBits, QualType &BestType,
+                            QualType &BestPromotionType);
+
   QualType
   getUnresolvedUsingType(const UnresolvedUsingTypenameDecl *Decl) const;
 
diff --git a/clang/include/clang/AST/Attr.h b/clang/include/clang/AST/Attr.h
index 3365ebe4d9012..bed532a84a1bd 100644
--- a/clang/include/clang/AST/Attr.h
+++ b/clang/include/clang/AST/Attr.h
@@ -60,6 +60,8 @@ class Attr : public AttributeCommonInfo {
   unsigned IsLateParsed : 1;
   LLVM_PREFERRED_TYPE(bool)
   unsigned InheritEvenIfAlreadyPresent : 1;
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned DeferDeserialization : 1;
 
   void *operator new(size_t bytes) noexcept {
     llvm_unreachable("Attrs cannot be allocated with regular 'new'.");
@@ -80,10 +82,11 @@ class Attr : public AttributeCommonInfo {
 
 protected:
   Attr(ASTContext &Context, const AttributeCommonInfo &CommonInfo,
-       attr::Kind AK, bool IsLateParsed)
+       attr::Kind AK, bool IsLateParsed, bool DeferDeserialization = false)
       : AttributeCommonInfo(CommonInfo), AttrKind(AK), Inherited(false),
         IsPackExpansion(false), Implicit(false), IsLateParsed(IsLateParsed),
-        InheritEvenIfAlreadyPresent(false) {}
+        InheritEvenIfAlreadyPresent(false),
+        DeferDeserialization(DeferDeserialization) {}
 
 public:
   attr::Kind getKind() const { return static_cast<attr::Kind>(AttrKind); }
@@ -105,6 +108,8 @@ class Attr : public AttributeCommonInfo {
   void setPackExpansion(bool PE) { IsPackExpansion = PE; }
   bool isPackExpansion() const { return IsPackExpansion; }
 
+  bool shouldDeferDeserialization() const { return DeferDeserialization; }
+
   // Clone this attribute.
   Attr *clone(ASTContext &C) const;
 
@@ -146,8 +151,9 @@ class InheritableAttr : public Attr {
 protected:
   InheritableAttr(ASTContext &Context, const AttributeCommonInfo &CommonInfo,
                   attr::Kind AK, bool IsLateParsed,
-                  bool InheritEvenIfAlreadyPresent)
-      : Attr(Context, CommonInfo, AK, IsLateParsed) {
+                  bool InheritEvenIfAlreadyPresent,
+                  bool DeferDeserialization = false)
+      : Attr(Context, CommonInfo, AK, IsLateParsed, DeferDeserialization) {
     this->InheritEvenIfAlreadyPresent = InheritEvenIfAlreadyPresent;
   }
 
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 27a91a2d07210..186a3e7fca59d 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -4021,7 +4021,7 @@ class EnumDecl : public TagDecl {
   QualType getIntegerType() const {
     if (!IntegerType)
       return QualType();
-    if (const Type *T = IntegerType.dyn_cast<const Type*>())
+    if (const Type *T = dyn_cast<const Type *>(IntegerType))
       return QualType(T, 0);
     return cast<TypeSourceInfo *>(IntegerType)->getType().getUnqualifiedType();
   }
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 91177c9a4b51f..573b46a2321c5 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -2726,11 +2726,8 @@ class DeclContext {
                    bool Deserialize = false) const;
 
 private:
-  /// Lookup all external visible declarations and the external declarations
-  /// within the same module specified by \c NamedModule. We can't
-  /// get it from \c this since the same declaration may be declared in
-  /// multiple modules. e.g., namespace.
-  lookup_result lookupImpl(DeclarationName Name, Module *NamedModule) const;
+  lookup_result lookupImpl(DeclarationName Name,
+                           const DeclContext *OriginalLookupDC) const;
 
   /// Whether this declaration context has had externally visible
   /// storage added since the last lookup. In this case, \c LookupPtr's
diff --git a/clang/include/clang/AST/ExternalASTMerger.h b/clang/include/clang/AST/ExternalASTMerger.h
index 46f187c5e0694..2c6f2a941311b 100644
--- a/clang/include/clang/AST/ExternalASTMerger.h
+++ b/clang/include/clang/AST/ExternalASTMerger.h
@@ -142,7 +142,7 @@ class ExternalASTMerger : public ExternalASTSource {
   /// Implementation of the ExternalASTSource API.
   bool FindExternalVisibleDeclsByName(const DeclContext *DC,
                                       DeclarationName Name,
-                                      Module *NamedModule) override;
+                                      const DeclContext *OriginalDC) override;
 
   /// Implementation of the ExternalASTSource API.
   void
diff --git a/clang/include/clang/AST/ExternalASTSource.h b/clang/include/clang/AST/ExternalASTSource.h
index ee4ad634977dc..42aed56d42e07 100644
--- a/clang/include/clang/AST/ExternalASTSource.h
+++ b/clang/include/clang/AST/ExternalASTSource.h
@@ -51,7 +51,6 @@ class RecordDecl;
 class Selector;
 class Stmt;
 class TagDecl;
-class Module;
 
 /// Abstract interface for external sources of AST nodes.
 ///
@@ -146,20 +145,20 @@ class ExternalASTSource : public RefCountedBase<ExternalASTSource> {
   /// Find all declarations with the given name in the given context,
   /// and add them to the context by calling SetExternalVisibleDeclsForName
   /// or SetNoExternalVisibleDeclsForName.
-  /// \param DC the context for lookup.
-  /// \param Name the name of the declarations to find.
-  /// \param NamedModule find declarations visible to the given module
-  /// \c NamedModule . This may be different from owning module of \c DC since
-  /// there are declarations (e.g., namespace declaration) can appear in
-  /// multiple modules.
+  /// \param DC The context for lookup in. \c DC should be a primary context.
+  /// \param Name The name to look for.
+  /// \param OriginalDC The original context for lookup.  \c OriginalDC can
+  /// provide more information than \c DC. e.g., The same namespace can appear
+  /// in multiple module units. So we need the \c OriginalDC to tell us what
+  /// the module the lookup come from.
   ///
-  /// \return \c true if any declarations might have been found, and \c false
-  /// if we definitely have no declarations with this name.
+  /// \return \c true if any declarations might have been found, \c false if
+  /// we definitely have no declarations with tbis name.
   ///
   /// The default implementation of this method is a no-op returning \c false.
   virtual bool FindExternalVisibleDeclsByName(const DeclContext *DC,
                                               DeclarationName Name,
-                                              Module *NamedModule);
+                                              const DeclContext *OriginalDC);
 
   /// Load all the external specializations for the Decl \param D if \param
   /// OnlyPartial is false. Otherwise, load all the external **partial**
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index f0fbacccc97bb..3457d524c63aa 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -8841,13 +8841,16 @@ void FixedPointValueToString(SmallVectorImpl<char> &Str, llvm::APSInt Val,
                              unsigned Scale);
 
 inline FunctionEffectsRef FunctionEffectsRef::get(QualType QT) {
+  const Type *TypePtr = QT.getTypePtr();
   while (true) {
-    QualType Pointee = QT->getPointeeType();
-    if (Pointee.isNull())
+    if (QualType Pointee = TypePtr->getPointeeType(); !Pointee.isNull())
+      TypePtr = Pointee.getTypePtr();
+    else if (TypePtr->isArrayType())
+      TypePtr = TypePtr->getBaseElementTypeUnsafe();
+    else
       break;
-    QT = Pointee;
   }
-  if (const auto *FPT = QT->getAs<FunctionProtoType>())
+  if (const auto *FPT = TypePtr->getAs<FunctionProtoType>())
     return FPT->getFunctionEffects();
   return {};
 }
diff --git a/clang/include/clang/AST/TypeLoc.h b/clang/include/clang/AST/TypeLoc.h
index 62ca52e508ba2..a55a38335ef6a 100644
--- a/clang/include/clang/AST/TypeLoc.h
+++ b/clang/include/clang/AST/TypeLoc.h
@@ -397,6 +397,7 @@ class ConcreteTypeLoc : public Base {
     unsigned extraAlign = asDerived()->getExtraLocalDataAlignment();
     size = llvm::alignTo(size, extraAlign);
     size += asDerived()->getExtraLocalDataSize();
+    size = llvm::alignTo(size, asDerived()->getLocalDataAlignment());
     return size;
   }
 
diff --git a/clang/include/clang/Basic/AllDiagnosticKinds.inc b/clang/include/clang/Basic/AllDiagnosticKinds.inc
new file mode 100644
index 0000000000000..a946b4a640ac6
--- /dev/null
+++ b/clang/include/clang/Basic/AllDiagnosticKinds.inc
@@ -0,0 +1,33 @@
+//===--- AllDiagnosticKinds.inc----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Defines the Diagnostic IDs in ID sorted order. The order is dictated by
+/// the enum in DiagnosticIDs.h#L49-L65.
+///
+//===----------------------------------------------------------------------===//
+
+// Turn off clang-format, as the order of the includes are important to make
+// sure tables based on Diagnostic IDs are partitioned/sorted based on
+// DiagID.
+
+// clang-format off
+#include "clang/Basic/DiagnosticCommonKinds.inc"
+#include "clang/Basic/DiagnosticDriverKinds.inc"
+#include "clang/Basic/DiagnosticFrontendKinds.inc"
+#include "clang/Basic/DiagnosticSerializationKinds.inc"
+#include "clang/Basic/DiagnosticLexKinds.inc"
+#include "clang/Basic/DiagnosticParseKinds.inc"
+#include "clang/Basic/DiagnosticASTKinds.inc"
+#include "clang/Basic/DiagnosticCommentKinds.inc"
+#include "clang/Basic/DiagnosticCrossTUKinds.inc"
+#include "clang/Basic/DiagnosticSemaKinds.inc"
+#include "clang/Basic/DiagnosticAnalysisKinds.inc"
+#include "clang/Basic/DiagnosticRefactoringKinds.inc"
+#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
+// clang-format on
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 408d3adf370c8..3969dd8af5dfa 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -713,6 +713,12 @@ class Attr {
   // attribute may be documented under multiple categories, more than one
   // Documentation entry may be listed.
   list<Documentation> Documentation;
+  // Set to true if deserialization of this attribute must be deferred until 
+  // the parent Decl is fully deserialized (during header module file
+  // deserialization). E.g., this is the case for the preferred_name attribute,
+  // since its type deserialization depends on its target Decl type.
+  // (See https://github.com/llvm/llvm-project/issues/56490 for details).
+  bit DeferDeserialization = 0;
 }
 
 /// Used to define a set of mutually exclusive attributes.
@@ -3254,6 +3260,11 @@ def PreferredName : InheritableAttr {
   let InheritEvenIfAlreadyPresent = 1;
   let MeaningfulToClassTemplateDefinition = 1;
   let TemplateDependent = 1;
+  // Type of this attribute depends on the target Decl type.
+  // Therefore, its deserialization must be deferred until
+  // deserialization of the target Decl is complete
+  // (for header modules).
+  let DeferDeserialization = 1;
 }
 
 def PreserveMost : DeclOrTypeAttr {
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index ea22690ce4f5c..bbf4886b5cf05 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4795,6 +4795,12 @@ def HLSLWaveActiveCountBits : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "unsigned int(bool)";
 }
 
+def HLSLWaveActiveSum : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_wave_active_sum"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void (...)";
+}
+
 def HLSLWaveGetLaneIndex : LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_wave_get_lane_index"];
   let Attributes = [NoThrow, Const];
@@ -4855,6 +4861,12 @@ def HLSLFirstBitHigh : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void(...)";
 }
 
+def HLSLFirstBitLow : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_elementwise_firstbitlow"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void(...)";
+}
+
 def HLSLFrac : LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_elementwise_frac"];
   let Attributes = [NoThrow, Const];
diff --git a/clang/include/clang/Basic/BuiltinsSPIRV.td b/clang/include/clang/Basic/BuiltinsSPIRV.td
index 1e66939b822ef..f72c555921dfe 100644
--- a/clang/include/clang/Basic/BuiltinsSPIRV.td
+++ b/clang/include/clang/Basic/BuiltinsSPIRV.td
@@ -13,3 +13,9 @@ def SPIRVDistance : Builtin {
   let Attributes = [NoThrow, Const];
   let Prototype = "void(...)";
 }
+
+def SPIRVLength : Builtin {
+  let Spellings = ["__builtin_spirv_length"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void(...)";
+}
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.td b/clang/include/clang/Basic/BuiltinsX86_64.td
index a6c6ef80eac21..4958265298d1b 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.td
+++ b/clang/include/clang/Basic/BuiltinsX86_64.td
@@ -295,8 +295,8 @@ let Features = "amx-complex,amx-transpose", Attributes = [NoThrow] in {
 
 let Features = "amx-avx512,avx10.2-512", Attributes = [NoThrow] in {
   def tcvtrowd2ps_internal : X86Builtin<"_Vector<16, float>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
-  def tcvtrowps2pbf16h_internal : X86Builtin<"_Vector<32, __bf16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
-  def tcvtrowps2pbf16l_internal : X86Builtin<"_Vector<32, __bf16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
+  def tcvtrowps2bf16h_internal : X86Builtin<"_Vector<32, __bf16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
+  def tcvtrowps2bf16l_internal : X86Builtin<"_Vector<32, __bf16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
   def tcvtrowps2phh_internal : X86Builtin<"_Vector<32, _Float16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
   def tcvtrowps2phl_internal : X86Builtin<"_Vector<32, _Float16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
   def tilemovrow_internal : X86Builtin<"_Vector<16, int>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
@@ -387,8 +387,8 @@ let Features = "amx-complex,amx-transpose", Attributes = [NoThrow] in {
 
 let Features = "amx-avx512,avx10.2-512", Attributes = [NoThrow] in {
   def tcvtrowd2ps : X86Builtin<"_Vector<16, float>(_Constant unsigned char, unsigned int)">;
-  def tcvtrowps2pbf16h : X86Builtin<"_Vector<32, __bf16>(_Constant unsigned char, unsigned int)">;
-  def tcvtrowps2pbf16l : X86Builtin<"_Vector<32, __bf16>(_Constant unsigned char, unsigned int)">;
+  def tcvtrowps2bf16h : X86Builtin<"_Vector<32, __bf16>(_Constant unsigned char, unsigned int)">;
+  def tcvtrowps2bf16l : X86Builtin<"_Vector<32, __bf16>(_Constant unsigned char, unsigned int)">;
   def tcvtrowps2phh : X86Builtin<"_Vector<32, _Float16>(_Constant unsigned char, unsigned int)">;
   def tcvtrowps2phl : X86Builtin<"_Vector<32, _Float16>(_Constant unsigned char, unsigned int)">;
   def tilemovrow : X86Builtin<"_Vector<16, int>(_Constant unsigned char, unsigned int)">;
diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt
index 897a610b7f908..56c27bacdb20b 100644
--- a/clang/include/clang/Basic/CMakeLists.txt
+++ b/clang/include/clang/Basic/CMakeLists.txt
@@ -3,6 +3,11 @@ macro(clang_diag_gen component)
     -gen-clang-diags-defs -clang-component=${component}
     SOURCE Diagnostic.td
     TARGET ClangDiagnostic${component})
+
+  clang_tablegen(Diagnostic${component}Enums.inc
+    -gen-clang-diags-enums -clang-component=${component}
+    SOURCE Diagnostic.td
+    TARGET ClangDiagnostic${component}Enums)
 endmacro(clang_diag_gen)
 
 clang_diag_gen(Analysis)
diff --git a/clang/include/clang/Basic/DiagnosticAST.h b/clang/include/clang/Basic/DiagnosticAST.h
index 24ef2689eac01..4f82114b7406b 100644
--- a/clang/include/clang/Basic/DiagnosticAST.h
+++ b/clang/include/clang/Basic/DiagnosticAST.h
@@ -22,6 +22,19 @@ enum {
 #undef DIAG
   NUM_BUILTIN_AST_DIAGNOSTICS
 };
+
+#define DIAG_ENUM(ENUM_NAME)                                                   \
+  namespace ENUM_NAME {                                                        \
+  enum {
+#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX,
+#define DIAG_ENUM_END()                                                        \
+  }                                                                            \
+  ;                                                                            \
+  }
+#include "clang/Basic/DiagnosticASTEnums.inc"
+#undef DIAG_ENUM_END
+#undef DIAG_ENUM_ITEM
+#undef DIAG_ENUM
 } // end namespace diag
 } // end namespace clang
 
diff --git a/clang/include/clang/Basic/DiagnosticAnalysis.h b/clang/include/clang/Basic/DiagnosticAnalysis.h
index 676b58f7d6ef2..1a49461bcd173 100644
--- a/clang/include/clang/Basic/DiagnosticAnalysis.h
+++ b/clang/include/clang/Basic/DiagnosticAnalysis.h
@@ -22,6 +22,18 @@ enum {
 #undef DIAG
   NUM_BUILTIN_ANALYSIS_DIAGNOSTICS
 };
+#define DIAG_ENUM(ENUM_NAME)                                                   \
+  namespace ENUM_NAME {                                                        \
+  enum {
+#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX,
+#define DIAG_ENUM_END()                                                        \
+  }                                                                            \
+  ;                                                                            \
+  }
+#include "clang/Basic/DiagnosticAnalysisEnums.inc"
+#undef DIAG_ENUM_END
+#undef DIAG_ENUM_ITEM
+#undef DIAG_ENUM
 } // end namespace diag
 } // end namespace clang
 
diff --git a/clang/include/clang/Basic/DiagnosticComment.h b/clang/include/clang/Basic/DiagnosticComment.h
index 17c0053e9a33d..53143ef132e4b 100644
--- a/clang/include/clang/Basic/DiagnosticComment.h
+++ b/clang/include/clang/Basic/DiagnosticComment.h
@@ -22,6 +22,19 @@ enum {
 #undef DIAG
   NUM_BUILTIN_COMMENT_DIAGNOSTICS
 };
+
+#define DIAG_ENUM(ENUM_NAME)                                                   \
+  namespace ENUM_NAME {                                                        \
+  enum {
+#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX,
+#define DIAG_ENUM_END()                                                        \
+  }                                                                            \
+  ;                                                                            \
+  }
+#include "clang/Basic/DiagnosticCommentEnums.inc"
+#undef DIAG_ENUM_END
+#undef DIAG_ENUM_ITEM
+#undef DIAG_ENUM
 } // end namespace diag
 } // end namespace clang
 
diff --git a/clang/include/clang/Basic/DiagnosticCrossTU.h b/clang/include/clang/Basic/DiagnosticCrossTU.h
index 4341bf327b69c..428da95011027 100644
--- a/clang/include/clang/Basic/DiagnosticCrossTU.h
+++ b/clang/include/clang/Basic/DiagnosticCrossTU.h
@@ -22,6 +22,19 @@ enum {
 #undef DIAG
   NUM_BUILTIN_CROSSTU_DIAGNOSTICS
 };
+
+#define DIAG_ENUM(ENUM_NAME)                                                   \
+  namespace ENUM_NAME {                                                        \
+  enum {
+#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX,
+#define DIAG_ENUM_END()                                                        \
+  }                                                                            \
+  ;                                                                            \
+  }
+#include "clang/Basic/DiagnosticCrossTUEnums.inc"
+#undef DIAG_ENUM_END
+#undef DIAG_ENUM_ITEM
+#undef DIAG_ENUM
 } // end namespace diag
 } // end namespace clang
 
diff --git a/clang/include/clang/Basic/DiagnosticDriver.h b/clang/include/clang/Basic/DiagnosticDriver.h
index 6931bd46542e8..c472afa3f6e96 100644
--- a/clang/include/clang/Basic/DiagnosticDriver.h
+++ b/clang/include/clang/Basic/DiagnosticDriver.h
@@ -22,6 +22,19 @@ enum {
 #undef DIAG
   NUM_BUILTIN_DRIVER_DIAGNOSTICS
 };
+
+#define DIAG_ENUM(ENUM_NAME)                                                   \
+  namespace ENUM_NAME {                                                        \
+  enum {
+#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX,
+#define DIAG_ENUM_END()                                                        \
+  }                                                                            \
+  ;                                                                            \
+  }
+#include "clang/Basic/DiagnosticDriverEnums.inc"
+#undef DIAG_ENUM_END
+#undef DIAG_ENUM_ITEM
+#undef DIAG_ENUM
 } // end namespace diag
 } // end namespace clang
 
diff --git a/clang/include/clang/Basic/DiagnosticFrontend.h b/clang/include/clang/Basic/DiagnosticFrontend.h
index ab4e855f2de02..766cac3d655b3 100644
--- a/clang/include/clang/Basic/DiagnosticFrontend.h
+++ b/clang/include/clang/Basic/DiagnosticFrontend.h
@@ -22,6 +22,19 @@ enum {
 #undef DIAG
   NUM_BUILTIN_FRONTEND_DIAGNOSTICS
 };
+
+#define DIAG_ENUM(ENUM_NAME)                                                   \
+  namespace ENUM_NAME {                                                        \
+  enum {
+#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX,
+#define DIAG_ENUM_END()                                                        \
+  }                                                                            \
+  ;                                                                            \
+  }
+#include "clang/Basic/DiagnosticFrontendEnums.inc"
+#undef DIAG_ENUM_END
+#undef DIAG_ENUM_ITEM
+#undef DIAG_ENUM
 } // end namespace diag
 } // end namespace clang
 
diff --git a/clang/include/clang/Basic/DiagnosticInstallAPI.h b/clang/include/clang/Basic/DiagnosticInstallAPI.h
index a76f6e087a2b0..cbdb00362624b 100644
--- a/clang/include/clang/Basic/DiagnosticInstallAPI.h
+++ b/clang/include/clang/Basic/DiagnosticInstallAPI.h
@@ -21,6 +21,19 @@ enum {
 #undef DIAG
   NUM_BUILTIN_INSTALLAPI_DIAGNOSTICS
 };
+
+#define DIAG_ENUM(ENUM_NAME)                                                   \
+  namespace ENUM_NAME {                                                        \
+  enum {
+#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX,
+#define DIAG_ENUM_END()                                                        \
+  }                                                                            \
+  ;                                                                            \
+  }
+#include "clang/Basic/DiagnosticInstallAPIEnums.inc"
+#undef DIAG_ENUM_END
+#undef DIAG_ENUM_ITEM
+#undef DIAG_ENUM
 } // namespace diag
 } // namespace clang
 #endif // LLVM_CLANG_BASIC_DIAGNOSTICINSTALLAPI_H
diff --git a/clang/include/clang/Basic/DiagnosticLex.h b/clang/include/clang/Basic/DiagnosticLex.h
index 5f237085ae03a..d14bf97e8642e 100644
--- a/clang/include/clang/Basic/DiagnosticLex.h
+++ b/clang/include/clang/Basic/DiagnosticLex.h
@@ -22,6 +22,18 @@ enum {
 #undef DIAG
   NUM_BUILTIN_LEX_DIAGNOSTICS
 };
+#define DIAG_ENUM(ENUM_NAME)                                                   \
+  namespace ENUM_NAME {                                                        \
+  enum {
+#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX,
+#define DIAG_ENUM_END()                                                        \
+  }                                                                            \
+  ;                                                                            \
+  }
+#include "clang/Basic/DiagnosticLexEnums.inc"
+#undef DIAG_ENUM_END
+#undef DIAG_ENUM_ITEM
+#undef DIAG_ENUM
 } // end namespace diag
 } // end namespace clang
 
diff --git a/clang/include/clang/Basic/DiagnosticParse.h b/clang/include/clang/Basic/DiagnosticParse.h
index 81a8185d25fb7..275e1a4c39b3f 100644
--- a/clang/include/clang/Basic/DiagnosticParse.h
+++ b/clang/include/clang/Basic/DiagnosticParse.h
@@ -22,6 +22,19 @@ enum {
 #undef DIAG
   NUM_BUILTIN_PARSE_DIAGNOSTICS
 };
+
+#define DIAG_ENUM(ENUM_NAME)                                                   \
+  namespace ENUM_NAME {                                                        \
+  enum {
+#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX,
+#define DIAG_ENUM_END()                                                        \
+  }                                                                            \
+  ;                                                                            \
+  }
+#include "clang/Basic/DiagnosticParseEnums.inc"
+#undef DIAG_ENUM_END
+#undef DIAG_ENUM_ITEM
+#undef DIAG_ENUM
 } // end namespace diag
 } // end namespace clang
 
diff --git a/clang/include/clang/Basic/DiagnosticRefactoring.h b/clang/include/clang/Basic/DiagnosticRefactoring.h
index 9b628dbeb7c26..59d4bc912733a 100644
--- a/clang/include/clang/Basic/DiagnosticRefactoring.h
+++ b/clang/include/clang/Basic/DiagnosticRefactoring.h
@@ -22,6 +22,19 @@ enum {
 #undef DIAG
   NUM_BUILTIN_REFACTORING_DIAGNOSTICS
 };
+
+#define DIAG_ENUM(ENUM_NAME)                                                   \
+  namespace ENUM_NAME {                                                        \
+  enum {
+#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX,
+#define DIAG_ENUM_END()                                                        \
+  }                                                                            \
+  ;                                                                            \
+  }
+#include "clang/Basic/DiagnosticRefactoringEnums.inc"
+#undef DIAG_ENUM_END
+#undef DIAG_ENUM_ITEM
+#undef DIAG_ENUM
 } // end namespace diag
 } // end namespace clang
 
diff --git a/clang/include/clang/Basic/DiagnosticSema.h b/clang/include/clang/Basic/DiagnosticSema.h
index 45014fe21271d..84986c7bccf71 100644
--- a/clang/include/clang/Basic/DiagnosticSema.h
+++ b/clang/include/clang/Basic/DiagnosticSema.h
@@ -22,6 +22,19 @@ enum {
 #undef DIAG
   NUM_BUILTIN_SEMA_DIAGNOSTICS
 };
+
+#define DIAG_ENUM(ENUM_NAME)                                                   \
+  namespace ENUM_NAME {                                                        \
+  enum {
+#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX,
+#define DIAG_ENUM_END()                                                        \
+  }                                                                            \
+  ;                                                                            \
+  }
+#include "clang/Basic/DiagnosticSemaEnums.inc"
+#undef DIAG_ENUM_END
+#undef DIAG_ENUM_ITEM
+#undef DIAG_ENUM
 } // end namespace diag
 } // end namespace clang
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 67c15e7c47594..db54312ad965e 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -617,9 +617,11 @@ def err_ambiguous_inherited_constructor : Error<
   "constructor of %0 inherited from multiple base class subobjects">;
 def note_ambiguous_inherited_constructor_using : Note<
   "inherited from base class %0 here">;
-def note_using_decl_class_member_workaround : Note<
-  "use %select{an alias declaration|a typedef declaration|a reference|"
-  "a const variable|a constexpr variable}0 instead">;
+def note_using_decl_class_member_workaround
+    : Note<"use %enum_select<MemClassWorkaround>{%AliasDecl{an alias "
+           "declaration}|%TypedefDecl{a typedef declaration}|%ReferenceDecl{a "
+           "reference}|%ConstVar{a const variable}|%ConstexprVar{a constexpr "
+           "variable}}0 instead">;
 def err_using_decl_can_not_refer_to_namespace : Error<
   "using declaration cannot refer to a namespace">;
 def note_namespace_using_decl : Note<
@@ -2202,6 +2204,8 @@ def err_invalid_qualified_constructor : Error<
   "'%0' qualifier is not allowed on a constructor">;
 def err_ref_qualifier_constructor : Error<
   "ref-qualifier '%select{&&|&}0' is not allowed on a constructor">;
+def err_invalid_ctor_dtor_decl : Error<
+  "invalid %select{constructor|destructor}0 declaration">;
 
 def err_constructor_return_type : Error<
   "constructor cannot have a return type">;
@@ -9301,7 +9305,7 @@ def err_typecheck_expect_scalar_or_vector : Error<
   "invalid operand of type %0 where %1 or "
   "a vector of such type is required">;
 def err_typecheck_expect_any_scalar_or_vector : Error<
-  "invalid operand of type %0 where a scalar or vector is required">;
+  "invalid operand of type %0%select{| where a scalar or vector is required}1">;
 def err_typecheck_expect_flt_or_vector : Error<
   "invalid operand of type %0 where floating, complex or "
   "a vector of such types is required">;
diff --git a/clang/include/clang/Basic/DiagnosticSerialization.h b/clang/include/clang/Basic/DiagnosticSerialization.h
index 0c622a5657737..6fb836dca1b04 100644
--- a/clang/include/clang/Basic/DiagnosticSerialization.h
+++ b/clang/include/clang/Basic/DiagnosticSerialization.h
@@ -22,6 +22,19 @@ enum {
 #undef DIAG
   NUM_BUILTIN_SERIALIZATION_DIAGNOSTICS
 };
+
+#define DIAG_ENUM(ENUM_NAME)                                                   \
+  namespace ENUM_NAME {                                                        \
+  enum {
+#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX,
+#define DIAG_ENUM_END()                                                        \
+  }                                                                            \
+  ;                                                                            \
+  }
+#include "clang/Basic/DiagnosticSerializationEnums.inc"
+#undef DIAG_ENUM_END
+#undef DIAG_ENUM_ITEM
+#undef DIAG_ENUM
 } // end namespace diag
 } // end namespace clang
 
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 47f1754aeb629..ac1c139b20943 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1954,13 +1954,13 @@ let SVETargetGuard = "sve2,lut", SMETargetGuard = "sme2,lut" in {
   def SVLUTI4_B : SInst<"svluti4_lane[_{d}]", "dd[i", "cUc", MergeNone, "aarch64_sve_luti4_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
   def SVLUTI4_H : SInst<"svluti4_lane[_{d}]", "dd[i", "sUsh", MergeNone, "aarch64_sve_luti4_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
 
-  def SVLUTI4_x2 : SInst<"svluti4_lane[_{d}]_x2", "d2.d[i", "sUsh", MergeNone, "aarch64_sve_luti4_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
+  def SVLUTI4_x2 : SInst<"svluti4_lane[_{d}_x2]", "d2.d[i", "sUsh", MergeNone, "aarch64_sve_luti4_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
 }
 
 let SVETargetGuard = "sve2,lut,bf16", SMETargetGuard = "sme2,lut,bf16" in {
   def SVLUTI2_BF16 : SInst<"svluti2_lane[_{d}]", "dd[i", "b", MergeNone, "aarch64_sve_luti2_lane", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_7>]>;
   def SVLUTI4_BF16 : SInst<"svluti4_lane[_{d}]", "dd[i", "b", MergeNone, "aarch64_sve_luti4_lane", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
-  def SVLUTI4_BF16_x2 : SInst<"svluti4_lane[_{d}]_x2", "d2.d[i", "b", MergeNone, "aarch64_sve_luti4_lane_x2", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
+  def SVLUTI4_BF16_x2 : SInst<"svluti4_lane[_{d}_x2]", "d2.d[i", "b", MergeNone, "aarch64_sve_luti4_lane_x2", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index e6d1e1f888f25..f4a52cc529b79 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -79,7 +79,6 @@ class CUIDOptions {
   enum class Kind { Hash, Random, Fixed, None, Invalid };
 
   CUIDOptions() = default;
-  CUIDOptions(const CUIDOptions &) = default;
   CUIDOptions(llvm::opt::DerivedArgList &Args, const Driver &D);
 
   // Get the CUID for an input string
@@ -492,7 +491,7 @@ class Driver {
   /// ArgList.
   llvm::opt::InputArgList ParseArgStrings(ArrayRef<const char *> Args,
                                           bool UseDriverMode,
-                                          bool &ContainsError);
+                                          bool &ContainsError) const;
 
   /// BuildInputs - Construct the list of inputs and their types from
   /// the given arguments.
diff --git a/clang/include/clang/Driver/Multilib.h b/clang/include/clang/Driver/Multilib.h
index 0a533ed2804e2..fc071ef48ca0f 100644
--- a/clang/include/clang/Driver/Multilib.h
+++ b/clang/include/clang/Driver/Multilib.h
@@ -168,9 +168,18 @@ class MultilibSet {
   const_iterator begin() const { return Multilibs.begin(); }
   const_iterator end() const { return Multilibs.end(); }
 
+  /// Process custom flags from \p Flags and returns an expanded flags list and
+  /// a list of macro defines.
+  /// Returns a pair where:
+  ///  - first: the new flags list including custom flags after processing.
+  ///  - second: the extra macro defines to be fed to the driver.
+  std::pair<Multilib::flags_list, SmallVector<StringRef>>
+  processCustomFlags(const Driver &D, const Multilib::flags_list &Flags) const;
+
   /// Select compatible variants, \returns false if none are compatible
   bool select(const Driver &D, const Multilib::flags_list &Flags,
-              llvm::SmallVectorImpl<Multilib> &) const;
+              llvm::SmallVectorImpl<Multilib> &,
+              llvm::SmallVector<StringRef> * = nullptr) const;
 
   unsigned size() const { return Multilibs.size(); }
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 2721c1b5d8dc5..d38dd2b4e3cf0 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4157,9 +4157,9 @@ def ftrap_function_EQ : Joined<["-"], "ftrap-function=">, Group<f_Group>,
   HelpText<"Issue call to specified function rather than a trap instruction">,
   MarshallingInfoString<CodeGenOpts<"TrapFuncName">>;
 def funroll_loops : Flag<["-"], "funroll-loops">, Group<f_Group>,
-  HelpText<"Turn on loop unroller">, Visibility<[ClangOption, CC1Option]>;
+  HelpText<"Turn on loop unroller">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>;
 def fno_unroll_loops : Flag<["-"], "fno-unroll-loops">, Group<f_Group>,
-  HelpText<"Turn off loop unroller">, Visibility<[ClangOption, CC1Option]>;
+  HelpText<"Turn off loop unroller">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>;
 def ffinite_loops: Flag<["-"],  "ffinite-loops">, Group<f_Group>,
   HelpText<"Assume all non-trivial loops are finite.">, Visibility<[ClangOption, CC1Option]>;
 def fno_finite_loops: Flag<["-"], "fno-finite-loops">, Group<f_Group>,
@@ -6670,6 +6670,7 @@ def fbinutils_version_EQ : Joined<["-"], "fbinutils-version=">,
 def fuse_ld_EQ : Joined<["-"], "fuse-ld=">, Group<f_Group>,
   Flags<[LinkOption]>, Visibility<[ClangOption, FlangOption, CLOption]>;
 def ld_path_EQ : Joined<["--"], "ld-path=">, Group<Link_Group>;
+def fuse_lipo_EQ : Joined<["-"], "fuse-lipo=">, Group<f_clang_Group>, Flags<[LinkOption]>;
 
 defm align_labels : BooleanFFlag<"align-labels">, Group<clang_ignored_gcc_optimization_f_Group>;
 def falign_labels_EQ : Joined<["-"], "falign-labels=">, Group<clang_ignored_gcc_optimization_f_Group>;
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index 701a1d25ca4c8..7d1d8feebf35e 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -686,6 +686,13 @@ class ToolChain {
   /// Add warning options that need to be passed to cc1 for this target.
   virtual void addClangWarningOptions(llvm::opt::ArgStringList &CC1Args) const;
 
+  // Get the list of extra macro defines requested by the multilib
+  // configuration.
+  virtual SmallVector<std::string>
+  getMultilibMacroDefinesStr(llvm::opt::ArgList &Args) const {
+    return {};
+  };
+
   // GetRuntimeLibType - Determine the runtime library type to use with the
   // given compilation arguments.
   virtual RuntimeLibType
diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
index b6ecc7e5ded9e..82a041ea3f848 100644
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -554,7 +554,8 @@ class Lexer : public PreprocessorLexer {
   /// Returns the next token, or std::nullopt if the location is inside a macro.
   static std::optional<Token> findNextToken(SourceLocation Loc,
                                             const SourceManager &SM,
-                                            const LangOptions &LangOpts);
+                                            const LangOptions &LangOpts,
+                                            bool IncludeComments = false);
 
   /// Checks that the given token is the first token that occurs after
   /// the given location (this excludes comments and whitespace). Returns the
diff --git a/clang-tools-extra/clangd/HeuristicResolver.h b/clang/include/clang/Sema/HeuristicResolver.h
similarity index 95%
rename from clang-tools-extra/clangd/HeuristicResolver.h
rename to clang/include/clang/Sema/HeuristicResolver.h
index c130e0677e86d..947de7a4e83ce 100644
--- a/clang-tools-extra/clangd/HeuristicResolver.h
+++ b/clang/include/clang/Sema/HeuristicResolver.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_HEURISTICRESOLVER_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_HEURISTICRESOLVER_H
+#ifndef LLVM_CLANG_SEMA_HEURISTICRESOLVER_H
+#define LLVM_CLANG_SEMA_HEURISTICRESOLVER_H
 
 #include "clang/AST/Decl.h"
 #include <vector>
@@ -24,8 +24,6 @@ class NamedDecl;
 class Type;
 class UnresolvedUsingValueDecl;
 
-namespace clangd {
-
 // This class handles heuristic resolution of declarations and types in template
 // code.
 //
@@ -80,7 +78,6 @@ class HeuristicResolver {
   ASTContext &Ctx;
 };
 
-} // namespace clangd
 } // namespace clang
 
 #endif
diff --git a/clang/include/clang/Sema/MultiplexExternalSemaSource.h b/clang/include/clang/Sema/MultiplexExternalSemaSource.h
index 08d6143f7caaf..921bebe3a44af 100644
--- a/clang/include/clang/Sema/MultiplexExternalSemaSource.h
+++ b/clang/include/clang/Sema/MultiplexExternalSemaSource.h
@@ -96,7 +96,7 @@ class MultiplexExternalSemaSource : public ExternalSemaSource {
   /// given context.
   bool FindExternalVisibleDeclsByName(const DeclContext *DC,
                                       DeclarationName Name,
-                                      Module *NamedModule) override;
+                                      const DeclContext *OriginalDC) override;
 
   bool LoadExternalSpecializations(const Decl *D, bool OnlyPartial) override;
 
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 40dae25f7b54b..d568d2fd7aa30 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -740,6 +740,8 @@ enum ASTRecordTypes {
   CXX_ADDED_TEMPLATE_PARTIAL_SPECIALIZATION = 75,
 
   UPDATE_MODULE_LOCAL_VISIBLE = 76,
+
+  UPDATE_TU_LOCAL_VISIBLE = 77,
 };
 
 /// Record types used within a source manager block.
@@ -1340,6 +1342,10 @@ enum DeclCode {
   /// only visible from DeclContext in the same module.
   DECL_CONTEXT_MODULE_LOCAL_VISIBLE,
 
+  /// A record that stores the set of declarations that are only visible
+  /// to the TU.
+  DECL_CONTEXT_TU_LOCAL_VISIBLE,
+
   /// A LabelDecl record.
   DECL_LABEL,
 
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index ea12adaec3ee8..82564fe664acb 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -528,6 +528,7 @@ class ASTReader
     uint64_t LexicalOffset;
     uint64_t VisibleOffset;
     uint64_t ModuleLocalOffset;
+    uint64_t TULocalOffset;
   };
 
   using DelayedNamespaceOffsetMapTy =
@@ -640,6 +641,9 @@ class ASTReader
   llvm::DenseMap<const DeclContext *,
                  serialization::reader::ModuleLocalLookupTable>
       ModuleLocalLookups;
+  llvm::DenseMap<const DeclContext *,
+                 serialization::reader::DeclContextLookupTable>
+      TULocalLookups;
 
   using SpecLookupTableTy =
       llvm::DenseMap<const Decl *,
@@ -670,6 +674,7 @@ class ASTReader
   llvm::DenseMap<GlobalDeclID, DeclContextVisibleUpdates> PendingVisibleUpdates;
   llvm::DenseMap<GlobalDeclID, DeclContextVisibleUpdates>
       PendingModuleLocalVisibleUpdates;
+  llvm::DenseMap<GlobalDeclID, DeclContextVisibleUpdates> TULocalUpdates;
 
   using SpecializationsUpdate = SmallVector<UpdateData, 1>;
   using SpecializationsUpdateMap =
@@ -704,11 +709,17 @@ class ASTReader
                                      llvm::BitstreamCursor &Cursor,
                                      uint64_t Offset, DeclContext *DC);
 
+  enum class VisibleDeclContextStorageKind {
+    GenerallyVisible,
+    ModuleLocalVisible,
+    TULocalVisible,
+  };
+
   /// Read the record that describes the visible contents of a DC.
   bool ReadVisibleDeclContextStorage(ModuleFile &M,
                                      llvm::BitstreamCursor &Cursor,
                                      uint64_t Offset, GlobalDeclID ID,
-                                     bool IsModuleLocal);
+                                     VisibleDeclContextStorageKind VisibleKind);
 
   bool ReadSpecializations(ModuleFile &M, llvm::BitstreamCursor &Cursor,
                            uint64_t Offset, Decl *D, bool IsPartial);
@@ -1148,6 +1159,10 @@ class ASTReader
   unsigned NumModuleLocalVisibleDeclContexts = 0,
            TotalModuleLocalVisibleDeclContexts = 0;
 
+  /// Number of TU Local decl contexts read/total
+  unsigned NumTULocalVisibleDeclContexts = 0,
+           TotalTULocalVisibleDeclContexts = 0;
+
   /// Total size of modules, in bits, currently loaded
   uint64_t TotalModulesSizeInBits = 0;
 
@@ -1221,6 +1236,24 @@ class ASTReader
   /// been completed.
   std::deque<PendingDeclContextInfo> PendingDeclContextInfos;
 
+  /// Deserialization of some attributes must be deferred since they refer
+  /// to themselves in their type (e.g., preferred_name attribute refers to the
+  /// typedef that refers back to the template specialization of the template
+  /// that the attribute is attached to).
+  /// More attributes that store TypeSourceInfo might be potentially affected,
+  /// see https://github.com/llvm/llvm-project/issues/56490 for details.
+  struct DeferredAttribute {
+    // Index of the deferred attribute in the Record of the TargetedDecl.
+    uint64_t RecordIdx;
+    // Decl to attach a deferred attribute to.
+    Decl *TargetedDecl;
+  };
+
+  /// The collection of Decls that have been loaded but some of their attributes
+  /// have been deferred, paired with the index inside the record pointing
+  /// at the skipped attribute.
+  SmallVector<DeferredAttribute> PendingDeferredAttributes;
+
   template <typename DeclTy>
   using DuplicateObjCDecls = std::pair<DeclTy *, DeclTy *>;
 
@@ -1463,6 +1496,9 @@ class ASTReader
   const serialization::reader::ModuleLocalLookupTable *
   getModuleLocalLookupTables(DeclContext *Primary) const;
 
+  const serialization::reader::DeclContextLookupTable *
+  getTULocalLookupTables(DeclContext *Primary) const;
+
   /// Get the loaded specializations lookup tables for \p D,
   /// if any.
   serialization::reader::LazySpecializationInfoLookupTable *
@@ -1570,6 +1606,7 @@ class ASTReader
   void loadPendingDeclChain(Decl *D, uint64_t LocalOffset);
   void loadObjCCategories(GlobalDeclID ID, ObjCInterfaceDecl *D,
                           unsigned PreviousGeneration = 0);
+  void loadDeferredAttribute(const DeferredAttribute &DA);
 
   RecordLocation getLocalBitOffset(uint64_t GlobalOffset);
   uint64_t getGlobalBitOffset(ModuleFile &M, uint64_t LocalOffset);
@@ -2139,7 +2176,7 @@ class ASTReader
   /// lookup table as unmaterialized references.
   bool FindExternalVisibleDeclsByName(const DeclContext *DC,
                                       DeclarationName Name,
-                                      Module *NamedModule) override;
+                                      const DeclContext *OriginalDC) override;
 
   /// Read all of the declarations lexically stored in a
   /// declaration context.
diff --git a/clang/include/clang/Serialization/ASTRecordReader.h b/clang/include/clang/Serialization/ASTRecordReader.h
index 2561418b78ca7..a29972fcf73a8 100644
--- a/clang/include/clang/Serialization/ASTRecordReader.h
+++ b/clang/include/clang/Serialization/ASTRecordReader.h
@@ -83,6 +83,12 @@ class ASTRecordReader
   /// Returns the current value in this record, without advancing.
   uint64_t peekInt() { return Record[Idx]; }
 
+  /// Returns the next N values in this record, without advancing.
+  uint64_t peekInts(unsigned N) { return Record[Idx + N]; }
+
+  /// Skips the current value.
+  void skipInt() { Idx += 1; }
+
   /// Skips the specified number of values.
   void skipInts(unsigned N) { Idx += N; }
 
@@ -335,7 +341,12 @@ class ASTRecordReader
   Attr *readAttr();
 
   /// Reads attributes from the current stream position, advancing Idx.
-  void readAttributes(AttrVec &Attrs);
+  /// For some attributes (where type depends on itself recursively), defer
+  /// reading the attribute until the type has been read.
+  void readAttributes(AttrVec &Attrs, Decl *D = nullptr);
+
+  /// Reads one attribute from the current stream position, advancing Idx.
+  Attr *readOrDeferAttrFor(Decl *D);
 
   /// Read an BTFTypeTagAttr object.
   BTFTypeTagAttr *readBTFTypeTagAttr() {
diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index 53b09cc914392..079e39a9fb678 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -496,6 +496,9 @@ class ASTWriter : public ASTDeserializationListener,
   /// file.
   unsigned NumModuleLocalDeclContexts = 0;
 
+  /// The number of TULocal declcontexts written to the AST file.
+  unsigned NumTULocalDeclContexts = 0;
+
   /// A mapping from each known submodule to its ID number, which will
   /// be a positive integer.
   llvm::DenseMap<const Module *, unsigned> SubmoduleIDs;
@@ -594,12 +597,14 @@ class ASTWriter : public ASTDeserializationListener,
   void
   GenerateNameLookupTable(ASTContext &Context, const DeclContext *DC,
                           llvm::SmallVectorImpl<char> &LookupTable,
-                          llvm::SmallVectorImpl<char> &ModuleLocalLookupTable);
+                          llvm::SmallVectorImpl<char> &ModuleLocalLookupTable,
+                          llvm::SmallVectorImpl<char> &TULocalLookupTable);
   uint64_t WriteDeclContextLexicalBlock(ASTContext &Context,
                                         const DeclContext *DC);
   void WriteDeclContextVisibleBlock(ASTContext &Context, DeclContext *DC,
                                     uint64_t &VisibleBlockOffset,
-                                    uint64_t &ModuleLocalBlockOffset);
+                                    uint64_t &ModuleLocalBlockOffset,
+                                    uint64_t &TULocalBlockOffset);
   void WriteTypeDeclOffsets();
   void WriteFileDeclIDsMap();
   void WriteComments(ASTContext &Context);
@@ -633,8 +638,10 @@ class ASTWriter : public ASTDeserializationListener,
   unsigned DeclContextLexicalAbbrev = 0;
   unsigned DeclContextVisibleLookupAbbrev = 0;
   unsigned DeclModuleLocalVisibleLookupAbbrev = 0;
+  unsigned DeclTULocalLookupAbbrev = 0;
   unsigned UpdateVisibleAbbrev = 0;
   unsigned ModuleLocalUpdateVisibleAbbrev = 0;
+  unsigned TULocalUpdateVisibleAbbrev = 0;
   unsigned DeclRecordAbbrev = 0;
   unsigned DeclTypedefAbbrev = 0;
   unsigned DeclVarAbbrev = 0;
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index be1dd29d46278..155dbcfcaeed3 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -5209,6 +5209,85 @@ QualType ASTContext::getEnumType(const EnumDecl *Decl) const {
   return QualType(newType, 0);
 }
 
+bool ASTContext::computeBestEnumTypes(bool IsPacked, unsigned NumNegativeBits,
+                                      unsigned NumPositiveBits,
+                                      QualType &BestType,
+                                      QualType &BestPromotionType) {
+  unsigned IntWidth = Target->getIntWidth();
+  unsigned CharWidth = Target->getCharWidth();
+  unsigned ShortWidth = Target->getShortWidth();
+  bool EnumTooLarge = false;
+  unsigned BestWidth;
+  if (NumNegativeBits) {
+    // If there is a negative value, figure out the smallest integer type (of
+    // int/long/longlong) that fits.
+    // If it's packed, check also if it fits a char or a short.
+    if (IsPacked && NumNegativeBits <= CharWidth &&
+        NumPositiveBits < CharWidth) {
+      BestType = SignedCharTy;
+      BestWidth = CharWidth;
+    } else if (IsPacked && NumNegativeBits <= ShortWidth &&
+               NumPositiveBits < ShortWidth) {
+      BestType = ShortTy;
+      BestWidth = ShortWidth;
+    } else if (NumNegativeBits <= IntWidth && NumPositiveBits < IntWidth) {
+      BestType = IntTy;
+      BestWidth = IntWidth;
+    } else {
+      BestWidth = Target->getLongWidth();
+
+      if (NumNegativeBits <= BestWidth && NumPositiveBits < BestWidth) {
+        BestType = LongTy;
+      } else {
+        BestWidth = Target->getLongLongWidth();
+
+        if (NumNegativeBits > BestWidth || NumPositiveBits >= BestWidth)
+          EnumTooLarge = true;
+        BestType = LongLongTy;
+      }
+    }
+    BestPromotionType = (BestWidth <= IntWidth ? IntTy : BestType);
+  } else {
+    // If there is no negative value, figure out the smallest type that fits
+    // all of the enumerator values.
+    // If it's packed, check also if it fits a char or a short.
+    if (IsPacked && NumPositiveBits <= CharWidth) {
+      BestType = UnsignedCharTy;
+      BestPromotionType = IntTy;
+      BestWidth = CharWidth;
+    } else if (IsPacked && NumPositiveBits <= ShortWidth) {
+      BestType = UnsignedShortTy;
+      BestPromotionType = IntTy;
+      BestWidth = ShortWidth;
+    } else if (NumPositiveBits <= IntWidth) {
+      BestType = UnsignedIntTy;
+      BestWidth = IntWidth;
+      BestPromotionType = (NumPositiveBits == BestWidth || !LangOpts.CPlusPlus)
+                              ? UnsignedIntTy
+                              : IntTy;
+    } else if (NumPositiveBits <= (BestWidth = Target->getLongWidth())) {
+      BestType = UnsignedLongTy;
+      BestPromotionType = (NumPositiveBits == BestWidth || !LangOpts.CPlusPlus)
+                              ? UnsignedLongTy
+                              : LongTy;
+    } else {
+      BestWidth = Target->getLongLongWidth();
+      if (NumPositiveBits > BestWidth) {
+        // This can happen with bit-precise integer types, but those are not
+        // allowed as the type for an enumerator per C23 6.7.2.2p4 and p12.
+        // FIXME: GCC uses __int128_t and __uint128_t for cases that fit within
+        // a 128-bit integer, we should consider doing the same.
+        EnumTooLarge = true;
+      }
+      BestType = UnsignedLongLongTy;
+      BestPromotionType = (NumPositiveBits == BestWidth || !LangOpts.CPlusPlus)
+                              ? UnsignedLongLongTy
+                              : LongLongTy;
+    }
+  }
+  return EnumTooLarge;
+}
+
 QualType ASTContext::getUnresolvedUsingType(
     const UnresolvedUsingTypenameDecl *Decl) const {
   if (Decl->TypeForDecl)
@@ -6248,7 +6327,8 @@ QualType ASTContext::getPackIndexingType(QualType Pattern, Expr *IndexExpr,
     Canonical = getCanonicalType(Expansions[Index]);
   } else {
     llvm::FoldingSetNodeID ID;
-    PackIndexingType::Profile(ID, *this, Pattern, IndexExpr, FullySubstituted);
+    PackIndexingType::Profile(ID, *this, Pattern.getCanonicalType(), IndexExpr,
+                              FullySubstituted);
     void *InsertPos = nullptr;
     PackIndexingType *Canon =
         DependentPackIndexingTypes.FindNodeOrInsertPos(ID, InsertPos);
@@ -6256,8 +6336,9 @@ QualType ASTContext::getPackIndexingType(QualType Pattern, Expr *IndexExpr,
       void *Mem = Allocate(
           PackIndexingType::totalSizeToAlloc<QualType>(Expansions.size()),
           TypeAlignment);
-      Canon = new (Mem) PackIndexingType(*this, QualType(), Pattern, IndexExpr,
-                                         FullySubstituted, Expansions);
+      Canon = new (Mem)
+          PackIndexingType(*this, QualType(), Pattern.getCanonicalType(),
+                           IndexExpr, FullySubstituted, Expansions);
       DependentPackIndexingTypes.InsertNode(Canon, InsertPos);
     }
     Canonical = QualType(Canon, 0);
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 4bfb80589620c..3ef2b0858e667 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -2066,7 +2066,7 @@ bool Compiler<Emitter>::VisitUnaryExprOrTypeTraitExpr(
       Size = CharUnits::One();
     else {
       if (ArgType->isDependentType() || !ArgType->isConstantSizeType())
-        return false;
+        return this->emitInvalid(E);
 
       if (Kind == UETT_SizeOf)
         Size = ASTCtx.getTypeSizeInChars(ArgType);
@@ -4247,6 +4247,7 @@ bool Compiler<Emitter>::visitExpr(const Expr *E, bool DestroyToplevelScope) {
   // For us, that means everything we don't
   // have a PrimType for.
   if (std::optional<unsigned> LocalOffset = this->allocateLocal(E)) {
+    InitLinkScope<Emitter> ILS(this, InitLink::Temp(*LocalOffset));
     if (!this->emitGetPtrLocal(*LocalOffset, E))
       return false;
 
@@ -6209,8 +6210,20 @@ bool Compiler<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
           return revisit(VD);
 
         if ((VD->hasGlobalStorage() || VD->isStaticDataMember()) &&
-            typeShouldBeVisited(VD->getType()))
+            typeShouldBeVisited(VD->getType())) {
+          if (const Expr *Init = VD->getAnyInitializer();
+              Init && !Init->isValueDependent()) {
+            // Whether or not the evaluation is successul doesn't really matter
+            // here -- we will create a global variable in any case, and that
+            // will have the state of initializer evaluation attached.
+            APValue V;
+            SmallVector<PartialDiagnosticAt> Notes;
+            (void)Init->EvaluateAsInitializer(V, Ctx.getASTContext(), VD, Notes,
+                                              true);
+            return this->visitDeclRef(D, E);
+          }
           return revisit(VD);
+        }
 
         // FIXME: The evaluateValue() check here is a little ridiculous, since
         // it will ultimately call into Context::evaluateAsInitializer(). In
diff --git a/clang/lib/AST/ByteCode/EvaluationResult.cpp b/clang/lib/AST/ByteCode/EvaluationResult.cpp
index 0ce8f15ea9127..d603e08c7bb4d 100644
--- a/clang/lib/AST/ByteCode/EvaluationResult.cpp
+++ b/clang/lib/AST/ByteCode/EvaluationResult.cpp
@@ -160,9 +160,9 @@ bool EvaluationResult::checkFullyInitialized(InterpState &S,
     return true;
 
   SourceLocation InitLoc;
-  if (const auto *D = Source.dyn_cast<const Decl *>())
+  if (const auto *D = dyn_cast<const Decl *>(Source))
     InitLoc = cast<VarDecl>(D)->getAnyInitializer()->getExprLoc();
-  else if (const auto *E = Source.dyn_cast<const Expr *>())
+  else if (const auto *E = dyn_cast<const Expr *>(Source))
     InitLoc = E->getExprLoc();
 
   if (const Record *R = Ptr.getRecord())
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index 42daaa4f3dcc3..2886aebdf52e9 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -130,6 +130,14 @@ void Decl::setOwningModuleID(unsigned ID) {
   *IDAddress |= (uint64_t)ID << 48;
 }
 
+Module *Decl::getTopLevelOwningNamedModule() const {
+  if (getOwningModule() &&
+      getOwningModule()->getTopLevelModule()->isNamedModule())
+    return getOwningModule()->getTopLevelModule();
+
+  return nullptr;
+}
+
 Module *Decl::getOwningModuleSlow() const {
   assert(isFromASTFile() && "Not from AST file?");
   return getASTContext().getExternalSource()->getModule(getOwningModuleID());
@@ -1850,28 +1858,22 @@ void DeclContext::buildLookupImpl(DeclContext *DCtx, bool Internal) {
   }
 }
 
-Module *Decl::getTopLevelOwningNamedModule() const {
-  if (getOwningModule() &&
-      getOwningModule()->getTopLevelModule()->isNamedModule())
-    return getOwningModule()->getTopLevelModule();
-
-  return nullptr;
-}
-
 DeclContext::lookup_result
 DeclContext::lookup(DeclarationName Name) const {
-  return lookupImpl(Name, cast<Decl>(this)->getTopLevelOwningNamedModule());
-}
-
-DeclContext::lookup_result DeclContext::lookupImpl(DeclarationName Name,
-                                                   Module *NamedModule) const {
   // For transparent DeclContext, we should lookup in their enclosing context.
   if (getDeclKind() == Decl::LinkageSpec || getDeclKind() == Decl::Export)
-    return getParent()->lookupImpl(Name, NamedModule);
+    return getParent()->lookup(Name);
 
-  const DeclContext *PrimaryContext = getPrimaryContext();
-  if (PrimaryContext != this)
-    return PrimaryContext->lookupImpl(Name, NamedModule);
+  return getPrimaryContext()->lookupImpl(Name, this);
+}
+
+DeclContext::lookup_result
+DeclContext::lookupImpl(DeclarationName Name,
+                        const DeclContext *OriginalLookupDC) const {
+  assert(this == getPrimaryContext() &&
+         "lookupImpl should only be called with primary DC!");
+  assert(getDeclKind() != Decl::LinkageSpec && getDeclKind() != Decl::Export &&
+         "We shouldn't lookup in transparent DC.");
 
   // If we have an external source, ensure that any later redeclarations of this
   // context have been loaded, since they may add names to the result of this
@@ -1902,7 +1904,7 @@ DeclContext::lookup_result DeclContext::lookupImpl(DeclarationName Name,
     if (!R.second && !R.first->second.hasExternalDecls())
       return R.first->second.getLookupResult();
 
-    if (Source->FindExternalVisibleDeclsByName(this, Name, NamedModule) ||
+    if (Source->FindExternalVisibleDeclsByName(this, Name, OriginalLookupDC) ||
         !R.second) {
       if (StoredDeclsMap *Map = LookupPtr) {
         StoredDeclsMap::iterator I = Map->find(Name);
@@ -2129,8 +2131,8 @@ void DeclContext::makeDeclVisibleInContextImpl(NamedDecl *D, bool Internal) {
     if (ExternalASTSource *Source = getParentASTContext().getExternalSource())
       if (hasExternalVisibleStorage() &&
           Map->find(D->getDeclName()) == Map->end())
-        Source->FindExternalVisibleDeclsByName(
-            this, D->getDeclName(), D->getTopLevelOwningNamedModule());
+        Source->FindExternalVisibleDeclsByName(this, D->getDeclName(),
+                                               D->getDeclContext());
 
   // Insert this declaration into the map.
   StoredDeclsList &DeclNameEntries = (*Map)[D->getDeclName()];
diff --git a/clang/lib/AST/ExternalASTMerger.cpp b/clang/lib/AST/ExternalASTMerger.cpp
index a33f6e3447679..257e8338dedef 100644
--- a/clang/lib/AST/ExternalASTMerger.cpp
+++ b/clang/lib/AST/ExternalASTMerger.cpp
@@ -471,9 +471,9 @@ static bool importSpecializationsIfNeeded(Decl *D, ASTImporter *Importer) {
   return false;
 }
 
-bool ExternalASTMerger::FindExternalVisibleDeclsByName(const DeclContext *DC,
-                                                       DeclarationName Name,
-                                                       Module *NamedModule) {
+bool ExternalASTMerger::FindExternalVisibleDeclsByName(
+    const DeclContext *DC, DeclarationName Name,
+    const DeclContext *OriginalDC) {
   llvm::SmallVector<NamedDecl *, 1> Decls;
   llvm::SmallVector<Candidate, 4> Candidates;
 
diff --git a/clang/lib/AST/ExternalASTSource.cpp b/clang/lib/AST/ExternalASTSource.cpp
index 4a29f4944f73c..e2451f294741d 100644
--- a/clang/lib/AST/ExternalASTSource.cpp
+++ b/clang/lib/AST/ExternalASTSource.cpp
@@ -90,9 +90,9 @@ ExternalASTSource::GetExternalCXXBaseSpecifiers(uint64_t Offset) {
   return nullptr;
 }
 
-bool ExternalASTSource::FindExternalVisibleDeclsByName(const DeclContext *DC,
-                                                       DeclarationName Name,
-                                                       Module *NamedModule) {
+bool ExternalASTSource::FindExternalVisibleDeclsByName(
+    const DeclContext *DC, DeclarationName Name,
+    const DeclContext *OriginalDC) {
   return false;
 }
 
diff --git a/clang/lib/AST/VTableBuilder.cpp b/clang/lib/AST/VTableBuilder.cpp
index e941c3bedb0a7..fa3055dd1206f 100644
--- a/clang/lib/AST/VTableBuilder.cpp
+++ b/clang/lib/AST/VTableBuilder.cpp
@@ -3831,8 +3831,8 @@ const VirtualBaseInfo &MicrosoftVTableContext::computeVBTableRelatedInformation(
   unsigned VBTableIndex = 1 + VBI->VBTableIndices.size();
   for (const auto &VB : RD->vbases()) {
     const CXXRecordDecl *CurVBase = VB.getType()->getAsCXXRecordDecl();
-    if (!VBI->VBTableIndices.count(CurVBase))
-      VBI->VBTableIndices[CurVBase] = VBTableIndex++;
+    if (VBI->VBTableIndices.try_emplace(CurVBase, VBTableIndex).second)
+      ++VBTableIndex;
   }
 
   return *VBI;
diff --git a/clang/lib/Analysis/ExprMutationAnalyzer.cpp b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
index 53b838e9ede4d..cefe64409c977 100644
--- a/clang/lib/Analysis/ExprMutationAnalyzer.cpp
+++ b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
@@ -104,6 +104,8 @@ AST_MATCHER_P(Stmt, canResolveToExpr, const Stmt *, Inner) {
 AST_MATCHER_P(InitListExpr, hasAnyInit, ast_matchers::internal::Matcher<Expr>,
               InnerMatcher) {
   for (const Expr *Arg : Node.inits()) {
+    if (Arg == nullptr)
+      continue;
     ast_matchers::internal::BoundNodesTreeBuilder Result(*Builder);
     if (InnerMatcher.matches(*Arg, Finder, &Result)) {
       *Builder = std::move(Result);
diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index bef5fa8624ce4..a9aff39df6474 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -453,11 +453,8 @@ AST_MATCHER(ArraySubscriptExpr, isSafeArraySubscript) {
     return false;
   }
 
-  Expr::EvalResult EVResult;
-  if (Node.getIdx()->EvaluateAsInt(EVResult, Finder->getASTContext())) {
-    llvm::APSInt ArrIdx = EVResult.Val.getInt();
-    // FIXME: ArrIdx.isNegative() we could immediately emit an error as that's a
-    // bug
+  if (const auto *IdxLit = dyn_cast<IntegerLiteral>(Node.getIdx())) {
+    const APInt ArrIdx = IdxLit->getValue();
     if (ArrIdx.isNonNegative() && ArrIdx.getLimitedValue() < limit)
       return true;
   }
diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp
index d77f28c80b2eb..81194bbf2538e 100644
--- a/clang/lib/Basic/DiagnosticIDs.cpp
+++ b/clang/lib/Basic/DiagnosticIDs.cpp
@@ -37,21 +37,7 @@ struct StaticDiagInfoDescriptionStringTable {
 #define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR,     \
              SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY)            \
   char ENUM##_desc[sizeof(DESC)];
-  // clang-format off
-#include "clang/Basic/DiagnosticCommonKinds.inc"
-#include "clang/Basic/DiagnosticDriverKinds.inc"
-#include "clang/Basic/DiagnosticFrontendKinds.inc"
-#include "clang/Basic/DiagnosticSerializationKinds.inc"
-#include "clang/Basic/DiagnosticLexKinds.inc"
-#include "clang/Basic/DiagnosticParseKinds.inc"
-#include "clang/Basic/DiagnosticASTKinds.inc"
-#include "clang/Basic/DiagnosticCommentKinds.inc"
-#include "clang/Basic/DiagnosticCrossTUKinds.inc"
-#include "clang/Basic/DiagnosticSemaKinds.inc"
-#include "clang/Basic/DiagnosticAnalysisKinds.inc"
-#include "clang/Basic/DiagnosticRefactoringKinds.inc"
-#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
-  // clang-format on
+#include "clang/Basic/AllDiagnosticKinds.inc"
 #undef DIAG
 };
 
@@ -59,21 +45,7 @@ const StaticDiagInfoDescriptionStringTable StaticDiagInfoDescriptions = {
 #define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR,     \
              SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY)            \
   DESC,
-// clang-format off
-#include "clang/Basic/DiagnosticCommonKinds.inc"
-#include "clang/Basic/DiagnosticDriverKinds.inc"
-#include "clang/Basic/DiagnosticFrontendKinds.inc"
-#include "clang/Basic/DiagnosticSerializationKinds.inc"
-#include "clang/Basic/DiagnosticLexKinds.inc"
-#include "clang/Basic/DiagnosticParseKinds.inc"
-#include "clang/Basic/DiagnosticASTKinds.inc"
-#include "clang/Basic/DiagnosticCommentKinds.inc"
-#include "clang/Basic/DiagnosticCrossTUKinds.inc"
-#include "clang/Basic/DiagnosticSemaKinds.inc"
-#include "clang/Basic/DiagnosticAnalysisKinds.inc"
-#include "clang/Basic/DiagnosticRefactoringKinds.inc"
-#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
-// clang-format on
+#include "clang/Basic/AllDiagnosticKinds.inc"
 #undef DIAG
 };
 
@@ -85,21 +57,7 @@ const uint32_t StaticDiagInfoDescriptionOffsets[] = {
 #define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR,     \
              SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY)            \
   offsetof(StaticDiagInfoDescriptionStringTable, ENUM##_desc),
-// clang-format off
-#include "clang/Basic/DiagnosticCommonKinds.inc"
-#include "clang/Basic/DiagnosticDriverKinds.inc"
-#include "clang/Basic/DiagnosticFrontendKinds.inc"
-#include "clang/Basic/DiagnosticSerializationKinds.inc"
-#include "clang/Basic/DiagnosticLexKinds.inc"
-#include "clang/Basic/DiagnosticParseKinds.inc"
-#include "clang/Basic/DiagnosticASTKinds.inc"
-#include "clang/Basic/DiagnosticCommentKinds.inc"
-#include "clang/Basic/DiagnosticCrossTUKinds.inc"
-#include "clang/Basic/DiagnosticSemaKinds.inc"
-#include "clang/Basic/DiagnosticAnalysisKinds.inc"
-#include "clang/Basic/DiagnosticRefactoringKinds.inc"
-#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
-// clang-format on
+#include "clang/Basic/AllDiagnosticKinds.inc"
 #undef DIAG
 };
 
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 86befb1cbc74f..c0bf4e686cf03 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -425,6 +425,7 @@ void TargetInfo::adjust(DiagnosticsEngine &Diags, LangOptions &Opts) {
   // HLSL explicitly defines the sizes and formats of some data types, and we
   // need to conform to those regardless of what architecture you are targeting.
   if (Opts.HLSL) {
+    BoolWidth = BoolAlign = 32;
     LongWidth = LongAlign = 64;
     if (!Opts.NativeHalfType) {
       HalfFormat = &llvm::APFloat::IEEEsingle();
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 4e211deb9faba..0b899137bbb5c 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -421,7 +421,7 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
 #define ARM_ACLE_VERSION(Y, Q, P) (100 * (Y) + 10 * (Q) + (P))
   Builder.defineMacro("__ARM_ACLE", Twine(ARM_ACLE_VERSION(2024, 2, 0)));
   Builder.defineMacro("__FUNCTION_MULTI_VERSIONING_SUPPORT_LEVEL",
-                      Twine(ARM_ACLE_VERSION(2024, 2, 0)));
+                      Twine(ARM_ACLE_VERSION(2024, 3, 0)));
 #undef ARM_ACLE_VERSION
   Builder.defineMacro("__ARM_ARCH",
                       std::to_string(ArchInfo->Version.getMajor()));
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 1b25d365932c3..b80833fd91884 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19186,6 +19186,23 @@ static Intrinsic::ID getFirstBitHighIntrinsic(CGHLSLRuntime &RT, QualType QT) {
   return RT.getFirstBitUHighIntrinsic();
 }
 
+// Return wave active sum that corresponds to the QT scalar type
+static Intrinsic::ID getWaveActiveSumIntrinsic(llvm::Triple::ArchType Arch,
+                                               CGHLSLRuntime &RT, QualType QT) {
+  switch (Arch) {
+  case llvm::Triple::spirv:
+    return llvm::Intrinsic::spv_wave_reduce_sum;
+  case llvm::Triple::dxil: {
+    if (QT->isUnsignedIntegerType())
+      return llvm::Intrinsic::dx_wave_reduce_usum;
+    return llvm::Intrinsic::dx_wave_reduce_sum;
+  }
+  default:
+    llvm_unreachable("Intrinsic WaveActiveSum"
+                     " not supported by target architecture");
+  }
+}
+
 Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
                                             const CallExpr *E,
                                             ReturnValueSlot ReturnValue) {
@@ -19316,7 +19333,6 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
         "hlsl.dot4add.u8packed");
   }
   case Builtin::BI__builtin_hlsl_elementwise_firstbithigh: {
-
     Value *X = EmitScalarExpr(E->getArg(0));
 
     return Builder.CreateIntrinsic(
@@ -19324,6 +19340,14 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
         getFirstBitHighIntrinsic(CGM.getHLSLRuntime(), E->getArg(0)->getType()),
         ArrayRef<Value *>{X}, nullptr, "hlsl.firstbithigh");
   }
+  case Builtin::BI__builtin_hlsl_elementwise_firstbitlow: {
+    Value *X = EmitScalarExpr(E->getArg(0));
+
+    return Builder.CreateIntrinsic(
+        /*ReturnType=*/ConvertType(E->getType()),
+        CGM.getHLSLRuntime().getFirstBitLowIntrinsic(), ArrayRef<Value *>{X},
+        nullptr, "hlsl.firstbitlow");
+  }
   case Builtin::BI__builtin_hlsl_lerp: {
     Value *X = EmitScalarExpr(E->getArg(0));
     Value *Y = EmitScalarExpr(E->getArg(1));
@@ -19491,6 +19515,23 @@ case Builtin::BI__builtin_hlsl_elementwise_isinf: {
         Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID),
         ArrayRef{OpExpr});
   }
+  case Builtin::BI__builtin_hlsl_wave_active_sum: {
+    // Due to the use of variadic arguments, explicitly retreive argument
+    Value *OpExpr = EmitScalarExpr(E->getArg(0));
+    llvm::FunctionType *FT = llvm::FunctionType::get(
+        OpExpr->getType(), ArrayRef{OpExpr->getType()}, false);
+    Intrinsic::ID IID = getWaveActiveSumIntrinsic(
+        getTarget().getTriple().getArch(), CGM.getHLSLRuntime(),
+        E->getArg(0)->getType());
+
+    // Get overloaded name
+    std::string Name =
+        Intrinsic::getName(IID, ArrayRef{OpExpr->getType()}, &CGM.getModule());
+    return EmitRuntimeCall(CGM.CreateRuntimeFunction(FT, Name, {},
+                                                     /*Local=*/false,
+                                                     /*AssumeConvergent=*/true),
+                           ArrayRef{OpExpr}, "hlsl.wave.active.sum");
+  }
   case Builtin::BI__builtin_hlsl_wave_get_lane_index: {
     // We don't define a SPIR-V intrinsic, instead it is a SPIR-V built-in
     // defined in SPIRVBuiltins.td. So instead we manually get the matching name
@@ -20487,6 +20528,16 @@ Value *CodeGenFunction::EmitSPIRVBuiltinExpr(unsigned BuiltinID,
         /*ReturnType=*/X->getType()->getScalarType(), Intrinsic::spv_distance,
         ArrayRef<Value *>{X, Y}, nullptr, "spv.distance");
   }
+  case SPIRV::BI__builtin_spirv_length: {
+    Value *X = EmitScalarExpr(E->getArg(0));
+    assert(E->getArg(0)->getType()->hasFloatingRepresentation() &&
+           "length operand must have a float representation");
+    assert(E->getArg(0)->getType()->isVectorType() &&
+           "length operand must be a vector");
+    return Builder.CreateIntrinsic(
+        /*ReturnType=*/X->getType()->getScalarType(), Intrinsic::spv_length,
+        ArrayRef<Value *>{X}, nullptr, "spv.length");
+  }
   }
   return nullptr;
 }
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index f9cba414dcfe2..6cbcaf0384410 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -1907,8 +1907,8 @@ void CGDebugInfo::CollectRecordNestedType(
   if (isa<InjectedClassNameType>(Ty))
     return;
   SourceLocation Loc = TD->getLocation();
-  llvm::DIType *nestedType = getOrCreateType(Ty, getOrCreateFile(Loc));
-  elements.push_back(nestedType);
+  if (llvm::DIType *nestedType = getOrCreateType(Ty, getOrCreateFile(Loc)))
+    elements.push_back(nestedType);
 }
 
 void CGDebugInfo::CollectRecordFields(
@@ -2016,13 +2016,15 @@ llvm::DISubroutineType *CGDebugInfo::getOrCreateInstanceMethodType(
   // First element is always return type. For 'void' functions it is NULL.
   Elts.push_back(Args[0]);
 
-  // "this" pointer is always first argument.
-  // ThisPtr may be null if the member function has an explicit 'this'
-  // parameter.
-  if (!ThisPtr.isNull()) {
+  const bool HasExplicitObjectParameter = ThisPtr.isNull();
+
+  // "this" pointer is always first argument. For explicit "this"
+  // parameters, it will already be in Args[1].
+  if (!HasExplicitObjectParameter) {
     llvm::DIType *ThisPtrType = getOrCreateType(ThisPtr, Unit);
     TypeCache[ThisPtr.getAsOpaquePtr()].reset(ThisPtrType);
-    ThisPtrType = DBuilder.createObjectPointerType(ThisPtrType);
+    ThisPtrType =
+        DBuilder.createObjectPointerType(ThisPtrType, /*Implicit=*/true);
     Elts.push_back(ThisPtrType);
   }
 
@@ -2030,6 +2032,13 @@ llvm::DISubroutineType *CGDebugInfo::getOrCreateInstanceMethodType(
   for (unsigned i = 1, e = Args.size(); i != e; ++i)
     Elts.push_back(Args[i]);
 
+  // Attach FlagObjectPointer to the explicit "this" parameter.
+  if (HasExplicitObjectParameter) {
+    assert(Elts.size() >= 2 && Args.size() >= 2 &&
+           "Expected at least return type and object parameter.");
+    Elts[1] = DBuilder.createObjectPointerType(Args[1], /*Implicit=*/false);
+  }
+
   llvm::DITypeRefArray EltTypeArray = DBuilder.getOrCreateTypeArray(Elts);
 
   return DBuilder.createSubroutineType(EltTypeArray, OriginalFunc->getFlags(),
@@ -5118,7 +5127,7 @@ llvm::DIType *CGDebugInfo::CreateSelfType(const QualType &QualTy,
   llvm::DIType *CachedTy = getTypeOrNull(QualTy);
   if (CachedTy)
     Ty = CachedTy;
-  return DBuilder.createObjectPointerType(Ty);
+  return DBuilder.createObjectPointerType(Ty, /*Implicit=*/true);
 }
 
 void CGDebugInfo::EmitDeclareOfBlockDeclRefVariable(
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 6e5a21c8f01e7..9a9a8c7f6eae0 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -3607,18 +3607,15 @@ void CodeGenFunction::EmitCheck(
   llvm::Value *RecoverableCond = nullptr;
   llvm::Value *TrapCond = nullptr;
   bool NoMerge = false;
-  for (int i = 0, n = Checked.size(); i < n; ++i) {
-    llvm::Value *Check = Checked[i].first;
+  for (auto &[Check, Ord] : Checked) {
     // -fsanitize-trap= overrides -fsanitize-recover=.
-    llvm::Value *&Cond =
-        CGM.getCodeGenOpts().SanitizeTrap.has(Checked[i].second)
-            ? TrapCond
-            : CGM.getCodeGenOpts().SanitizeRecover.has(Checked[i].second)
-                  ? RecoverableCond
-                  : FatalCond;
+    llvm::Value *&Cond = CGM.getCodeGenOpts().SanitizeTrap.has(Ord) ? TrapCond
+                         : CGM.getCodeGenOpts().SanitizeRecover.has(Ord)
+                             ? RecoverableCond
+                             : FatalCond;
     Cond = Cond ? Builder.CreateAnd(Cond, Check) : Check;
 
-    if (!CGM.getCodeGenOpts().SanitizeMergeHandlers.has(Checked[i].second))
+    if (!CGM.getCodeGenOpts().SanitizeMergeHandlers.has(Ord))
       NoMerge = true;
   }
 
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index 00e110e8e6fa2..f9dc7b87af0e3 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -99,6 +99,7 @@ class CGHLSLRuntime {
   GENERATE_HLSL_INTRINSIC_FUNCTION(WaveReadLaneAt, wave_readlane)
   GENERATE_HLSL_INTRINSIC_FUNCTION(FirstBitUHigh, firstbituhigh)
   GENERATE_HLSL_INTRINSIC_FUNCTION(FirstBitSHigh, firstbitshigh)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(FirstBitLow, firstbitlow)
   GENERATE_HLSL_INTRINSIC_FUNCTION(NClamp, nclamp)
   GENERATE_HLSL_INTRINSIC_FUNCTION(SClamp, sclamp)
   GENERATE_HLSL_INTRINSIC_FUNCTION(UClamp, uclamp)
diff --git a/clang/lib/CodeGen/ConstantInitBuilder.cpp b/clang/lib/CodeGen/ConstantInitBuilder.cpp
index ddbf3ef743370..ce1fe137c1919 100644
--- a/clang/lib/CodeGen/ConstantInitBuilder.cpp
+++ b/clang/lib/CodeGen/ConstantInitBuilder.cpp
@@ -29,7 +29,7 @@ llvm::Type *ConstantInitFuture::getType() const {
 
 void ConstantInitFuture::abandon() {
   assert(Data && "abandoning null future");
-  if (auto builder = Data.dyn_cast<ConstantInitBuilderBase*>()) {
+  if (auto *builder = dyn_cast<ConstantInitBuilderBase *>(Data)) {
     builder->abandon(0);
   }
   Data = nullptr;
diff --git a/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp b/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp
index 5447b98d7105e..02635ce235a12 100644
--- a/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp
+++ b/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp
@@ -81,6 +81,9 @@ class PCHContainerGenerator : public ASTConsumer {
         if (!TD->isCompleteDefinition())
           return true;
 
+      if (D->hasAttr<NoDebugAttr>())
+        return true;
+
       QualType QualTy = Ctx.getTypeDeclType(D);
       if (!QualTy.isNull() && CanRepresent(QualTy.getTypePtr()))
         DI.getOrCreateStandaloneType(QualTy, D->getLocation());
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 7767c81d654dc..87855fdb79971 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -308,7 +308,8 @@ void Driver::setDriverMode(StringRef Value) {
 }
 
 InputArgList Driver::ParseArgStrings(ArrayRef<const char *> ArgStrings,
-                                     bool UseDriverMode, bool &ContainsError) {
+                                     bool UseDriverMode,
+                                     bool &ContainsError) const {
   llvm::PrettyStackTraceString CrashInfo("Command line argument parsing");
   ContainsError = false;
 
@@ -1674,13 +1675,31 @@ Compilation *Driver::BuildCompilation(ArrayRef<const char *> ArgList) {
   std::unique_ptr<llvm::opt::InputArgList> UArgs =
       std::make_unique<InputArgList>(std::move(Args));
 
+  // Owned by the host.
+  const ToolChain &TC =
+      getToolChain(*UArgs, computeTargetTriple(*this, TargetTriple, *UArgs));
+
+  {
+    SmallVector<std::string> MultilibMacroDefinesStr =
+        TC.getMultilibMacroDefinesStr(*UArgs);
+    SmallVector<const char *> MLMacroDefinesChar(
+        llvm::map_range(MultilibMacroDefinesStr, [&UArgs](const auto &S) {
+          return UArgs->MakeArgString(Twine("-D") + Twine(S));
+        }));
+    bool MLContainsError;
+    auto MultilibMacroDefineList =
+        std::make_unique<InputArgList>(ParseArgStrings(
+            MLMacroDefinesChar, /*UseDriverMode=*/false, MLContainsError));
+    if (!MLContainsError) {
+      for (auto *Opt : *MultilibMacroDefineList) {
+        appendOneArg(*UArgs, Opt);
+      }
+    }
+  }
+
   // Perform the default argument translations.
   DerivedArgList *TranslatedArgs = TranslateInputArgs(*UArgs);
 
-  // Owned by the host.
-  const ToolChain &TC = getToolChain(
-      *UArgs, computeTargetTriple(*this, TargetTriple, *UArgs));
-
   // Check if the environment version is valid except wasm case.
   llvm::Triple Triple = TC.getTriple();
   if (!Triple.isWasm()) {
diff --git a/clang/lib/Driver/Multilib.cpp b/clang/lib/Driver/Multilib.cpp
index ccf747e90cb2c..efb99d3ffc752 100644
--- a/clang/lib/Driver/Multilib.cpp
+++ b/clang/lib/Driver/Multilib.cpp
@@ -92,12 +92,145 @@ MultilibSet &MultilibSet::FilterOut(FilterCallback F) {
 
 void MultilibSet::push_back(const Multilib &M) { Multilibs.push_back(M); }
 
-bool MultilibSet::select(const Driver &D, const Multilib::flags_list &Flags,
-                         llvm::SmallVectorImpl<Multilib> &Selected) const {
-  llvm::StringSet<> FlagSet(expandFlags(Flags));
+static void DiagnoseUnclaimedMultilibCustomFlags(
+    const Driver &D, const SmallVector<StringRef> &UnclaimedCustomFlagValues,
+    const SmallVector<custom_flag::Declaration> &CustomFlagDecls) {
+  struct EditDistanceInfo {
+    StringRef FlagValue;
+    unsigned EditDistance;
+  };
+  const unsigned MaxEditDistance = 5;
+
+  for (StringRef Unclaimed : UnclaimedCustomFlagValues) {
+    std::optional<EditDistanceInfo> BestCandidate;
+    for (const auto &Decl : CustomFlagDecls) {
+      for (const auto &Value : Decl.ValueList) {
+        const std::string &FlagValueName = Value.Name;
+        unsigned EditDistance =
+            Unclaimed.edit_distance(FlagValueName, /*AllowReplacements=*/true,
+                                    /*MaxEditDistance=*/MaxEditDistance);
+        if (!BestCandidate || (EditDistance <= MaxEditDistance &&
+                               EditDistance < BestCandidate->EditDistance)) {
+          BestCandidate = {FlagValueName, EditDistance};
+        }
+      }
+    }
+    if (!BestCandidate)
+      D.Diag(clang::diag::err_drv_unsupported_opt)
+          << (custom_flag::Prefix + Unclaimed).str();
+    else
+      D.Diag(clang::diag::err_drv_unsupported_opt_with_suggestion)
+          << (custom_flag::Prefix + Unclaimed).str()
+          << (custom_flag::Prefix + BestCandidate->FlagValue).str();
+  }
+}
+
+namespace clang::driver::custom_flag {
+// Map implemented using linear searches as the expected size is too small for
+// the overhead of a search tree or a hash table.
+class ValueNameToDetailMap {
+  SmallVector<std::pair<StringRef, const ValueDetail *>> Mapping;
+
+public:
+  template <typename It>
+  ValueNameToDetailMap(It FlagDeclsBegin, It FlagDeclsEnd) {
+    for (auto DeclIt = FlagDeclsBegin; DeclIt != FlagDeclsEnd; ++DeclIt) {
+      const Declaration &Decl = *DeclIt;
+      for (const auto &Value : Decl.ValueList)
+        Mapping.emplace_back(Value.Name, &Value);
+    }
+  }
+
+  const ValueDetail *get(StringRef Key) const {
+    auto Iter = llvm::find_if(
+        Mapping, [&](const auto &Pair) { return Pair.first == Key; });
+    return Iter != Mapping.end() ? Iter->second : nullptr;
+  }
+};
+} // namespace clang::driver::custom_flag
+
+std::pair<Multilib::flags_list, SmallVector<StringRef>>
+MultilibSet::processCustomFlags(const Driver &D,
+                                const Multilib::flags_list &Flags) const {
+  Multilib::flags_list Result;
+  SmallVector<StringRef> MacroDefines;
+
+  // Custom flag values detected in the flags list
+  SmallVector<const custom_flag::ValueDetail *> ClaimedCustomFlagValues;
+
+  // Arguments to -fmultilib-flag=<arg> that don't correspond to any valid
+  // custom flag value. An error will be printed out for each of these.
+  SmallVector<StringRef> UnclaimedCustomFlagValueStrs;
+
+  const auto ValueNameToValueDetail = custom_flag::ValueNameToDetailMap(
+      CustomFlagDecls.begin(), CustomFlagDecls.end());
+
+  for (StringRef Flag : Flags) {
+    if (!Flag.starts_with(custom_flag::Prefix)) {
+      Result.push_back(Flag.str());
+      continue;
+    }
+
+    StringRef CustomFlagValueStr = Flag.substr(custom_flag::Prefix.size());
+    const custom_flag::ValueDetail *Detail =
+        ValueNameToValueDetail.get(CustomFlagValueStr);
+    if (Detail)
+      ClaimedCustomFlagValues.push_back(Detail);
+    else
+      UnclaimedCustomFlagValueStrs.push_back(CustomFlagValueStr);
+  }
+
+  // Set of custom flag declarations for which a value was passed in the flags
+  // list. This is used to, firstly, detect multiple values for the same flag
+  // declaration (in this case, the last one wins), and secondly, to detect
+  // which declarations had no value passed in (in this case, the default value
+  // is selected).
+  llvm::SmallPtrSet<custom_flag::Declaration *, 32> TriggeredCustomFlagDecls;
+
+  // Detect multiple values for the same flag declaration. Last one wins.
+  for (auto *CustomFlagValue : llvm::reverse(ClaimedCustomFlagValues)) {
+    if (!TriggeredCustomFlagDecls.insert(CustomFlagValue->Decl).second)
+      continue;
+    Result.push_back(std::string(custom_flag::Prefix) + CustomFlagValue->Name);
+    if (CustomFlagValue->MacroDefines)
+      MacroDefines.append(CustomFlagValue->MacroDefines->begin(),
+                          CustomFlagValue->MacroDefines->end());
+  }
+
+  // Detect flag declarations with no value passed in. Select default value.
+  for (const auto &Decl : CustomFlagDecls) {
+    if (TriggeredCustomFlagDecls.contains(&Decl))
+      continue;
+    const custom_flag::ValueDetail &CustomFlagValue =
+        Decl.ValueList[*Decl.DefaultValueIdx];
+    Result.push_back(std::string(custom_flag::Prefix) + CustomFlagValue.Name);
+    if (CustomFlagValue.MacroDefines)
+      MacroDefines.append(CustomFlagValue.MacroDefines->begin(),
+                          CustomFlagValue.MacroDefines->end());
+  }
+
+  DiagnoseUnclaimedMultilibCustomFlags(D, UnclaimedCustomFlagValueStrs,
+                                       CustomFlagDecls);
+
+  return {Result, MacroDefines};
+}
+
+bool MultilibSet::select(
+    const Driver &D, const Multilib::flags_list &Flags,
+    llvm::SmallVectorImpl<Multilib> &Selected,
+    llvm::SmallVector<StringRef> *CustomFlagMacroDefines) const {
+  auto [FlagsWithCustom, CFMacroDefines] = processCustomFlags(D, Flags);
+  llvm::StringSet<> FlagSet(expandFlags(FlagsWithCustom));
   Selected.clear();
   bool AnyErrors = false;
 
+  // Determining the list of macro defines depends only on the custom flags
+  // passed in. The library variants actually selected are not relevant in
+  // this. Therefore this assignment can take place before the selection
+  // happens.
+  if (CustomFlagMacroDefines)
+    *CustomFlagMacroDefines = std::move(CFMacroDefines);
+
   // Decide which multilibs we're going to select at all.
   llvm::DenseSet<StringRef> ExclusiveGroupsSelected;
   for (const Multilib &M : llvm::reverse(Multilibs)) {
diff --git a/clang/lib/Driver/ToolChains/Arch/ARM.cpp b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
index b8181ce6dc012..c648fb66085c7 100644
--- a/clang/lib/Driver/ToolChains/Arch/ARM.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
@@ -647,7 +647,7 @@ llvm::ARM::FPUKind arm::getARMTargetFeatures(const Driver &D,
     (void)getARMFPUFeatures(D, WaFPU->first, Args, WaFPU->second, Features);
   } else if (FPUArg) {
     FPUKind = getARMFPUFeatures(D, FPUArg, Args, FPUArg->getValue(), Features);
-  } else if (Triple.isAndroid() && getARMSubArchVersionNumber(Triple) >= 7) {
+  } else if (Triple.isAndroid() && getARMSubArchVersionNumber(Triple) == 7) {
     const char *AndroidFPU = "neon";
     FPUKind = llvm::ARM::parseFPU(AndroidFPU);
     if (!llvm::ARM::getFPUFeatures(FPUKind, Features))
@@ -659,13 +659,21 @@ llvm::ARM::FPUKind arm::getARMTargetFeatures(const Driver &D,
         CPUArgFPUKind != llvm::ARM::FK_INVALID ? CPUArgFPUKind : ArchArgFPUKind;
     (void)llvm::ARM::getFPUFeatures(FPUKind, Features);
   } else {
+    bool Generic = true;
     if (!ForAS) {
       std::string CPU = arm::getARMTargetCPU(CPUName, ArchName, Triple);
+      if (CPU != "generic")
+        Generic = false;
       llvm::ARM::ArchKind ArchKind =
           arm::getLLVMArchKindForARM(CPU, ArchName, Triple);
       FPUKind = llvm::ARM::getDefaultFPU(CPU, ArchKind);
       (void)llvm::ARM::getFPUFeatures(FPUKind, Features);
     }
+    if (Generic && (Triple.isOSWindows() || Triple.isOSDarwin()) &&
+        getARMSubArchVersionNumber(Triple) >= 7) {
+      FPUKind = llvm::ARM::parseFPU("neon");
+      (void)llvm::ARM::getFPUFeatures(FPUKind, Features);
+    }
   }
 
   // Now we've finished accumulating features from arch, cpu and fpu,
diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index eecaaa9a42930..ffb1c6e34d603 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -162,9 +162,11 @@ static bool isPPCBareMetal(const llvm::Triple &Triple) {
          Triple.getEnvironment() == llvm::Triple::EABI;
 }
 
-static void findMultilibsFromYAML(const ToolChain &TC, const Driver &D,
-                                  StringRef MultilibPath, const ArgList &Args,
-                                  DetectedMultilibs &Result) {
+static void
+findMultilibsFromYAML(const ToolChain &TC, const Driver &D,
+                      StringRef MultilibPath, const ArgList &Args,
+                      DetectedMultilibs &Result,
+                      SmallVector<StringRef> &CustomFlagsMacroDefines) {
   llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> MB =
       D.getVFS().getBufferForFile(MultilibPath);
   if (!MB)
@@ -175,7 +177,8 @@ static void findMultilibsFromYAML(const ToolChain &TC, const Driver &D,
   if (ErrorOrMultilibSet.getError())
     return;
   Result.Multilibs = ErrorOrMultilibSet.get();
-  if (Result.Multilibs.select(D, Flags, Result.SelectedMultilibs))
+  if (Result.Multilibs.select(D, Flags, Result.SelectedMultilibs,
+                              &CustomFlagsMacroDefines))
     return;
   D.Diag(clang::diag::warn_drv_missing_multilib) << llvm::join(Flags, " ");
   std::stringstream ss;
@@ -234,9 +237,13 @@ void BareMetal::findMultilibs(const Driver &D, const llvm::Triple &Triple,
     // If multilib.yaml is found, update sysroot so it doesn't use a target
     // specific suffix
     SysRoot = computeBaseSysRoot(D, /*IncludeTriple=*/false);
-    findMultilibsFromYAML(*this, D, *MultilibPath, Args, Result);
+    SmallVector<StringRef> CustomFlagMacroDefines;
+    findMultilibsFromYAML(*this, D, *MultilibPath, Args, Result,
+                          CustomFlagMacroDefines);
     SelectedMultilibs = Result.SelectedMultilibs;
     Multilibs = Result.Multilibs;
+    MultilibMacroDefines.append(CustomFlagMacroDefines.begin(),
+                                CustomFlagMacroDefines.end());
   } else if (isRISCVBareMetal(Triple)) {
     if (findRISCVMultilibs(D, Triple, Args, Result)) {
       SelectedMultilibs = Result.SelectedMultilibs;
@@ -551,3 +558,8 @@ SanitizerMask BareMetal::getSupportedSanitizers() const {
   }
   return Res;
 }
+
+SmallVector<std::string>
+BareMetal::getMultilibMacroDefinesStr(llvm::opt::ArgList &Args) const {
+  return MultilibMacroDefines;
+}
diff --git a/clang/lib/Driver/ToolChains/BareMetal.h b/clang/lib/Driver/ToolChains/BareMetal.h
index 483b5efab5e6e..f6295bda0a6a2 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.h
+++ b/clang/lib/Driver/ToolChains/BareMetal.h
@@ -70,12 +70,17 @@ class LLVM_LIBRARY_VISIBILITY BareMetal : public ToolChain {
   std::string computeSysRoot() const override;
   SanitizerMask getSupportedSanitizers() const override;
 
+  SmallVector<std::string>
+  getMultilibMacroDefinesStr(llvm::opt::ArgList &Args) const override;
+
 private:
   using OrderedMultilibs =
       llvm::iterator_range<llvm::SmallVector<Multilib>::const_reverse_iterator>;
   OrderedMultilibs getOrderedMultilibs() const;
 
   std::string SysRoot;
+
+  SmallVector<std::string> MultilibMacroDefines;
 };
 
 } // namespace toolchains
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index e5dffb11d1a5e..84ef8199de049 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -910,7 +910,9 @@ void darwin::Lipo::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(II.getFilename());
   }
 
-  const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("lipo"));
+  StringRef LipoName = Args.getLastArgValue(options::OPT_fuse_lipo_EQ, "lipo");
+  const char *Exec =
+      Args.MakeArgString(getToolChain().GetProgramPath(LipoName.data()));
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
                                          Exec, CmdArgs, Inputs, Output));
 }
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index a7d0cc99f27d2..86ed25badfa2b 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -155,7 +155,8 @@ void Flang::addCodegenOptions(const ArgList &Args,
                    options::OPT_flang_deprecated_no_hlfir,
                    options::OPT_fno_ppc_native_vec_elem_order,
                    options::OPT_fppc_native_vec_elem_order,
-                   options::OPT_ftime_report, options::OPT_ftime_report_EQ});
+                   options::OPT_ftime_report, options::OPT_ftime_report_EQ,
+                   options::OPT_funroll_loops, options::OPT_fno_unroll_loops});
 }
 
 void Flang::addPicOptions(const ArgList &Args, ArgStringList &CmdArgs) const {
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index 42c48f6c9b774..ccee065b59064 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -176,7 +176,6 @@ void AMDGCN::Linker::constructLinkAndEmitSpirvCommand(
   llvm::opt::ArgStringList TrArgs{
       "--spirv-max-version=1.6",
       "--spirv-ext=+all",
-      "--spirv-allow-extra-diexpressions",
       "--spirv-allow-unknown-intrinsics",
       "--spirv-lower-const-expr",
       "--spirv-preserve-auxdata",
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 554b55fa75c92..c311deaa17bb0 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -148,6 +148,7 @@ static bool startsNextOperand(const FormatToken &Current) {
 static bool mustBreakBinaryOperation(const FormatToken &Current,
                                      const FormatStyle &Style) {
   return Style.BreakBinaryOperations != FormatStyle::BBO_Never &&
+         Current.CanBreakBefore &&
          (Style.BreakBeforeBinaryOperators == FormatStyle::BOS_None
               ? startsNextOperand
               : isAlignableBinaryOperator)(Current);
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 317717241c17c..198c05fd9dcd8 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -503,14 +503,14 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
     auto *NextTok = Tokens->getNextNonComment();
 
     if (!Line->InMacroBody && !Style.isTableGen()) {
-      // Skip PPDirective lines and comments.
+      // Skip PPDirective lines (except macro definitions) and comments.
       while (NextTok->is(tok::hash)) {
         NextTok = Tokens->getNextToken();
-        if (NextTok->is(tok::pp_not_keyword))
+        if (NextTok->isOneOf(tok::pp_not_keyword, tok::pp_define))
           break;
         do {
           NextTok = Tokens->getNextToken();
-        } while (!NextTok->HasUnescapedNewline && NextTok->isNot(tok::eof));
+        } while (NextTok->NewlinesBefore == 0 && NextTok->isNot(tok::eof));
 
         while (NextTok->is(tok::comment))
           NextTok = Tokens->getNextToken();
diff --git a/clang/lib/Headers/amxavx512intrin.h b/clang/lib/Headers/amxavx512intrin.h
index e4d77e503015a..a158983482d5b 100644
--- a/clang/lib/Headers/amxavx512intrin.h
+++ b/clang/lib/Headers/amxavx512intrin.h
@@ -60,7 +60,7 @@
 /// \headerfile <x86intrin.h>
 ///
 /// \code
-/// __m512i _tile_cvtrowps2pbf16h(__tile tsrc, unsigned int row);
+/// __m512i _tile_cvtrowps2bf16h(__tile tsrc, unsigned int row);
 /// \endcode
 ///
 /// \code{.operation}
@@ -80,14 +80,14 @@
 /// zero_tileconfig_start()
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c TCVTROWPS2PBF16H instruction.
+/// This intrinsic corresponds to the \c TCVTROWPS2BF16H instruction.
 ///
 /// \param tsrc
 ///    The source tile. Max size is 1024 Bytes.
 /// \param row
 ///    The the row of the source tile.
-#define _tile_cvtrowps2pbf16h(tsrc, row)                                       \
-  __builtin_ia32_tcvtrowps2pbf16h(tsrc, row)
+#define _tile_cvtrowps2bf16h(tsrc, row)                                        \
+  __builtin_ia32_tcvtrowps2bf16h(tsrc, row)
 
 /// Moves a row from a tile register to a zmm destination register, converting
 ///    the fp32 source elements to bf16. It places the resulting bf16 elements
@@ -97,7 +97,7 @@
 /// \headerfile <x86intrin.h>
 ///
 /// \code
-/// __m512i _tile_cvtrowps2pbf16l(__tile tsrc, unsigned int row);
+/// __m512i _tile_cvtrowps2bf16l(__tile tsrc, unsigned int row);
 /// \endcode
 ///
 /// \code{.operation}
@@ -117,14 +117,14 @@
 /// zero_tileconfig_start()
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c TCVTROWPS2PBF16L instruction.
+/// This intrinsic corresponds to the \c TCVTROWPS2BF16L instruction.
 ///
 /// \param tsrc
 ///    The source tile. Max size is 1024 Bytes.
 /// \param row
 ///    The the row of the source tile.
-#define _tile_cvtrowps2pbf16l(tsrc, row)                                       \
-  __builtin_ia32_tcvtrowps2pbf16l(tsrc, row)
+#define _tile_cvtrowps2bf16l(tsrc, row)                                        \
+  __builtin_ia32_tcvtrowps2bf16l(tsrc, row)
 
 /// Moves a row from a tile register to a zmm destination register, converting
 ///    the fp32 source elements to fp16. It places the resulting fp16 elements
@@ -238,15 +238,15 @@ static __inline__ __m512 __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowd2ps_internal(
 }
 
 static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
-_tile_cvtrowps2pbf16h_internal(unsigned short m, unsigned short n,
-                               _tile1024i src, unsigned u) {
-  return __builtin_ia32_tcvtrowps2pbf16h_internal(m, n, src, u);
+_tile_cvtrowps2bf16h_internal(unsigned short m, unsigned short n,
+                              _tile1024i src, unsigned u) {
+  return __builtin_ia32_tcvtrowps2bf16h_internal(m, n, src, u);
 }
 
 static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
-_tile_cvtrowps2pbf16l_internal(unsigned short m, unsigned short n,
-                               _tile1024i src, unsigned u) {
-  return __builtin_ia32_tcvtrowps2pbf16l_internal(m, n, src, u);
+_tile_cvtrowps2bf16l_internal(unsigned short m, unsigned short n,
+                              _tile1024i src, unsigned u) {
+  return __builtin_ia32_tcvtrowps2bf16l_internal(m, n, src, u);
 }
 
 static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phh_internal(
@@ -290,7 +290,7 @@ static __m512 __tile_cvtrowd2ps(__tile1024i src0, unsigned src1) {
 ///
 /// \headerfile <immintrin.h>
 ///
-/// This intrinsic corresponds to the <c> TCVTROWPS2PBF16H </c> instruction.
+/// This intrinsic corresponds to the <c> TCVTROWPS2BF16H </c> instruction.
 ///
 /// \param src0
 ///    The 1st source tile. Max size is 1024 Bytes.
@@ -299,8 +299,8 @@ static __m512 __tile_cvtrowd2ps(__tile1024i src0, unsigned src1) {
 /// \returns
 ///    The destination v32bf16 data. Size is 64 Bytes.
 __DEFAULT_FN_ATTRS_AVX512
-static __m512bh __tile_cvtrowps2pbf16h(__tile1024i src0, unsigned src1) {
-  return _tile_cvtrowps2pbf16h_internal(src0.row, src0.col, src0.tile, src1);
+static __m512bh __tile_cvtrowps2bf16h(__tile1024i src0, unsigned src1) {
+  return _tile_cvtrowps2bf16h_internal(src0.row, src0.col, src0.tile, src1);
 }
 
 /// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
@@ -309,7 +309,7 @@ static __m512bh __tile_cvtrowps2pbf16h(__tile1024i src0, unsigned src1) {
 ///
 /// \headerfile <immintrin.h>
 ///
-/// This intrinsic corresponds to the <c> TCVTROWPS2PBF16L </c> instruction.
+/// This intrinsic corresponds to the <c> TCVTROWPS2BF16L </c> instruction.
 ///
 /// \param src0
 ///    The 1st source tile. Max size is 1024 Bytes.
@@ -318,8 +318,8 @@ static __m512bh __tile_cvtrowps2pbf16h(__tile1024i src0, unsigned src1) {
 /// \returns
 ///    The destination v32bf16 data. Size is 64 Bytes.
 __DEFAULT_FN_ATTRS_AVX512
-static __m512bh __tile_cvtrowps2pbf16l(__tile1024i src0, unsigned src1) {
-  return _tile_cvtrowps2pbf16l_internal(src0.row, src0.col, src0.tile, src1);
+static __m512bh __tile_cvtrowps2bf16l(__tile1024i src0, unsigned src1) {
+  return _tile_cvtrowps2bf16l_internal(src0.row, src0.col, src0.tile, src1);
 }
 
 /// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
diff --git a/clang/lib/Headers/hlsl/hlsl_detail.h b/clang/lib/Headers/hlsl/hlsl_detail.h
index 3eb4a3dc861e3..b2c8cc6c5c3db 100644
--- a/clang/lib/Headers/hlsl/hlsl_detail.h
+++ b/clang/lib/Headers/hlsl/hlsl_detail.h
@@ -61,7 +61,11 @@ length_impl(T X) {
 template <typename T, int N>
 constexpr enable_if_t<is_same<float, T>::value || is_same<half, T>::value, T>
 length_vec_impl(vector<T, N> X) {
+#if (__has_builtin(__builtin_spirv_length))
+  return __builtin_spirv_length(X);
+#else
   return __builtin_elementwise_sqrt(__builtin_hlsl_dot(X, X));
+#endif
 }
 
 template <typename T>
@@ -73,11 +77,7 @@ distance_impl(T X, T Y) {
 template <typename T, int N>
 constexpr enable_if_t<is_same<float, T>::value || is_same<half, T>::value, T>
 distance_vec_impl(vector<T, N> X, vector<T, N> Y) {
-#if (__has_builtin(__builtin_spirv_distance))
-  return __builtin_spirv_distance(X, Y);
-#else
   return length_vec_impl(X - Y);
-#endif
 }
 } // namespace __detail
 } // namespace hlsl
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 378b18e18b71b..d1e4eb08aa764 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -1150,6 +1150,78 @@ uint3 firstbithigh(uint64_t3);
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
 uint4 firstbithigh(uint64_t4);
 
+//===----------------------------------------------------------------------===//
+// firstbitlow builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn T firstbitlow(T Val)
+/// \brief Returns the location of the first set bit starting from the lowest
+/// order bit and working upward, per component.
+/// \param Val the input value.
+
+#ifdef __HLSL_ENABLE_16_BIT
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint firstbitlow(int16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint2 firstbitlow(int16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint3 firstbitlow(int16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint4 firstbitlow(int16_t4);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint firstbitlow(uint16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint2 firstbitlow(uint16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint3 firstbitlow(uint16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint4 firstbitlow(uint16_t4);
+#endif
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint firstbitlow(int);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint2 firstbitlow(int2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint3 firstbitlow(int3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint4 firstbitlow(int4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint firstbitlow(uint);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint2 firstbitlow(uint2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint3 firstbitlow(uint3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint4 firstbitlow(uint4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint firstbitlow(int64_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint2 firstbitlow(int64_t2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint3 firstbitlow(int64_t3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint4 firstbitlow(int64_t4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint firstbitlow(uint64_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint2 firstbitlow(uint64_t2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint3 firstbitlow(uint64_t3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint4 firstbitlow(uint64_t4);
+
 //===----------------------------------------------------------------------===//
 // floor builtins
 //===----------------------------------------------------------------------===//
@@ -2396,6 +2468,105 @@ __attribute__((convergent)) double3 WaveReadLaneAt(double3, int32_t);
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
 __attribute__((convergent)) double4 WaveReadLaneAt(double4, int32_t);
 
+//===----------------------------------------------------------------------===//
+// WaveActiveSum builtins
+//===----------------------------------------------------------------------===//
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) half WaveActiveSum(half);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) half2 WaveActiveSum(half2);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) half3 WaveActiveSum(half3);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) half4 WaveActiveSum(half4);
+
+#ifdef __HLSL_ENABLE_16_BIT
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) int16_t WaveActiveSum(int16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) int16_t2 WaveActiveSum(int16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) int16_t3 WaveActiveSum(int16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) int16_t4 WaveActiveSum(int16_t4);
+
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) uint16_t WaveActiveSum(uint16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) uint16_t2 WaveActiveSum(uint16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) uint16_t3 WaveActiveSum(uint16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) uint16_t4 WaveActiveSum(uint16_t4);
+#endif
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) int WaveActiveSum(int);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) int2 WaveActiveSum(int2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) int3 WaveActiveSum(int3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) int4 WaveActiveSum(int4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) uint WaveActiveSum(uint);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) uint2 WaveActiveSum(uint2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) uint3 WaveActiveSum(uint3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) uint4 WaveActiveSum(uint4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) int64_t WaveActiveSum(int64_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) int64_t2 WaveActiveSum(int64_t2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) int64_t3 WaveActiveSum(int64_t3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) int64_t4 WaveActiveSum(int64_t4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) uint64_t WaveActiveSum(uint64_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) uint64_t2 WaveActiveSum(uint64_t2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) uint64_t3 WaveActiveSum(uint64_t3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) uint64_t4 WaveActiveSum(uint64_t4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) float WaveActiveSum(float);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) float2 WaveActiveSum(float2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) float3 WaveActiveSum(float3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) float4 WaveActiveSum(float4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) double WaveActiveSum(double);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) double2 WaveActiveSum(double2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) double3 WaveActiveSum(double3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_sum)
+__attribute__((convergent)) double4 WaveActiveSum(double4);
+
 //===----------------------------------------------------------------------===//
 // sign builtins
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Interpreter/CodeCompletion.cpp b/clang/lib/Interpreter/CodeCompletion.cpp
index 9092d4705ca58..aa90663538128 100644
--- a/clang/lib/Interpreter/CodeCompletion.cpp
+++ b/clang/lib/Interpreter/CodeCompletion.cpp
@@ -229,7 +229,7 @@ class ExternalSource : public clang::ExternalASTSource {
                  ASTContext &ParentASTCtxt, FileManager &ParentFM);
   bool FindExternalVisibleDeclsByName(const DeclContext *DC,
                                       DeclarationName Name,
-                                      Module *NamedModule) override;
+                                      const DeclContext *OriginalDC) override;
   void
   completeVisibleDeclsMap(const clang::DeclContext *childDeclContext) override;
 };
@@ -271,9 +271,9 @@ ExternalSource::ExternalSource(ASTContext &ChildASTCtxt, FileManager &ChildFM,
   Importer.reset(importer);
 }
 
-bool ExternalSource::FindExternalVisibleDeclsByName(const DeclContext *DC,
-                                                    DeclarationName Name,
-                                                    Module *NamedModule) {
+bool ExternalSource::FindExternalVisibleDeclsByName(
+    const DeclContext *DC, DeclarationName Name,
+    const DeclContext *OriginalDC) {
 
   IdentifierTable &ParentIdTable = ParentASTCtxt.Idents;
 
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 72364500a48f9..115b6c1606a02 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -1323,7 +1323,8 @@ const char *Lexer::SkipEscapedNewLines(const char *P) {
 
 std::optional<Token> Lexer::findNextToken(SourceLocation Loc,
                                           const SourceManager &SM,
-                                          const LangOptions &LangOpts) {
+                                          const LangOptions &LangOpts,
+                                          bool IncludeComments) {
   if (Loc.isMacroID()) {
     if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
       return std::nullopt;
@@ -1344,6 +1345,7 @@ std::optional<Token> Lexer::findNextToken(SourceLocation Loc,
   // Lex from the start of the given location.
   Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
                                       TokenBegin, File.end());
+  lexer.SetCommentRetentionState(IncludeComments);
   // Find the token.
   Token Tok;
   lexer.LexFromRawLexer(Tok);
diff --git a/clang/lib/Sema/CMakeLists.txt b/clang/lib/Sema/CMakeLists.txt
index 3241cb53f004c..19cf3a2db00fd 100644
--- a/clang/lib/Sema/CMakeLists.txt
+++ b/clang/lib/Sema/CMakeLists.txt
@@ -19,6 +19,7 @@ add_clang_library(clangSema
   CodeCompleteConsumer.cpp
   DeclSpec.cpp
   DelayedDiagnostic.cpp
+  HeuristicResolver.cpp
   HLSLExternalSemaSource.cpp
   IdentifierResolver.cpp
   JumpDiagnostics.cpp
@@ -70,6 +71,7 @@ add_clang_library(clangSema
   SemaObjC.cpp
   SemaObjCProperty.cpp
   SemaOpenACC.cpp
+  SemaOpenACCClause.cpp
   SemaOpenCL.cpp
   SemaOpenMP.cpp
   SemaOverload.cpp
diff --git a/clang-tools-extra/clangd/HeuristicResolver.cpp b/clang/lib/Sema/HeuristicResolver.cpp
similarity index 99%
rename from clang-tools-extra/clangd/HeuristicResolver.cpp
rename to clang/lib/Sema/HeuristicResolver.cpp
index 9eb892e8e4a8e..7c1b8450b9633 100644
--- a/clang-tools-extra/clangd/HeuristicResolver.cpp
+++ b/clang/lib/Sema/HeuristicResolver.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "HeuristicResolver.h"
+#include "clang/Sema/HeuristicResolver.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/CXXInheritance.h"
 #include "clang/AST/DeclTemplate.h"
@@ -14,7 +14,6 @@
 #include "clang/AST/Type.h"
 
 namespace clang {
-namespace clangd {
 
 namespace {
 
@@ -466,5 +465,4 @@ const Type *HeuristicResolver::getPointeeType(const Type *T) const {
   return HeuristicResolverImpl(Ctx).getPointeeType(T);
 }
 
-} // namespace clangd
 } // namespace clang
diff --git a/clang/lib/Sema/MultiplexExternalSemaSource.cpp b/clang/lib/Sema/MultiplexExternalSemaSource.cpp
index c19a0f980c1e9..6d945300c386c 100644
--- a/clang/lib/Sema/MultiplexExternalSemaSource.cpp
+++ b/clang/lib/Sema/MultiplexExternalSemaSource.cpp
@@ -108,11 +108,12 @@ MultiplexExternalSemaSource::hasExternalDefinitions(const Decl *D) {
 }
 
 bool MultiplexExternalSemaSource::FindExternalVisibleDeclsByName(
-    const DeclContext *DC, DeclarationName Name, Module *NamedModule) {
+    const DeclContext *DC, DeclarationName Name,
+    const DeclContext *OriginalDC) {
   bool AnyDeclsFound = false;
   for (size_t i = 0; i < Sources.size(); ++i)
     AnyDeclsFound |=
-        Sources[i]->FindExternalVisibleDeclsByName(DC, Name, NamedModule);
+        Sources[i]->FindExternalVisibleDeclsByName(DC, Name, OriginalDC);
   return AnyDeclsFound;
 }
 
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index e0dd6039810cb..4b56a4dea05e5 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -20043,10 +20043,6 @@ void Sema::ActOnEnumBody(SourceLocation EnumLoc, SourceRange BraceRange,
     return;
   }
 
-  unsigned IntWidth = Context.getTargetInfo().getIntWidth();
-  unsigned CharWidth = Context.getTargetInfo().getCharWidth();
-  unsigned ShortWidth = Context.getTargetInfo().getShortWidth();
-
   // Verify that all the values are okay, compute the size of the values, and
   // reverse the list.
   unsigned NumNegativeBits = 0;
@@ -20112,73 +20108,12 @@ void Sema::ActOnEnumBody(SourceLocation EnumLoc, SourceRange BraceRange,
       BestPromotionType = BestType;
 
     BestWidth = Context.getIntWidth(BestType);
-  }
-  else if (NumNegativeBits) {
-    // If there is a negative value, figure out the smallest integer type (of
-    // int/long/longlong) that fits.
-    // If it's packed, check also if it fits a char or a short.
-    if (Packed && NumNegativeBits <= CharWidth && NumPositiveBits < CharWidth) {
-      BestType = Context.SignedCharTy;
-      BestWidth = CharWidth;
-    } else if (Packed && NumNegativeBits <= ShortWidth &&
-               NumPositiveBits < ShortWidth) {
-      BestType = Context.ShortTy;
-      BestWidth = ShortWidth;
-    } else if (NumNegativeBits <= IntWidth && NumPositiveBits < IntWidth) {
-      BestType = Context.IntTy;
-      BestWidth = IntWidth;
-    } else {
-      BestWidth = Context.getTargetInfo().getLongWidth();
-
-      if (NumNegativeBits <= BestWidth && NumPositiveBits < BestWidth) {
-        BestType = Context.LongTy;
-      } else {
-        BestWidth = Context.getTargetInfo().getLongLongWidth();
-
-        if (NumNegativeBits > BestWidth || NumPositiveBits >= BestWidth)
-          Diag(Enum->getLocation(), diag::ext_enum_too_large);
-        BestType = Context.LongLongTy;
-      }
-    }
-    BestPromotionType = (BestWidth <= IntWidth ? Context.IntTy : BestType);
   } else {
-    // If there is no negative value, figure out the smallest type that fits
-    // all of the enumerator values.
-    // If it's packed, check also if it fits a char or a short.
-    if (Packed && NumPositiveBits <= CharWidth) {
-      BestType = Context.UnsignedCharTy;
-      BestPromotionType = Context.IntTy;
-      BestWidth = CharWidth;
-    } else if (Packed && NumPositiveBits <= ShortWidth) {
-      BestType = Context.UnsignedShortTy;
-      BestPromotionType = Context.IntTy;
-      BestWidth = ShortWidth;
-    } else if (NumPositiveBits <= IntWidth) {
-      BestType = Context.UnsignedIntTy;
-      BestWidth = IntWidth;
-      BestPromotionType
-        = (NumPositiveBits == BestWidth || !getLangOpts().CPlusPlus)
-                           ? Context.UnsignedIntTy : Context.IntTy;
-    } else if (NumPositiveBits <=
-               (BestWidth = Context.getTargetInfo().getLongWidth())) {
-      BestType = Context.UnsignedLongTy;
-      BestPromotionType
-        = (NumPositiveBits == BestWidth || !getLangOpts().CPlusPlus)
-                           ? Context.UnsignedLongTy : Context.LongTy;
-    } else {
-      BestWidth = Context.getTargetInfo().getLongLongWidth();
-      if (NumPositiveBits > BestWidth) {
-        // This can happen with bit-precise integer types, but those are not
-        // allowed as the type for an enumerator per C23 6.7.2.2p4 and p12.
-        // FIXME: GCC uses __int128_t and __uint128_t for cases that fit within
-        // a 128-bit integer, we should consider doing the same.
-        Diag(Enum->getLocation(), diag::ext_enum_too_large);
-      }
-      BestType = Context.UnsignedLongLongTy;
-      BestPromotionType
-        = (NumPositiveBits == BestWidth || !getLangOpts().CPlusPlus)
-                           ? Context.UnsignedLongLongTy : Context.LongLongTy;
-    }
+    bool EnumTooLarge = Context.computeBestEnumTypes(
+        Packed, NumNegativeBits, NumPositiveBits, BestType, BestPromotionType);
+    BestWidth = Context.getIntWidth(BestType);
+    if (EnumTooLarge)
+      Diag(Enum->getLocation(), diag::ext_enum_too_large);
   }
 
   // Loop over all of the enumerator constants, changing their types to match
@@ -20210,7 +20145,7 @@ void Sema::ActOnEnumBody(SourceLocation EnumLoc, SourceRange BraceRange,
       //  int; or,
       //  - the enumerated type
       NewTy = Context.IntTy;
-      NewWidth = IntWidth;
+      NewWidth = Context.getTargetInfo().getIntWidth();
       NewSign = true;
     } else if (ECD->getType() == BestType) {
       // Already the right type!
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index c4bee44f5ec04..a867ed73bd403 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -10757,6 +10757,22 @@ static void checkMethodTypeQualifiers(Sema &S, Declarator &D, unsigned DiagID) {
   }
 }
 
+static void diagnoseInvalidDeclaratorChunks(Sema &S, Declarator &D,
+                                            unsigned Kind) {
+  if (D.isInvalidType() || D.getNumTypeObjects() <= 1)
+    return;
+
+  DeclaratorChunk &Chunk = D.getTypeObject(D.getNumTypeObjects() - 1);
+  if (Chunk.Kind == DeclaratorChunk::Paren ||
+      Chunk.Kind == DeclaratorChunk::Function)
+    return;
+
+  SourceLocation PointerLoc = Chunk.getSourceRange().getBegin();
+  S.Diag(PointerLoc, diag::err_invalid_ctor_dtor_decl)
+      << Kind << Chunk.getSourceRange();
+  D.setInvalidType();
+}
+
 QualType Sema::CheckConstructorDeclarator(Declarator &D, QualType R,
                                           StorageClass &SC) {
   bool isVirtual = D.getDeclSpec().isVirtualSpecified();
@@ -10792,6 +10808,7 @@ QualType Sema::CheckConstructorDeclarator(Declarator &D, QualType R,
   }
 
   checkMethodTypeQualifiers(*this, D, diag::err_invalid_qualified_constructor);
+  diagnoseInvalidDeclaratorChunks(*this, D, /*constructor*/ 0);
 
   // C++0x [class.ctor]p4:
   //   A constructor shall not be declared with a ref-qualifier.
@@ -10958,6 +10975,7 @@ QualType Sema::CheckDestructorDeclarator(Declarator &D, QualType R,
   }
 
   checkMethodTypeQualifiers(*this, D, diag::err_invalid_qualified_destructor);
+  diagnoseInvalidDeclaratorChunks(*this, D, /*destructor*/ 1);
 
   // C++0x [class.dtor]p2:
   //   A destructor shall not be declared with a ref-qualifier.
@@ -13217,18 +13235,18 @@ bool Sema::CheckUsingDeclQualifier(SourceLocation UsingLoc, bool HasTypename,
         if (getLangOpts().CPlusPlus11) {
           // Convert 'using X::Y;' to 'using Y = X::Y;'.
           Diag(SS.getBeginLoc(), diag::note_using_decl_class_member_workaround)
-            << 0 // alias declaration
-            << FixItHint::CreateInsertion(SS.getBeginLoc(),
-                                          NameInfo.getName().getAsString() +
-                                              " = ");
+              << diag::MemClassWorkaround::AliasDecl
+              << FixItHint::CreateInsertion(SS.getBeginLoc(),
+                                            NameInfo.getName().getAsString() +
+                                                " = ");
         } else {
           // Convert 'using X::Y;' to 'typedef X::Y Y;'.
           SourceLocation InsertLoc = getLocForEndOfToken(NameInfo.getEndLoc());
           Diag(InsertLoc, diag::note_using_decl_class_member_workaround)
-            << 1 // typedef declaration
-            << FixItHint::CreateReplacement(UsingLoc, "typedef")
-            << FixItHint::CreateInsertion(
-                   InsertLoc, " " + NameInfo.getName().getAsString());
+              << diag::MemClassWorkaround::TypedefDecl
+              << FixItHint::CreateReplacement(UsingLoc, "typedef")
+              << FixItHint::CreateInsertion(
+                     InsertLoc, " " + NameInfo.getName().getAsString());
         }
       } else if (R->getAsSingle<VarDecl>()) {
         // Don't provide a fixit outside C++11 mode; we don't want to suggest
@@ -13241,8 +13259,7 @@ bool Sema::CheckUsingDeclQualifier(SourceLocation UsingLoc, bool HasTypename,
         }
 
         Diag(UsingLoc, diag::note_using_decl_class_member_workaround)
-          << 2 // reference declaration
-          << FixIt;
+            << diag::MemClassWorkaround::ReferenceDecl << FixIt;
       } else if (R->getAsSingle<EnumConstantDecl>()) {
         // Don't provide a fixit outside C++11 mode; we don't want to suggest
         // repeating the type of the enumeration here, and we can't do so if
@@ -13256,8 +13273,10 @@ bool Sema::CheckUsingDeclQualifier(SourceLocation UsingLoc, bool HasTypename,
         }
 
         Diag(UsingLoc, diag::note_using_decl_class_member_workaround)
-          << (getLangOpts().CPlusPlus11 ? 4 : 3) // const[expr] variable
-          << FixIt;
+            << (getLangOpts().CPlusPlus11
+                    ? diag::MemClassWorkaround::ConstexprVar
+                    : diag::MemClassWorkaround::ConstVar)
+            << FixIt;
       }
     }
 
diff --git a/clang/lib/Sema/SemaFunctionEffects.cpp b/clang/lib/Sema/SemaFunctionEffects.cpp
index 0d56a74b066e8..31980abd23fd1 100644
--- a/clang/lib/Sema/SemaFunctionEffects.cpp
+++ b/clang/lib/Sema/SemaFunctionEffects.cpp
@@ -567,7 +567,7 @@ class Analyzer {
     while (!VerificationQueue.empty()) {
       const Decl *D = VerificationQueue.back();
       if (FuncAnalysisPtr AP = DeclAnalysis.lookup(D)) {
-        if (auto *Pending = AP.dyn_cast<PendingFunctionAnalysis *>()) {
+        if (auto *Pending = dyn_cast<PendingFunctionAnalysis *>(AP)) {
           // All children have been traversed; finish analysis.
           finishPendingAnalysis(D, Pending);
         }
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 65ddee05a2151..5001883003ee2 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -1688,13 +1688,21 @@ static bool CheckVectorElementCallArgs(Sema *S, CallExpr *TheCall) {
   auto *VecTyA = ArgTyA->getAs<VectorType>();
   SourceLocation BuiltinLoc = TheCall->getBeginLoc();
 
+  bool AllBArgAreVectors = true;
   for (unsigned i = 1; i < TheCall->getNumArgs(); ++i) {
     ExprResult B = TheCall->getArg(i);
     QualType ArgTyB = B.get()->getType();
     auto *VecTyB = ArgTyB->getAs<VectorType>();
-    if (VecTyA == nullptr && VecTyB == nullptr)
-      return false;
-
+    if (VecTyB == nullptr)
+      AllBArgAreVectors &= false;
+    if (VecTyA && VecTyB == nullptr) {
+      // Note: if we get here 'B' is scalar which
+      // requires a VectorSplat on ArgN
+      S->Diag(BuiltinLoc, diag::err_vec_builtin_non_vector)
+          << TheCall->getDirectCallee() << /*useAllTerminology*/ true
+          << SourceRange(A.get()->getBeginLoc(), B.get()->getEndLoc());
+      return true;
+    }
     if (VecTyA && VecTyB) {
       bool retValue = false;
       if (VecTyA->getElementType() != VecTyB->getElementType()) {
@@ -1712,21 +1720,23 @@ static bool CheckVectorElementCallArgs(Sema *S, CallExpr *TheCall) {
         // HLSLVectorTruncation.
         S->Diag(BuiltinLoc, diag::err_vec_builtin_incompatible_vector)
             << TheCall->getDirectCallee() << /*useAllTerminology*/ true
-            << SourceRange(TheCall->getArg(0)->getBeginLoc(),
-                           TheCall->getArg(1)->getEndLoc());
+            << SourceRange(A.get()->getBeginLoc(), B.get()->getEndLoc());
         retValue = true;
       }
-      return retValue;
+      if (retValue)
+        return retValue;
     }
   }
 
-  // Note: if we get here one of the args is a scalar which
-  // requires a VectorSplat on Arg0 or Arg1
-  S->Diag(BuiltinLoc, diag::err_vec_builtin_non_vector)
-      << TheCall->getDirectCallee() << /*useAllTerminology*/ true
-      << SourceRange(TheCall->getArg(0)->getBeginLoc(),
-                     TheCall->getArg(1)->getEndLoc());
-  return true;
+  if (VecTyA == nullptr && AllBArgAreVectors) {
+    // Note: if we get here 'A' is a scalar which
+    // requires a VectorSplat on Arg0
+    S->Diag(BuiltinLoc, diag::err_vec_builtin_non_vector)
+        << TheCall->getDirectCallee() << /*useAllTerminology*/ true
+        << SourceRange(A.get()->getBeginLoc(), A.get()->getEndLoc());
+    return true;
+  }
+  return false;
 }
 
 static bool CheckArgTypeMatches(Sema *S, Expr *Arg, QualType ExpectedType) {
@@ -1859,7 +1869,24 @@ static bool CheckAnyScalarOrVector(Sema *S, CallExpr *TheCall,
         (VTy && VTy->getElementType()->isScalarType()))) {
     S->Diag(TheCall->getArg(0)->getBeginLoc(),
             diag::err_typecheck_expect_any_scalar_or_vector)
-        << ArgType;
+        << ArgType << 1;
+    return true;
+  }
+  return false;
+}
+
+static bool CheckWaveActive(Sema *S, CallExpr *TheCall) {
+  QualType BoolType = S->getASTContext().BoolTy;
+  assert(TheCall->getNumArgs() >= 1);
+  QualType ArgType = TheCall->getArg(0)->getType();
+  auto *VTy = ArgType->getAs<VectorType>();
+  // is the bool or vector<bool>
+  if (S->Context.hasSameUnqualifiedType(ArgType, BoolType) ||
+      (VTy &&
+       S->Context.hasSameUnqualifiedType(VTy->getElementType(), BoolType))) {
+    S->Diag(TheCall->getArg(0)->getBeginLoc(),
+            diag::err_typecheck_expect_any_scalar_or_vector)
+        << ArgType << 0;
     return true;
   }
   return false;
@@ -2036,7 +2063,8 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       return true;
     break;
   }
-  case Builtin::BI__builtin_hlsl_elementwise_firstbithigh: {
+  case Builtin::BI__builtin_hlsl_elementwise_firstbithigh:
+  case Builtin::BI__builtin_hlsl_elementwise_firstbitlow: {
     if (SemaRef.PrepareBuiltinElementwiseMathOneArgCall(TheCall))
       return true;
 
@@ -2155,6 +2183,20 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     TheCall->setType(ArgTyA);
     break;
   }
+  case Builtin::BI__builtin_hlsl_wave_active_sum: {
+    if (SemaRef.checkArgCount(TheCall, 1))
+      return true;
+
+    // Ensure input expr type is a scalar/vector and the same as the return type
+    if (CheckAnyScalarOrVector(&SemaRef, TheCall, 0))
+      return true;
+    if (CheckWaveActive(&SemaRef, TheCall))
+      return true;
+    ExprResult Expr = TheCall->getArg(0);
+    QualType ArgTyExpr = Expr.get()->getType();
+    TheCall->setType(ArgTyExpr);
+    break;
+  }
   // Note these are llvm builtins that we want to catch invalid intrinsic
   // generation. Normal handling of these builitns will occur elsewhere.
   case Builtin::BI__builtin_elementwise_bitreverse: {
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index e1171d4284c76..9d8cdc9c08525 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -1624,6 +1624,11 @@ bool Sema::isUsableModule(const Module *M) {
   if (!Current)
     return false;
 
+  // For implicit global module, the decls in the same modules with the parent
+  // module should be visible to the decls in the implicit global module.
+  if (Current->isImplicitGlobalModule())
+    Current = Current->getTopLevelModule();
+
   // If M is the module we're parsing or M and the current module unit lives in
   // the same module, M should be usable.
   //
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 9ded913638fb3..f5edc0ed36a9a 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -6,1920 +6,47 @@
 //
 //===----------------------------------------------------------------------===//
 /// \file
-/// This file implements semantic analysis for OpenACC constructs and
-/// clauses.
+/// This file implements semantic analysis for OpenACC constructs, and things
+/// that are not clause specific.
 ///
 //===----------------------------------------------------------------------===//
 
-#include "clang/Sema/SemaOpenACC.h"
-#include "clang/AST/StmtOpenACC.h"
-#include "clang/Basic/DiagnosticSema.h"
-#include "clang/Basic/OpenACCKinds.h"
-#include "clang/Sema/Sema.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/Casting.h"
-
-using namespace clang;
-
-namespace {
-bool diagnoseConstructAppertainment(SemaOpenACC &S, OpenACCDirectiveKind K,
-                                    SourceLocation StartLoc, bool IsStmt) {
-  switch (K) {
-  default:
-  case OpenACCDirectiveKind::Invalid:
-    // Nothing to do here, both invalid and unimplemented don't really need to
-    // do anything.
-    break;
-  case OpenACCDirectiveKind::ParallelLoop:
-  case OpenACCDirectiveKind::SerialLoop:
-  case OpenACCDirectiveKind::KernelsLoop:
-  case OpenACCDirectiveKind::Parallel:
-  case OpenACCDirectiveKind::Serial:
-  case OpenACCDirectiveKind::Kernels:
-  case OpenACCDirectiveKind::Loop:
-  case OpenACCDirectiveKind::Data:
-  case OpenACCDirectiveKind::EnterData:
-  case OpenACCDirectiveKind::ExitData:
-  case OpenACCDirectiveKind::HostData:
-  case OpenACCDirectiveKind::Wait:
-    if (!IsStmt)
-      return S.Diag(StartLoc, diag::err_acc_construct_appertainment) << K;
-    break;
-  }
-  return false;
-}
-
-bool doesClauseApplyToDirective(OpenACCDirectiveKind DirectiveKind,
-                                OpenACCClauseKind ClauseKind) {
-  switch (ClauseKind) {
-    // FIXME: For each clause as we implement them, we can add the
-    // 'legalization' list here.
-  case OpenACCClauseKind::Default:
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Parallel:
-    case OpenACCDirectiveKind::Serial:
-    case OpenACCDirectiveKind::Kernels:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-    case OpenACCDirectiveKind::Data:
-      return true;
-    default:
-      return false;
-    }
-  case OpenACCClauseKind::If:
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Parallel:
-    case OpenACCDirectiveKind::Serial:
-    case OpenACCDirectiveKind::Kernels:
-    case OpenACCDirectiveKind::Data:
-    case OpenACCDirectiveKind::EnterData:
-    case OpenACCDirectiveKind::ExitData:
-    case OpenACCDirectiveKind::HostData:
-    case OpenACCDirectiveKind::Init:
-    case OpenACCDirectiveKind::Shutdown:
-    case OpenACCDirectiveKind::Set:
-    case OpenACCDirectiveKind::Update:
-    case OpenACCDirectiveKind::Wait:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-      return true;
-    default:
-      return false;
-    }
-  case OpenACCClauseKind::Self:
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Parallel:
-    case OpenACCDirectiveKind::Serial:
-    case OpenACCDirectiveKind::Kernels:
-    case OpenACCDirectiveKind::Update:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-      return true;
-    default:
-      return false;
-    }
-  case OpenACCClauseKind::NumGangs:
-  case OpenACCClauseKind::NumWorkers:
-  case OpenACCClauseKind::VectorLength:
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Parallel:
-    case OpenACCDirectiveKind::Kernels:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-      return true;
-    default:
-      return false;
-    }
-  case OpenACCClauseKind::FirstPrivate:
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Parallel:
-    case OpenACCDirectiveKind::Serial:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-      return true;
-    default:
-      return false;
-    }
-  case OpenACCClauseKind::Private:
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Parallel:
-    case OpenACCDirectiveKind::Serial:
-    case OpenACCDirectiveKind::Loop:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-      return true;
-    default:
-      return false;
-    }
-  case OpenACCClauseKind::NoCreate:
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Parallel:
-    case OpenACCDirectiveKind::Serial:
-    case OpenACCDirectiveKind::Kernels:
-    case OpenACCDirectiveKind::Data:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-      return true;
-    default:
-      return false;
-    }
-  case OpenACCClauseKind::Present:
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Parallel:
-    case OpenACCDirectiveKind::Serial:
-    case OpenACCDirectiveKind::Kernels:
-    case OpenACCDirectiveKind::Data:
-    case OpenACCDirectiveKind::Declare:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-      return true;
-    default:
-      return false;
-    }
-
-  case OpenACCClauseKind::Copy:
-  case OpenACCClauseKind::PCopy:
-  case OpenACCClauseKind::PresentOrCopy:
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Parallel:
-    case OpenACCDirectiveKind::Serial:
-    case OpenACCDirectiveKind::Kernels:
-    case OpenACCDirectiveKind::Data:
-    case OpenACCDirectiveKind::Declare:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-      return true;
-    default:
-      return false;
-    }
-  case OpenACCClauseKind::CopyIn:
-  case OpenACCClauseKind::PCopyIn:
-  case OpenACCClauseKind::PresentOrCopyIn:
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Parallel:
-    case OpenACCDirectiveKind::Serial:
-    case OpenACCDirectiveKind::Kernels:
-    case OpenACCDirectiveKind::Data:
-    case OpenACCDirectiveKind::EnterData:
-    case OpenACCDirectiveKind::Declare:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-      return true;
-    default:
-      return false;
-    }
-  case OpenACCClauseKind::CopyOut:
-  case OpenACCClauseKind::PCopyOut:
-  case OpenACCClauseKind::PresentOrCopyOut:
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Parallel:
-    case OpenACCDirectiveKind::Serial:
-    case OpenACCDirectiveKind::Kernels:
-    case OpenACCDirectiveKind::Data:
-    case OpenACCDirectiveKind::ExitData:
-    case OpenACCDirectiveKind::Declare:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-      return true;
-    default:
-      return false;
-    }
-  case OpenACCClauseKind::Create:
-  case OpenACCClauseKind::PCreate:
-  case OpenACCClauseKind::PresentOrCreate:
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Parallel:
-    case OpenACCDirectiveKind::Serial:
-    case OpenACCDirectiveKind::Kernels:
-    case OpenACCDirectiveKind::Data:
-    case OpenACCDirectiveKind::EnterData:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-      return true;
-    default:
-      return false;
-    }
-
-  case OpenACCClauseKind::Attach:
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Parallel:
-    case OpenACCDirectiveKind::Serial:
-    case OpenACCDirectiveKind::Kernels:
-    case OpenACCDirectiveKind::Data:
-    case OpenACCDirectiveKind::EnterData:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-      return true;
-    default:
-      return false;
-    }
-  case OpenACCClauseKind::DevicePtr:
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Parallel:
-    case OpenACCDirectiveKind::Serial:
-    case OpenACCDirectiveKind::Kernels:
-    case OpenACCDirectiveKind::Data:
-    case OpenACCDirectiveKind::Declare:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-      return true;
-    default:
-      return false;
-    }
-  case OpenACCClauseKind::Async:
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Parallel:
-    case OpenACCDirectiveKind::Serial:
-    case OpenACCDirectiveKind::Kernels:
-    case OpenACCDirectiveKind::Data:
-    case OpenACCDirectiveKind::EnterData:
-    case OpenACCDirectiveKind::ExitData:
-    case OpenACCDirectiveKind::Set:
-    case OpenACCDirectiveKind::Update:
-    case OpenACCDirectiveKind::Wait:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-      return true;
-    default:
-      return false;
-    }
-  case OpenACCClauseKind::Wait:
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Parallel:
-    case OpenACCDirectiveKind::Serial:
-    case OpenACCDirectiveKind::Kernels:
-    case OpenACCDirectiveKind::Data:
-    case OpenACCDirectiveKind::EnterData:
-    case OpenACCDirectiveKind::ExitData:
-    case OpenACCDirectiveKind::Update:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-      return true;
-    default:
-      return false;
-    }
-
-  case OpenACCClauseKind::Seq:
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Loop:
-    case OpenACCDirectiveKind::Routine:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-      return true;
-    default:
-      return false;
-    }
-
-  case OpenACCClauseKind::Independent:
-  case OpenACCClauseKind::Auto:
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Loop:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-      return true;
-    default:
-      return false;
-    }
-
-  case OpenACCClauseKind::Reduction:
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Parallel:
-    case OpenACCDirectiveKind::Serial:
-    case OpenACCDirectiveKind::Loop:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-      return true;
-    default:
-      return false;
-    }
-
-  case OpenACCClauseKind::DeviceType:
-  case OpenACCClauseKind::DType:
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Parallel:
-    case OpenACCDirectiveKind::Serial:
-    case OpenACCDirectiveKind::Kernels:
-    case OpenACCDirectiveKind::Data:
-    case OpenACCDirectiveKind::Init:
-    case OpenACCDirectiveKind::Shutdown:
-    case OpenACCDirectiveKind::Set:
-    case OpenACCDirectiveKind::Update:
-    case OpenACCDirectiveKind::Loop:
-    case OpenACCDirectiveKind::Routine:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-      return true;
-    default:
-      return false;
-    }
-
-  case OpenACCClauseKind::Collapse: {
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Loop:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-      return true;
-    default:
-      return false;
-    }
-  }
-  case OpenACCClauseKind::Tile: {
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Loop:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-      return true;
-    default:
-      return false;
-    }
-  }
-
-  case OpenACCClauseKind::Gang: {
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Loop:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-    case OpenACCDirectiveKind::Routine:
-      return true;
-    default:
-      return false;
-    }
-  case OpenACCClauseKind::Worker: {
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Loop:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-    case OpenACCDirectiveKind::Routine:
-      return true;
-    default:
-      return false;
-    }
-  }
-  case OpenACCClauseKind::Vector: {
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Loop:
-    case OpenACCDirectiveKind::ParallelLoop:
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::KernelsLoop:
-    case OpenACCDirectiveKind::Routine:
-      return true;
-    default:
-      return false;
-    }
-  }
-  case OpenACCClauseKind::Finalize: {
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::ExitData:
-      return true;
-    default:
-      return false;
-    }
-  }
-  case OpenACCClauseKind::IfPresent: {
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::HostData:
-    case OpenACCDirectiveKind::Update:
-      return true;
-    default:
-      return false;
-    }
-  }
-  case OpenACCClauseKind::Delete: {
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::ExitData:
-      return true;
-    default:
-      return false;
-    }
-  }
-
-  case OpenACCClauseKind::Detach: {
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::ExitData:
-      return true;
-    default:
-      return false;
-    }
-  }
-
-  case OpenACCClauseKind::DeviceNum: {
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Init:
-    case OpenACCDirectiveKind::Shutdown:
-    case OpenACCDirectiveKind::Set:
-      return true;
-    default:
-      return false;
-    }
-  }
-
-  case OpenACCClauseKind::UseDevice: {
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::HostData:
-      return true;
-    default:
-      return false;
-    }
-  }
-  case OpenACCClauseKind::DefaultAsync: {
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Set:
-      return true;
-    default:
-      return false;
-    }
-  }
-  case OpenACCClauseKind::Device: {
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Update:
-      return true;
-    default:
-      return false;
-    }
-  }
-  case OpenACCClauseKind::Host: {
-    switch (DirectiveKind) {
-    case OpenACCDirectiveKind::Update:
-      return true;
-    default:
-      return false;
-    }
-  }
-  }
-
-  default:
-    // Do nothing so we can go to the 'unimplemented' diagnostic instead.
-    return true;
-  }
-  llvm_unreachable("Invalid clause kind");
-}
-
-bool checkAlreadyHasClauseOfKind(
-    SemaOpenACC &S, ArrayRef<const OpenACCClause *> ExistingClauses,
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  const auto *Itr = llvm::find_if(ExistingClauses, [&](const OpenACCClause *C) {
-    return C->getClauseKind() == Clause.getClauseKind();
-  });
-  if (Itr != ExistingClauses.end()) {
-    S.Diag(Clause.getBeginLoc(), diag::err_acc_duplicate_clause_disallowed)
-        << Clause.getDirectiveKind() << Clause.getClauseKind();
-    S.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
-    return true;
-  }
-  return false;
-}
-
-bool checkValidAfterDeviceType(
-    SemaOpenACC &S, const OpenACCDeviceTypeClause &DeviceTypeClause,
-    const SemaOpenACC::OpenACCParsedClause &NewClause) {
-  // This is implemented for everything but 'routine', so treat as 'fine' for
-  // that.
-  if (NewClause.getDirectiveKind() == OpenACCDirectiveKind::Routine)
-    return false;
-
-  // OpenACC3.3: Section 2.4: Clauses that precede any device_type clause are
-  // default clauses.  Clauses that follow a device_type clause up to the end of
-  // the directive or up to the next device_type clause are device-specific
-  // clauses for the device types specified in the device_type argument.
-  //
-  // The above implies that despite what the individual text says, these are
-  // valid.
-  if (NewClause.getClauseKind() == OpenACCClauseKind::DType ||
-      NewClause.getClauseKind() == OpenACCClauseKind::DeviceType)
-    return false;
-
-  // Implement check from OpenACC3.3: section 2.5.4:
-  // Only the async, wait, num_gangs, num_workers, and vector_length clauses may
-  // follow a device_type clause.
-  if (isOpenACCComputeDirectiveKind(NewClause.getDirectiveKind())) {
-    switch (NewClause.getClauseKind()) {
-    case OpenACCClauseKind::Async:
-    case OpenACCClauseKind::Wait:
-    case OpenACCClauseKind::NumGangs:
-    case OpenACCClauseKind::NumWorkers:
-    case OpenACCClauseKind::VectorLength:
-      return false;
-    default:
-      break;
-    }
-  } else if (NewClause.getDirectiveKind() == OpenACCDirectiveKind::Loop) {
-    // Implement check from OpenACC3.3: section 2.9:
-    // Only the collapse, gang, worker, vector, seq, independent, auto, and tile
-    // clauses may follow a device_type clause.
-    switch (NewClause.getClauseKind()) {
-    case OpenACCClauseKind::Collapse:
-    case OpenACCClauseKind::Gang:
-    case OpenACCClauseKind::Worker:
-    case OpenACCClauseKind::Vector:
-    case OpenACCClauseKind::Seq:
-    case OpenACCClauseKind::Independent:
-    case OpenACCClauseKind::Auto:
-    case OpenACCClauseKind::Tile:
-      return false;
-    default:
-      break;
-    }
-  } else if (isOpenACCCombinedDirectiveKind(NewClause.getDirectiveKind())) {
-    // This seems like it should be the union of 2.9 and 2.5.4 from above.
-    switch (NewClause.getClauseKind()) {
-    case OpenACCClauseKind::Async:
-    case OpenACCClauseKind::Wait:
-    case OpenACCClauseKind::NumGangs:
-    case OpenACCClauseKind::NumWorkers:
-    case OpenACCClauseKind::VectorLength:
-    case OpenACCClauseKind::Collapse:
-    case OpenACCClauseKind::Gang:
-    case OpenACCClauseKind::Worker:
-    case OpenACCClauseKind::Vector:
-    case OpenACCClauseKind::Seq:
-    case OpenACCClauseKind::Independent:
-    case OpenACCClauseKind::Auto:
-    case OpenACCClauseKind::Tile:
-      return false;
-    default:
-      break;
-    }
-  } else if (NewClause.getDirectiveKind() == OpenACCDirectiveKind::Data) {
-    // OpenACC3.3 section 2.6.5: Only the async and wait clauses may follow a
-    // device_type clause.
-    switch (NewClause.getClauseKind()) {
-    case OpenACCClauseKind::Async:
-    case OpenACCClauseKind::Wait:
-      return false;
-    default:
-      break;
-    }
-  } else if (NewClause.getDirectiveKind() == OpenACCDirectiveKind::Set ||
-             NewClause.getDirectiveKind() == OpenACCDirectiveKind::Init ||
-             NewClause.getDirectiveKind() == OpenACCDirectiveKind::Shutdown) {
-    // There are no restrictions on 'set', 'init', or 'shutdown'.
-    return false;
-  } else if (NewClause.getDirectiveKind() == OpenACCDirectiveKind::Update) {
-    // OpenACC3.3 section 2.14.4: Only the async and wait clauses may follow a
-    // device_type clause.
-    switch (NewClause.getClauseKind()) {
-    case OpenACCClauseKind::Async:
-    case OpenACCClauseKind::Wait:
-      return false;
-    default:
-      break;
-    }
-  }
-  S.Diag(NewClause.getBeginLoc(), diag::err_acc_clause_after_device_type)
-      << NewClause.getClauseKind() << DeviceTypeClause.getClauseKind()
-      << NewClause.getDirectiveKind();
-  S.Diag(DeviceTypeClause.getBeginLoc(), diag::note_acc_previous_clause_here);
-  return true;
-}
-
-// A temporary function that helps implement the 'not implemented' check at the
-// top of each clause checking function. This should only be used in conjunction
-// with the one being currently implemented/only updated after the entire
-// construct has been implemented.
-bool isDirectiveKindImplemented(OpenACCDirectiveKind DK) {
-  return isOpenACCComputeDirectiveKind(DK) ||
-         isOpenACCCombinedDirectiveKind(DK) || isOpenACCDataDirectiveKind(DK) ||
-         DK == OpenACCDirectiveKind::Loop || DK == OpenACCDirectiveKind::Wait ||
-         DK == OpenACCDirectiveKind::Init ||
-         DK == OpenACCDirectiveKind::Shutdown ||
-         DK == OpenACCDirectiveKind::Set;
-}
-
-class SemaOpenACCClauseVisitor {
-  SemaOpenACC &SemaRef;
-  ASTContext &Ctx;
-  ArrayRef<const OpenACCClause *> ExistingClauses;
-  bool NotImplemented = false;
-
-  OpenACCClause *isNotImplemented() {
-    NotImplemented = true;
-    return nullptr;
-  }
-
-  // OpenACC 3.3 2.9:
-  // A 'gang', 'worker', or 'vector' clause may not appear if a 'seq' clause
-  // appears.
-  bool DiagIfSeqClause(SemaOpenACC::OpenACCParsedClause &Clause) {
-    const auto *Itr =
-        llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCSeqClause>);
-
-    if (Itr != ExistingClauses.end()) {
-      SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_clause_cannot_combine)
-          << Clause.getClauseKind() << (*Itr)->getClauseKind()
-          << Clause.getDirectiveKind();
-      SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
-
-      return true;
-    }
-    return false;
-  }
-
-public:
-  SemaOpenACCClauseVisitor(SemaOpenACC &S,
-                           ArrayRef<const OpenACCClause *> ExistingClauses)
-      : SemaRef(S), Ctx(S.getASTContext()), ExistingClauses(ExistingClauses) {}
-  // Once we've implemented everything, we shouldn't need this infrastructure.
-  // But in the meantime, we use this to help decide whether the clause was
-  // handled for this directive.
-  bool diagNotImplemented() { return NotImplemented; }
-
-  OpenACCClause *Visit(SemaOpenACC::OpenACCParsedClause &Clause) {
-    switch (Clause.getClauseKind()) {
-#define VISIT_CLAUSE(CLAUSE_NAME)                                              \
-  case OpenACCClauseKind::CLAUSE_NAME:                                         \
-    return Visit##CLAUSE_NAME##Clause(Clause);
-#define CLAUSE_ALIAS(ALIAS, CLAUSE_NAME, DEPRECATED)                           \
-  case OpenACCClauseKind::ALIAS:                                               \
-  if (DEPRECATED)                                                              \
-    SemaRef.Diag(Clause.getBeginLoc(), diag::warn_acc_deprecated_alias_name)   \
-        << Clause.getClauseKind() << OpenACCClauseKind::CLAUSE_NAME;           \
-  return Visit##CLAUSE_NAME##Clause(Clause);
-#include "clang/Basic/OpenACCClauses.def"
-    default:
-      return isNotImplemented();
-    }
-    llvm_unreachable("Invalid clause kind");
-  }
-
-#define VISIT_CLAUSE(CLAUSE_NAME)                                              \
-  OpenACCClause *Visit##CLAUSE_NAME##Clause(                                   \
-      SemaOpenACC::OpenACCParsedClause &Clause);
-#include "clang/Basic/OpenACCClauses.def"
-};
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitDefaultClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Don't add an invalid clause to the AST.
-  if (Clause.getDefaultClauseKind() == OpenACCDefaultClauseKind::Invalid)
-    return nullptr;
-
-  // OpenACC 3.3, Section 2.5.4:
-  // At most one 'default' clause may appear, and it must have a value of
-  // either 'none' or 'present'.
-  // Second half of the sentence is diagnosed during parsing.
-  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
-    return nullptr;
-
-  return OpenACCDefaultClause::Create(
-      Ctx, Clause.getDefaultClauseKind(), Clause.getBeginLoc(),
-      Clause.getLParenLoc(), Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitTileClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-
-  // Duplicates here are not really sensible.  We could possible permit
-  // multiples if they all had the same value, but there isn't really a good
-  // reason to do so. Also, this simplifies the suppression of duplicates, in
-  // that we know if we 'find' one after instantiation, that it is the same
-  // clause, which simplifies instantiation/checking/etc.
-  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
-    return nullptr;
-
-  llvm::SmallVector<Expr *> NewSizeExprs;
-
-  // Make sure these are all positive constant expressions or *.
-  for (Expr *E : Clause.getIntExprs()) {
-    ExprResult Res = SemaRef.CheckTileSizeExpr(E);
-
-    if (!Res.isUsable())
-      return nullptr;
-
-    NewSizeExprs.push_back(Res.get());
-  }
-
-  return OpenACCTileClause::Create(Ctx, Clause.getBeginLoc(),
-                                   Clause.getLParenLoc(), NewSizeExprs,
-                                   Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitIfClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // There is no prose in the standard that says duplicates aren't allowed,
-  // but this diagnostic is present in other compilers, as well as makes
-  // sense. Prose DOES exist for 'data' and 'host_data', 'set', 'enter data' and
-  // 'exit data' both don't, but other implmementations do this.  OpenACC issue
-  // 519 filed for the latter two. Prose also exists for 'update'.
-  // GCC allows this on init/shutdown, presumably for good reason, so we do too.
-  if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Init &&
-      Clause.getDirectiveKind() != OpenACCDirectiveKind::Shutdown &&
-      checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
-    return nullptr;
-
-  // The parser has ensured that we have a proper condition expr, so there
-  // isn't really much to do here.
-
-  // If the 'if' clause is true, it makes the 'self' clause have no effect,
-  // diagnose that here.  This only applies on compute/combined constructs.
-  if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Update) {
-    const auto *Itr =
-        llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCSelfClause>);
-    if (Itr != ExistingClauses.end()) {
-      SemaRef.Diag(Clause.getBeginLoc(), diag::warn_acc_if_self_conflict);
-      SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
-    }
-  }
-
-  return OpenACCIfClause::Create(Ctx, Clause.getBeginLoc(),
-                                 Clause.getLParenLoc(),
-                                 Clause.getConditionExpr(), Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitSelfClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // There is no prose in the standard that says duplicates aren't allowed,
-  // but this diagnostic is present in other compilers, as well as makes
-  // sense.
-  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
-    return nullptr;
-
-  // If the 'if' clause is true, it makes the 'self' clause have no effect,
-  // diagnose that here.  This only applies on compute/combined constructs.
-  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Update)
-    return OpenACCSelfClause::Create(Ctx, Clause.getBeginLoc(),
-                                     Clause.getLParenLoc(), Clause.getVarList(),
-                                     Clause.getEndLoc());
-
-  const auto *Itr =
-      llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCIfClause>);
-  if (Itr != ExistingClauses.end()) {
-    SemaRef.Diag(Clause.getBeginLoc(), diag::warn_acc_if_self_conflict);
-    SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
-  }
-  return OpenACCSelfClause::Create(
-      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(),
-      Clause.getConditionExpr(), Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitNumGangsClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // There is no prose in the standard that says duplicates aren't allowed,
-  // but this diagnostic is present in other compilers, as well as makes
-  // sense.
-  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
-    return nullptr;
-
-  // num_gangs requires at least 1 int expr in all forms.  Diagnose here, but
-  // allow us to continue, an empty clause might be useful for future
-  // diagnostics.
-  if (Clause.getIntExprs().empty())
-    SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_num_gangs_num_args)
-        << /*NoArgs=*/0;
-
-  unsigned MaxArgs =
-      (Clause.getDirectiveKind() == OpenACCDirectiveKind::Parallel ||
-       Clause.getDirectiveKind() == OpenACCDirectiveKind::ParallelLoop)
-          ? 3
-          : 1;
-  // The max number of args differs between parallel and other constructs.
-  // Again, allow us to continue for the purposes of future diagnostics.
-  if (Clause.getIntExprs().size() > MaxArgs)
-    SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_num_gangs_num_args)
-        << /*NoArgs=*/1 << Clause.getDirectiveKind() << MaxArgs
-        << Clause.getIntExprs().size();
-
-  // OpenACC 3.3 Section 2.9.11: A reduction clause may not appear on a loop
-  // directive that has a gang clause and is within a compute construct that has
-  // a num_gangs clause with more than one explicit argument.
-  if (Clause.getIntExprs().size() > 1 &&
-      isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind())) {
-    auto *GangClauseItr =
-        llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCGangClause>);
-    auto *ReductionClauseItr =
-        llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCReductionClause>);
-
-    if (GangClauseItr != ExistingClauses.end() &&
-        ReductionClauseItr != ExistingClauses.end()) {
-      SemaRef.Diag(Clause.getBeginLoc(),
-                   diag::err_acc_gang_reduction_numgangs_conflict)
-          << OpenACCClauseKind::Reduction << OpenACCClauseKind::Gang
-          << Clause.getDirectiveKind() << /*is on combined directive=*/1;
-      SemaRef.Diag((*ReductionClauseItr)->getBeginLoc(),
-                   diag::note_acc_previous_clause_here);
-      SemaRef.Diag((*GangClauseItr)->getBeginLoc(),
-                   diag::note_acc_previous_clause_here);
-      return nullptr;
-    }
-  }
-
-  // OpenACC 3.3 Section 2.5.4:
-  // A reduction clause may not appear on a parallel construct with a
-  // num_gangs clause that has more than one argument.
-  if ((Clause.getDirectiveKind() == OpenACCDirectiveKind::Parallel ||
-       Clause.getDirectiveKind() == OpenACCDirectiveKind::ParallelLoop) &&
-      Clause.getIntExprs().size() > 1) {
-    auto *Parallel =
-        llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCReductionClause>);
-
-    if (Parallel != ExistingClauses.end()) {
-      SemaRef.Diag(Clause.getBeginLoc(),
-                   diag::err_acc_reduction_num_gangs_conflict)
-          << /*>1 arg in first loc=*/1 << Clause.getClauseKind()
-          << Clause.getDirectiveKind() << OpenACCClauseKind::Reduction;
-      SemaRef.Diag((*Parallel)->getBeginLoc(),
-                   diag::note_acc_previous_clause_here);
-      return nullptr;
-    }
-  }
-
-  // OpenACC 3.3 Section 2.9.2:
-  // An argument with no keyword or with the 'num' keyword is allowed only when
-  // the 'num_gangs' does not appear on the 'kernel' construct.
-  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::KernelsLoop) {
-    auto GangClauses = llvm::make_filter_range(
-        ExistingClauses, llvm::IsaPred<OpenACCGangClause>);
-
-    for (auto *GC : GangClauses) {
-      if (cast<OpenACCGangClause>(GC)->hasExprOfKind(OpenACCGangKind::Num)) {
-        SemaRef.Diag(Clause.getBeginLoc(),
-                     diag::err_acc_num_arg_conflict_reverse)
-            << OpenACCClauseKind::NumGangs << OpenACCClauseKind::Gang
-            << /*Num argument*/ 1;
-        SemaRef.Diag(GC->getBeginLoc(), diag::note_acc_previous_clause_here);
-        return nullptr;
-      }
-    }
-  }
-
-  return OpenACCNumGangsClause::Create(
-      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getIntExprs(),
-      Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitNumWorkersClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // There is no prose in the standard that says duplicates aren't allowed,
-  // but this diagnostic is present in other compilers, as well as makes
-  // sense.
-  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
-    return nullptr;
-
-  // OpenACC 3.3 Section 2.9.2:
-  // An argument is allowed only when the 'num_workers' does not appear on the
-  // kernels construct.
-  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::KernelsLoop) {
-    auto WorkerClauses = llvm::make_filter_range(
-        ExistingClauses, llvm::IsaPred<OpenACCWorkerClause>);
-
-    for (auto *WC : WorkerClauses) {
-      if (cast<OpenACCWorkerClause>(WC)->hasIntExpr()) {
-        SemaRef.Diag(Clause.getBeginLoc(),
-                     diag::err_acc_num_arg_conflict_reverse)
-            << OpenACCClauseKind::NumWorkers << OpenACCClauseKind::Worker
-            << /*num argument*/ 0;
-        SemaRef.Diag(WC->getBeginLoc(), diag::note_acc_previous_clause_here);
-        return nullptr;
-      }
-    }
-  }
-
-  assert(Clause.getIntExprs().size() == 1 &&
-         "Invalid number of expressions for NumWorkers");
-  return OpenACCNumWorkersClause::Create(
-      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getIntExprs()[0],
-      Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitVectorLengthClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // There is no prose in the standard that says duplicates aren't allowed,
-  // but this diagnostic is present in other compilers, as well as makes
-  // sense.
-  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
-    return nullptr;
-
-  // OpenACC 3.3 Section 2.9.4:
-  // An argument is allowed only when the 'vector_length' does not appear on the
-  // 'kernels' construct.
-  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::KernelsLoop) {
-    auto VectorClauses = llvm::make_filter_range(
-        ExistingClauses, llvm::IsaPred<OpenACCVectorClause>);
-
-    for (auto *VC : VectorClauses) {
-      if (cast<OpenACCVectorClause>(VC)->hasIntExpr()) {
-        SemaRef.Diag(Clause.getBeginLoc(),
-                     diag::err_acc_num_arg_conflict_reverse)
-            << OpenACCClauseKind::VectorLength << OpenACCClauseKind::Vector
-            << /*num argument*/ 0;
-        SemaRef.Diag(VC->getBeginLoc(), diag::note_acc_previous_clause_here);
-        return nullptr;
-      }
-    }
-  }
-
-  assert(Clause.getIntExprs().size() == 1 &&
-         "Invalid number of expressions for NumWorkers");
-  return OpenACCVectorLengthClause::Create(
-      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getIntExprs()[0],
-      Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitAsyncClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // There is no prose in the standard that says duplicates aren't allowed,
-  // but this diagnostic is present in other compilers, as well as makes
-  // sense.
-  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
-    return nullptr;
-
-  assert(Clause.getNumIntExprs() < 2 &&
-         "Invalid number of expressions for Async");
-  return OpenACCAsyncClause::Create(
-      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(),
-      Clause.getNumIntExprs() != 0 ? Clause.getIntExprs()[0] : nullptr,
-      Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitDeviceNumClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on certain constructs, so skip/treat
-  // as unimplemented in those cases.
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
-    return isNotImplemented();
-
-  // OpenACC 3.3 2.14.3: Two instances of the same clause may not appear on the
-  // same directive.
-  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Set &&
-      checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
-    return nullptr;
-
-  assert(Clause.getNumIntExprs() == 1 &&
-         "Invalid number of expressions for device_num");
-  return OpenACCDeviceNumClause::Create(
-      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getIntExprs()[0],
-      Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitDefaultAsyncClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // OpenACC 3.3 2.14.3: Two instances of the same clause may not appear on the
-  // same directive.
-  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
-    return nullptr;
-
-  assert(Clause.getNumIntExprs() == 1 &&
-         "Invalid number of expressions for default_async");
-  return OpenACCDefaultAsyncClause::Create(
-      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getIntExprs()[0],
-      Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitPrivateClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // ActOnVar ensured that everything is a valid variable reference, so there
-  // really isn't anything to do here. GCC does some duplicate-finding, though
-  // it isn't apparent in the standard where this is justified.
-
-  return OpenACCPrivateClause::Create(Ctx, Clause.getBeginLoc(),
-                                      Clause.getLParenLoc(),
-                                      Clause.getVarList(), Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitFirstPrivateClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // ActOnVar ensured that everything is a valid variable reference, so there
-  // really isn't anything to do here. GCC does some duplicate-finding, though
-  // it isn't apparent in the standard where this is justified.
-
-  return OpenACCFirstPrivateClause::Create(
-      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getVarList(),
-      Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitNoCreateClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // ActOnVar ensured that everything is a valid variable reference, so there
-  // really isn't anything to do here. GCC does some duplicate-finding, though
-  // it isn't apparent in the standard where this is justified.
-
-  return OpenACCNoCreateClause::Create(Ctx, Clause.getBeginLoc(),
-                                       Clause.getLParenLoc(),
-                                       Clause.getVarList(), Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitPresentClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute'/'combined'/'data'
-  // constructs, and 'compute'/'combined'/'data' constructs are the only
-  // construct that can do anything with this yet, so skip/treat as
-  // unimplemented in this case.
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
-    return isNotImplemented();
-  // ActOnVar ensured that everything is a valid variable reference, so there
-  // really isn't anything to do here. GCC does some duplicate-finding, though
-  // it isn't apparent in the standard where this is justified.
-
-  return OpenACCPresentClause::Create(Ctx, Clause.getBeginLoc(),
-                                      Clause.getLParenLoc(),
-                                      Clause.getVarList(), Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitHostClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // ActOnVar ensured that everything is a valid variable reference, so there
-  // really isn't anything to do here. GCC does some duplicate-finding, though
-  // it isn't apparent in the standard where this is justified.
-
-  return OpenACCHostClause::Create(Ctx, Clause.getBeginLoc(),
-                                   Clause.getLParenLoc(), Clause.getVarList(),
-                                   Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitDeviceClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // ActOnVar ensured that everything is a valid variable reference, so there
-  // really isn't anything to do here. GCC does some duplicate-finding, though
-  // it isn't apparent in the standard where this is justified.
-
-  return OpenACCDeviceClause::Create(Ctx, Clause.getBeginLoc(),
-                                     Clause.getLParenLoc(), Clause.getVarList(),
-                                     Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitCopyClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute'/'combined'/'data'
-  // constructs, and 'compute'/'combined'/'data' constructs are the only
-  // construct that can do anything with this yet, so skip/treat as
-  // unimplemented in this case.
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
-    return isNotImplemented();
-  // ActOnVar ensured that everything is a valid variable reference, so there
-  // really isn't anything to do here. GCC does some duplicate-finding, though
-  // it isn't apparent in the standard where this is justified.
-
-  return OpenACCCopyClause::Create(
-      Ctx, Clause.getClauseKind(), Clause.getBeginLoc(), Clause.getLParenLoc(),
-      Clause.getVarList(), Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitCopyInClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute'/'combined'/'data'
-  // constructs, and 'compute'/'combined'/'data' constructs are the only
-  // construct that can do anything with this yet, so skip/treat as
-  // unimplemented in this case.
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
-    return isNotImplemented();
-  // ActOnVar ensured that everything is a valid variable reference, so there
-  // really isn't anything to do here. GCC does some duplicate-finding, though
-  // it isn't apparent in the standard where this is justified.
-
-  return OpenACCCopyInClause::Create(
-      Ctx, Clause.getClauseKind(), Clause.getBeginLoc(), Clause.getLParenLoc(),
-      Clause.isReadOnly(), Clause.getVarList(), Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitCopyOutClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute'/'combined'/'data'
-  // constructs, and 'compute'/'combined'/'data' constructs are the only
-  // construct that can do anything with this yet, so skip/treat as
-  // unimplemented in this case.
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
-    return isNotImplemented();
-  // ActOnVar ensured that everything is a valid variable reference, so there
-  // really isn't anything to do here. GCC does some duplicate-finding, though
-  // it isn't apparent in the standard where this is justified.
-
-  return OpenACCCopyOutClause::Create(
-      Ctx, Clause.getClauseKind(), Clause.getBeginLoc(), Clause.getLParenLoc(),
-      Clause.isZero(), Clause.getVarList(), Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitCreateClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // ActOnVar ensured that everything is a valid variable reference, so there
-  // really isn't anything to do here. GCC does some duplicate-finding, though
-  // it isn't apparent in the standard where this is justified.
-
-  return OpenACCCreateClause::Create(
-      Ctx, Clause.getClauseKind(), Clause.getBeginLoc(), Clause.getLParenLoc(),
-      Clause.isZero(), Clause.getVarList(), Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitAttachClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // ActOnVar ensured that everything is a valid variable reference, but we
-  // still have to make sure it is a pointer type.
-  llvm::SmallVector<Expr *> VarList{Clause.getVarList()};
-  llvm::erase_if(VarList, [&](Expr *E) {
-    return SemaRef.CheckVarIsPointerType(OpenACCClauseKind::Attach, E);
-  });
-  Clause.setVarListDetails(VarList,
-                           /*IsReadOnly=*/false, /*IsZero=*/false);
-  return OpenACCAttachClause::Create(Ctx, Clause.getBeginLoc(),
-                                     Clause.getLParenLoc(), Clause.getVarList(),
-                                     Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitDetachClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // ActOnVar ensured that everything is a valid variable reference, but we
-  // still have to make sure it is a pointer type.
-  llvm::SmallVector<Expr *> VarList{Clause.getVarList()};
-  llvm::erase_if(VarList, [&](Expr *E) {
-    return SemaRef.CheckVarIsPointerType(OpenACCClauseKind::Detach, E);
-  });
-  Clause.setVarListDetails(VarList,
-                           /*IsReadOnly=*/false, /*IsZero=*/false);
-  return OpenACCDetachClause::Create(Ctx, Clause.getBeginLoc(),
-                                     Clause.getLParenLoc(), Clause.getVarList(),
-                                     Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitDeleteClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // ActOnVar ensured that everything is a valid variable reference, so there
-  // really isn't anything to do here. GCC does some duplicate-finding, though
-  // it isn't apparent in the standard where this is justified.
-  return OpenACCDeleteClause::Create(Ctx, Clause.getBeginLoc(),
-                                     Clause.getLParenLoc(), Clause.getVarList(),
-                                     Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitUseDeviceClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // ActOnVar ensured that everything is a valid variable or array, so nothing
-  // left to do here.
-  return OpenACCUseDeviceClause::Create(
-      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getVarList(),
-      Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitDevicePtrClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute'/'combined'/'data'
-  // constructs, and 'compute'/'combined'/'data' constructs are the only
-  // construct that can do anything with this yet, so skip/treat as
-  // unimplemented in this case.
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
-    return isNotImplemented();
-
-  // ActOnVar ensured that everything is a valid variable reference, but we
-  // still have to make sure it is a pointer type.
-  llvm::SmallVector<Expr *> VarList{Clause.getVarList()};
-  llvm::erase_if(VarList, [&](Expr *E) {
-    return SemaRef.CheckVarIsPointerType(OpenACCClauseKind::DevicePtr, E);
-  });
-  Clause.setVarListDetails(VarList,
-                           /*IsReadOnly=*/false, /*IsZero=*/false);
-
-  return OpenACCDevicePtrClause::Create(
-      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getVarList(),
-      Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitWaitClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  return OpenACCWaitClause::Create(
-      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getDevNumExpr(),
-      Clause.getQueuesLoc(), Clause.getQueueIdExprs(), Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitDeviceTypeClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions implemented properly on everything except 'routine'.
-  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Routine)
-    return isNotImplemented();
-
-  // OpenACC 3.3 2.14.3: Two instances of the same clause may not appear on the
-  // same directive.
-  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Set &&
-      checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
-    return nullptr;
-
-  // TODO OpenACC: Once we get enough of the CodeGen implemented that we have
-  // a source for the list of valid architectures, we need to warn on unknown
-  // identifiers here.
-
-  return OpenACCDeviceTypeClause::Create(
-      Ctx, Clause.getClauseKind(), Clause.getBeginLoc(), Clause.getLParenLoc(),
-      Clause.getDeviceTypeArchitectures(), Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitAutoClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // OpenACC 3.3 2.9:
-  // Only one of the seq, independent, and auto clauses may appear.
-  const auto *Itr =
-      llvm::find_if(ExistingClauses,
-                    llvm::IsaPred<OpenACCIndependentClause, OpenACCSeqClause>);
-  if (Itr != ExistingClauses.end()) {
-    SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_loop_spec_conflict)
-        << Clause.getClauseKind() << Clause.getDirectiveKind();
-    SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
-    return nullptr;
-  }
-
-  return OpenACCAutoClause::Create(Ctx, Clause.getBeginLoc(),
-                                   Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitIndependentClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // OpenACC 3.3 2.9:
-  // Only one of the seq, independent, and auto clauses may appear.
-  const auto *Itr = llvm::find_if(
-      ExistingClauses, llvm::IsaPred<OpenACCAutoClause, OpenACCSeqClause>);
-  if (Itr != ExistingClauses.end()) {
-    SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_loop_spec_conflict)
-        << Clause.getClauseKind() << Clause.getDirectiveKind();
-    SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
-    return nullptr;
-  }
-
-  return OpenACCIndependentClause::Create(Ctx, Clause.getBeginLoc(),
-                                          Clause.getEndLoc());
-}
-
-ExprResult CheckGangStaticExpr(SemaOpenACC &S, Expr *E) {
-  if (isa<OpenACCAsteriskSizeExpr>(E))
-    return E;
-  return S.ActOnIntExpr(OpenACCDirectiveKind::Invalid, OpenACCClauseKind::Gang,
-                        E->getBeginLoc(), E);
-}
-
-bool IsOrphanLoop(OpenACCDirectiveKind DK, OpenACCDirectiveKind AssocKind) {
-  return DK == OpenACCDirectiveKind::Loop &&
-         AssocKind == OpenACCDirectiveKind::Invalid;
-}
-
-bool HasAssocKind(OpenACCDirectiveKind DK, OpenACCDirectiveKind AssocKind) {
-  return DK == OpenACCDirectiveKind::Loop &&
-         AssocKind != OpenACCDirectiveKind::Invalid;
-}
-
-ExprResult DiagIntArgInvalid(SemaOpenACC &S, Expr *E, OpenACCGangKind GK,
-                             OpenACCClauseKind CK, OpenACCDirectiveKind DK,
-                             OpenACCDirectiveKind AssocKind) {
-  S.Diag(E->getBeginLoc(), diag::err_acc_int_arg_invalid)
-      << GK << CK << IsOrphanLoop(DK, AssocKind) << DK
-      << HasAssocKind(DK, AssocKind) << AssocKind;
-  return ExprError();
-}
-ExprResult DiagIntArgInvalid(SemaOpenACC &S, Expr *E, StringRef TagKind,
-                             OpenACCClauseKind CK, OpenACCDirectiveKind DK,
-                             OpenACCDirectiveKind AssocKind) {
-  S.Diag(E->getBeginLoc(), diag::err_acc_int_arg_invalid)
-      << TagKind << CK << IsOrphanLoop(DK, AssocKind) << DK
-      << HasAssocKind(DK, AssocKind) << AssocKind;
-  return ExprError();
-}
-
-ExprResult CheckGangParallelExpr(SemaOpenACC &S, OpenACCDirectiveKind DK,
-                                 OpenACCDirectiveKind AssocKind,
-                                 OpenACCGangKind GK, Expr *E) {
-  switch (GK) {
-  case OpenACCGangKind::Static:
-    return CheckGangStaticExpr(S, E);
-  case OpenACCGangKind::Num:
-    // OpenACC 3.3 2.9.2: When the parent compute construct is a parallel
-    // construct, or an orphaned loop construct, the gang clause behaves as
-    // follows. ... The num argument is not allowed.
-    return DiagIntArgInvalid(S, E, GK, OpenACCClauseKind::Gang, DK, AssocKind);
-  case OpenACCGangKind::Dim: {
-    // OpenACC 3.3 2.9.2: When the parent compute construct is a parallel
-    // construct, or an orphaned loop construct, the gang clause behaves as
-    // follows. ... The dim argument must be a constant positive integer value
-    // 1, 2, or 3.
-    if (!E)
-      return ExprError();
-    ExprResult Res =
-        S.ActOnIntExpr(OpenACCDirectiveKind::Invalid, OpenACCClauseKind::Gang,
-                       E->getBeginLoc(), E);
-
-    if (!Res.isUsable())
-      return Res;
-
-    if (Res.get()->isInstantiationDependent())
-      return Res;
-
-    std::optional<llvm::APSInt> ICE =
-        Res.get()->getIntegerConstantExpr(S.getASTContext());
-
-    if (!ICE || *ICE <= 0 || ICE > 3) {
-      S.Diag(Res.get()->getBeginLoc(), diag::err_acc_gang_dim_value)
-          << ICE.has_value() << ICE.value_or(llvm::APSInt{}).getExtValue();
-      return ExprError();
-    }
-
-    return ExprResult{
-        ConstantExpr::Create(S.getASTContext(), Res.get(), APValue{*ICE})};
-  }
-  }
-  llvm_unreachable("Unknown gang kind in gang parallel check");
-}
-
-ExprResult CheckGangKernelsExpr(SemaOpenACC &S,
-                                ArrayRef<const OpenACCClause *> ExistingClauses,
-                                OpenACCDirectiveKind DK,
-                                OpenACCDirectiveKind AssocKind,
-                                OpenACCGangKind GK, Expr *E) {
-  switch (GK) {
-  // OpenACC 3.3 2.9.2: When the parent compute construct is a kernels
-  // construct, the gang clause behaves as follows. ... The dim argument is
-  // not allowed.
-  case OpenACCGangKind::Dim:
-    return DiagIntArgInvalid(S, E, GK, OpenACCClauseKind::Gang, DK, AssocKind);
-  case OpenACCGangKind::Num: {
-    // OpenACC 3.3 2.9.2: When the parent compute construct is a kernels
-    // construct, the gang clause behaves as follows. ... An argument with no
-    // keyword or with num keyword is only allowed when num_gangs does not
-    // appear on the kernels construct. ... The region of a loop with the gang
-    // clause may not contain another loop with a gang clause unless within a
-    // nested compute region.
-
-    // If this is a 'combined' construct, search the list of existing clauses.
-    // Else we need to search the containing 'kernel'.
-    auto Collection = isOpenACCCombinedDirectiveKind(DK)
-                          ? ExistingClauses
-                          : S.getActiveComputeConstructInfo().Clauses;
-
-    const auto *Itr =
-        llvm::find_if(Collection, llvm::IsaPred<OpenACCNumGangsClause>);
-
-    if (Itr != Collection.end()) {
-      S.Diag(E->getBeginLoc(), diag::err_acc_num_arg_conflict)
-          << "num" << OpenACCClauseKind::Gang << DK
-          << HasAssocKind(DK, AssocKind) << AssocKind
-          << OpenACCClauseKind::NumGangs;
-
-      S.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
-      return ExprError();
-    }
-    return ExprResult{E};
-  }
-  case OpenACCGangKind::Static:
-    return CheckGangStaticExpr(S, E);
-    return ExprError();
-  }
-  llvm_unreachable("Unknown gang kind in gang kernels check");
-}
-
-ExprResult CheckGangSerialExpr(SemaOpenACC &S, OpenACCDirectiveKind DK,
-                               OpenACCDirectiveKind AssocKind,
-                               OpenACCGangKind GK, Expr *E) {
-  switch (GK) {
-  // 'dim' and 'num' don't really make sense on serial, and GCC rejects them
-  // too, so we disallow them too.
-  case OpenACCGangKind::Dim:
-  case OpenACCGangKind::Num:
-    return DiagIntArgInvalid(S, E, GK, OpenACCClauseKind::Gang, DK, AssocKind);
-  case OpenACCGangKind::Static:
-    return CheckGangStaticExpr(S, E);
-  }
-  llvm_unreachable("Unknown gang kind in gang serial check");
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitVectorClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  if (DiagIfSeqClause(Clause))
-    return nullptr;
-
-  // Restrictions only properly implemented on 'loop'/'combined' constructs, and
-  // it is the only construct that can do anything with this, so skip/treat as
-  // unimplemented for the routine constructs.
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
-    return isNotImplemented();
-
-  Expr *IntExpr =
-      Clause.getNumIntExprs() != 0 ? Clause.getIntExprs()[0] : nullptr;
-  if (IntExpr) {
-    if (!isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind())) {
-      switch (SemaRef.getActiveComputeConstructInfo().Kind) {
-      case OpenACCDirectiveKind::Invalid:
-      case OpenACCDirectiveKind::Parallel:
-        // No restriction on when 'parallel' can contain an argument.
-        break;
-      case OpenACCDirectiveKind::Serial:
-        // GCC disallows this, and there is no real good reason for us to permit
-        // it, so disallow until we come up with a use case that makes sense.
-        DiagIntArgInvalid(SemaRef, IntExpr, "length", OpenACCClauseKind::Vector,
-                          Clause.getDirectiveKind(),
-                          SemaRef.getActiveComputeConstructInfo().Kind);
-        IntExpr = nullptr;
-        break;
-      case OpenACCDirectiveKind::Kernels: {
-        const auto *Itr =
-            llvm::find_if(SemaRef.getActiveComputeConstructInfo().Clauses,
-                          llvm::IsaPred<OpenACCVectorLengthClause>);
-        if (Itr != SemaRef.getActiveComputeConstructInfo().Clauses.end()) {
-          SemaRef.Diag(IntExpr->getBeginLoc(), diag::err_acc_num_arg_conflict)
-              << "length" << OpenACCClauseKind::Vector
-              << Clause.getDirectiveKind()
-              << HasAssocKind(Clause.getDirectiveKind(),
-                              SemaRef.getActiveComputeConstructInfo().Kind)
-              << SemaRef.getActiveComputeConstructInfo().Kind
-              << OpenACCClauseKind::VectorLength;
-          SemaRef.Diag((*Itr)->getBeginLoc(),
-                       diag::note_acc_previous_clause_here);
-
-          IntExpr = nullptr;
-        }
-        break;
-      }
-      default:
-        llvm_unreachable("Non compute construct in active compute construct");
-      }
-    } else {
-      if (Clause.getDirectiveKind() == OpenACCDirectiveKind::SerialLoop) {
-        DiagIntArgInvalid(SemaRef, IntExpr, "length", OpenACCClauseKind::Vector,
-                          Clause.getDirectiveKind(),
-                          SemaRef.getActiveComputeConstructInfo().Kind);
-        IntExpr = nullptr;
-      } else if (Clause.getDirectiveKind() ==
-                 OpenACCDirectiveKind::KernelsLoop) {
-        const auto *Itr = llvm::find_if(
-            ExistingClauses, llvm::IsaPred<OpenACCVectorLengthClause>);
-        if (Itr != ExistingClauses.end()) {
-          SemaRef.Diag(IntExpr->getBeginLoc(), diag::err_acc_num_arg_conflict)
-              << "length" << OpenACCClauseKind::Vector
-              << Clause.getDirectiveKind()
-              << HasAssocKind(Clause.getDirectiveKind(),
-                              SemaRef.getActiveComputeConstructInfo().Kind)
-              << SemaRef.getActiveComputeConstructInfo().Kind
-              << OpenACCClauseKind::VectorLength;
-          SemaRef.Diag((*Itr)->getBeginLoc(),
-                       diag::note_acc_previous_clause_here);
-
-          IntExpr = nullptr;
-        }
-      }
-    }
-  }
-
-  if (!isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind())) {
-    // OpenACC 3.3 2.9.4: The region of a loop with a 'vector' clause may not
-    // contain a loop with a gang, worker, or vector clause unless within a
-    // nested compute region.
-    if (SemaRef.LoopVectorClauseLoc.isValid()) {
-      // This handles the 'inner loop' diagnostic, but we cannot set that we're
-      // on one of these until we get to the end of the construct.
-      SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_clause_in_clause_region)
-          << OpenACCClauseKind::Vector << OpenACCClauseKind::Vector
-          << /*skip kernels construct info*/ 0;
-      SemaRef.Diag(SemaRef.LoopVectorClauseLoc,
-                   diag::note_acc_previous_clause_here);
-      return nullptr;
-    }
-  }
-
-  return OpenACCVectorClause::Create(Ctx, Clause.getBeginLoc(),
-                                     Clause.getLParenLoc(), IntExpr,
-                                     Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitWorkerClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  if (DiagIfSeqClause(Clause))
-    return nullptr;
-
-  // Restrictions only properly implemented on 'loop'/'combined' constructs, and
-  // it is the only construct that can do anything with this, so skip/treat as
-  // unimplemented for the routine constructs.
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
-    return isNotImplemented();
-
-  Expr *IntExpr =
-      Clause.getNumIntExprs() != 0 ? Clause.getIntExprs()[0] : nullptr;
-
-  if (IntExpr) {
-    if (!isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind())) {
-      switch (SemaRef.getActiveComputeConstructInfo().Kind) {
-      case OpenACCDirectiveKind::Invalid:
-      case OpenACCDirectiveKind::ParallelLoop:
-      case OpenACCDirectiveKind::SerialLoop:
-      case OpenACCDirectiveKind::Parallel:
-      case OpenACCDirectiveKind::Serial:
-        DiagIntArgInvalid(SemaRef, IntExpr, OpenACCGangKind::Num,
-                          OpenACCClauseKind::Worker, Clause.getDirectiveKind(),
-                          SemaRef.getActiveComputeConstructInfo().Kind);
-        IntExpr = nullptr;
-        break;
-      case OpenACCDirectiveKind::KernelsLoop:
-      case OpenACCDirectiveKind::Kernels: {
-        const auto *Itr =
-            llvm::find_if(SemaRef.getActiveComputeConstructInfo().Clauses,
-                          llvm::IsaPred<OpenACCNumWorkersClause>);
-        if (Itr != SemaRef.getActiveComputeConstructInfo().Clauses.end()) {
-          SemaRef.Diag(IntExpr->getBeginLoc(), diag::err_acc_num_arg_conflict)
-              << "num" << OpenACCClauseKind::Worker << Clause.getDirectiveKind()
-              << HasAssocKind(Clause.getDirectiveKind(),
-                              SemaRef.getActiveComputeConstructInfo().Kind)
-              << SemaRef.getActiveComputeConstructInfo().Kind
-              << OpenACCClauseKind::NumWorkers;
-          SemaRef.Diag((*Itr)->getBeginLoc(),
-                       diag::note_acc_previous_clause_here);
-
-          IntExpr = nullptr;
-        }
-        break;
-      }
-      default:
-        llvm_unreachable("Non compute construct in active compute construct");
-      }
-    } else {
-      if (Clause.getDirectiveKind() == OpenACCDirectiveKind::ParallelLoop ||
-          Clause.getDirectiveKind() == OpenACCDirectiveKind::SerialLoop) {
-        DiagIntArgInvalid(SemaRef, IntExpr, OpenACCGangKind::Num,
-                          OpenACCClauseKind::Worker, Clause.getDirectiveKind(),
-                          SemaRef.getActiveComputeConstructInfo().Kind);
-        IntExpr = nullptr;
-      } else {
-        assert(Clause.getDirectiveKind() == OpenACCDirectiveKind::KernelsLoop &&
-               "Unknown combined directive kind?");
-        const auto *Itr = llvm::find_if(ExistingClauses,
-                                        llvm::IsaPred<OpenACCNumWorkersClause>);
-        if (Itr != ExistingClauses.end()) {
-          SemaRef.Diag(IntExpr->getBeginLoc(), diag::err_acc_num_arg_conflict)
-              << "num" << OpenACCClauseKind::Worker << Clause.getDirectiveKind()
-              << HasAssocKind(Clause.getDirectiveKind(),
-                              SemaRef.getActiveComputeConstructInfo().Kind)
-              << SemaRef.getActiveComputeConstructInfo().Kind
-              << OpenACCClauseKind::NumWorkers;
-          SemaRef.Diag((*Itr)->getBeginLoc(),
-                       diag::note_acc_previous_clause_here);
-
-          IntExpr = nullptr;
-        }
-      }
-    }
-  }
-
-  if (!isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind())) {
-    // OpenACC 3.3 2.9.3: The region of a loop with a 'worker' clause may not
-    // contain a loop with a gang or worker clause unless within a nested
-    // compute region.
-    if (SemaRef.LoopWorkerClauseLoc.isValid()) {
-      // This handles the 'inner loop' diagnostic, but we cannot set that we're
-      // on one of these until we get to the end of the construct.
-      SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_clause_in_clause_region)
-          << OpenACCClauseKind::Worker << OpenACCClauseKind::Worker
-          << /*skip kernels construct info*/ 0;
-      SemaRef.Diag(SemaRef.LoopWorkerClauseLoc,
-                   diag::note_acc_previous_clause_here);
-      return nullptr;
-    }
-
-    // OpenACC 3.3 2.9.4: The region of a loop with a 'vector' clause may not
-    // contain a loop with a gang, worker, or vector clause unless within a
-    // nested compute region.
-    if (SemaRef.LoopVectorClauseLoc.isValid()) {
-      // This handles the 'inner loop' diagnostic, but we cannot set that we're
-      // on one of these until we get to the end of the construct.
-      SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_clause_in_clause_region)
-          << OpenACCClauseKind::Worker << OpenACCClauseKind::Vector
-          << /*skip kernels construct info*/ 0;
-      SemaRef.Diag(SemaRef.LoopVectorClauseLoc,
-                   diag::note_acc_previous_clause_here);
-      return nullptr;
-    }
-  }
-
-  return OpenACCWorkerClause::Create(Ctx, Clause.getBeginLoc(),
-                                     Clause.getLParenLoc(), IntExpr,
-                                     Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitGangClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  if (DiagIfSeqClause(Clause))
-    return nullptr;
-
-  // Restrictions only properly implemented on 'loop' constructs, and it is
-  // the only construct that can do anything with this, so skip/treat as
-  // unimplemented for the combined constructs.
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
-    return isNotImplemented();
-
-  // OpenACC 3.3 Section 2.9.11: A reduction clause may not appear on a loop
-  // directive that has a gang clause and is within a compute construct that has
-  // a num_gangs clause with more than one explicit argument.
-  if ((Clause.getDirectiveKind() == OpenACCDirectiveKind::Loop &&
-       SemaRef.getActiveComputeConstructInfo().Kind !=
-           OpenACCDirectiveKind::Invalid) ||
-      isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind())) {
-    // num_gangs clause on the active compute construct.
-    auto ActiveComputeConstructContainer =
-        isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind())
-            ? ExistingClauses
-            : SemaRef.getActiveComputeConstructInfo().Clauses;
-    auto *NumGangsClauseItr = llvm::find_if(
-        ActiveComputeConstructContainer, llvm::IsaPred<OpenACCNumGangsClause>);
-
-    if (NumGangsClauseItr != ActiveComputeConstructContainer.end() &&
-        cast<OpenACCNumGangsClause>(*NumGangsClauseItr)->getIntExprs().size() >
-            1) {
-      auto *ReductionClauseItr =
-          llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCReductionClause>);
-
-      if (ReductionClauseItr != ExistingClauses.end()) {
-        SemaRef.Diag(Clause.getBeginLoc(),
-                     diag::err_acc_gang_reduction_numgangs_conflict)
-            << OpenACCClauseKind::Gang << OpenACCClauseKind::Reduction
-            << Clause.getDirectiveKind()
-            << isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind());
-        SemaRef.Diag((*ReductionClauseItr)->getBeginLoc(),
-                     diag::note_acc_previous_clause_here);
-        SemaRef.Diag((*NumGangsClauseItr)->getBeginLoc(),
-                     diag::note_acc_previous_clause_here);
-        return nullptr;
-      }
-    }
-  }
-
-  llvm::SmallVector<OpenACCGangKind> GangKinds;
-  llvm::SmallVector<Expr *> IntExprs;
-
-  // Store the existing locations, so we can do duplicate checking.  Index is
-  // the int-value of the OpenACCGangKind enum.
-  SourceLocation ExistingElemLoc[3];
-
-  for (unsigned I = 0; I < Clause.getIntExprs().size(); ++I) {
-    OpenACCGangKind GK = Clause.getGangKinds()[I];
-    ExprResult ER =
-        SemaRef.CheckGangExpr(ExistingClauses, Clause.getDirectiveKind(), GK,
-                              Clause.getIntExprs()[I]);
-
-    if (!ER.isUsable())
-      continue;
-
-    // OpenACC 3.3 2.9: 'gang-arg-list' may have at most one num, one dim, and
-    // one static argument.
-    if (ExistingElemLoc[static_cast<unsigned>(GK)].isValid()) {
-      SemaRef.Diag(ER.get()->getBeginLoc(), diag::err_acc_gang_multiple_elt)
-          << static_cast<unsigned>(GK);
-      SemaRef.Diag(ExistingElemLoc[static_cast<unsigned>(GK)],
-                   diag::note_acc_previous_expr_here);
-      continue;
-    }
-
-    ExistingElemLoc[static_cast<unsigned>(GK)] = ER.get()->getBeginLoc();
-    GangKinds.push_back(GK);
-    IntExprs.push_back(ER.get());
-  }
-
-  if (!isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind())) {
-    // OpenACC 3.3 2.9.2: When the parent compute construct is a kernels
-    // construct, the gang clause behaves as follows. ... The region of a loop
-    // with a gang clause may not contain another loop with a gang clause unless
-    // within a nested compute region.
-    if (SemaRef.LoopGangClauseOnKernel.Loc.isValid()) {
-      // This handles the 'inner loop' diagnostic, but we cannot set that we're
-      // on one of these until we get to the end of the construct.
-      SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_clause_in_clause_region)
-          << OpenACCClauseKind::Gang << OpenACCClauseKind::Gang
-          << /*kernels construct info*/ 1
-          << SemaRef.LoopGangClauseOnKernel.DirKind;
-      SemaRef.Diag(SemaRef.LoopGangClauseOnKernel.Loc,
-                   diag::note_acc_previous_clause_here);
-      return nullptr;
-    }
-
-    // OpenACC 3.3 2.9.3: The region of a loop with a 'worker' clause may not
-    // contain a loop with a gang or worker clause unless within a nested
-    // compute region.
-    if (SemaRef.LoopWorkerClauseLoc.isValid()) {
-      // This handles the 'inner loop' diagnostic, but we cannot set that we're
-      // on one of these until we get to the end of the construct.
-      SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_clause_in_clause_region)
-          << OpenACCClauseKind::Gang << OpenACCClauseKind::Worker
-          << /*!kernels construct info*/ 0;
-      SemaRef.Diag(SemaRef.LoopWorkerClauseLoc,
-                   diag::note_acc_previous_clause_here);
-      return nullptr;
-    }
-
-    // OpenACC 3.3 2.9.4: The region of a loop with a 'vector' clause may not
-    // contain a loop with a gang, worker, or vector clause unless within a
-    // nested compute region.
-    if (SemaRef.LoopVectorClauseLoc.isValid()) {
-      // This handles the 'inner loop' diagnostic, but we cannot set that we're
-      // on one of these until we get to the end of the construct.
-      SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_clause_in_clause_region)
-          << OpenACCClauseKind::Gang << OpenACCClauseKind::Vector
-          << /*!kernels construct info*/ 0;
-      SemaRef.Diag(SemaRef.LoopVectorClauseLoc,
-                   diag::note_acc_previous_clause_here);
-      return nullptr;
-    }
-  }
-
-  return SemaRef.CheckGangClause(Clause.getDirectiveKind(), ExistingClauses,
-                                 Clause.getBeginLoc(), Clause.getLParenLoc(),
-                                 GangKinds, IntExprs, Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitFinalizeClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // There isn't anything to do here, this is only valid on one construct, and
-  // has no associated rules.
-  return OpenACCFinalizeClause::Create(Ctx, Clause.getBeginLoc(),
-                                       Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitIfPresentClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // There isn't anything to do here, this is only valid on one construct, and
-  // has no associated rules.
-  return OpenACCIfPresentClause::Create(Ctx, Clause.getBeginLoc(),
-                                        Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitSeqClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'loop' constructs and combined ,
-  // and it is the only construct that can do anything with this, so skip/treat
-  // as unimplemented for the routine constructs.
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
-    return isNotImplemented();
-
-  // OpenACC 3.3 2.9:
-  // Only one of the seq, independent, and auto clauses may appear.
-  const auto *Itr =
-      llvm::find_if(ExistingClauses,
-                    llvm::IsaPred<OpenACCAutoClause, OpenACCIndependentClause>);
-  if (Itr != ExistingClauses.end()) {
-    SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_loop_spec_conflict)
-        << Clause.getClauseKind() << Clause.getDirectiveKind();
-    SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
-    return nullptr;
-  }
-
-  // OpenACC 3.3 2.9:
-  // A 'gang', 'worker', or 'vector' clause may not appear if a 'seq' clause
-  // appears.
-  Itr = llvm::find_if(ExistingClauses,
-                      llvm::IsaPred<OpenACCGangClause, OpenACCWorkerClause,
-                                    OpenACCVectorClause>);
-
-  if (Itr != ExistingClauses.end()) {
-    SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_clause_cannot_combine)
-        << Clause.getClauseKind() << (*Itr)->getClauseKind()
-        << Clause.getDirectiveKind();
-    SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
-    return nullptr;
-  }
-
-  return OpenACCSeqClause::Create(Ctx, Clause.getBeginLoc(),
-                                  Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitReductionClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // OpenACC 3.3 Section 2.9.11: A reduction clause may not appear on a loop
-  // directive that has a gang clause and is within a compute construct that has
-  // a num_gangs clause with more than one explicit argument.
-  if ((Clause.getDirectiveKind() == OpenACCDirectiveKind::Loop &&
-       SemaRef.getActiveComputeConstructInfo().Kind !=
-           OpenACCDirectiveKind::Invalid) ||
-      isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind())) {
-    // num_gangs clause on the active compute construct.
-    auto ActiveComputeConstructContainer =
-        isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind())
-            ? ExistingClauses
-            : SemaRef.getActiveComputeConstructInfo().Clauses;
-    auto *NumGangsClauseItr = llvm::find_if(
-        ActiveComputeConstructContainer, llvm::IsaPred<OpenACCNumGangsClause>);
-
-    if (NumGangsClauseItr != ActiveComputeConstructContainer.end() &&
-        cast<OpenACCNumGangsClause>(*NumGangsClauseItr)->getIntExprs().size() >
-            1) {
-      auto *GangClauseItr =
-          llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCGangClause>);
-
-      if (GangClauseItr != ExistingClauses.end()) {
-        SemaRef.Diag(Clause.getBeginLoc(),
-                     diag::err_acc_gang_reduction_numgangs_conflict)
-            << OpenACCClauseKind::Reduction << OpenACCClauseKind::Gang
-            << Clause.getDirectiveKind()
-            << isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind());
-        SemaRef.Diag((*GangClauseItr)->getBeginLoc(),
-                     diag::note_acc_previous_clause_here);
-        SemaRef.Diag((*NumGangsClauseItr)->getBeginLoc(),
-                     diag::note_acc_previous_clause_here);
-        return nullptr;
-      }
-    }
-  }
-
-  // OpenACC3.3 Section 2.9.11: If a variable is involved in a reduction that
-  // spans multiple nested loops where two or more of those loops have
-  // associated loop directives, a reduction clause containing that variable
-  // must appear on each of those loop directives.
-  //
-  // This can't really be implemented in the CFE, as this requires a level of
-  // rechability/useage analysis that we're not really wanting to get into.
-  // Additionally, I'm alerted that this restriction is one that the middle-end
-  // can just 'figure out' as an extension and isn't really necessary.
-  //
-  // OpenACC3.3 Section 2.9.11: Every 'var' in a reduction clause appearing on
-  // an orphaned loop construct must be private.
-  //
-  // This again is something we cannot really diagnose, as it requires we see
-  // all the uses/scopes of all variables referenced.  The middle end/MLIR might
-  // be able to diagnose this.
-
-  // OpenACC 3.3 Section 2.5.4:
-  // A reduction clause may not appear on a parallel construct with a
-  // num_gangs clause that has more than one argument.
-  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Parallel ||
-      Clause.getDirectiveKind() == OpenACCDirectiveKind::ParallelLoop) {
-    auto NumGangsClauses = llvm::make_filter_range(
-        ExistingClauses, llvm::IsaPred<OpenACCNumGangsClause>);
-
-    for (auto *NGC : NumGangsClauses) {
-      unsigned NumExprs =
-          cast<OpenACCNumGangsClause>(NGC)->getIntExprs().size();
-
-      if (NumExprs > 1) {
-        SemaRef.Diag(Clause.getBeginLoc(),
-                     diag::err_acc_reduction_num_gangs_conflict)
-            << /*>1 arg in first loc=*/0 << Clause.getClauseKind()
-            << Clause.getDirectiveKind() << OpenACCClauseKind::NumGangs;
-        SemaRef.Diag(NGC->getBeginLoc(), diag::note_acc_previous_clause_here);
-        return nullptr;
-      }
-    }
-  }
-
-  SmallVector<Expr *> ValidVars;
+#include "clang/AST/StmtOpenACC.h"
+#include "clang/Basic/DiagnosticSema.h"
+#include "clang/Basic/OpenACCKinds.h"
+#include "clang/Sema/Sema.h"
+#include "clang/Sema/SemaOpenACC.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Casting.h"
 
-  for (Expr *Var : Clause.getVarList()) {
-    ExprResult Res = SemaRef.CheckReductionVar(Clause.getDirectiveKind(),
-                                               Clause.getReductionOp(), Var);
+using namespace clang;
 
-    if (Res.isUsable())
-      ValidVars.push_back(Res.get());
+namespace {
+bool diagnoseConstructAppertainment(SemaOpenACC &S, OpenACCDirectiveKind K,
+                                    SourceLocation StartLoc, bool IsStmt) {
+  switch (K) {
+  default:
+  case OpenACCDirectiveKind::Invalid:
+    // Nothing to do here, both invalid and unimplemented don't really need to
+    // do anything.
+    break;
+  case OpenACCDirectiveKind::ParallelLoop:
+  case OpenACCDirectiveKind::SerialLoop:
+  case OpenACCDirectiveKind::KernelsLoop:
+  case OpenACCDirectiveKind::Parallel:
+  case OpenACCDirectiveKind::Serial:
+  case OpenACCDirectiveKind::Kernels:
+  case OpenACCDirectiveKind::Loop:
+  case OpenACCDirectiveKind::Data:
+  case OpenACCDirectiveKind::EnterData:
+  case OpenACCDirectiveKind::ExitData:
+  case OpenACCDirectiveKind::HostData:
+  case OpenACCDirectiveKind::Wait:
+    if (!IsStmt)
+      return S.Diag(StartLoc, diag::err_acc_construct_appertainment) << K;
+    break;
   }
-
-  return SemaRef.CheckReductionClause(
-      ExistingClauses, Clause.getDirectiveKind(), Clause.getBeginLoc(),
-      Clause.getLParenLoc(), Clause.getReductionOp(), ValidVars,
-      Clause.getEndLoc());
-}
-
-OpenACCClause *SemaOpenACCClauseVisitor::VisitCollapseClause(
-    SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Duplicates here are not really sensible.  We could possible permit
-  // multiples if they all had the same value, but there isn't really a good
-  // reason to do so. Also, this simplifies the suppression of duplicates, in
-  // that we know if we 'find' one after instantiation, that it is the same
-  // clause, which simplifies instantiation/checking/etc.
-  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
-    return nullptr;
-
-  ExprResult LoopCount = SemaRef.CheckCollapseLoopCount(Clause.getLoopCount());
-
-  if (!LoopCount.isUsable())
-    return nullptr;
-
-  return OpenACCCollapseClause::Create(Ctx, Clause.getBeginLoc(),
-                                       Clause.getLParenLoc(), Clause.isForce(),
-                                       LoopCount.get(), Clause.getEndLoc());
+  return false;
 }
 
 void CollectActiveReductionClauses(
@@ -2172,190 +299,6 @@ SemaOpenACC::AssociatedStmtRAII::~AssociatedStmtRAII() {
   }
 }
 
-OpenACCClause *
-SemaOpenACC::ActOnClause(ArrayRef<const OpenACCClause *> ExistingClauses,
-                         OpenACCParsedClause &Clause) {
-  if (Clause.getClauseKind() == OpenACCClauseKind::Invalid)
-    return nullptr;
-
-  // Diagnose that we don't support this clause on this directive.
-  if (!doesClauseApplyToDirective(Clause.getDirectiveKind(),
-                                  Clause.getClauseKind())) {
-    Diag(Clause.getBeginLoc(), diag::err_acc_clause_appertainment)
-        << Clause.getDirectiveKind() << Clause.getClauseKind();
-    return nullptr;
-  }
-
-  if (const auto *DevTypeClause =
-          llvm::find_if(ExistingClauses,
-                        [&](const OpenACCClause *C) {
-                          return isa<OpenACCDeviceTypeClause>(C);
-                        });
-      DevTypeClause != ExistingClauses.end()) {
-    if (checkValidAfterDeviceType(
-            *this, *cast<OpenACCDeviceTypeClause>(*DevTypeClause), Clause))
-      return nullptr;
-  }
-
-  SemaOpenACCClauseVisitor Visitor{*this, ExistingClauses};
-  OpenACCClause *Result = Visitor.Visit(Clause);
-  assert((!Result || Result->getClauseKind() == Clause.getClauseKind()) &&
-         "Created wrong clause?");
-
-  if (Visitor.diagNotImplemented())
-    Diag(Clause.getBeginLoc(), diag::warn_acc_clause_unimplemented)
-        << Clause.getClauseKind();
-
-  return Result;
-
-}
-
-namespace {
-// Return true if the two vars refer to the same variable, for the purposes of
-// equality checking.
-bool areVarsEqual(Expr *VarExpr1, Expr *VarExpr2) {
-  if (VarExpr1->isInstantiationDependent() ||
-      VarExpr2->isInstantiationDependent())
-    return false;
-
-  VarExpr1 = VarExpr1->IgnoreParenCasts();
-  VarExpr2 = VarExpr2->IgnoreParenCasts();
-
-  // Legal expressions can be: Scalar variable reference, sub-array, array
-  // element, or composite variable member.
-
-  // Sub-array.
-  if (isa<ArraySectionExpr>(VarExpr1)) {
-    auto *Expr2AS = dyn_cast<ArraySectionExpr>(VarExpr2);
-    if (!Expr2AS)
-      return false;
-
-    auto *Expr1AS = cast<ArraySectionExpr>(VarExpr1);
-
-    if (!areVarsEqual(Expr1AS->getBase(), Expr2AS->getBase()))
-      return false;
-    // We could possibly check to see if the ranges aren't overlapping, but it
-    // isn't clear that the rules allow this.
-    return true;
-  }
-
-  // Array-element.
-  if (isa<ArraySubscriptExpr>(VarExpr1)) {
-    auto *Expr2AS = dyn_cast<ArraySubscriptExpr>(VarExpr2);
-    if (!Expr2AS)
-      return false;
-
-    auto *Expr1AS = cast<ArraySubscriptExpr>(VarExpr1);
-
-    if (!areVarsEqual(Expr1AS->getBase(), Expr2AS->getBase()))
-      return false;
-
-    // We could possibly check to see if the elements referenced aren't the
-    // same, but it isn't clear by reading of the standard that this is allowed
-    // (and that the 'var' refered to isn't the array).
-    return true;
-  }
-
-  // Scalar variable reference, or composite variable.
-  if (isa<DeclRefExpr>(VarExpr1)) {
-    auto *Expr2DRE = dyn_cast<DeclRefExpr>(VarExpr2);
-    if (!Expr2DRE)
-      return false;
-
-    auto *Expr1DRE = cast<DeclRefExpr>(VarExpr1);
-
-    return Expr1DRE->getDecl()->getMostRecentDecl() ==
-           Expr2DRE->getDecl()->getMostRecentDecl();
-  }
-
-  llvm_unreachable("Unknown variable type encountered");
-}
-} // namespace
-
-/// OpenACC 3.3 section 2.5.15:
-/// At a mininmum, the supported data types include ... the numerical data types
-/// in C, C++, and Fortran.
-///
-/// If the reduction var is a composite variable, each
-/// member of the composite variable must be a supported datatype for the
-/// reduction operation.
-ExprResult SemaOpenACC::CheckReductionVar(OpenACCDirectiveKind DirectiveKind,
-                                          OpenACCReductionOperator ReductionOp,
-                                          Expr *VarExpr) {
-  VarExpr = VarExpr->IgnoreParenCasts();
-
-  auto TypeIsValid = [](QualType Ty) {
-    return Ty->isDependentType() || Ty->isScalarType();
-  };
-
-  if (isa<ArraySectionExpr>(VarExpr)) {
-    Expr *ASExpr = VarExpr;
-    QualType BaseTy = ArraySectionExpr::getBaseOriginalType(ASExpr);
-    QualType EltTy = getASTContext().getBaseElementType(BaseTy);
-
-    if (!TypeIsValid(EltTy)) {
-      Diag(VarExpr->getExprLoc(), diag::err_acc_reduction_type)
-          << EltTy << /*Sub array base type*/ 1;
-      return ExprError();
-    }
-  } else if (auto *RD = VarExpr->getType()->getAsRecordDecl()) {
-    if (!RD->isStruct() && !RD->isClass()) {
-      Diag(VarExpr->getExprLoc(), diag::err_acc_reduction_composite_type)
-          << /*not class or struct*/ 0 << VarExpr->getType();
-      return ExprError();
-    }
-
-    if (!RD->isCompleteDefinition()) {
-      Diag(VarExpr->getExprLoc(), diag::err_acc_reduction_composite_type)
-          << /*incomplete*/ 1 << VarExpr->getType();
-      return ExprError();
-    }
-    if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD);
-        CXXRD && !CXXRD->isAggregate()) {
-      Diag(VarExpr->getExprLoc(), diag::err_acc_reduction_composite_type)
-          << /*aggregate*/ 2 << VarExpr->getType();
-      return ExprError();
-    }
-
-    for (FieldDecl *FD : RD->fields()) {
-      if (!TypeIsValid(FD->getType())) {
-        Diag(VarExpr->getExprLoc(),
-             diag::err_acc_reduction_composite_member_type);
-        Diag(FD->getLocation(), diag::note_acc_reduction_composite_member_loc);
-        return ExprError();
-      }
-    }
-  } else if (!TypeIsValid(VarExpr->getType())) {
-    Diag(VarExpr->getExprLoc(), diag::err_acc_reduction_type)
-        << VarExpr->getType() << /*Sub array base type*/ 0;
-    return ExprError();
-  }
-
-  // OpenACC3.3: 2.9.11: Reduction clauses on nested constructs for the same
-  // reduction 'var' must have the same reduction operator.
-  if (!VarExpr->isInstantiationDependent()) {
-
-    for (const OpenACCReductionClause *RClause : ActiveReductionClauses) {
-      if (RClause->getReductionOp() == ReductionOp)
-        break;
-
-      for (Expr *OldVarExpr : RClause->getVarList()) {
-        if (OldVarExpr->isInstantiationDependent())
-          continue;
-
-        if (areVarsEqual(VarExpr, OldVarExpr)) {
-          Diag(VarExpr->getExprLoc(), diag::err_reduction_op_mismatch)
-              << ReductionOp << RClause->getReductionOp();
-          Diag(OldVarExpr->getExprLoc(), diag::note_acc_previous_clause_here);
-          return ExprError();
-        }
-      }
-    }
-  }
-
-  return VarExpr;
-}
-
 void SemaOpenACC::ActOnConstruct(OpenACCDirectiveKind K,
                                  SourceLocation DirLoc) {
   // Start an evaluation context to parse the clause arguments on.
@@ -2809,181 +752,6 @@ ExprResult SemaOpenACC::ActOnArraySectionExpr(Expr *Base, SourceLocation LBLoc,
                        OK_Ordinary, ColonLoc, RBLoc);
 }
 
-ExprResult SemaOpenACC::CheckCollapseLoopCount(Expr *LoopCount) {
-  if (!LoopCount)
-    return ExprError();
-
-  assert((LoopCount->isInstantiationDependent() ||
-          LoopCount->getType()->isIntegerType()) &&
-         "Loop argument non integer?");
-
-  // If this is dependent, there really isn't anything we can check.
-  if (LoopCount->isInstantiationDependent())
-    return ExprResult{LoopCount};
-
-  std::optional<llvm::APSInt> ICE =
-      LoopCount->getIntegerConstantExpr(getASTContext());
-
-  // OpenACC 3.3: 2.9.1
-  // The argument to the collapse clause must be a constant positive integer
-  // expression.
-  if (!ICE || *ICE <= 0) {
-    Diag(LoopCount->getBeginLoc(), diag::err_acc_collapse_loop_count)
-        << ICE.has_value() << ICE.value_or(llvm::APSInt{}).getExtValue();
-    return ExprError();
-  }
-
-  return ExprResult{
-      ConstantExpr::Create(getASTContext(), LoopCount, APValue{*ICE})};
-}
-
-ExprResult
-SemaOpenACC::CheckGangExpr(ArrayRef<const OpenACCClause *> ExistingClauses,
-                           OpenACCDirectiveKind DK, OpenACCGangKind GK,
-                           Expr *E) {
-  // There are two cases for the enforcement here: the 'current' directive is a
-  // 'loop', where we need to check the active compute construct kind, or the
-  // current directive is a 'combined' construct, where we have to check the
-  // current one.
-  switch (DK) {
-  case OpenACCDirectiveKind::ParallelLoop:
-    return CheckGangParallelExpr(*this, DK, ActiveComputeConstructInfo.Kind, GK,
-                                 E);
-  case OpenACCDirectiveKind::SerialLoop:
-    return CheckGangSerialExpr(*this, DK, ActiveComputeConstructInfo.Kind, GK,
-                               E);
-  case OpenACCDirectiveKind::KernelsLoop:
-    return CheckGangKernelsExpr(*this, ExistingClauses, DK,
-                                ActiveComputeConstructInfo.Kind, GK, E);
-  case OpenACCDirectiveKind::Loop:
-    switch (ActiveComputeConstructInfo.Kind) {
-    case OpenACCDirectiveKind::Invalid:
-    case OpenACCDirectiveKind::Parallel:
-    case OpenACCDirectiveKind::ParallelLoop:
-      return CheckGangParallelExpr(*this, DK, ActiveComputeConstructInfo.Kind,
-                                   GK, E);
-    case OpenACCDirectiveKind::SerialLoop:
-    case OpenACCDirectiveKind::Serial:
-      return CheckGangSerialExpr(*this, DK, ActiveComputeConstructInfo.Kind, GK,
-                                 E);
-    case OpenACCDirectiveKind::KernelsLoop:
-    case OpenACCDirectiveKind::Kernels:
-      return CheckGangKernelsExpr(*this, ExistingClauses, DK,
-                                  ActiveComputeConstructInfo.Kind, GK, E);
-    default:
-      llvm_unreachable("Non compute construct in active compute construct?");
-    }
-  default:
-    // TODO: OpenACC: when we implement this on 'routine', we'll have to
-    // implement its checking here.
-    llvm_unreachable("Invalid directive kind for a Gang clause");
-  }
-  llvm_unreachable("Compute construct directive not handled?");
-}
-
-OpenACCClause *
-SemaOpenACC::CheckGangClause(OpenACCDirectiveKind DirKind,
-                             ArrayRef<const OpenACCClause *> ExistingClauses,
-                             SourceLocation BeginLoc, SourceLocation LParenLoc,
-                             ArrayRef<OpenACCGangKind> GangKinds,
-                             ArrayRef<Expr *> IntExprs, SourceLocation EndLoc) {
-  // OpenACC 3.3 2.9.11: A reduction clause may not appear on a loop directive
-  // that has a gang clause with a dim: argument whose value is greater than 1.
-
-  const auto *ReductionItr =
-      llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCReductionClause>);
-
-  if (ReductionItr != ExistingClauses.end()) {
-    const auto GangZip = llvm::zip_equal(GangKinds, IntExprs);
-    const auto GangItr = llvm::find_if(GangZip, [](const auto &Tuple) {
-      return std::get<0>(Tuple) == OpenACCGangKind::Dim;
-    });
-
-    if (GangItr != GangZip.end()) {
-      const Expr *DimExpr = std::get<1>(*GangItr);
-
-      assert(
-          (DimExpr->isInstantiationDependent() || isa<ConstantExpr>(DimExpr)) &&
-          "Improperly formed gang argument");
-      if (const auto *DimVal = dyn_cast<ConstantExpr>(DimExpr);
-          DimVal && DimVal->getResultAsAPSInt() > 1) {
-        Diag(DimVal->getBeginLoc(), diag::err_acc_gang_reduction_conflict)
-            << /*gang/reduction=*/0 << DirKind;
-        Diag((*ReductionItr)->getBeginLoc(),
-             diag::note_acc_previous_clause_here);
-        return nullptr;
-      }
-    }
-  }
-
-  return OpenACCGangClause::Create(getASTContext(), BeginLoc, LParenLoc,
-                                   GangKinds, IntExprs, EndLoc);
-}
-
-OpenACCClause *SemaOpenACC::CheckReductionClause(
-    ArrayRef<const OpenACCClause *> ExistingClauses,
-    OpenACCDirectiveKind DirectiveKind, SourceLocation BeginLoc,
-    SourceLocation LParenLoc, OpenACCReductionOperator ReductionOp,
-    ArrayRef<Expr *> Vars, SourceLocation EndLoc) {
-  if (DirectiveKind == OpenACCDirectiveKind::Loop ||
-      isOpenACCCombinedDirectiveKind(DirectiveKind)) {
-    // OpenACC 3.3 2.9.11: A reduction clause may not appear on a loop directive
-    // that has a gang clause with a dim: argument whose value is greater
-    // than 1.
-    const auto GangClauses = llvm::make_filter_range(
-        ExistingClauses, llvm::IsaPred<OpenACCGangClause>);
-
-    for (auto *GC : GangClauses) {
-      const auto *GangClause = cast<OpenACCGangClause>(GC);
-      for (unsigned I = 0; I < GangClause->getNumExprs(); ++I) {
-        std::pair<OpenACCGangKind, const Expr *> EPair = GangClause->getExpr(I);
-        if (EPair.first != OpenACCGangKind::Dim)
-          continue;
-
-        if (const auto *DimVal = dyn_cast<ConstantExpr>(EPair.second);
-            DimVal && DimVal->getResultAsAPSInt() > 1) {
-          Diag(BeginLoc, diag::err_acc_gang_reduction_conflict)
-              << /*reduction/gang=*/1 << DirectiveKind;
-          Diag(GangClause->getBeginLoc(), diag::note_acc_previous_clause_here);
-          return nullptr;
-        }
-      }
-    }
-  }
-
-  auto *Ret = OpenACCReductionClause::Create(
-      getASTContext(), BeginLoc, LParenLoc, ReductionOp, Vars, EndLoc);
-  return Ret;
-}
-
-ExprResult SemaOpenACC::CheckTileSizeExpr(Expr *SizeExpr) {
-  if (!SizeExpr)
-    return ExprError();
-
-  assert((SizeExpr->isInstantiationDependent() ||
-          SizeExpr->getType()->isIntegerType()) &&
-         "size argument non integer?");
-
-  // If dependent, or an asterisk, the expression is fine.
-  if (SizeExpr->isInstantiationDependent() ||
-      isa<OpenACCAsteriskSizeExpr>(SizeExpr))
-    return ExprResult{SizeExpr};
-
-  std::optional<llvm::APSInt> ICE =
-      SizeExpr->getIntegerConstantExpr(getASTContext());
-
-  // OpenACC 3.3 2.9.8
-  // where each tile size is a constant positive integer expression or asterisk.
-  if (!ICE || *ICE <= 0) {
-    Diag(SizeExpr->getBeginLoc(), diag::err_acc_size_expr_value)
-        << ICE.has_value() << ICE.value_or(llvm::APSInt{}).getExtValue();
-    return ExprError();
-  }
-
-  return ExprResult{
-      ConstantExpr::Create(getASTContext(), SizeExpr, APValue{*ICE})};
-}
-
 void SemaOpenACC::ActOnWhileStmt(SourceLocation WhileLoc) {
   if (!getLangOpts().OpenACC)
     return;
diff --git a/clang/lib/Sema/SemaOpenACCClause.cpp b/clang/lib/Sema/SemaOpenACCClause.cpp
new file mode 100644
index 0000000000000..27da14de4c04f
--- /dev/null
+++ b/clang/lib/Sema/SemaOpenACCClause.cpp
@@ -0,0 +1,2247 @@
+//===--- SemaOpenACCClause.cpp - Semantic Analysis for OpenACC clause -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements semantic analysis for OpenACC clauses.
+///
+//===----------------------------------------------------------------------===//
+
+#include "clang/AST/OpenACCClause.h"
+#include "clang/AST/DeclCXX.h"
+#include "clang/Basic/DiagnosticSema.h"
+#include "clang/Basic/OpenACCKinds.h"
+#include "clang/Sema/SemaOpenACC.h"
+
+using namespace clang;
+
+namespace {
+bool doesClauseApplyToDirective(OpenACCDirectiveKind DirectiveKind,
+                                OpenACCClauseKind ClauseKind) {
+  switch (ClauseKind) {
+    // FIXME: For each clause as we implement them, we can add the
+    // 'legalization' list here.
+  case OpenACCClauseKind::Default:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+    case OpenACCDirectiveKind::Data:
+      return true;
+    default:
+      return false;
+    }
+  case OpenACCClauseKind::If:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Data:
+    case OpenACCDirectiveKind::EnterData:
+    case OpenACCDirectiveKind::ExitData:
+    case OpenACCDirectiveKind::HostData:
+    case OpenACCDirectiveKind::Init:
+    case OpenACCDirectiveKind::Shutdown:
+    case OpenACCDirectiveKind::Set:
+    case OpenACCDirectiveKind::Update:
+    case OpenACCDirectiveKind::Wait:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+  case OpenACCClauseKind::Self:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Update:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+  case OpenACCClauseKind::NumGangs:
+  case OpenACCClauseKind::NumWorkers:
+  case OpenACCClauseKind::VectorLength:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+  case OpenACCClauseKind::FirstPrivate:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+      return true;
+    default:
+      return false;
+    }
+  case OpenACCClauseKind::Private:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Loop:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+  case OpenACCClauseKind::NoCreate:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Data:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+  case OpenACCClauseKind::Present:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Data:
+    case OpenACCDirectiveKind::Declare:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+
+  case OpenACCClauseKind::Copy:
+  case OpenACCClauseKind::PCopy:
+  case OpenACCClauseKind::PresentOrCopy:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Data:
+    case OpenACCDirectiveKind::Declare:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+  case OpenACCClauseKind::CopyIn:
+  case OpenACCClauseKind::PCopyIn:
+  case OpenACCClauseKind::PresentOrCopyIn:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Data:
+    case OpenACCDirectiveKind::EnterData:
+    case OpenACCDirectiveKind::Declare:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+  case OpenACCClauseKind::CopyOut:
+  case OpenACCClauseKind::PCopyOut:
+  case OpenACCClauseKind::PresentOrCopyOut:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Data:
+    case OpenACCDirectiveKind::ExitData:
+    case OpenACCDirectiveKind::Declare:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+  case OpenACCClauseKind::Create:
+  case OpenACCClauseKind::PCreate:
+  case OpenACCClauseKind::PresentOrCreate:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Data:
+    case OpenACCDirectiveKind::EnterData:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+
+  case OpenACCClauseKind::Attach:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Data:
+    case OpenACCDirectiveKind::EnterData:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+  case OpenACCClauseKind::DevicePtr:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Data:
+    case OpenACCDirectiveKind::Declare:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+  case OpenACCClauseKind::Async:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Data:
+    case OpenACCDirectiveKind::EnterData:
+    case OpenACCDirectiveKind::ExitData:
+    case OpenACCDirectiveKind::Set:
+    case OpenACCDirectiveKind::Update:
+    case OpenACCDirectiveKind::Wait:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+  case OpenACCClauseKind::Wait:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Data:
+    case OpenACCDirectiveKind::EnterData:
+    case OpenACCDirectiveKind::ExitData:
+    case OpenACCDirectiveKind::Update:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+
+  case OpenACCClauseKind::Seq:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Loop:
+    case OpenACCDirectiveKind::Routine:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+
+  case OpenACCClauseKind::Independent:
+  case OpenACCClauseKind::Auto:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Loop:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+
+  case OpenACCClauseKind::Reduction:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Loop:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+
+  case OpenACCClauseKind::DeviceType:
+  case OpenACCClauseKind::DType:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Data:
+    case OpenACCDirectiveKind::Init:
+    case OpenACCDirectiveKind::Shutdown:
+    case OpenACCDirectiveKind::Set:
+    case OpenACCDirectiveKind::Update:
+    case OpenACCDirectiveKind::Loop:
+    case OpenACCDirectiveKind::Routine:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+
+  case OpenACCClauseKind::Collapse: {
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Loop:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+  }
+  case OpenACCClauseKind::Tile: {
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Loop:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  case OpenACCClauseKind::Gang: {
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Loop:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+    case OpenACCDirectiveKind::Routine:
+      return true;
+    default:
+      return false;
+    }
+  case OpenACCClauseKind::Worker: {
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Loop:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+    case OpenACCDirectiveKind::Routine:
+      return true;
+    default:
+      return false;
+    }
+  }
+  case OpenACCClauseKind::Vector: {
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Loop:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+    case OpenACCDirectiveKind::Routine:
+      return true;
+    default:
+      return false;
+    }
+  }
+  case OpenACCClauseKind::Finalize: {
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::ExitData:
+      return true;
+    default:
+      return false;
+    }
+  }
+  case OpenACCClauseKind::IfPresent: {
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::HostData:
+    case OpenACCDirectiveKind::Update:
+      return true;
+    default:
+      return false;
+    }
+  }
+  case OpenACCClauseKind::Delete: {
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::ExitData:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  case OpenACCClauseKind::Detach: {
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::ExitData:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  case OpenACCClauseKind::DeviceNum: {
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Init:
+    case OpenACCDirectiveKind::Shutdown:
+    case OpenACCDirectiveKind::Set:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  case OpenACCClauseKind::UseDevice: {
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::HostData:
+      return true;
+    default:
+      return false;
+    }
+  }
+  case OpenACCClauseKind::DefaultAsync: {
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Set:
+      return true;
+    default:
+      return false;
+    }
+  }
+  case OpenACCClauseKind::Device: {
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Update:
+      return true;
+    default:
+      return false;
+    }
+  }
+  case OpenACCClauseKind::Host: {
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Update:
+      return true;
+    default:
+      return false;
+    }
+  }
+  }
+
+  default:
+    // Do nothing so we can go to the 'unimplemented' diagnostic instead.
+    return true;
+  }
+  llvm_unreachable("Invalid clause kind");
+}
+
+bool checkAlreadyHasClauseOfKind(
+    SemaOpenACC &S, ArrayRef<const OpenACCClause *> ExistingClauses,
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  const auto *Itr = llvm::find_if(ExistingClauses, [&](const OpenACCClause *C) {
+    return C->getClauseKind() == Clause.getClauseKind();
+  });
+  if (Itr != ExistingClauses.end()) {
+    S.Diag(Clause.getBeginLoc(), diag::err_acc_duplicate_clause_disallowed)
+        << Clause.getDirectiveKind() << Clause.getClauseKind();
+    S.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
+    return true;
+  }
+  return false;
+}
+bool checkValidAfterDeviceType(
+    SemaOpenACC &S, const OpenACCDeviceTypeClause &DeviceTypeClause,
+    const SemaOpenACC::OpenACCParsedClause &NewClause) {
+  // This is implemented for everything but 'routine', so treat as 'fine' for
+  // that.
+  if (NewClause.getDirectiveKind() == OpenACCDirectiveKind::Routine)
+    return false;
+
+  // OpenACC3.3: Section 2.4: Clauses that precede any device_type clause are
+  // default clauses.  Clauses that follow a device_type clause up to the end of
+  // the directive or up to the next device_type clause are device-specific
+  // clauses for the device types specified in the device_type argument.
+  //
+  // The above implies that despite what the individual text says, these are
+  // valid.
+  if (NewClause.getClauseKind() == OpenACCClauseKind::DType ||
+      NewClause.getClauseKind() == OpenACCClauseKind::DeviceType)
+    return false;
+
+  // Implement check from OpenACC3.3: section 2.5.4:
+  // Only the async, wait, num_gangs, num_workers, and vector_length clauses may
+  // follow a device_type clause.
+  if (isOpenACCComputeDirectiveKind(NewClause.getDirectiveKind())) {
+    switch (NewClause.getClauseKind()) {
+    case OpenACCClauseKind::Async:
+    case OpenACCClauseKind::Wait:
+    case OpenACCClauseKind::NumGangs:
+    case OpenACCClauseKind::NumWorkers:
+    case OpenACCClauseKind::VectorLength:
+      return false;
+    default:
+      break;
+    }
+  } else if (NewClause.getDirectiveKind() == OpenACCDirectiveKind::Loop) {
+    // Implement check from OpenACC3.3: section 2.9:
+    // Only the collapse, gang, worker, vector, seq, independent, auto, and tile
+    // clauses may follow a device_type clause.
+    switch (NewClause.getClauseKind()) {
+    case OpenACCClauseKind::Collapse:
+    case OpenACCClauseKind::Gang:
+    case OpenACCClauseKind::Worker:
+    case OpenACCClauseKind::Vector:
+    case OpenACCClauseKind::Seq:
+    case OpenACCClauseKind::Independent:
+    case OpenACCClauseKind::Auto:
+    case OpenACCClauseKind::Tile:
+      return false;
+    default:
+      break;
+    }
+  } else if (isOpenACCCombinedDirectiveKind(NewClause.getDirectiveKind())) {
+    // This seems like it should be the union of 2.9 and 2.5.4 from above.
+    switch (NewClause.getClauseKind()) {
+    case OpenACCClauseKind::Async:
+    case OpenACCClauseKind::Wait:
+    case OpenACCClauseKind::NumGangs:
+    case OpenACCClauseKind::NumWorkers:
+    case OpenACCClauseKind::VectorLength:
+    case OpenACCClauseKind::Collapse:
+    case OpenACCClauseKind::Gang:
+    case OpenACCClauseKind::Worker:
+    case OpenACCClauseKind::Vector:
+    case OpenACCClauseKind::Seq:
+    case OpenACCClauseKind::Independent:
+    case OpenACCClauseKind::Auto:
+    case OpenACCClauseKind::Tile:
+      return false;
+    default:
+      break;
+    }
+  } else if (NewClause.getDirectiveKind() == OpenACCDirectiveKind::Data) {
+    // OpenACC3.3 section 2.6.5: Only the async and wait clauses may follow a
+    // device_type clause.
+    switch (NewClause.getClauseKind()) {
+    case OpenACCClauseKind::Async:
+    case OpenACCClauseKind::Wait:
+      return false;
+    default:
+      break;
+    }
+  } else if (NewClause.getDirectiveKind() == OpenACCDirectiveKind::Set ||
+             NewClause.getDirectiveKind() == OpenACCDirectiveKind::Init ||
+             NewClause.getDirectiveKind() == OpenACCDirectiveKind::Shutdown) {
+    // There are no restrictions on 'set', 'init', or 'shutdown'.
+    return false;
+  } else if (NewClause.getDirectiveKind() == OpenACCDirectiveKind::Update) {
+    // OpenACC3.3 section 2.14.4: Only the async and wait clauses may follow a
+    // device_type clause.
+    switch (NewClause.getClauseKind()) {
+    case OpenACCClauseKind::Async:
+    case OpenACCClauseKind::Wait:
+      return false;
+    default:
+      break;
+    }
+  }
+  S.Diag(NewClause.getBeginLoc(), diag::err_acc_clause_after_device_type)
+      << NewClause.getClauseKind() << DeviceTypeClause.getClauseKind()
+      << NewClause.getDirectiveKind();
+  S.Diag(DeviceTypeClause.getBeginLoc(), diag::note_acc_previous_clause_here);
+  return true;
+}
+
+// A temporary function that helps implement the 'not implemented' check at the
+// top of each clause checking function. This should only be used in conjunction
+// with the one being currently implemented/only updated after the entire
+// construct has been implemented.
+bool isDirectiveKindImplemented(OpenACCDirectiveKind DK) {
+  return DK != OpenACCDirectiveKind::Declare &&
+         DK != OpenACCDirectiveKind::Atomic &&
+         DK != OpenACCDirectiveKind::Routine;
+}
+
+class SemaOpenACCClauseVisitor {
+  SemaOpenACC &SemaRef;
+  ASTContext &Ctx;
+  ArrayRef<const OpenACCClause *> ExistingClauses;
+  bool NotImplemented = false;
+
+  OpenACCClause *isNotImplemented() {
+    NotImplemented = true;
+    return nullptr;
+  }
+
+  // OpenACC 3.3 2.9:
+  // A 'gang', 'worker', or 'vector' clause may not appear if a 'seq' clause
+  // appears.
+  bool DiagIfSeqClause(SemaOpenACC::OpenACCParsedClause &Clause) {
+    const auto *Itr =
+        llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCSeqClause>);
+
+    if (Itr != ExistingClauses.end()) {
+      SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_clause_cannot_combine)
+          << Clause.getClauseKind() << (*Itr)->getClauseKind()
+          << Clause.getDirectiveKind();
+      SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
+
+      return true;
+    }
+    return false;
+  }
+
+public:
+  SemaOpenACCClauseVisitor(SemaOpenACC &S,
+                           ArrayRef<const OpenACCClause *> ExistingClauses)
+      : SemaRef(S), Ctx(S.getASTContext()), ExistingClauses(ExistingClauses) {}
+  // Once we've implemented everything, we shouldn't need this infrastructure.
+  // But in the meantime, we use this to help decide whether the clause was
+  // handled for this directive.
+  bool diagNotImplemented() { return NotImplemented; }
+
+  OpenACCClause *Visit(SemaOpenACC::OpenACCParsedClause &Clause) {
+    switch (Clause.getClauseKind()) {
+#define VISIT_CLAUSE(CLAUSE_NAME)                                              \
+  case OpenACCClauseKind::CLAUSE_NAME:                                         \
+    return Visit##CLAUSE_NAME##Clause(Clause);
+#define CLAUSE_ALIAS(ALIAS, CLAUSE_NAME, DEPRECATED)                           \
+  case OpenACCClauseKind::ALIAS:                                               \
+  if (DEPRECATED)                                                              \
+    SemaRef.Diag(Clause.getBeginLoc(), diag::warn_acc_deprecated_alias_name)   \
+        << Clause.getClauseKind() << OpenACCClauseKind::CLAUSE_NAME;           \
+  return Visit##CLAUSE_NAME##Clause(Clause);
+#include "clang/Basic/OpenACCClauses.def"
+    default:
+      return isNotImplemented();
+    }
+    llvm_unreachable("Invalid clause kind");
+  }
+
+#define VISIT_CLAUSE(CLAUSE_NAME)                                              \
+  OpenACCClause *Visit##CLAUSE_NAME##Clause(                                   \
+      SemaOpenACC::OpenACCParsedClause &Clause);
+#include "clang/Basic/OpenACCClauses.def"
+};
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitDefaultClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // Don't add an invalid clause to the AST.
+  if (Clause.getDefaultClauseKind() == OpenACCDefaultClauseKind::Invalid)
+    return nullptr;
+
+  // OpenACC 3.3, Section 2.5.4:
+  // At most one 'default' clause may appear, and it must have a value of
+  // either 'none' or 'present'.
+  // Second half of the sentence is diagnosed during parsing.
+  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
+    return nullptr;
+
+  return OpenACCDefaultClause::Create(
+      Ctx, Clause.getDefaultClauseKind(), Clause.getBeginLoc(),
+      Clause.getLParenLoc(), Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitTileClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+
+  // Duplicates here are not really sensible.  We could possible permit
+  // multiples if they all had the same value, but there isn't really a good
+  // reason to do so. Also, this simplifies the suppression of duplicates, in
+  // that we know if we 'find' one after instantiation, that it is the same
+  // clause, which simplifies instantiation/checking/etc.
+  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
+    return nullptr;
+
+  llvm::SmallVector<Expr *> NewSizeExprs;
+
+  // Make sure these are all positive constant expressions or *.
+  for (Expr *E : Clause.getIntExprs()) {
+    ExprResult Res = SemaRef.CheckTileSizeExpr(E);
+
+    if (!Res.isUsable())
+      return nullptr;
+
+    NewSizeExprs.push_back(Res.get());
+  }
+
+  return OpenACCTileClause::Create(Ctx, Clause.getBeginLoc(),
+                                   Clause.getLParenLoc(), NewSizeExprs,
+                                   Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitIfClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // There is no prose in the standard that says duplicates aren't allowed,
+  // but this diagnostic is present in other compilers, as well as makes
+  // sense. Prose DOES exist for 'data' and 'host_data', 'set', 'enter data' and
+  // 'exit data' both don't, but other implmementations do this.  OpenACC issue
+  // 519 filed for the latter two. Prose also exists for 'update'.
+  // GCC allows this on init/shutdown, presumably for good reason, so we do too.
+  if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Init &&
+      Clause.getDirectiveKind() != OpenACCDirectiveKind::Shutdown &&
+      checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
+    return nullptr;
+
+  // The parser has ensured that we have a proper condition expr, so there
+  // isn't really much to do here.
+
+  // If the 'if' clause is true, it makes the 'self' clause have no effect,
+  // diagnose that here.  This only applies on compute/combined constructs.
+  if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Update) {
+    const auto *Itr =
+        llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCSelfClause>);
+    if (Itr != ExistingClauses.end()) {
+      SemaRef.Diag(Clause.getBeginLoc(), diag::warn_acc_if_self_conflict);
+      SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
+    }
+  }
+
+  return OpenACCIfClause::Create(Ctx, Clause.getBeginLoc(),
+                                 Clause.getLParenLoc(),
+                                 Clause.getConditionExpr(), Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitSelfClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // There is no prose in the standard that says duplicates aren't allowed,
+  // but this diagnostic is present in other compilers, as well as makes
+  // sense.
+  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
+    return nullptr;
+
+  // If the 'if' clause is true, it makes the 'self' clause have no effect,
+  // diagnose that here.  This only applies on compute/combined constructs.
+  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Update)
+    return OpenACCSelfClause::Create(Ctx, Clause.getBeginLoc(),
+                                     Clause.getLParenLoc(), Clause.getVarList(),
+                                     Clause.getEndLoc());
+
+  const auto *Itr =
+      llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCIfClause>);
+  if (Itr != ExistingClauses.end()) {
+    SemaRef.Diag(Clause.getBeginLoc(), diag::warn_acc_if_self_conflict);
+    SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
+  }
+  return OpenACCSelfClause::Create(
+      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(),
+      Clause.getConditionExpr(), Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitNumGangsClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // There is no prose in the standard that says duplicates aren't allowed,
+  // but this diagnostic is present in other compilers, as well as makes
+  // sense.
+  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
+    return nullptr;
+
+  // num_gangs requires at least 1 int expr in all forms.  Diagnose here, but
+  // allow us to continue, an empty clause might be useful for future
+  // diagnostics.
+  if (Clause.getIntExprs().empty())
+    SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_num_gangs_num_args)
+        << /*NoArgs=*/0;
+
+  unsigned MaxArgs =
+      (Clause.getDirectiveKind() == OpenACCDirectiveKind::Parallel ||
+       Clause.getDirectiveKind() == OpenACCDirectiveKind::ParallelLoop)
+          ? 3
+          : 1;
+  // The max number of args differs between parallel and other constructs.
+  // Again, allow us to continue for the purposes of future diagnostics.
+  if (Clause.getIntExprs().size() > MaxArgs)
+    SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_num_gangs_num_args)
+        << /*NoArgs=*/1 << Clause.getDirectiveKind() << MaxArgs
+        << Clause.getIntExprs().size();
+
+  // OpenACC 3.3 Section 2.9.11: A reduction clause may not appear on a loop
+  // directive that has a gang clause and is within a compute construct that has
+  // a num_gangs clause with more than one explicit argument.
+  if (Clause.getIntExprs().size() > 1 &&
+      isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind())) {
+    auto *GangClauseItr =
+        llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCGangClause>);
+    auto *ReductionClauseItr =
+        llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCReductionClause>);
+
+    if (GangClauseItr != ExistingClauses.end() &&
+        ReductionClauseItr != ExistingClauses.end()) {
+      SemaRef.Diag(Clause.getBeginLoc(),
+                   diag::err_acc_gang_reduction_numgangs_conflict)
+          << OpenACCClauseKind::Reduction << OpenACCClauseKind::Gang
+          << Clause.getDirectiveKind() << /*is on combined directive=*/1;
+      SemaRef.Diag((*ReductionClauseItr)->getBeginLoc(),
+                   diag::note_acc_previous_clause_here);
+      SemaRef.Diag((*GangClauseItr)->getBeginLoc(),
+                   diag::note_acc_previous_clause_here);
+      return nullptr;
+    }
+  }
+
+  // OpenACC 3.3 Section 2.5.4:
+  // A reduction clause may not appear on a parallel construct with a
+  // num_gangs clause that has more than one argument.
+  if ((Clause.getDirectiveKind() == OpenACCDirectiveKind::Parallel ||
+       Clause.getDirectiveKind() == OpenACCDirectiveKind::ParallelLoop) &&
+      Clause.getIntExprs().size() > 1) {
+    auto *Parallel =
+        llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCReductionClause>);
+
+    if (Parallel != ExistingClauses.end()) {
+      SemaRef.Diag(Clause.getBeginLoc(),
+                   diag::err_acc_reduction_num_gangs_conflict)
+          << /*>1 arg in first loc=*/1 << Clause.getClauseKind()
+          << Clause.getDirectiveKind() << OpenACCClauseKind::Reduction;
+      SemaRef.Diag((*Parallel)->getBeginLoc(),
+                   diag::note_acc_previous_clause_here);
+      return nullptr;
+    }
+  }
+
+  // OpenACC 3.3 Section 2.9.2:
+  // An argument with no keyword or with the 'num' keyword is allowed only when
+  // the 'num_gangs' does not appear on the 'kernel' construct.
+  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::KernelsLoop) {
+    auto GangClauses = llvm::make_filter_range(
+        ExistingClauses, llvm::IsaPred<OpenACCGangClause>);
+
+    for (auto *GC : GangClauses) {
+      if (cast<OpenACCGangClause>(GC)->hasExprOfKind(OpenACCGangKind::Num)) {
+        SemaRef.Diag(Clause.getBeginLoc(),
+                     diag::err_acc_num_arg_conflict_reverse)
+            << OpenACCClauseKind::NumGangs << OpenACCClauseKind::Gang
+            << /*Num argument*/ 1;
+        SemaRef.Diag(GC->getBeginLoc(), diag::note_acc_previous_clause_here);
+        return nullptr;
+      }
+    }
+  }
+
+  return OpenACCNumGangsClause::Create(
+      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getIntExprs(),
+      Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitNumWorkersClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // There is no prose in the standard that says duplicates aren't allowed,
+  // but this diagnostic is present in other compilers, as well as makes
+  // sense.
+  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
+    return nullptr;
+
+  // OpenACC 3.3 Section 2.9.2:
+  // An argument is allowed only when the 'num_workers' does not appear on the
+  // kernels construct.
+  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::KernelsLoop) {
+    auto WorkerClauses = llvm::make_filter_range(
+        ExistingClauses, llvm::IsaPred<OpenACCWorkerClause>);
+
+    for (auto *WC : WorkerClauses) {
+      if (cast<OpenACCWorkerClause>(WC)->hasIntExpr()) {
+        SemaRef.Diag(Clause.getBeginLoc(),
+                     diag::err_acc_num_arg_conflict_reverse)
+            << OpenACCClauseKind::NumWorkers << OpenACCClauseKind::Worker
+            << /*num argument*/ 0;
+        SemaRef.Diag(WC->getBeginLoc(), diag::note_acc_previous_clause_here);
+        return nullptr;
+      }
+    }
+  }
+
+  assert(Clause.getIntExprs().size() == 1 &&
+         "Invalid number of expressions for NumWorkers");
+  return OpenACCNumWorkersClause::Create(
+      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getIntExprs()[0],
+      Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitVectorLengthClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // There is no prose in the standard that says duplicates aren't allowed,
+  // but this diagnostic is present in other compilers, as well as makes
+  // sense.
+  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
+    return nullptr;
+
+  // OpenACC 3.3 Section 2.9.4:
+  // An argument is allowed only when the 'vector_length' does not appear on the
+  // 'kernels' construct.
+  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::KernelsLoop) {
+    auto VectorClauses = llvm::make_filter_range(
+        ExistingClauses, llvm::IsaPred<OpenACCVectorClause>);
+
+    for (auto *VC : VectorClauses) {
+      if (cast<OpenACCVectorClause>(VC)->hasIntExpr()) {
+        SemaRef.Diag(Clause.getBeginLoc(),
+                     diag::err_acc_num_arg_conflict_reverse)
+            << OpenACCClauseKind::VectorLength << OpenACCClauseKind::Vector
+            << /*num argument*/ 0;
+        SemaRef.Diag(VC->getBeginLoc(), diag::note_acc_previous_clause_here);
+        return nullptr;
+      }
+    }
+  }
+
+  assert(Clause.getIntExprs().size() == 1 &&
+         "Invalid number of expressions for NumWorkers");
+  return OpenACCVectorLengthClause::Create(
+      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getIntExprs()[0],
+      Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitAsyncClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // There is no prose in the standard that says duplicates aren't allowed,
+  // but this diagnostic is present in other compilers, as well as makes
+  // sense.
+  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
+    return nullptr;
+
+  assert(Clause.getNumIntExprs() < 2 &&
+         "Invalid number of expressions for Async");
+  return OpenACCAsyncClause::Create(
+      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(),
+      Clause.getNumIntExprs() != 0 ? Clause.getIntExprs()[0] : nullptr,
+      Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitDeviceNumClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // Restrictions only properly implemented on certain constructs, so skip/treat
+  // as unimplemented in those cases.
+  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
+    return isNotImplemented();
+
+  // OpenACC 3.3 2.14.3: Two instances of the same clause may not appear on the
+  // same directive.
+  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Set &&
+      checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
+    return nullptr;
+
+  assert(Clause.getNumIntExprs() == 1 &&
+         "Invalid number of expressions for device_num");
+  return OpenACCDeviceNumClause::Create(
+      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getIntExprs()[0],
+      Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitDefaultAsyncClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // OpenACC 3.3 2.14.3: Two instances of the same clause may not appear on the
+  // same directive.
+  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
+    return nullptr;
+
+  assert(Clause.getNumIntExprs() == 1 &&
+         "Invalid number of expressions for default_async");
+  return OpenACCDefaultAsyncClause::Create(
+      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getIntExprs()[0],
+      Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitPrivateClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // ActOnVar ensured that everything is a valid variable reference, so there
+  // really isn't anything to do here. GCC does some duplicate-finding, though
+  // it isn't apparent in the standard where this is justified.
+
+  return OpenACCPrivateClause::Create(Ctx, Clause.getBeginLoc(),
+                                      Clause.getLParenLoc(),
+                                      Clause.getVarList(), Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitFirstPrivateClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // ActOnVar ensured that everything is a valid variable reference, so there
+  // really isn't anything to do here. GCC does some duplicate-finding, though
+  // it isn't apparent in the standard where this is justified.
+
+  return OpenACCFirstPrivateClause::Create(
+      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getVarList(),
+      Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitNoCreateClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // ActOnVar ensured that everything is a valid variable reference, so there
+  // really isn't anything to do here. GCC does some duplicate-finding, though
+  // it isn't apparent in the standard where this is justified.
+
+  return OpenACCNoCreateClause::Create(Ctx, Clause.getBeginLoc(),
+                                       Clause.getLParenLoc(),
+                                       Clause.getVarList(), Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitPresentClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // Restrictions only properly implemented on 'compute'/'combined'/'data'
+  // constructs, and 'compute'/'combined'/'data' constructs are the only
+  // construct that can do anything with this yet, so skip/treat as
+  // unimplemented in this case.
+  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
+    return isNotImplemented();
+  // ActOnVar ensured that everything is a valid variable reference, so there
+  // really isn't anything to do here. GCC does some duplicate-finding, though
+  // it isn't apparent in the standard where this is justified.
+
+  return OpenACCPresentClause::Create(Ctx, Clause.getBeginLoc(),
+                                      Clause.getLParenLoc(),
+                                      Clause.getVarList(), Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitHostClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // ActOnVar ensured that everything is a valid variable reference, so there
+  // really isn't anything to do here. GCC does some duplicate-finding, though
+  // it isn't apparent in the standard where this is justified.
+
+  return OpenACCHostClause::Create(Ctx, Clause.getBeginLoc(),
+                                   Clause.getLParenLoc(), Clause.getVarList(),
+                                   Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitDeviceClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // ActOnVar ensured that everything is a valid variable reference, so there
+  // really isn't anything to do here. GCC does some duplicate-finding, though
+  // it isn't apparent in the standard where this is justified.
+
+  return OpenACCDeviceClause::Create(Ctx, Clause.getBeginLoc(),
+                                     Clause.getLParenLoc(), Clause.getVarList(),
+                                     Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitCopyClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // Restrictions only properly implemented on 'compute'/'combined'/'data'
+  // constructs, and 'compute'/'combined'/'data' constructs are the only
+  // construct that can do anything with this yet, so skip/treat as
+  // unimplemented in this case.
+  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
+    return isNotImplemented();
+  // ActOnVar ensured that everything is a valid variable reference, so there
+  // really isn't anything to do here. GCC does some duplicate-finding, though
+  // it isn't apparent in the standard where this is justified.
+
+  return OpenACCCopyClause::Create(
+      Ctx, Clause.getClauseKind(), Clause.getBeginLoc(), Clause.getLParenLoc(),
+      Clause.getVarList(), Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitCopyInClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // Restrictions only properly implemented on 'compute'/'combined'/'data'
+  // constructs, and 'compute'/'combined'/'data' constructs are the only
+  // construct that can do anything with this yet, so skip/treat as
+  // unimplemented in this case.
+  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
+    return isNotImplemented();
+  // ActOnVar ensured that everything is a valid variable reference, so there
+  // really isn't anything to do here. GCC does some duplicate-finding, though
+  // it isn't apparent in the standard where this is justified.
+
+  return OpenACCCopyInClause::Create(
+      Ctx, Clause.getClauseKind(), Clause.getBeginLoc(), Clause.getLParenLoc(),
+      Clause.isReadOnly(), Clause.getVarList(), Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitCopyOutClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // Restrictions only properly implemented on 'compute'/'combined'/'data'
+  // constructs, and 'compute'/'combined'/'data' constructs are the only
+  // construct that can do anything with this yet, so skip/treat as
+  // unimplemented in this case.
+  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
+    return isNotImplemented();
+  // ActOnVar ensured that everything is a valid variable reference, so there
+  // really isn't anything to do here. GCC does some duplicate-finding, though
+  // it isn't apparent in the standard where this is justified.
+
+  return OpenACCCopyOutClause::Create(
+      Ctx, Clause.getClauseKind(), Clause.getBeginLoc(), Clause.getLParenLoc(),
+      Clause.isZero(), Clause.getVarList(), Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitCreateClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // ActOnVar ensured that everything is a valid variable reference, so there
+  // really isn't anything to do here. GCC does some duplicate-finding, though
+  // it isn't apparent in the standard where this is justified.
+
+  return OpenACCCreateClause::Create(
+      Ctx, Clause.getClauseKind(), Clause.getBeginLoc(), Clause.getLParenLoc(),
+      Clause.isZero(), Clause.getVarList(), Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitAttachClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // ActOnVar ensured that everything is a valid variable reference, but we
+  // still have to make sure it is a pointer type.
+  llvm::SmallVector<Expr *> VarList{Clause.getVarList()};
+  llvm::erase_if(VarList, [&](Expr *E) {
+    return SemaRef.CheckVarIsPointerType(OpenACCClauseKind::Attach, E);
+  });
+  Clause.setVarListDetails(VarList,
+                           /*IsReadOnly=*/false, /*IsZero=*/false);
+  return OpenACCAttachClause::Create(Ctx, Clause.getBeginLoc(),
+                                     Clause.getLParenLoc(), Clause.getVarList(),
+                                     Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitDetachClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // ActOnVar ensured that everything is a valid variable reference, but we
+  // still have to make sure it is a pointer type.
+  llvm::SmallVector<Expr *> VarList{Clause.getVarList()};
+  llvm::erase_if(VarList, [&](Expr *E) {
+    return SemaRef.CheckVarIsPointerType(OpenACCClauseKind::Detach, E);
+  });
+  Clause.setVarListDetails(VarList,
+                           /*IsReadOnly=*/false, /*IsZero=*/false);
+  return OpenACCDetachClause::Create(Ctx, Clause.getBeginLoc(),
+                                     Clause.getLParenLoc(), Clause.getVarList(),
+                                     Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitDeleteClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // ActOnVar ensured that everything is a valid variable reference, so there
+  // really isn't anything to do here. GCC does some duplicate-finding, though
+  // it isn't apparent in the standard where this is justified.
+  return OpenACCDeleteClause::Create(Ctx, Clause.getBeginLoc(),
+                                     Clause.getLParenLoc(), Clause.getVarList(),
+                                     Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitUseDeviceClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // ActOnVar ensured that everything is a valid variable or array, so nothing
+  // left to do here.
+  return OpenACCUseDeviceClause::Create(
+      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getVarList(),
+      Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitDevicePtrClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // Restrictions only properly implemented on 'compute'/'combined'/'data'
+  // constructs, and 'compute'/'combined'/'data' constructs are the only
+  // construct that can do anything with this yet, so skip/treat as
+  // unimplemented in this case.
+  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
+    return isNotImplemented();
+
+  // ActOnVar ensured that everything is a valid variable reference, but we
+  // still have to make sure it is a pointer type.
+  llvm::SmallVector<Expr *> VarList{Clause.getVarList()};
+  llvm::erase_if(VarList, [&](Expr *E) {
+    return SemaRef.CheckVarIsPointerType(OpenACCClauseKind::DevicePtr, E);
+  });
+  Clause.setVarListDetails(VarList,
+                           /*IsReadOnly=*/false, /*IsZero=*/false);
+
+  return OpenACCDevicePtrClause::Create(
+      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getVarList(),
+      Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitWaitClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  return OpenACCWaitClause::Create(
+      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getDevNumExpr(),
+      Clause.getQueuesLoc(), Clause.getQueueIdExprs(), Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitDeviceTypeClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // Restrictions implemented properly on everything except 'routine'.
+  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Routine)
+    return isNotImplemented();
+
+  // OpenACC 3.3 2.14.3: Two instances of the same clause may not appear on the
+  // same directive.
+  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Set &&
+      checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
+    return nullptr;
+
+  // TODO OpenACC: Once we get enough of the CodeGen implemented that we have
+  // a source for the list of valid architectures, we need to warn on unknown
+  // identifiers here.
+
+  return OpenACCDeviceTypeClause::Create(
+      Ctx, Clause.getClauseKind(), Clause.getBeginLoc(), Clause.getLParenLoc(),
+      Clause.getDeviceTypeArchitectures(), Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitAutoClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // OpenACC 3.3 2.9:
+  // Only one of the seq, independent, and auto clauses may appear.
+  const auto *Itr =
+      llvm::find_if(ExistingClauses,
+                    llvm::IsaPred<OpenACCIndependentClause, OpenACCSeqClause>);
+  if (Itr != ExistingClauses.end()) {
+    SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_loop_spec_conflict)
+        << Clause.getClauseKind() << Clause.getDirectiveKind();
+    SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
+    return nullptr;
+  }
+
+  return OpenACCAutoClause::Create(Ctx, Clause.getBeginLoc(),
+                                   Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitIndependentClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // OpenACC 3.3 2.9:
+  // Only one of the seq, independent, and auto clauses may appear.
+  const auto *Itr = llvm::find_if(
+      ExistingClauses, llvm::IsaPred<OpenACCAutoClause, OpenACCSeqClause>);
+  if (Itr != ExistingClauses.end()) {
+    SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_loop_spec_conflict)
+        << Clause.getClauseKind() << Clause.getDirectiveKind();
+    SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
+    return nullptr;
+  }
+
+  return OpenACCIndependentClause::Create(Ctx, Clause.getBeginLoc(),
+                                          Clause.getEndLoc());
+}
+
+ExprResult CheckGangStaticExpr(SemaOpenACC &S, Expr *E) {
+  if (isa<OpenACCAsteriskSizeExpr>(E))
+    return E;
+  return S.ActOnIntExpr(OpenACCDirectiveKind::Invalid, OpenACCClauseKind::Gang,
+                        E->getBeginLoc(), E);
+}
+
+bool IsOrphanLoop(OpenACCDirectiveKind DK, OpenACCDirectiveKind AssocKind) {
+  return DK == OpenACCDirectiveKind::Loop &&
+         AssocKind == OpenACCDirectiveKind::Invalid;
+}
+
+bool HasAssocKind(OpenACCDirectiveKind DK, OpenACCDirectiveKind AssocKind) {
+  return DK == OpenACCDirectiveKind::Loop &&
+         AssocKind != OpenACCDirectiveKind::Invalid;
+}
+
+ExprResult DiagIntArgInvalid(SemaOpenACC &S, Expr *E, OpenACCGangKind GK,
+                             OpenACCClauseKind CK, OpenACCDirectiveKind DK,
+                             OpenACCDirectiveKind AssocKind) {
+  S.Diag(E->getBeginLoc(), diag::err_acc_int_arg_invalid)
+      << GK << CK << IsOrphanLoop(DK, AssocKind) << DK
+      << HasAssocKind(DK, AssocKind) << AssocKind;
+  return ExprError();
+}
+ExprResult DiagIntArgInvalid(SemaOpenACC &S, Expr *E, StringRef TagKind,
+                             OpenACCClauseKind CK, OpenACCDirectiveKind DK,
+                             OpenACCDirectiveKind AssocKind) {
+  S.Diag(E->getBeginLoc(), diag::err_acc_int_arg_invalid)
+      << TagKind << CK << IsOrphanLoop(DK, AssocKind) << DK
+      << HasAssocKind(DK, AssocKind) << AssocKind;
+  return ExprError();
+}
+
+ExprResult CheckGangParallelExpr(SemaOpenACC &S, OpenACCDirectiveKind DK,
+                                 OpenACCDirectiveKind AssocKind,
+                                 OpenACCGangKind GK, Expr *E) {
+  switch (GK) {
+  case OpenACCGangKind::Static:
+    return CheckGangStaticExpr(S, E);
+  case OpenACCGangKind::Num:
+    // OpenACC 3.3 2.9.2: When the parent compute construct is a parallel
+    // construct, or an orphaned loop construct, the gang clause behaves as
+    // follows. ... The num argument is not allowed.
+    return DiagIntArgInvalid(S, E, GK, OpenACCClauseKind::Gang, DK, AssocKind);
+  case OpenACCGangKind::Dim: {
+    // OpenACC 3.3 2.9.2: When the parent compute construct is a parallel
+    // construct, or an orphaned loop construct, the gang clause behaves as
+    // follows. ... The dim argument must be a constant positive integer value
+    // 1, 2, or 3.
+    if (!E)
+      return ExprError();
+    ExprResult Res =
+        S.ActOnIntExpr(OpenACCDirectiveKind::Invalid, OpenACCClauseKind::Gang,
+                       E->getBeginLoc(), E);
+
+    if (!Res.isUsable())
+      return Res;
+
+    if (Res.get()->isInstantiationDependent())
+      return Res;
+
+    std::optional<llvm::APSInt> ICE =
+        Res.get()->getIntegerConstantExpr(S.getASTContext());
+
+    if (!ICE || *ICE <= 0 || ICE > 3) {
+      S.Diag(Res.get()->getBeginLoc(), diag::err_acc_gang_dim_value)
+          << ICE.has_value() << ICE.value_or(llvm::APSInt{}).getExtValue();
+      return ExprError();
+    }
+
+    return ExprResult{
+        ConstantExpr::Create(S.getASTContext(), Res.get(), APValue{*ICE})};
+  }
+  }
+  llvm_unreachable("Unknown gang kind in gang parallel check");
+}
+
+ExprResult CheckGangKernelsExpr(SemaOpenACC &S,
+                                ArrayRef<const OpenACCClause *> ExistingClauses,
+                                OpenACCDirectiveKind DK,
+                                OpenACCDirectiveKind AssocKind,
+                                OpenACCGangKind GK, Expr *E) {
+  switch (GK) {
+  // OpenACC 3.3 2.9.2: When the parent compute construct is a kernels
+  // construct, the gang clause behaves as follows. ... The dim argument is
+  // not allowed.
+  case OpenACCGangKind::Dim:
+    return DiagIntArgInvalid(S, E, GK, OpenACCClauseKind::Gang, DK, AssocKind);
+  case OpenACCGangKind::Num: {
+    // OpenACC 3.3 2.9.2: When the parent compute construct is a kernels
+    // construct, the gang clause behaves as follows. ... An argument with no
+    // keyword or with num keyword is only allowed when num_gangs does not
+    // appear on the kernels construct. ... The region of a loop with the gang
+    // clause may not contain another loop with a gang clause unless within a
+    // nested compute region.
+
+    // If this is a 'combined' construct, search the list of existing clauses.
+    // Else we need to search the containing 'kernel'.
+    auto Collection = isOpenACCCombinedDirectiveKind(DK)
+                          ? ExistingClauses
+                          : S.getActiveComputeConstructInfo().Clauses;
+
+    const auto *Itr =
+        llvm::find_if(Collection, llvm::IsaPred<OpenACCNumGangsClause>);
+
+    if (Itr != Collection.end()) {
+      S.Diag(E->getBeginLoc(), diag::err_acc_num_arg_conflict)
+          << "num" << OpenACCClauseKind::Gang << DK
+          << HasAssocKind(DK, AssocKind) << AssocKind
+          << OpenACCClauseKind::NumGangs;
+
+      S.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
+      return ExprError();
+    }
+    return ExprResult{E};
+  }
+  case OpenACCGangKind::Static:
+    return CheckGangStaticExpr(S, E);
+    return ExprError();
+  }
+  llvm_unreachable("Unknown gang kind in gang kernels check");
+}
+
+ExprResult CheckGangSerialExpr(SemaOpenACC &S, OpenACCDirectiveKind DK,
+                               OpenACCDirectiveKind AssocKind,
+                               OpenACCGangKind GK, Expr *E) {
+  switch (GK) {
+  // 'dim' and 'num' don't really make sense on serial, and GCC rejects them
+  // too, so we disallow them too.
+  case OpenACCGangKind::Dim:
+  case OpenACCGangKind::Num:
+    return DiagIntArgInvalid(S, E, GK, OpenACCClauseKind::Gang, DK, AssocKind);
+  case OpenACCGangKind::Static:
+    return CheckGangStaticExpr(S, E);
+  }
+  llvm_unreachable("Unknown gang kind in gang serial check");
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitVectorClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  if (DiagIfSeqClause(Clause))
+    return nullptr;
+
+  // Restrictions only properly implemented on 'loop'/'combined' constructs, and
+  // it is the only construct that can do anything with this, so skip/treat as
+  // unimplemented for the routine constructs.
+  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
+    return isNotImplemented();
+
+  Expr *IntExpr =
+      Clause.getNumIntExprs() != 0 ? Clause.getIntExprs()[0] : nullptr;
+  if (IntExpr) {
+    if (!isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind())) {
+      switch (SemaRef.getActiveComputeConstructInfo().Kind) {
+      case OpenACCDirectiveKind::Invalid:
+      case OpenACCDirectiveKind::Parallel:
+        // No restriction on when 'parallel' can contain an argument.
+        break;
+      case OpenACCDirectiveKind::Serial:
+        // GCC disallows this, and there is no real good reason for us to permit
+        // it, so disallow until we come up with a use case that makes sense.
+        DiagIntArgInvalid(SemaRef, IntExpr, "length", OpenACCClauseKind::Vector,
+                          Clause.getDirectiveKind(),
+                          SemaRef.getActiveComputeConstructInfo().Kind);
+        IntExpr = nullptr;
+        break;
+      case OpenACCDirectiveKind::Kernels: {
+        const auto *Itr =
+            llvm::find_if(SemaRef.getActiveComputeConstructInfo().Clauses,
+                          llvm::IsaPred<OpenACCVectorLengthClause>);
+        if (Itr != SemaRef.getActiveComputeConstructInfo().Clauses.end()) {
+          SemaRef.Diag(IntExpr->getBeginLoc(), diag::err_acc_num_arg_conflict)
+              << "length" << OpenACCClauseKind::Vector
+              << Clause.getDirectiveKind()
+              << HasAssocKind(Clause.getDirectiveKind(),
+                              SemaRef.getActiveComputeConstructInfo().Kind)
+              << SemaRef.getActiveComputeConstructInfo().Kind
+              << OpenACCClauseKind::VectorLength;
+          SemaRef.Diag((*Itr)->getBeginLoc(),
+                       diag::note_acc_previous_clause_here);
+
+          IntExpr = nullptr;
+        }
+        break;
+      }
+      default:
+        llvm_unreachable("Non compute construct in active compute construct");
+      }
+    } else {
+      if (Clause.getDirectiveKind() == OpenACCDirectiveKind::SerialLoop) {
+        DiagIntArgInvalid(SemaRef, IntExpr, "length", OpenACCClauseKind::Vector,
+                          Clause.getDirectiveKind(),
+                          SemaRef.getActiveComputeConstructInfo().Kind);
+        IntExpr = nullptr;
+      } else if (Clause.getDirectiveKind() ==
+                 OpenACCDirectiveKind::KernelsLoop) {
+        const auto *Itr = llvm::find_if(
+            ExistingClauses, llvm::IsaPred<OpenACCVectorLengthClause>);
+        if (Itr != ExistingClauses.end()) {
+          SemaRef.Diag(IntExpr->getBeginLoc(), diag::err_acc_num_arg_conflict)
+              << "length" << OpenACCClauseKind::Vector
+              << Clause.getDirectiveKind()
+              << HasAssocKind(Clause.getDirectiveKind(),
+                              SemaRef.getActiveComputeConstructInfo().Kind)
+              << SemaRef.getActiveComputeConstructInfo().Kind
+              << OpenACCClauseKind::VectorLength;
+          SemaRef.Diag((*Itr)->getBeginLoc(),
+                       diag::note_acc_previous_clause_here);
+
+          IntExpr = nullptr;
+        }
+      }
+    }
+  }
+
+  if (!isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind())) {
+    // OpenACC 3.3 2.9.4: The region of a loop with a 'vector' clause may not
+    // contain a loop with a gang, worker, or vector clause unless within a
+    // nested compute region.
+    if (SemaRef.LoopVectorClauseLoc.isValid()) {
+      // This handles the 'inner loop' diagnostic, but we cannot set that we're
+      // on one of these until we get to the end of the construct.
+      SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_clause_in_clause_region)
+          << OpenACCClauseKind::Vector << OpenACCClauseKind::Vector
+          << /*skip kernels construct info*/ 0;
+      SemaRef.Diag(SemaRef.LoopVectorClauseLoc,
+                   diag::note_acc_previous_clause_here);
+      return nullptr;
+    }
+  }
+
+  return OpenACCVectorClause::Create(Ctx, Clause.getBeginLoc(),
+                                     Clause.getLParenLoc(), IntExpr,
+                                     Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitWorkerClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  if (DiagIfSeqClause(Clause))
+    return nullptr;
+
+  // Restrictions only properly implemented on 'loop'/'combined' constructs, and
+  // it is the only construct that can do anything with this, so skip/treat as
+  // unimplemented for the routine constructs.
+  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
+    return isNotImplemented();
+
+  Expr *IntExpr =
+      Clause.getNumIntExprs() != 0 ? Clause.getIntExprs()[0] : nullptr;
+
+  if (IntExpr) {
+    if (!isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind())) {
+      switch (SemaRef.getActiveComputeConstructInfo().Kind) {
+      case OpenACCDirectiveKind::Invalid:
+      case OpenACCDirectiveKind::ParallelLoop:
+      case OpenACCDirectiveKind::SerialLoop:
+      case OpenACCDirectiveKind::Parallel:
+      case OpenACCDirectiveKind::Serial:
+        DiagIntArgInvalid(SemaRef, IntExpr, OpenACCGangKind::Num,
+                          OpenACCClauseKind::Worker, Clause.getDirectiveKind(),
+                          SemaRef.getActiveComputeConstructInfo().Kind);
+        IntExpr = nullptr;
+        break;
+      case OpenACCDirectiveKind::KernelsLoop:
+      case OpenACCDirectiveKind::Kernels: {
+        const auto *Itr =
+            llvm::find_if(SemaRef.getActiveComputeConstructInfo().Clauses,
+                          llvm::IsaPred<OpenACCNumWorkersClause>);
+        if (Itr != SemaRef.getActiveComputeConstructInfo().Clauses.end()) {
+          SemaRef.Diag(IntExpr->getBeginLoc(), diag::err_acc_num_arg_conflict)
+              << "num" << OpenACCClauseKind::Worker << Clause.getDirectiveKind()
+              << HasAssocKind(Clause.getDirectiveKind(),
+                              SemaRef.getActiveComputeConstructInfo().Kind)
+              << SemaRef.getActiveComputeConstructInfo().Kind
+              << OpenACCClauseKind::NumWorkers;
+          SemaRef.Diag((*Itr)->getBeginLoc(),
+                       diag::note_acc_previous_clause_here);
+
+          IntExpr = nullptr;
+        }
+        break;
+      }
+      default:
+        llvm_unreachable("Non compute construct in active compute construct");
+      }
+    } else {
+      if (Clause.getDirectiveKind() == OpenACCDirectiveKind::ParallelLoop ||
+          Clause.getDirectiveKind() == OpenACCDirectiveKind::SerialLoop) {
+        DiagIntArgInvalid(SemaRef, IntExpr, OpenACCGangKind::Num,
+                          OpenACCClauseKind::Worker, Clause.getDirectiveKind(),
+                          SemaRef.getActiveComputeConstructInfo().Kind);
+        IntExpr = nullptr;
+      } else {
+        assert(Clause.getDirectiveKind() == OpenACCDirectiveKind::KernelsLoop &&
+               "Unknown combined directive kind?");
+        const auto *Itr = llvm::find_if(ExistingClauses,
+                                        llvm::IsaPred<OpenACCNumWorkersClause>);
+        if (Itr != ExistingClauses.end()) {
+          SemaRef.Diag(IntExpr->getBeginLoc(), diag::err_acc_num_arg_conflict)
+              << "num" << OpenACCClauseKind::Worker << Clause.getDirectiveKind()
+              << HasAssocKind(Clause.getDirectiveKind(),
+                              SemaRef.getActiveComputeConstructInfo().Kind)
+              << SemaRef.getActiveComputeConstructInfo().Kind
+              << OpenACCClauseKind::NumWorkers;
+          SemaRef.Diag((*Itr)->getBeginLoc(),
+                       diag::note_acc_previous_clause_here);
+
+          IntExpr = nullptr;
+        }
+      }
+    }
+  }
+
+  if (!isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind())) {
+    // OpenACC 3.3 2.9.3: The region of a loop with a 'worker' clause may not
+    // contain a loop with a gang or worker clause unless within a nested
+    // compute region.
+    if (SemaRef.LoopWorkerClauseLoc.isValid()) {
+      // This handles the 'inner loop' diagnostic, but we cannot set that we're
+      // on one of these until we get to the end of the construct.
+      SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_clause_in_clause_region)
+          << OpenACCClauseKind::Worker << OpenACCClauseKind::Worker
+          << /*skip kernels construct info*/ 0;
+      SemaRef.Diag(SemaRef.LoopWorkerClauseLoc,
+                   diag::note_acc_previous_clause_here);
+      return nullptr;
+    }
+
+    // OpenACC 3.3 2.9.4: The region of a loop with a 'vector' clause may not
+    // contain a loop with a gang, worker, or vector clause unless within a
+    // nested compute region.
+    if (SemaRef.LoopVectorClauseLoc.isValid()) {
+      // This handles the 'inner loop' diagnostic, but we cannot set that we're
+      // on one of these until we get to the end of the construct.
+      SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_clause_in_clause_region)
+          << OpenACCClauseKind::Worker << OpenACCClauseKind::Vector
+          << /*skip kernels construct info*/ 0;
+      SemaRef.Diag(SemaRef.LoopVectorClauseLoc,
+                   diag::note_acc_previous_clause_here);
+      return nullptr;
+    }
+  }
+
+  return OpenACCWorkerClause::Create(Ctx, Clause.getBeginLoc(),
+                                     Clause.getLParenLoc(), IntExpr,
+                                     Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitGangClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  if (DiagIfSeqClause(Clause))
+    return nullptr;
+
+  // Restrictions only properly implemented on 'loop' constructs, and it is
+  // the only construct that can do anything with this, so skip/treat as
+  // unimplemented for the combined constructs.
+  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
+    return isNotImplemented();
+
+  // OpenACC 3.3 Section 2.9.11: A reduction clause may not appear on a loop
+  // directive that has a gang clause and is within a compute construct that has
+  // a num_gangs clause with more than one explicit argument.
+  if ((Clause.getDirectiveKind() == OpenACCDirectiveKind::Loop &&
+       SemaRef.getActiveComputeConstructInfo().Kind !=
+           OpenACCDirectiveKind::Invalid) ||
+      isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind())) {
+    // num_gangs clause on the active compute construct.
+    auto ActiveComputeConstructContainer =
+        isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind())
+            ? ExistingClauses
+            : SemaRef.getActiveComputeConstructInfo().Clauses;
+    auto *NumGangsClauseItr = llvm::find_if(
+        ActiveComputeConstructContainer, llvm::IsaPred<OpenACCNumGangsClause>);
+
+    if (NumGangsClauseItr != ActiveComputeConstructContainer.end() &&
+        cast<OpenACCNumGangsClause>(*NumGangsClauseItr)->getIntExprs().size() >
+            1) {
+      auto *ReductionClauseItr =
+          llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCReductionClause>);
+
+      if (ReductionClauseItr != ExistingClauses.end()) {
+        SemaRef.Diag(Clause.getBeginLoc(),
+                     diag::err_acc_gang_reduction_numgangs_conflict)
+            << OpenACCClauseKind::Gang << OpenACCClauseKind::Reduction
+            << Clause.getDirectiveKind()
+            << isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind());
+        SemaRef.Diag((*ReductionClauseItr)->getBeginLoc(),
+                     diag::note_acc_previous_clause_here);
+        SemaRef.Diag((*NumGangsClauseItr)->getBeginLoc(),
+                     diag::note_acc_previous_clause_here);
+        return nullptr;
+      }
+    }
+  }
+
+  llvm::SmallVector<OpenACCGangKind> GangKinds;
+  llvm::SmallVector<Expr *> IntExprs;
+
+  // Store the existing locations, so we can do duplicate checking.  Index is
+  // the int-value of the OpenACCGangKind enum.
+  SourceLocation ExistingElemLoc[3];
+
+  for (unsigned I = 0; I < Clause.getIntExprs().size(); ++I) {
+    OpenACCGangKind GK = Clause.getGangKinds()[I];
+    ExprResult ER =
+        SemaRef.CheckGangExpr(ExistingClauses, Clause.getDirectiveKind(), GK,
+                              Clause.getIntExprs()[I]);
+
+    if (!ER.isUsable())
+      continue;
+
+    // OpenACC 3.3 2.9: 'gang-arg-list' may have at most one num, one dim, and
+    // one static argument.
+    if (ExistingElemLoc[static_cast<unsigned>(GK)].isValid()) {
+      SemaRef.Diag(ER.get()->getBeginLoc(), diag::err_acc_gang_multiple_elt)
+          << static_cast<unsigned>(GK);
+      SemaRef.Diag(ExistingElemLoc[static_cast<unsigned>(GK)],
+                   diag::note_acc_previous_expr_here);
+      continue;
+    }
+
+    ExistingElemLoc[static_cast<unsigned>(GK)] = ER.get()->getBeginLoc();
+    GangKinds.push_back(GK);
+    IntExprs.push_back(ER.get());
+  }
+
+  if (!isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind())) {
+    // OpenACC 3.3 2.9.2: When the parent compute construct is a kernels
+    // construct, the gang clause behaves as follows. ... The region of a loop
+    // with a gang clause may not contain another loop with a gang clause unless
+    // within a nested compute region.
+    if (SemaRef.LoopGangClauseOnKernel.Loc.isValid()) {
+      // This handles the 'inner loop' diagnostic, but we cannot set that we're
+      // on one of these until we get to the end of the construct.
+      SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_clause_in_clause_region)
+          << OpenACCClauseKind::Gang << OpenACCClauseKind::Gang
+          << /*kernels construct info*/ 1
+          << SemaRef.LoopGangClauseOnKernel.DirKind;
+      SemaRef.Diag(SemaRef.LoopGangClauseOnKernel.Loc,
+                   diag::note_acc_previous_clause_here);
+      return nullptr;
+    }
+
+    // OpenACC 3.3 2.9.3: The region of a loop with a 'worker' clause may not
+    // contain a loop with a gang or worker clause unless within a nested
+    // compute region.
+    if (SemaRef.LoopWorkerClauseLoc.isValid()) {
+      // This handles the 'inner loop' diagnostic, but we cannot set that we're
+      // on one of these until we get to the end of the construct.
+      SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_clause_in_clause_region)
+          << OpenACCClauseKind::Gang << OpenACCClauseKind::Worker
+          << /*!kernels construct info*/ 0;
+      SemaRef.Diag(SemaRef.LoopWorkerClauseLoc,
+                   diag::note_acc_previous_clause_here);
+      return nullptr;
+    }
+
+    // OpenACC 3.3 2.9.4: The region of a loop with a 'vector' clause may not
+    // contain a loop with a gang, worker, or vector clause unless within a
+    // nested compute region.
+    if (SemaRef.LoopVectorClauseLoc.isValid()) {
+      // This handles the 'inner loop' diagnostic, but we cannot set that we're
+      // on one of these until we get to the end of the construct.
+      SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_clause_in_clause_region)
+          << OpenACCClauseKind::Gang << OpenACCClauseKind::Vector
+          << /*!kernels construct info*/ 0;
+      SemaRef.Diag(SemaRef.LoopVectorClauseLoc,
+                   diag::note_acc_previous_clause_here);
+      return nullptr;
+    }
+  }
+
+  return SemaRef.CheckGangClause(Clause.getDirectiveKind(), ExistingClauses,
+                                 Clause.getBeginLoc(), Clause.getLParenLoc(),
+                                 GangKinds, IntExprs, Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitFinalizeClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // There isn't anything to do here, this is only valid on one construct, and
+  // has no associated rules.
+  return OpenACCFinalizeClause::Create(Ctx, Clause.getBeginLoc(),
+                                       Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitIfPresentClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // There isn't anything to do here, this is only valid on one construct, and
+  // has no associated rules.
+  return OpenACCIfPresentClause::Create(Ctx, Clause.getBeginLoc(),
+                                        Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitSeqClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // Restrictions only properly implemented on 'loop' constructs and combined ,
+  // and it is the only construct that can do anything with this, so skip/treat
+  // as unimplemented for the routine constructs.
+  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
+    return isNotImplemented();
+
+  // OpenACC 3.3 2.9:
+  // Only one of the seq, independent, and auto clauses may appear.
+  const auto *Itr =
+      llvm::find_if(ExistingClauses,
+                    llvm::IsaPred<OpenACCAutoClause, OpenACCIndependentClause>);
+  if (Itr != ExistingClauses.end()) {
+    SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_loop_spec_conflict)
+        << Clause.getClauseKind() << Clause.getDirectiveKind();
+    SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
+    return nullptr;
+  }
+
+  // OpenACC 3.3 2.9:
+  // A 'gang', 'worker', or 'vector' clause may not appear if a 'seq' clause
+  // appears.
+  Itr = llvm::find_if(ExistingClauses,
+                      llvm::IsaPred<OpenACCGangClause, OpenACCWorkerClause,
+                                    OpenACCVectorClause>);
+
+  if (Itr != ExistingClauses.end()) {
+    SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_clause_cannot_combine)
+        << Clause.getClauseKind() << (*Itr)->getClauseKind()
+        << Clause.getDirectiveKind();
+    SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
+    return nullptr;
+  }
+
+  return OpenACCSeqClause::Create(Ctx, Clause.getBeginLoc(),
+                                  Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitReductionClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // OpenACC 3.3 Section 2.9.11: A reduction clause may not appear on a loop
+  // directive that has a gang clause and is within a compute construct that has
+  // a num_gangs clause with more than one explicit argument.
+  if ((Clause.getDirectiveKind() == OpenACCDirectiveKind::Loop &&
+       SemaRef.getActiveComputeConstructInfo().Kind !=
+           OpenACCDirectiveKind::Invalid) ||
+      isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind())) {
+    // num_gangs clause on the active compute construct.
+    auto ActiveComputeConstructContainer =
+        isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind())
+            ? ExistingClauses
+            : SemaRef.getActiveComputeConstructInfo().Clauses;
+    auto *NumGangsClauseItr = llvm::find_if(
+        ActiveComputeConstructContainer, llvm::IsaPred<OpenACCNumGangsClause>);
+
+    if (NumGangsClauseItr != ActiveComputeConstructContainer.end() &&
+        cast<OpenACCNumGangsClause>(*NumGangsClauseItr)->getIntExprs().size() >
+            1) {
+      auto *GangClauseItr =
+          llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCGangClause>);
+
+      if (GangClauseItr != ExistingClauses.end()) {
+        SemaRef.Diag(Clause.getBeginLoc(),
+                     diag::err_acc_gang_reduction_numgangs_conflict)
+            << OpenACCClauseKind::Reduction << OpenACCClauseKind::Gang
+            << Clause.getDirectiveKind()
+            << isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind());
+        SemaRef.Diag((*GangClauseItr)->getBeginLoc(),
+                     diag::note_acc_previous_clause_here);
+        SemaRef.Diag((*NumGangsClauseItr)->getBeginLoc(),
+                     diag::note_acc_previous_clause_here);
+        return nullptr;
+      }
+    }
+  }
+
+  // OpenACC3.3 Section 2.9.11: If a variable is involved in a reduction that
+  // spans multiple nested loops where two or more of those loops have
+  // associated loop directives, a reduction clause containing that variable
+  // must appear on each of those loop directives.
+  //
+  // This can't really be implemented in the CFE, as this requires a level of
+  // rechability/useage analysis that we're not really wanting to get into.
+  // Additionally, I'm alerted that this restriction is one that the middle-end
+  // can just 'figure out' as an extension and isn't really necessary.
+  //
+  // OpenACC3.3 Section 2.9.11: Every 'var' in a reduction clause appearing on
+  // an orphaned loop construct must be private.
+  //
+  // This again is something we cannot really diagnose, as it requires we see
+  // all the uses/scopes of all variables referenced.  The middle end/MLIR might
+  // be able to diagnose this.
+
+  // OpenACC 3.3 Section 2.5.4:
+  // A reduction clause may not appear on a parallel construct with a
+  // num_gangs clause that has more than one argument.
+  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Parallel ||
+      Clause.getDirectiveKind() == OpenACCDirectiveKind::ParallelLoop) {
+    auto NumGangsClauses = llvm::make_filter_range(
+        ExistingClauses, llvm::IsaPred<OpenACCNumGangsClause>);
+
+    for (auto *NGC : NumGangsClauses) {
+      unsigned NumExprs =
+          cast<OpenACCNumGangsClause>(NGC)->getIntExprs().size();
+
+      if (NumExprs > 1) {
+        SemaRef.Diag(Clause.getBeginLoc(),
+                     diag::err_acc_reduction_num_gangs_conflict)
+            << /*>1 arg in first loc=*/0 << Clause.getClauseKind()
+            << Clause.getDirectiveKind() << OpenACCClauseKind::NumGangs;
+        SemaRef.Diag(NGC->getBeginLoc(), diag::note_acc_previous_clause_here);
+        return nullptr;
+      }
+    }
+  }
+
+  SmallVector<Expr *> ValidVars;
+
+  for (Expr *Var : Clause.getVarList()) {
+    ExprResult Res = SemaRef.CheckReductionVar(Clause.getDirectiveKind(),
+                                               Clause.getReductionOp(), Var);
+
+    if (Res.isUsable())
+      ValidVars.push_back(Res.get());
+  }
+
+  return SemaRef.CheckReductionClause(
+      ExistingClauses, Clause.getDirectiveKind(), Clause.getBeginLoc(),
+      Clause.getLParenLoc(), Clause.getReductionOp(), ValidVars,
+      Clause.getEndLoc());
+}
+
+OpenACCClause *SemaOpenACCClauseVisitor::VisitCollapseClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // Duplicates here are not really sensible.  We could possible permit
+  // multiples if they all had the same value, but there isn't really a good
+  // reason to do so. Also, this simplifies the suppression of duplicates, in
+  // that we know if we 'find' one after instantiation, that it is the same
+  // clause, which simplifies instantiation/checking/etc.
+  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
+    return nullptr;
+
+  ExprResult LoopCount = SemaRef.CheckCollapseLoopCount(Clause.getLoopCount());
+
+  if (!LoopCount.isUsable())
+    return nullptr;
+
+  return OpenACCCollapseClause::Create(Ctx, Clause.getBeginLoc(),
+                                       Clause.getLParenLoc(), Clause.isForce(),
+                                       LoopCount.get(), Clause.getEndLoc());
+}
+
+// Return true if the two vars refer to the same variable, for the purposes of
+// equality checking.
+bool areVarsEqual(Expr *VarExpr1, Expr *VarExpr2) {
+  if (VarExpr1->isInstantiationDependent() ||
+      VarExpr2->isInstantiationDependent())
+    return false;
+
+  VarExpr1 = VarExpr1->IgnoreParenCasts();
+  VarExpr2 = VarExpr2->IgnoreParenCasts();
+
+  // Legal expressions can be: Scalar variable reference, sub-array, array
+  // element, or composite variable member.
+
+  // Sub-array.
+  if (isa<ArraySectionExpr>(VarExpr1)) {
+    auto *Expr2AS = dyn_cast<ArraySectionExpr>(VarExpr2);
+    if (!Expr2AS)
+      return false;
+
+    auto *Expr1AS = cast<ArraySectionExpr>(VarExpr1);
+
+    if (!areVarsEqual(Expr1AS->getBase(), Expr2AS->getBase()))
+      return false;
+    // We could possibly check to see if the ranges aren't overlapping, but it
+    // isn't clear that the rules allow this.
+    return true;
+  }
+
+  // Array-element.
+  if (isa<ArraySubscriptExpr>(VarExpr1)) {
+    auto *Expr2AS = dyn_cast<ArraySubscriptExpr>(VarExpr2);
+    if (!Expr2AS)
+      return false;
+
+    auto *Expr1AS = cast<ArraySubscriptExpr>(VarExpr1);
+
+    if (!areVarsEqual(Expr1AS->getBase(), Expr2AS->getBase()))
+      return false;
+
+    // We could possibly check to see if the elements referenced aren't the
+    // same, but it isn't clear by reading of the standard that this is allowed
+    // (and that the 'var' refered to isn't the array).
+    return true;
+  }
+
+  // Scalar variable reference, or composite variable.
+  if (isa<DeclRefExpr>(VarExpr1)) {
+    auto *Expr2DRE = dyn_cast<DeclRefExpr>(VarExpr2);
+    if (!Expr2DRE)
+      return false;
+
+    auto *Expr1DRE = cast<DeclRefExpr>(VarExpr1);
+
+    return Expr1DRE->getDecl()->getMostRecentDecl() ==
+           Expr2DRE->getDecl()->getMostRecentDecl();
+  }
+
+  llvm_unreachable("Unknown variable type encountered");
+}
+} // namespace
+
+OpenACCClause *
+SemaOpenACC::ActOnClause(ArrayRef<const OpenACCClause *> ExistingClauses,
+                         OpenACCParsedClause &Clause) {
+  if (Clause.getClauseKind() == OpenACCClauseKind::Invalid)
+    return nullptr;
+
+  // Diagnose that we don't support this clause on this directive.
+  if (!doesClauseApplyToDirective(Clause.getDirectiveKind(),
+                                  Clause.getClauseKind())) {
+    Diag(Clause.getBeginLoc(), diag::err_acc_clause_appertainment)
+        << Clause.getDirectiveKind() << Clause.getClauseKind();
+    return nullptr;
+  }
+
+  if (const auto *DevTypeClause =
+          llvm::find_if(ExistingClauses,
+                        [&](const OpenACCClause *C) {
+                          return isa<OpenACCDeviceTypeClause>(C);
+                        });
+      DevTypeClause != ExistingClauses.end()) {
+    if (checkValidAfterDeviceType(
+            *this, *cast<OpenACCDeviceTypeClause>(*DevTypeClause), Clause))
+      return nullptr;
+  }
+
+  SemaOpenACCClauseVisitor Visitor{*this, ExistingClauses};
+  OpenACCClause *Result = Visitor.Visit(Clause);
+  assert((!Result || Result->getClauseKind() == Clause.getClauseKind()) &&
+         "Created wrong clause?");
+
+  if (Visitor.diagNotImplemented())
+    Diag(Clause.getBeginLoc(), diag::warn_acc_clause_unimplemented)
+        << Clause.getClauseKind();
+
+  return Result;
+
+}
+
+/// OpenACC 3.3 section 2.5.15:
+/// At a mininmum, the supported data types include ... the numerical data types
+/// in C, C++, and Fortran.
+///
+/// If the reduction var is a composite variable, each
+/// member of the composite variable must be a supported datatype for the
+/// reduction operation.
+ExprResult SemaOpenACC::CheckReductionVar(OpenACCDirectiveKind DirectiveKind,
+                                          OpenACCReductionOperator ReductionOp,
+                                          Expr *VarExpr) {
+  VarExpr = VarExpr->IgnoreParenCasts();
+
+  auto TypeIsValid = [](QualType Ty) {
+    return Ty->isDependentType() || Ty->isScalarType();
+  };
+
+  if (isa<ArraySectionExpr>(VarExpr)) {
+    Expr *ASExpr = VarExpr;
+    QualType BaseTy = ArraySectionExpr::getBaseOriginalType(ASExpr);
+    QualType EltTy = getASTContext().getBaseElementType(BaseTy);
+
+    if (!TypeIsValid(EltTy)) {
+      Diag(VarExpr->getExprLoc(), diag::err_acc_reduction_type)
+          << EltTy << /*Sub array base type*/ 1;
+      return ExprError();
+    }
+  } else if (auto *RD = VarExpr->getType()->getAsRecordDecl()) {
+    if (!RD->isStruct() && !RD->isClass()) {
+      Diag(VarExpr->getExprLoc(), diag::err_acc_reduction_composite_type)
+          << /*not class or struct*/ 0 << VarExpr->getType();
+      return ExprError();
+    }
+
+    if (!RD->isCompleteDefinition()) {
+      Diag(VarExpr->getExprLoc(), diag::err_acc_reduction_composite_type)
+          << /*incomplete*/ 1 << VarExpr->getType();
+      return ExprError();
+    }
+    if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD);
+        CXXRD && !CXXRD->isAggregate()) {
+      Diag(VarExpr->getExprLoc(), diag::err_acc_reduction_composite_type)
+          << /*aggregate*/ 2 << VarExpr->getType();
+      return ExprError();
+    }
+
+    for (FieldDecl *FD : RD->fields()) {
+      if (!TypeIsValid(FD->getType())) {
+        Diag(VarExpr->getExprLoc(),
+             diag::err_acc_reduction_composite_member_type);
+        Diag(FD->getLocation(), diag::note_acc_reduction_composite_member_loc);
+        return ExprError();
+      }
+    }
+  } else if (!TypeIsValid(VarExpr->getType())) {
+    Diag(VarExpr->getExprLoc(), diag::err_acc_reduction_type)
+        << VarExpr->getType() << /*Sub array base type*/ 0;
+    return ExprError();
+  }
+
+  // OpenACC3.3: 2.9.11: Reduction clauses on nested constructs for the same
+  // reduction 'var' must have the same reduction operator.
+  if (!VarExpr->isInstantiationDependent()) {
+
+    for (const OpenACCReductionClause *RClause : ActiveReductionClauses) {
+      if (RClause->getReductionOp() == ReductionOp)
+        break;
+
+      for (Expr *OldVarExpr : RClause->getVarList()) {
+        if (OldVarExpr->isInstantiationDependent())
+          continue;
+
+        if (areVarsEqual(VarExpr, OldVarExpr)) {
+          Diag(VarExpr->getExprLoc(), diag::err_reduction_op_mismatch)
+              << ReductionOp << RClause->getReductionOp();
+          Diag(OldVarExpr->getExprLoc(), diag::note_acc_previous_clause_here);
+          return ExprError();
+        }
+      }
+    }
+  }
+
+  return VarExpr;
+}
+
+ExprResult SemaOpenACC::CheckTileSizeExpr(Expr *SizeExpr) {
+  if (!SizeExpr)
+    return ExprError();
+
+  assert((SizeExpr->isInstantiationDependent() ||
+          SizeExpr->getType()->isIntegerType()) &&
+         "size argument non integer?");
+
+  // If dependent, or an asterisk, the expression is fine.
+  if (SizeExpr->isInstantiationDependent() ||
+      isa<OpenACCAsteriskSizeExpr>(SizeExpr))
+    return ExprResult{SizeExpr};
+
+  std::optional<llvm::APSInt> ICE =
+      SizeExpr->getIntegerConstantExpr(getASTContext());
+
+  // OpenACC 3.3 2.9.8
+  // where each tile size is a constant positive integer expression or asterisk.
+  if (!ICE || *ICE <= 0) {
+    Diag(SizeExpr->getBeginLoc(), diag::err_acc_size_expr_value)
+        << ICE.has_value() << ICE.value_or(llvm::APSInt{}).getExtValue();
+    return ExprError();
+  }
+
+  return ExprResult{
+      ConstantExpr::Create(getASTContext(), SizeExpr, APValue{*ICE})};
+}
+
+ExprResult SemaOpenACC::CheckCollapseLoopCount(Expr *LoopCount) {
+  if (!LoopCount)
+    return ExprError();
+
+  assert((LoopCount->isInstantiationDependent() ||
+          LoopCount->getType()->isIntegerType()) &&
+         "Loop argument non integer?");
+
+  // If this is dependent, there really isn't anything we can check.
+  if (LoopCount->isInstantiationDependent())
+    return ExprResult{LoopCount};
+
+  std::optional<llvm::APSInt> ICE =
+      LoopCount->getIntegerConstantExpr(getASTContext());
+
+  // OpenACC 3.3: 2.9.1
+  // The argument to the collapse clause must be a constant positive integer
+  // expression.
+  if (!ICE || *ICE <= 0) {
+    Diag(LoopCount->getBeginLoc(), diag::err_acc_collapse_loop_count)
+        << ICE.has_value() << ICE.value_or(llvm::APSInt{}).getExtValue();
+    return ExprError();
+  }
+
+  return ExprResult{
+      ConstantExpr::Create(getASTContext(), LoopCount, APValue{*ICE})};
+}
+
+ExprResult
+SemaOpenACC::CheckGangExpr(ArrayRef<const OpenACCClause *> ExistingClauses,
+                           OpenACCDirectiveKind DK, OpenACCGangKind GK,
+                           Expr *E) {
+  // There are two cases for the enforcement here: the 'current' directive is a
+  // 'loop', where we need to check the active compute construct kind, or the
+  // current directive is a 'combined' construct, where we have to check the
+  // current one.
+  switch (DK) {
+  case OpenACCDirectiveKind::ParallelLoop:
+    return CheckGangParallelExpr(*this, DK, ActiveComputeConstructInfo.Kind, GK,
+                                 E);
+  case OpenACCDirectiveKind::SerialLoop:
+    return CheckGangSerialExpr(*this, DK, ActiveComputeConstructInfo.Kind, GK,
+                               E);
+  case OpenACCDirectiveKind::KernelsLoop:
+    return CheckGangKernelsExpr(*this, ExistingClauses, DK,
+                                ActiveComputeConstructInfo.Kind, GK, E);
+  case OpenACCDirectiveKind::Loop:
+    switch (ActiveComputeConstructInfo.Kind) {
+    case OpenACCDirectiveKind::Invalid:
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::ParallelLoop:
+      return CheckGangParallelExpr(*this, DK, ActiveComputeConstructInfo.Kind,
+                                   GK, E);
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::Serial:
+      return CheckGangSerialExpr(*this, DK, ActiveComputeConstructInfo.Kind, GK,
+                                 E);
+    case OpenACCDirectiveKind::KernelsLoop:
+    case OpenACCDirectiveKind::Kernels:
+      return CheckGangKernelsExpr(*this, ExistingClauses, DK,
+                                  ActiveComputeConstructInfo.Kind, GK, E);
+    default:
+      llvm_unreachable("Non compute construct in active compute construct?");
+    }
+  default:
+    // TODO: OpenACC: when we implement this on 'routine', we'll have to
+    // implement its checking here.
+    llvm_unreachable("Invalid directive kind for a Gang clause");
+  }
+  llvm_unreachable("Compute construct directive not handled?");
+}
+
+OpenACCClause *
+SemaOpenACC::CheckGangClause(OpenACCDirectiveKind DirKind,
+                             ArrayRef<const OpenACCClause *> ExistingClauses,
+                             SourceLocation BeginLoc, SourceLocation LParenLoc,
+                             ArrayRef<OpenACCGangKind> GangKinds,
+                             ArrayRef<Expr *> IntExprs, SourceLocation EndLoc) {
+  // OpenACC 3.3 2.9.11: A reduction clause may not appear on a loop directive
+  // that has a gang clause with a dim: argument whose value is greater than 1.
+
+  const auto *ReductionItr =
+      llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCReductionClause>);
+
+  if (ReductionItr != ExistingClauses.end()) {
+    const auto GangZip = llvm::zip_equal(GangKinds, IntExprs);
+    const auto GangItr = llvm::find_if(GangZip, [](const auto &Tuple) {
+      return std::get<0>(Tuple) == OpenACCGangKind::Dim;
+    });
+
+    if (GangItr != GangZip.end()) {
+      const Expr *DimExpr = std::get<1>(*GangItr);
+
+      assert(
+          (DimExpr->isInstantiationDependent() || isa<ConstantExpr>(DimExpr)) &&
+          "Improperly formed gang argument");
+      if (const auto *DimVal = dyn_cast<ConstantExpr>(DimExpr);
+          DimVal && DimVal->getResultAsAPSInt() > 1) {
+        Diag(DimVal->getBeginLoc(), diag::err_acc_gang_reduction_conflict)
+            << /*gang/reduction=*/0 << DirKind;
+        Diag((*ReductionItr)->getBeginLoc(),
+             diag::note_acc_previous_clause_here);
+        return nullptr;
+      }
+    }
+  }
+
+  return OpenACCGangClause::Create(getASTContext(), BeginLoc, LParenLoc,
+                                   GangKinds, IntExprs, EndLoc);
+}
+
+OpenACCClause *SemaOpenACC::CheckReductionClause(
+    ArrayRef<const OpenACCClause *> ExistingClauses,
+    OpenACCDirectiveKind DirectiveKind, SourceLocation BeginLoc,
+    SourceLocation LParenLoc, OpenACCReductionOperator ReductionOp,
+    ArrayRef<Expr *> Vars, SourceLocation EndLoc) {
+  if (DirectiveKind == OpenACCDirectiveKind::Loop ||
+      isOpenACCCombinedDirectiveKind(DirectiveKind)) {
+    // OpenACC 3.3 2.9.11: A reduction clause may not appear on a loop directive
+    // that has a gang clause with a dim: argument whose value is greater
+    // than 1.
+    const auto GangClauses = llvm::make_filter_range(
+        ExistingClauses, llvm::IsaPred<OpenACCGangClause>);
+
+    for (auto *GC : GangClauses) {
+      const auto *GangClause = cast<OpenACCGangClause>(GC);
+      for (unsigned I = 0; I < GangClause->getNumExprs(); ++I) {
+        std::pair<OpenACCGangKind, const Expr *> EPair = GangClause->getExpr(I);
+        if (EPair.first != OpenACCGangKind::Dim)
+          continue;
+
+        if (const auto *DimVal = dyn_cast<ConstantExpr>(EPair.second);
+            DimVal && DimVal->getResultAsAPSInt() > 1) {
+          Diag(BeginLoc, diag::err_acc_gang_reduction_conflict)
+              << /*reduction/gang=*/1 << DirectiveKind;
+          Diag(GangClause->getBeginLoc(), diag::note_acc_previous_clause_here);
+          return nullptr;
+        }
+      }
+    }
+  }
+
+  auto *Ret = OpenACCReductionClause::Create(
+      getASTContext(), BeginLoc, LParenLoc, ReductionOp, Vars, EndLoc);
+  return Ret;
+}
diff --git a/clang/lib/Sema/SemaSPIRV.cpp b/clang/lib/Sema/SemaSPIRV.cpp
index d2de64826c6eb..dc49fc7907357 100644
--- a/clang/lib/Sema/SemaSPIRV.cpp
+++ b/clang/lib/Sema/SemaSPIRV.cpp
@@ -51,6 +51,24 @@ bool SemaSPIRV::CheckSPIRVBuiltinFunctionCall(unsigned BuiltinID,
     TheCall->setType(RetTy);
     break;
   }
+  case SPIRV::BI__builtin_spirv_length: {
+    if (SemaRef.checkArgCount(TheCall, 1))
+      return true;
+    ExprResult A = TheCall->getArg(0);
+    QualType ArgTyA = A.get()->getType();
+    auto *VTy = ArgTyA->getAs<VectorType>();
+    if (VTy == nullptr) {
+      SemaRef.Diag(A.get()->getBeginLoc(),
+                   diag::err_typecheck_convert_incompatible)
+          << ArgTyA
+          << SemaRef.Context.getVectorType(ArgTyA, 2, VectorKind::Generic) << 1
+          << 0 << 0;
+      return true;
+    }
+    QualType RetTy = VTy->getElementType();
+    TheCall->setType(RetTy);
+    break;
+  }
   }
   return false;
 }
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index ce672b00893b0..50b479052a25f 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -2145,7 +2145,6 @@ DeclResult Sema::CheckClassTemplate(
     NewClass->startDefinition();
 
   ProcessDeclAttributeList(S, NewClass, Attr);
-  ProcessAPINotes(NewClass);
 
   if (PrevClassTemplate)
     mergeDeclAttributes(NewClass, PrevClassTemplate->getTemplatedDecl());
diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index d42c3765aa534..5f813ba3a597a 100644
--- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -996,7 +996,7 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
       F->getTemplateParameters()->size());
 
   // FIXME: DeduceTemplateArguments stops immediately at the first
-  // non-deducible template argument. However, this doesn't seem to casue
+  // non-deducible template argument. However, this doesn't seem to cause
   // issues for practice cases, we probably need to extend it to continue
   // performing deduction for rest of arguments to align with the C++
   // standard.
@@ -1053,25 +1053,6 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
     TransformedDeducedAliasArgs[AliasTemplateParamIdx] = NewTemplateArgument;
   }
   unsigned FirstUndeducedParamIdx = FPrimeTemplateParams.size();
-  //   ...followed by the template parameters of f that were not deduced
-  //   (including their default template arguments)
-  for (unsigned FTemplateParamIdx : NonDeducedTemplateParamsInFIndex) {
-    auto *TP = F->getTemplateParameters()->getParam(FTemplateParamIdx);
-    MultiLevelTemplateArgumentList Args;
-    Args.setKind(TemplateSubstitutionKind::Rewrite);
-    // We take a shortcut here, it is ok to reuse the
-    // TemplateArgsForBuildingFPrime.
-    Args.addOuterTemplateArguments(TemplateArgsForBuildingFPrime);
-    NamedDecl *NewParam = transformTemplateParameter(
-        SemaRef, F->getDeclContext(), TP, Args, FPrimeTemplateParams.size(),
-        getDepthAndIndex(TP).first);
-    FPrimeTemplateParams.push_back(NewParam);
-
-    assert(TemplateArgsForBuildingFPrime[FTemplateParamIdx].isNull() &&
-           "The argument must be null before setting");
-    TemplateArgsForBuildingFPrime[FTemplateParamIdx] =
-        Context.getInjectedTemplateArg(NewParam);
-  }
 
   // To form a deduction guide f' from f, we leverage clang's instantiation
   // mechanism, we construct a template argument list where the template
@@ -1080,24 +1061,21 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
   // f, this ensures all template parameter occurrences are updated
   // correctly.
   //
-  // The template argument list is formed from the `DeducedArgs`, two parts:
-  //  1) appeared template parameters of alias: transfrom the deduced
-  //  template argument;
-  //  2) non-deduced template parameters of f: rebuild a
-  //  template argument;
+  // The template argument list is formed, in order, from
+  //   1) For the template parameters of the alias, the corresponding deduced
+  //      template arguments
+  //   2) For the non-deduced template parameters of f. the
+  //      (rebuilt) template arguments corresponding.
   //
-  // 2) has been built already (when rebuilding the new template
-  // parameters), we now perform 1).
+  // Note: the non-deduced template arguments of `f` might refer to arguments
+  // deduced in 1), as in a type constraint.
   MultiLevelTemplateArgumentList Args;
   Args.setKind(TemplateSubstitutionKind::Rewrite);
   Args.addOuterTemplateArguments(TransformedDeducedAliasArgs);
   for (unsigned Index = 0; Index < DeduceResults.size(); ++Index) {
     const auto &D = DeduceResults[Index];
     if (D.isNull()) {
-      // 2): Non-deduced template parameter has been built already.
-      assert(!TemplateArgsForBuildingFPrime[Index].isNull() &&
-             "template arguments for non-deduced template parameters should "
-             "be been set!");
+      // 2): Non-deduced template parameters would be substituted later.
       continue;
     }
     TemplateArgumentLoc Input =
@@ -1110,6 +1088,27 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
     }
   }
 
+  // Case 2)
+  //   ...followed by the template parameters of f that were not deduced
+  //   (including their default template arguments)
+  for (unsigned FTemplateParamIdx : NonDeducedTemplateParamsInFIndex) {
+    auto *TP = F->getTemplateParameters()->getParam(FTemplateParamIdx);
+    MultiLevelTemplateArgumentList Args;
+    Args.setKind(TemplateSubstitutionKind::Rewrite);
+    // We take a shortcut here, it is ok to reuse the
+    // TemplateArgsForBuildingFPrime.
+    Args.addOuterTemplateArguments(TemplateArgsForBuildingFPrime);
+    NamedDecl *NewParam = transformTemplateParameter(
+        SemaRef, F->getDeclContext(), TP, Args, FPrimeTemplateParams.size(),
+        getDepthAndIndex(TP).first);
+    FPrimeTemplateParams.push_back(NewParam);
+
+    assert(TemplateArgsForBuildingFPrime[FTemplateParamIdx].isNull() &&
+           "The argument must be null before setting");
+    TemplateArgsForBuildingFPrime[FTemplateParamIdx] =
+        Context.getInjectedTemplateArg(NewParam);
+  }
+
   auto *TemplateArgListForBuildingFPrime =
       TemplateArgumentList::CreateCopy(Context, TemplateArgsForBuildingFPrime);
   // Form the f' by substituting the template arguments into f.
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index fb0f38df62a74..839c4e8a28220 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -4661,7 +4661,7 @@ void LocalInstantiationScope::InstantiatedLocal(const Decl *D, Decl *Inst) {
     }
 #endif
     Stored = Inst;
-  } else if (DeclArgumentPack *Pack = Stored.dyn_cast<DeclArgumentPack *>()) {
+  } else if (DeclArgumentPack *Pack = dyn_cast<DeclArgumentPack *>(Stored)) {
     Pack->push_back(cast<VarDecl>(Inst));
   } else {
     assert(cast<Decl *>(Stored) == Inst && "Already instantiated this local");
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index 0b4b78c5b15dc..fd1a6017712d2 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -641,8 +641,8 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_t2rpntlvwz1rs:
   case X86::BI__builtin_ia32_t2rpntlvwz1rst1:
   case X86::BI__builtin_ia32_t2rpntlvwz0rs:
-  case X86::BI__builtin_ia32_tcvtrowps2pbf16h:
-  case X86::BI__builtin_ia32_tcvtrowps2pbf16l:
+  case X86::BI__builtin_ia32_tcvtrowps2bf16h:
+  case X86::BI__builtin_ia32_tcvtrowps2bf16l:
   case X86::BI__builtin_ia32_tcvtrowps2phh:
   case X86::BI__builtin_ia32_tcvtrowps2phl:
   case X86::BI__builtin_ia32_tcvtrowd2ps:
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 06853a227215e..a72ff766685bb 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -1425,10 +1425,9 @@ bool ASTReader::ReadLexicalDeclContextStorage(ModuleFile &M,
   return false;
 }
 
-bool ASTReader::ReadVisibleDeclContextStorage(ModuleFile &M,
-                                              BitstreamCursor &Cursor,
-                                              uint64_t Offset, GlobalDeclID ID,
-                                              bool IsModuleLocal) {
+bool ASTReader::ReadVisibleDeclContextStorage(
+    ModuleFile &M, BitstreamCursor &Cursor, uint64_t Offset, GlobalDeclID ID,
+    ASTReader::VisibleDeclContextStorageKind VisibleKind) {
   assert(Offset != 0);
 
   SavedStreamPosition SavedPosition(Cursor);
@@ -1452,22 +1451,42 @@ bool ASTReader::ReadVisibleDeclContextStorage(ModuleFile &M,
     return true;
   }
   unsigned RecCode = MaybeRecCode.get();
-  if (!IsModuleLocal && RecCode != DECL_CONTEXT_VISIBLE) {
-    Error("Expected visible lookup table block");
-    return true;
-  }
-  if (IsModuleLocal && RecCode != DECL_CONTEXT_MODULE_LOCAL_VISIBLE) {
-    Error("Expected module local visible lookup table block");
-    return true;
+  switch (VisibleKind) {
+  case VisibleDeclContextStorageKind::GenerallyVisible:
+    if (RecCode != DECL_CONTEXT_VISIBLE) {
+      Error("Expected visible lookup table block");
+      return true;
+    }
+    break;
+  case VisibleDeclContextStorageKind::ModuleLocalVisible:
+    if (RecCode != DECL_CONTEXT_MODULE_LOCAL_VISIBLE) {
+      Error("Expected module local visible lookup table block");
+      return true;
+    }
+    break;
+  case VisibleDeclContextStorageKind::TULocalVisible:
+    if (RecCode != DECL_CONTEXT_TU_LOCAL_VISIBLE) {
+      Error("Expected TU local lookup table block");
+      return true;
+    }
+    break;
   }
 
   // We can't safely determine the primary context yet, so delay attaching the
   // lookup table until we're done with recursive deserialization.
   auto *Data = (const unsigned char*)Blob.data();
-  if (!IsModuleLocal)
+  switch (VisibleKind) {
+  case VisibleDeclContextStorageKind::GenerallyVisible:
     PendingVisibleUpdates[ID].push_back(UpdateData{&M, Data});
-  else
+    break;
+  case VisibleDeclContextStorageKind::ModuleLocalVisible:
     PendingModuleLocalVisibleUpdates[ID].push_back(UpdateData{&M, Data});
+    break;
+  case VisibleDeclContextStorageKind::TULocalVisible:
+    if (M.Kind == MK_MainFile)
+      TULocalUpdates[ID].push_back(UpdateData{&M, Data});
+    break;
+  }
   return false;
 }
 
@@ -3613,6 +3632,21 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
       break;
     }
 
+    case UPDATE_TU_LOCAL_VISIBLE: {
+      if (F.Kind != MK_MainFile)
+        break;
+      unsigned Idx = 0;
+      GlobalDeclID ID = ReadDeclID(F, Record, Idx);
+      auto *Data = (const unsigned char *)Blob.data();
+      TULocalUpdates[ID].push_back(UpdateData{&F, Data});
+      // If we've already loaded the decl, perform the updates when we finish
+      // loading this block.
+      if (Decl *D = GetExistingDecl(ID))
+        PendingUpdateRecords.push_back(
+            PendingUpdateRecord(ID, D, /*JustLoaded=*/false));
+      break;
+    }
+
     case CXX_ADDED_TEMPLATE_SPECIALIZATION: {
       unsigned Idx = 0;
       GlobalDeclID ID = ReadDeclID(F, Record, Idx);
@@ -3717,6 +3751,7 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
       TotalLexicalDeclContexts += Record[2];
       TotalVisibleDeclContexts += Record[3];
       TotalModuleLocalVisibleDeclContexts += Record[4];
+      TotalTULocalVisibleDeclContexts += Record[5];
       break;
 
     case UNUSED_FILESCOPED_DECLS:
@@ -4002,7 +4037,7 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
       break;
 
     case DELAYED_NAMESPACE_LEXICAL_VISIBLE_RECORD: {
-      if (Record.size() % 4 != 0)
+      if (Record.size() % 5 != 0)
         return llvm::createStringError(
             std::errc::illegal_byte_sequence,
             "invalid DELAYED_NAMESPACE_LEXICAL_VISIBLE_RECORD block in AST "
@@ -4021,9 +4056,12 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
         uint64_t LocalModuleLocalOffset = Record[I++];
         uint64_t ModuleLocalOffset =
             LocalModuleLocalOffset ? BaseOffset + LocalModuleLocalOffset : 0;
+        uint64_t TULocalLocalOffset = Record[I++];
+        uint64_t TULocalOffset =
+            TULocalLocalOffset ? BaseOffset + TULocalLocalOffset : 0;
 
         DelayedNamespaceOffsetMap[ID] = {LexicalOffset, VisibleOffset,
-                                         ModuleLocalOffset};
+                                         ModuleLocalOffset, TULocalOffset};
 
         assert(!GetExistingDecl(ID) &&
                "We shouldn't load the namespace in the front of delayed "
@@ -8437,7 +8475,7 @@ void ASTReader::FindFileRegionDecls(FileID File,
 
 bool ASTReader::FindExternalVisibleDeclsByName(const DeclContext *DC,
                                                DeclarationName Name,
-                                               Module *NamedModule) {
+                                               const DeclContext *OriginalDC) {
   assert(DC->hasExternalVisibleStorage() && DC == DC->getPrimaryContext() &&
          "DeclContext has no visible decls in storage");
   if (!Name)
@@ -8460,7 +8498,9 @@ bool ASTReader::FindExternalVisibleDeclsByName(const DeclContext *DC,
     }
   }
 
-  if (NamedModule) {
+  if (auto *NamedModule =
+          OriginalDC ? cast<Decl>(OriginalDC)->getTopLevelOwningNamedModule()
+                     : nullptr) {
     if (auto It = ModuleLocalLookups.find(DC); It != ModuleLocalLookups.end()) {
       ++NumModuleLocalVisibleDeclContexts;
       for (GlobalDeclID ID : It->second.Table.find({Name, NamedModule})) {
@@ -8471,6 +8511,15 @@ bool ASTReader::FindExternalVisibleDeclsByName(const DeclContext *DC,
     }
   }
 
+  if (auto It = TULocalLookups.find(DC); It != TULocalLookups.end()) {
+    ++NumTULocalVisibleDeclContexts;
+    for (GlobalDeclID ID : It->second.Table.find(Name)) {
+      NamedDecl *ND = cast<NamedDecl>(GetDecl(ID));
+      if (ND->getDeclName() == Name && Found.insert(ND).second)
+        Decls.push_back(ND);
+    }
+  }
+
   SetExternalVisibleDeclsForName(DC, Name, Decls);
   return !Decls.empty();
 }
@@ -8498,6 +8547,7 @@ void ASTReader::completeVisibleDeclsMap(const DeclContext *DC) {
 
   findAll(Lookups, NumVisibleDeclContextsRead);
   findAll(ModuleLocalLookups, NumModuleLocalVisibleDeclContexts);
+  findAll(TULocalLookups, NumTULocalVisibleDeclContexts);
 
   for (DeclsMap::iterator I = Decls.begin(), E = Decls.end(); I != E; ++I) {
     SetExternalVisibleDeclsForName(DC, I->first, I->second);
@@ -8517,6 +8567,12 @@ ASTReader::getModuleLocalLookupTables(DeclContext *Primary) const {
   return I == ModuleLocalLookups.end() ? nullptr : &I->second;
 }
 
+const serialization::reader::DeclContextLookupTable *
+ASTReader::getTULocalLookupTables(DeclContext *Primary) const {
+  auto I = TULocalLookups.find(Primary);
+  return I == TULocalLookups.end() ? nullptr : &I->second;
+}
+
 serialization::reader::LazySpecializationInfoLookupTable *
 ASTReader::getLoadedSpecializationsLookupTables(const Decl *D, bool IsPartial) {
   assert(D->isCanonicalDecl());
@@ -8632,6 +8688,11 @@ void ASTReader::PrintStats() {
         NumModuleLocalVisibleDeclContexts, TotalModuleLocalVisibleDeclContexts,
         ((float)NumModuleLocalVisibleDeclContexts /
          TotalModuleLocalVisibleDeclContexts * 100));
+  if (TotalTULocalVisibleDeclContexts)
+    std::fprintf(stderr, "  %u/%u visible declcontexts in GMF read (%f%%)\n",
+                 NumTULocalVisibleDeclContexts, TotalTULocalVisibleDeclContexts,
+                 ((float)NumTULocalVisibleDeclContexts /
+                  TotalTULocalVisibleDeclContexts * 100));
   if (TotalNumMethodPoolEntries)
     std::fprintf(stderr, "  %u/%u method pool entries read (%f%%)\n",
                  NumMethodPoolEntriesRead, TotalNumMethodPoolEntries,
@@ -10178,6 +10239,11 @@ void ASTReader::finishPendingActions() {
     }
     PendingDeducedVarTypes.clear();
 
+    // Load the delayed preferred name attributes.
+    for (unsigned I = 0; I != PendingDeferredAttributes.size(); ++I)
+      loadDeferredAttribute(PendingDeferredAttributes[I]);
+    PendingDeferredAttributes.clear();
+
     // For each decl chain that we wanted to complete while deserializing, mark
     // it as "still needs to be completed".
     for (unsigned I = 0; I != PendingIncompleteDeclChains.size(); ++I) {
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 1c51a7b5e460f..de834285fa76b 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -414,7 +414,8 @@ class ASTDeclReader : public DeclVisitor<ASTDeclReader, void> {
   void VisitLifetimeExtendedTemporaryDecl(LifetimeExtendedTemporaryDecl *D);
 
   void VisitDeclContext(DeclContext *DC, uint64_t &LexicalOffset,
-                        uint64_t &VisibleOffset, uint64_t &ModuleLocalOffset);
+                        uint64_t &VisibleOffset, uint64_t &ModuleLocalOffset,
+                        uint64_t &TULocalOffset);
 
   template <typename T>
   RedeclarableResult VisitRedeclarable(Redeclarable<T> *D);
@@ -612,7 +613,7 @@ void ASTDeclReader::VisitDecl(Decl *D) {
 
   if (HasAttrs) {
     AttrVec Attrs;
-    Record.readAttributes(Attrs);
+    Record.readAttributes(Attrs, D);
     // Avoid calling setAttrs() directly because it uses Decl::getASTContext()
     // internally which is unsafe during derialization.
     D->setAttrsImpl(Attrs, Reader.getContext());
@@ -1859,7 +1860,9 @@ void ASTDeclReader::VisitHLSLBufferDecl(HLSLBufferDecl *D) {
   uint64_t LexicalOffset = 0;
   uint64_t VisibleOffset = 0;
   uint64_t ModuleLocalOffset = 0;
-  VisitDeclContext(D, LexicalOffset, VisibleOffset, ModuleLocalOffset);
+  uint64_t TULocalOffset = 0;
+  VisitDeclContext(D, LexicalOffset, VisibleOffset, ModuleLocalOffset,
+                   TULocalOffset);
   D->IsCBuffer = Record.readBool();
   D->KwLoc = readSourceLocation();
   D->LBraceLoc = readSourceLocation();
@@ -2770,10 +2773,12 @@ void ASTDeclReader::VisitLifetimeExtendedTemporaryDecl(
 
 void ASTDeclReader::VisitDeclContext(DeclContext *DC, uint64_t &LexicalOffset,
                                      uint64_t &VisibleOffset,
-                                     uint64_t &ModuleLocalOffset) {
+                                     uint64_t &ModuleLocalOffset,
+                                     uint64_t &TULocalOffset) {
   LexicalOffset = ReadLocalOffset();
   VisibleOffset = ReadLocalOffset();
   ModuleLocalOffset = ReadLocalOffset();
+  TULocalOffset = ReadLocalOffset();
 }
 
 template <typename T>
@@ -3093,6 +3098,8 @@ class AttrReader {
     return Reader.readInt();
   }
 
+  uint64_t peekInts(unsigned N) { return Reader.peekInts(N); }
+
   bool readBool() { return Reader.readBool(); }
 
   SourceRange readSourceRange() {
@@ -3123,18 +3130,29 @@ class AttrReader {
     return Reader.readVersionTuple();
   }
 
+  void skipInt() { Reader.skipInts(1); }
+
+  void skipInts(unsigned N) { Reader.skipInts(N); }
+
+  unsigned getCurrentIdx() { return Reader.getIdx(); }
+
   OMPTraitInfo *readOMPTraitInfo() { return Reader.readOMPTraitInfo(); }
 
   template <typename T> T *readDeclAs() { return Reader.readDeclAs<T>(); }
 };
 }
 
+/// Reads one attribute from the current stream position, advancing Idx.
 Attr *ASTRecordReader::readAttr() {
   AttrReader Record(*this);
   auto V = Record.readInt();
   if (!V)
     return nullptr;
 
+  // Read and ignore the skip count, since attribute deserialization is not
+  // deferred on this pass.
+  Record.skipInt();
+
   Attr *New = nullptr;
   // Kind is stored as a 1-based integer because 0 is used to indicate a null
   // Attr pointer.
@@ -3164,13 +3182,28 @@ Attr *ASTRecordReader::readAttr() {
   return New;
 }
 
-/// Reads attributes from the current stream position.
-void ASTRecordReader::readAttributes(AttrVec &Attrs) {
+/// Reads attributes from the current stream position, advancing Idx.
+/// For some attributes (where type depends on itself recursively), defer
+/// reading the attribute until the type has been read.
+void ASTRecordReader::readAttributes(AttrVec &Attrs, Decl *D) {
   for (unsigned I = 0, E = readInt(); I != E; ++I)
-    if (auto *A = readAttr())
+    if (auto *A = readOrDeferAttrFor(D))
       Attrs.push_back(A);
 }
 
+/// Reads one attribute from the current stream position, advancing Idx.
+/// For some attributes (where type depends on itself recursively), defer
+/// reading the attribute until the type has been read.
+Attr *ASTRecordReader::readOrDeferAttrFor(Decl *D) {
+  AttrReader Record(*this);
+  unsigned SkipCount = Record.peekInts(1);
+  if (!SkipCount)
+    return readAttr();
+  Reader->PendingDeferredAttributes.push_back({Record.getCurrentIdx(), D});
+  Record.skipInts(SkipCount);
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // ASTReader Implementation
 //===----------------------------------------------------------------------===//
@@ -3875,6 +3908,7 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) {
   case DECL_CONTEXT_LEXICAL:
   case DECL_CONTEXT_VISIBLE:
   case DECL_CONTEXT_MODULE_LOCAL_VISIBLE:
+  case DECL_CONTEXT_TU_LOCAL_VISIBLE:
   case DECL_SPECIALIZATIONS:
   case DECL_PARTIAL_SPECIALIZATIONS:
     llvm_unreachable("Record cannot be de-serialized with readDeclRecord");
@@ -4185,9 +4219,10 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) {
     uint64_t LexicalOffset = 0;
     uint64_t VisibleOffset = 0;
     uint64_t ModuleLocalOffset = 0;
+    uint64_t TULocalOffset = 0;
 
-    Reader.VisitDeclContext(DC, LexicalOffset, VisibleOffset,
-                            ModuleLocalOffset);
+    Reader.VisitDeclContext(DC, LexicalOffset, VisibleOffset, ModuleLocalOffset,
+                            TULocalOffset);
 
     // Get the lexical and visible block for the delayed namespace.
     // It is sufficient to judge if ID is in DelayedNamespaceOffsetMap.
@@ -4199,18 +4234,24 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) {
         LexicalOffset = Iter->second.LexicalOffset;
         VisibleOffset = Iter->second.VisibleOffset;
         ModuleLocalOffset = Iter->second.ModuleLocalOffset;
+        TULocalOffset = Iter->second.TULocalOffset;
       }
 
     if (LexicalOffset &&
         ReadLexicalDeclContextStorage(*Loc.F, DeclsCursor, LexicalOffset, DC))
       return nullptr;
-    if (VisibleOffset &&
-        ReadVisibleDeclContextStorage(*Loc.F, DeclsCursor, VisibleOffset, ID,
-                                      /*IsModuleLocal=*/false))
+    if (VisibleOffset && ReadVisibleDeclContextStorage(
+                             *Loc.F, DeclsCursor, VisibleOffset, ID,
+                             VisibleDeclContextStorageKind::GenerallyVisible))
       return nullptr;
     if (ModuleLocalOffset &&
-        ReadVisibleDeclContextStorage(*Loc.F, DeclsCursor, ModuleLocalOffset,
-                                      ID, /*IsModuleLocal=*/true))
+        ReadVisibleDeclContextStorage(
+            *Loc.F, DeclsCursor, ModuleLocalOffset, ID,
+            VisibleDeclContextStorageKind::ModuleLocalVisible))
+      return nullptr;
+    if (TULocalOffset && ReadVisibleDeclContextStorage(
+                             *Loc.F, DeclsCursor, TULocalOffset, ID,
+                             VisibleDeclContextStorageKind::TULocalVisible))
       return nullptr;
   }
   assert(Record.getIdx() == Record.size());
@@ -4376,6 +4417,18 @@ void ASTReader::loadDeclUpdateRecords(PendingUpdateRecord &Record) {
     DC->setHasExternalVisibleStorage(true);
   }
 
+  if (auto I = TULocalUpdates.find(ID); I != TULocalUpdates.end()) {
+    auto Updates = std::move(I->second);
+    TULocalUpdates.erase(I);
+
+    auto *DC = cast<DeclContext>(D)->getPrimaryContext();
+    for (const auto &Update : Updates)
+      TULocalLookups[DC].Table.add(
+          Update.Mod, Update.Data,
+          reader::ASTDeclContextNameLookupTrait(*this, *Update.Mod));
+    DC->setHasExternalVisibleStorage(true);
+  }
+
   // Load any pending related decls.
   if (D->isCanonicalDecl()) {
     if (auto IT = RelatedDeclsMap.find(ID); IT != RelatedDeclsMap.end()) {
@@ -4459,6 +4512,49 @@ void ASTReader::loadPendingDeclChain(Decl *FirstLocal, uint64_t LocalOffset) {
   ASTDeclReader::attachLatestDecl(CanonDecl, MostRecent);
 }
 
+void ASTReader::loadDeferredAttribute(const DeferredAttribute &DA) {
+  Decl *D = DA.TargetedDecl;
+  ModuleFile *M = getOwningModuleFile(D);
+
+  unsigned LocalDeclIndex = D->getGlobalID().getLocalDeclIndex();
+  const DeclOffset &DOffs = M->DeclOffsets[LocalDeclIndex];
+  RecordLocation Loc(M, DOffs.getBitOffset(M->DeclsBlockStartOffset));
+
+  llvm::BitstreamCursor &Cursor = Loc.F->DeclsCursor;
+  SavedStreamPosition SavedPosition(Cursor);
+  if (llvm::Error Err = Cursor.JumpToBit(Loc.Offset)) {
+    Error(std::move(Err));
+  }
+
+  Expected<unsigned> MaybeCode = Cursor.ReadCode();
+  if (!MaybeCode) {
+    llvm::report_fatal_error(
+        Twine("ASTReader::loadPreferredNameAttribute failed reading code: ") +
+        toString(MaybeCode.takeError()));
+  }
+  unsigned Code = MaybeCode.get();
+
+  ASTRecordReader Record(*this, *Loc.F);
+  Expected<unsigned> MaybeRecCode = Record.readRecord(Cursor, Code);
+  if (!MaybeRecCode) {
+    llvm::report_fatal_error(
+        Twine(
+            "ASTReader::loadPreferredNameAttribute failed reading rec code: ") +
+        toString(MaybeCode.takeError()));
+  }
+  unsigned RecCode = MaybeRecCode.get();
+  if (RecCode < DECL_TYPEDEF || RecCode > DECL_LAST) {
+    llvm::report_fatal_error(
+        Twine("ASTReader::loadPreferredNameAttribute failed reading rec code: "
+              "expected valid DeclCode") +
+        toString(MaybeCode.takeError()));
+  }
+
+  Record.skipInts(DA.RecordIdx);
+  Attr *A = Record.readAttr();
+  getContext().getDeclAttrs(D).push_back(A);
+}
+
 namespace {
 
   /// Given an ObjC interface, goes through the modules and links to the
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index a6f8c6009f1ff..c7c17e09a30e0 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -37,6 +37,7 @@
 #include "clang/AST/Type.h"
 #include "clang/AST/TypeLoc.h"
 #include "clang/AST/TypeLocVisitor.h"
+#include "clang/Basic/AttrKinds.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Basic/FileEntry.h"
@@ -1154,6 +1155,8 @@ void ASTWriter::WriteBlockInfoBlock() {
   RECORD(DIAGNOSTIC_OPTIONS);
   RECORD(HEADER_SEARCH_PATHS);
   RECORD(DIAG_PRAGMA_MAPPINGS);
+  RECORD(HEADER_SEARCH_ENTRY_USAGE);
+  RECORD(VFS_USAGE);
 
 #undef RECORD
 #undef BLOCK
@@ -4044,6 +4047,13 @@ class ASTDeclContextNameLookupTraitBase {
       : Writer(Writer) {}
 
 public:
+  data_type getData(const DeclIDsTy &LocalIDs) {
+    unsigned Start = DeclIDs.size();
+    for (auto ID : LocalIDs)
+      DeclIDs.push_back(ID);
+    return std::make_pair(Start, DeclIDs.size());
+  }
+
   data_type ImportData(const reader::ASTDeclContextNameLookupTrait::data_type &FromReader) {
     unsigned Start = DeclIDs.size();
     DeclIDs.insert(
@@ -4136,23 +4146,16 @@ class ASTDeclContextNameLookupTraitBase {
   }
 };
 
-class ModuleLocalNameLookupTrait : public ASTDeclContextNameLookupTraitBase {
+class ModuleLevelNameLookupTrait : public ASTDeclContextNameLookupTraitBase {
 public:
   using primary_module_hash_type = unsigned;
 
   using key_type = std::pair<DeclarationNameKey, primary_module_hash_type>;
   using key_type_ref = key_type;
 
-  explicit ModuleLocalNameLookupTrait(ASTWriter &Writer)
+  explicit ModuleLevelNameLookupTrait(ASTWriter &Writer)
       : ASTDeclContextNameLookupTraitBase(Writer) {}
 
-  data_type getData(const DeclIDsTy &LocalIDs) {
-    unsigned Start = DeclIDs.size();
-    for (auto ID : LocalIDs)
-      DeclIDs.push_back(ID);
-    return std::make_pair(Start, DeclIDs.size());
-  }
-
   static bool EqualKey(key_type_ref a, key_type_ref b) { return a == b; }
 
   hash_value_type ComputeHash(key_type Key) {
@@ -4181,19 +4184,65 @@ class ModuleLocalNameLookupTrait : public ASTDeclContextNameLookupTraitBase {
   }
 };
 
+static bool isModuleLocalDecl(NamedDecl *D) {
+  // For decls not in a file context, they should have the same visibility
+  // with their parent.
+  if (auto *Parent = dyn_cast<NamedDecl>(D->getNonTransparentDeclContext());
+      Parent && !D->getNonTransparentDeclContext()->isFileContext())
+    return isModuleLocalDecl(Parent);
+
+  // Deduction Guide are special here. Since their logical parent context are
+  // not their actual parent.
+  if (auto *FTD = dyn_cast<FunctionTemplateDecl>(D))
+    if (auto *CDGD = dyn_cast<CXXDeductionGuideDecl>(FTD->getTemplatedDecl()))
+      return isModuleLocalDecl(CDGD->getDeducedTemplate());
+
+  if (D->getFormalLinkage() == Linkage::Module)
+    return true;
+
+  return false;
+}
+
+static bool isTULocalInNamedModules(NamedDecl *D) {
+  Module *NamedModule = D->getTopLevelOwningNamedModule();
+  if (!NamedModule)
+    return false;
+
+  // For none-top level decls, we choose to move it to the general visible
+  // lookup table. Since the consumer may get its parent somehow and performs
+  // a lookup in it (considering looking up the operator function in lambda).
+  // The difference between module local lookup table and TU local lookup table
+  // is, the consumers still have a chance to lookup in the module local lookup
+  // table but **now** the consumers won't read the TU local lookup table if
+  // the consumer is not the original TU.
+  //
+  // FIXME: It seems to be an optimization chance (and also a more correct
+  // semantics) to remain the TULocal lookup table and performing similar lookup
+  // with the module local lookup table except that we only allow the lookups
+  // with the same module unit.
+  if (!D->getNonTransparentDeclContext()->isFileContext())
+    return false;
+
+  return D->getLinkageInternal() == Linkage::Internal;
+}
+
 // Trait used for the on-disk hash table used in the method pool.
+template <bool CollectingTULocalDecls>
 class ASTDeclContextNameLookupTrait : public ASTDeclContextNameLookupTraitBase {
 public:
-  using ModuleLocalDeclsMapTy =
-      llvm::DenseMap<ModuleLocalNameLookupTrait::key_type, DeclIDsTy>;
-
-private:
-  ModuleLocalDeclsMapTy ModuleLocalDeclsMap;
+  using ModuleLevelDeclsMapTy =
+      llvm::DenseMap<ModuleLevelNameLookupTrait::key_type, DeclIDsTy>;
 
-public:
   using key_type = DeclarationNameKey;
   using key_type_ref = key_type;
 
+  using TULocalDeclsMapTy = llvm::DenseMap<key_type, DeclIDsTy>;
+
+private:
+  ModuleLevelDeclsMapTy ModuleLocalDeclsMap;
+  TULocalDeclsMapTy TULocalDeclsMap;
+
+public:
   explicit ASTDeclContextNameLookupTrait(ASTWriter &Writer)
       : ASTDeclContextNameLookupTraitBase(Writer) {}
 
@@ -4216,7 +4265,7 @@ class ASTDeclContextNameLookupTrait : public ASTDeclContextNameLookupTraitBase {
 
       auto ID = Writer.GetDeclRef(DeclForLocalLookup);
 
-      if (D->getFormalLinkage() == Linkage::Module) {
+      if (isModuleLocalDecl(D)) {
         if (std::optional<unsigned> PrimaryModuleHash =
                 getPrimaryModuleHash(D->getOwningModule())) {
           auto Key = std::make_pair(D->getDeclName(), *PrimaryModuleHash);
@@ -4229,15 +4278,30 @@ class ASTDeclContextNameLookupTrait : public ASTDeclContextNameLookupTraitBase {
         }
       }
 
+      if constexpr (CollectingTULocalDecls) {
+        if (isTULocalInNamedModules(D)) {
+          auto Iter = TULocalDeclsMap.find(D->getDeclName());
+          if (Iter == TULocalDeclsMap.end())
+            TULocalDeclsMap.insert({D->getDeclName(), DeclIDsTy{ID}});
+          else
+            Iter->second.push_back(ID);
+          continue;
+        }
+      }
+
       DeclIDs.push_back(ID);
     }
     return std::make_pair(Start, DeclIDs.size());
   }
 
-  const ModuleLocalDeclsMapTy &getModuleLocalDecls() {
+  using ASTDeclContextNameLookupTraitBase::getData;
+
+  const ModuleLevelDeclsMapTy &getModuleLocalDecls() {
     return ModuleLocalDeclsMap;
   }
 
+  const TULocalDeclsMapTy &getTULocalDecls() { return TULocalDeclsMap; }
+
   static bool EqualKey(key_type_ref a, key_type_ref b) { return a == b; }
 
   hash_value_type ComputeHash(key_type Name) { return Name.getHash(); }
@@ -4465,7 +4529,8 @@ static bool isLookupResultNotInteresting(ASTWriter &Writer,
 void ASTWriter::GenerateNameLookupTable(
     ASTContext &Context, const DeclContext *ConstDC,
     llvm::SmallVectorImpl<char> &LookupTable,
-    llvm::SmallVectorImpl<char> &ModuleLocalLookupTable) {
+    llvm::SmallVectorImpl<char> &ModuleLocalLookupTable,
+    llvm::SmallVectorImpl<char> &TULookupTable) {
   assert(!ConstDC->hasLazyLocalLexicalLookups() &&
          !ConstDC->hasLazyExternalLexicalLookups() &&
          "must call buildLookups first");
@@ -4475,9 +4540,11 @@ void ASTWriter::GenerateNameLookupTable(
   assert(DC == DC->getPrimaryContext() && "only primary DC has lookup table");
 
   // Create the on-disk hash table representation.
-  MultiOnDiskHashTableGenerator<reader::ASTDeclContextNameLookupTrait,
-                                ASTDeclContextNameLookupTrait> Generator;
-  ASTDeclContextNameLookupTrait Trait(*this);
+  MultiOnDiskHashTableGenerator<
+      reader::ASTDeclContextNameLookupTrait,
+      ASTDeclContextNameLookupTrait</*CollectingTULocal=*/true>>
+      Generator;
+  ASTDeclContextNameLookupTrait</*CollectingTULocal=*/true> Trait(*this);
 
   // The first step is to collect the declaration names which we need to
   // serialize into the name lookup table, and to collect them in a stable
@@ -4649,26 +4716,45 @@ void ASTWriter::GenerateNameLookupTable(
   Generator.emit(LookupTable, Trait, Lookups ? &Lookups->Table : nullptr);
 
   const auto &ModuleLocalDecls = Trait.getModuleLocalDecls();
-  if (ModuleLocalDecls.empty())
-    return;
+  if (!ModuleLocalDecls.empty()) {
+    MultiOnDiskHashTableGenerator<reader::ModuleLocalNameLookupTrait,
+                                  ModuleLevelNameLookupTrait>
+        ModuleLocalLookupGenerator;
+    ModuleLevelNameLookupTrait ModuleLocalTrait(*this);
+
+    for (const auto &ModuleLocalIter : ModuleLocalDecls) {
+      const auto &Key = ModuleLocalIter.first;
+      const auto &IDs = ModuleLocalIter.second;
+      ModuleLocalLookupGenerator.insert(Key, ModuleLocalTrait.getData(IDs),
+                                        ModuleLocalTrait);
+    }
 
-  MultiOnDiskHashTableGenerator<reader::ModuleLocalNameLookupTrait,
-                                ModuleLocalNameLookupTrait>
-      ModuleLocalLookupGenerator;
-  ModuleLocalNameLookupTrait ModuleLocalTrait(*this);
+    auto *ModuleLocalLookups =
+        Chain ? Chain->getModuleLocalLookupTables(DC) : nullptr;
+    ModuleLocalLookupGenerator.emit(
+        ModuleLocalLookupTable, ModuleLocalTrait,
+        ModuleLocalLookups ? &ModuleLocalLookups->Table : nullptr);
+  }
+
+  const auto &TULocalDecls = Trait.getTULocalDecls();
+  if (!TULocalDecls.empty() && !isGeneratingReducedBMI()) {
+    MultiOnDiskHashTableGenerator<
+        reader::ASTDeclContextNameLookupTrait,
+        ASTDeclContextNameLookupTrait</*CollectingTULocal=*/false>>
+        TULookupGenerator;
+    ASTDeclContextNameLookupTrait</*CollectingTULocal=*/false> TULocalTrait(
+        *this);
+
+    for (const auto &TULocalIter : TULocalDecls) {
+      const auto &Key = TULocalIter.first;
+      const auto &IDs = TULocalIter.second;
+      TULookupGenerator.insert(Key, TULocalTrait.getData(IDs), TULocalTrait);
+    }
 
-  for (const auto &ModuleLocalIter : ModuleLocalDecls) {
-    const auto &Key = ModuleLocalIter.first;
-    const auto &IDs = ModuleLocalIter.second;
-    ModuleLocalLookupGenerator.insert(Key, ModuleLocalTrait.getData(IDs),
-                                      ModuleLocalTrait);
+    auto *TULocalLookups = Chain ? Chain->getTULocalLookupTables(DC) : nullptr;
+    TULookupGenerator.emit(TULookupTable, TULocalTrait,
+                           TULocalLookups ? &TULocalLookups->Table : nullptr);
   }
-
-  auto *ModuleLocalLookups =
-      Chain ? Chain->getModuleLocalLookupTables(DC) : nullptr;
-  ModuleLocalLookupGenerator.emit(
-      ModuleLocalLookupTable, ModuleLocalTrait,
-      ModuleLocalLookups ? &ModuleLocalLookups->Table : nullptr);
 }
 
 /// Write the block containing all of the declaration IDs
@@ -4679,7 +4765,12 @@ void ASTWriter::GenerateNameLookupTable(
 void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context,
                                              DeclContext *DC,
                                              uint64_t &VisibleBlockOffset,
-                                             uint64_t &ModuleLocalBlockOffset) {
+                                             uint64_t &ModuleLocalBlockOffset,
+                                             uint64_t &TULocalBlockOffset) {
+  assert(VisibleBlockOffset == 0);
+  assert(ModuleLocalBlockOffset == 0);
+  assert(TULocalBlockOffset == 0);
+
   // If we imported a key declaration of this namespace, write the visible
   // lookup results as an update record for it rather than including them
   // on this declaration. We will only look at key declarations on reload.
@@ -4766,7 +4857,9 @@ void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context,
   // Create the on-disk hash table in a buffer.
   SmallString<4096> LookupTable;
   SmallString<4096> ModuleLocalLookupTable;
-  GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable);
+  SmallString<4096> TULookupTable;
+  GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable,
+                          TULookupTable);
 
   // Write the lookup table
   RecordData::value_type Record[] = {DECL_CONTEXT_VISIBLE};
@@ -4774,17 +4867,26 @@ void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context,
                             LookupTable);
   ++NumVisibleDeclContexts;
 
-  if (ModuleLocalLookupTable.empty())
-    return;
+  if (!ModuleLocalLookupTable.empty()) {
+    ModuleLocalBlockOffset = Stream.GetCurrentBitNo();
+    assert(ModuleLocalBlockOffset > VisibleBlockOffset);
+    // Write the lookup table
+    RecordData::value_type ModuleLocalRecord[] = {
+        DECL_CONTEXT_MODULE_LOCAL_VISIBLE};
+    Stream.EmitRecordWithBlob(DeclModuleLocalVisibleLookupAbbrev,
+                              ModuleLocalRecord, ModuleLocalLookupTable);
+    ++NumModuleLocalDeclContexts;
+  }
 
-  ModuleLocalBlockOffset = Stream.GetCurrentBitNo();
-  assert(ModuleLocalBlockOffset > VisibleBlockOffset);
-  // Write the lookup table
-  RecordData::value_type ModuleLocalRecord[] = {
-      DECL_CONTEXT_MODULE_LOCAL_VISIBLE};
-  Stream.EmitRecordWithBlob(DeclModuleLocalVisibleLookupAbbrev,
-                            ModuleLocalRecord, ModuleLocalLookupTable);
-  ++NumModuleLocalDeclContexts;
+  if (!TULookupTable.empty()) {
+    TULocalBlockOffset = Stream.GetCurrentBitNo();
+    // Write the lookup table
+    RecordData::value_type TULocalDeclsRecord[] = {
+        DECL_CONTEXT_TU_LOCAL_VISIBLE};
+    Stream.EmitRecordWithBlob(DeclTULocalLookupAbbrev, TULocalDeclsRecord,
+                              TULookupTable);
+    ++NumTULocalDeclContexts;
+  }
 }
 
 /// Write an UPDATE_VISIBLE block for the given context.
@@ -4802,7 +4904,9 @@ void ASTWriter::WriteDeclContextVisibleUpdate(ASTContext &Context,
   // Create the on-disk hash table in a buffer.
   SmallString<4096> LookupTable;
   SmallString<4096> ModuleLocalLookupTable;
-  GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable);
+  SmallString<4096> TULookupTable;
+  GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable,
+                          TULookupTable);
 
   // If we're updating a namespace, select a key declaration as the key for the
   // update record; those are the only ones that will be checked on reload.
@@ -4814,14 +4918,20 @@ void ASTWriter::WriteDeclContextVisibleUpdate(ASTContext &Context,
                                      getDeclID(cast<Decl>(DC)).getRawValue()};
   Stream.EmitRecordWithBlob(UpdateVisibleAbbrev, Record, LookupTable);
 
-  if (ModuleLocalLookupTable.empty())
-    return;
+  if (!ModuleLocalLookupTable.empty()) {
+    // Write the module local lookup table
+    RecordData::value_type ModuleLocalRecord[] = {
+        UPDATE_MODULE_LOCAL_VISIBLE, getDeclID(cast<Decl>(DC)).getRawValue()};
+    Stream.EmitRecordWithBlob(ModuleLocalUpdateVisibleAbbrev, ModuleLocalRecord,
+                              ModuleLocalLookupTable);
+  }
 
-  // Write the module local lookup table
-  RecordData::value_type ModuleLocalRecord[] = {
-      UPDATE_MODULE_LOCAL_VISIBLE, getDeclID(cast<Decl>(DC)).getRawValue()};
-  Stream.EmitRecordWithBlob(ModuleLocalUpdateVisibleAbbrev, ModuleLocalRecord,
-                            ModuleLocalLookupTable);
+  if (!TULookupTable.empty()) {
+    RecordData::value_type GMFRecord[] = {
+        UPDATE_TU_LOCAL_VISIBLE, getDeclID(cast<Decl>(DC)).getRawValue()};
+    Stream.EmitRecordWithBlob(TULocalUpdateVisibleAbbrev, GMFRecord,
+                              TULookupTable);
+  }
 }
 
 /// Write an FP_PRAGMA_OPTIONS block for the given FPOptions.
@@ -5046,15 +5156,14 @@ void ASTWriter::WriteModuleFileExtension(Sema &SemaRef,
 
 void ASTRecordWriter::AddAttr(const Attr *A) {
   auto &Record = *this;
-  // FIXME: Clang can't handle the serialization/deserialization of
-  // preferred_name properly now. See
-  // https://github.com/llvm/llvm-project/issues/56490 for example.
-  if (!A || (isa<PreferredNameAttr>(A) &&
-             Writer->isWritingStdCXXNamedModules()))
+  if (!A)
     return Record.push_back(0);
 
   Record.push_back(A->getKind() + 1); // FIXME: stable encoding, target attrs
 
+  auto SkipIdx = Record.size();
+  // Add placeholder for the size of deferred attribute.
+  Record.push_back(0);
   Record.AddIdentifierRef(A->getAttrName());
   Record.AddIdentifierRef(A->getScopeName());
   Record.AddSourceRange(A->getRange());
@@ -5065,6 +5174,12 @@ void ASTRecordWriter::AddAttr(const Attr *A) {
   Record.push_back(A->isRegularKeywordAttribute());
 
 #include "clang/Serialization/AttrPCHWrite.inc"
+
+  if (A->shouldDeferDeserialization()) {
+    // Record the actual size of deferred attribute (+ 1 to count the attribute
+    // kind).
+    Record[SkipIdx] = Record.size() - SkipIdx + 1;
+  }
 }
 
 /// Emit the list of attributes to the specified record.
@@ -6004,9 +6119,12 @@ ASTFileSignature ASTWriter::WriteASTCore(Sema *SemaPtr, StringRef isysroot,
   }
 
   // Some simple statistics
-  RecordData::value_type Record[] = {
-      NumStatements, NumMacros, NumLexicalDeclContexts, NumVisibleDeclContexts,
-      NumModuleLocalDeclContexts};
+  RecordData::value_type Record[] = {NumStatements,
+                                     NumMacros,
+                                     NumLexicalDeclContexts,
+                                     NumVisibleDeclContexts,
+                                     NumModuleLocalDeclContexts,
+                                     NumTULocalDeclContexts};
   Stream.EmitRecord(STATISTICS, Record);
   Stream.ExitBlock();
   Stream.FlushToWord();
@@ -6085,7 +6203,9 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) {
     uint64_t LexicalOffset = WriteDeclContextLexicalBlock(Context, NS);
     uint64_t VisibleOffset = 0;
     uint64_t ModuleLocalOffset = 0;
-    WriteDeclContextVisibleBlock(Context, NS, VisibleOffset, ModuleLocalOffset);
+    uint64_t TULocalOffset = 0;
+    WriteDeclContextVisibleBlock(Context, NS, VisibleOffset, ModuleLocalOffset,
+                                 TULocalOffset);
 
     // Write the offset relative to current block.
     if (LexicalOffset)
@@ -6097,10 +6217,14 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) {
     if (ModuleLocalOffset)
       ModuleLocalOffset -= DeclTypesBlockStartOffset;
 
+    if (TULocalOffset)
+      TULocalOffset -= DeclTypesBlockStartOffset;
+
     AddDeclRef(NS, DelayedNamespaceRecord);
     DelayedNamespaceRecord.push_back(LexicalOffset);
     DelayedNamespaceRecord.push_back(VisibleOffset);
     DelayedNamespaceRecord.push_back(ModuleLocalOffset);
+    DelayedNamespaceRecord.push_back(TULocalOffset);
   }
 
   // The process of writing lexical and visible block for delayed namespace
@@ -6186,6 +6310,12 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) {
   Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Blob));
   ModuleLocalUpdateVisibleAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
+  Abv = std::make_shared<llvm::BitCodeAbbrev>();
+  Abv->Add(llvm::BitCodeAbbrevOp(UPDATE_TU_LOCAL_VISIBLE));
+  Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::VBR, 6));
+  Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Blob));
+  TULocalUpdateVisibleAbbrev = Stream.EmitAbbrev(std::move(Abv));
+
   // And a visible updates block for the translation unit.
   WriteDeclContextVisibleUpdate(Context, TU);
 
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 7a494cfe1ac64..30b28057f4c10 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -2069,6 +2069,7 @@ void ASTDeclWriter::VisitDeclContext(DeclContext *DC) {
   uint64_t LexicalOffset = 0;
   uint64_t VisibleOffset = 0;
   uint64_t ModuleLocalOffset = 0;
+  uint64_t TULocalOffset = 0;
 
   if (Writer.isGeneratingReducedBMI() && isa<NamespaceDecl>(DC) &&
       cast<NamespaceDecl>(DC)->isFromExplicitGlobalModule()) {
@@ -2080,12 +2081,14 @@ void ASTDeclWriter::VisitDeclContext(DeclContext *DC) {
     LexicalOffset =
         Writer.WriteDeclContextLexicalBlock(Record.getASTContext(), DC);
     Writer.WriteDeclContextVisibleBlock(Record.getASTContext(), DC,
-                                        VisibleOffset, ModuleLocalOffset);
+                                        VisibleOffset, ModuleLocalOffset,
+                                        TULocalOffset);
   }
 
   Record.AddOffset(LexicalOffset);
   Record.AddOffset(VisibleOffset);
   Record.AddOffset(ModuleLocalOffset);
+  Record.AddOffset(TULocalOffset);
 }
 
 const Decl *ASTWriter::getFirstLocalDecl(const Decl *D) {
@@ -2441,6 +2444,7 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // LexicalOffset
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // VisibleOffset
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // ModuleLocalOffset
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // TULocalOffset
   DeclEnumAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for DECL_RECORD
@@ -2494,6 +2498,7 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // LexicalOffset
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // VisibleOffset
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // ModuleLocalOffset
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // TULocalOffset
   DeclRecordAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for DECL_PARM_VAR
@@ -2836,6 +2841,11 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
   DeclModuleLocalVisibleLookupAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
+  Abv = std::make_shared<BitCodeAbbrev>();
+  Abv->Add(BitCodeAbbrevOp(serialization::DECL_CONTEXT_TU_LOCAL_VISIBLE));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
+  DeclTULocalLookupAbbrev = Stream.EmitAbbrev(std::move(Abv));
+
   Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::DECL_SPECIALIZATIONS));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
diff --git a/clang/lib/Serialization/GeneratePCH.cpp b/clang/lib/Serialization/GeneratePCH.cpp
index a3189bb40b191..12751beb8d715 100644
--- a/clang/lib/Serialization/GeneratePCH.cpp
+++ b/clang/lib/Serialization/GeneratePCH.cpp
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/AST/ASTContext.h"
-#include "clang/Frontend/FrontendDiagnostic.h"
+#include "clang/Basic/DiagnosticFrontend.h"
 #include "clang/Lex/HeaderSearch.h"
 #include "clang/Lex/HeaderSearchOptions.h"
 #include "clang/Lex/Preprocessor.h"
diff --git a/clang/test/APINotes/Inputs/Headers/Templates.h b/clang/test/APINotes/Inputs/Headers/Templates.h
index 862035fee363f..2a86a46d4af27 100644
--- a/clang/test/APINotes/Inputs/Headers/Templates.h
+++ b/clang/test/APINotes/Inputs/Headers/Templates.h
@@ -6,4 +6,5 @@ struct Box {
   const T* get_ptr() const { return &value; }
 };
 
+using FloatBox = Box<float>;
 using IntBox = Box<int>;
diff --git a/clang/test/APINotes/templates.cpp b/clang/test/APINotes/templates.cpp
index 0556eba925a51..48109011e73a9 100644
--- a/clang/test/APINotes/templates.cpp
+++ b/clang/test/APINotes/templates.cpp
@@ -7,3 +7,6 @@
 // CHECK-BOX: Dumping Box:
 // CHECK-BOX-NEXT: ClassTemplateDecl {{.+}} imported in Templates Box
 // CHECK-BOX: SwiftAttrAttr {{.+}} <<invalid sloc>> "import_owned"
+
+// Make sure the attributes aren't duplicated.
+// CHECK-BOX-NOT: SwiftAttrAttr {{.+}} <<invalid sloc>> "import_owned"
diff --git a/clang/test/AST/ByteCode/cxx20.cpp b/clang/test/AST/ByteCode/cxx20.cpp
index 268362ceff635..268226a7c143e 100644
--- a/clang/test/AST/ByteCode/cxx20.cpp
+++ b/clang/test/AST/ByteCode/cxx20.cpp
@@ -893,3 +893,18 @@ namespace VirtDtor {
 
   static_assert(test('C', 'B'));
 }
+
+namespace TemporaryInNTTP {
+  template<auto n> struct B { /* ... */ };
+  struct J1 {
+    J1 *self=this;
+  };
+  /// FIXME: The bytecode interpreter emits a different diagnostic here.
+  /// The current interpreter creates a fake MaterializeTemporaryExpr (see EvaluateAsConstantExpr)
+  /// which is later used as the LValueBase of the created APValue.
+  B<J1{}> j1;  // ref-error {{pointer to temporary object is not allowed in a template argument}} \
+               // expected-error {{non-type template argument is not a constant expression}} \
+               // expected-note {{pointer to temporary is not a constant expression}} \
+               // expected-note {{created here}}
+  B<2> j2; /// Ok.
+}
diff --git a/clang/test/AST/ByteCode/cxx98.cpp b/clang/test/AST/ByteCode/cxx98.cpp
index 20f98d33c31c4..c17049b01c1da 100644
--- a/clang/test/AST/ByteCode/cxx98.cpp
+++ b/clang/test/AST/ByteCode/cxx98.cpp
@@ -59,3 +59,8 @@ struct PR65784s{
   int *ptr;
 } const PR65784[] = {(int *)""};
 PR65784s PR65784f() { return *PR65784; }
+
+const int b = 1 / 0; // both-warning {{division by zero is undefined}} \
+                     // both-note {{declared here}}
+_Static_assert(b, ""); // both-error {{not an integral constant expression}} \
+                       // both-note {{initializer of 'b' is not a constant expression}}
diff --git a/clang/test/AST/ByteCode/literals.cpp b/clang/test/AST/ByteCode/literals.cpp
index 3d415a93a392a..fdf1a6820e446 100644
--- a/clang/test/AST/ByteCode/literals.cpp
+++ b/clang/test/AST/ByteCode/literals.cpp
@@ -261,31 +261,26 @@ namespace SizeOf {
   }
 
 #if __cplusplus >= 201402L
-  constexpr int IgnoredRejected() { // ref-error {{never produces a constant expression}}
+  constexpr int IgnoredRejected() { // both-error {{never produces a constant expression}}
     int n = 0;
     sizeof(int[n++]); // both-warning {{expression result unused}} \
-                      // ref-note 2{{subexpression not valid in a constant expression}}
+                      // both-note 2{{subexpression not valid in a constant expression}}
     return n;
   }
-  /// FIXME: This is rejected because the parameter so sizeof() is not constant.
-  ///   produce a proper diagnostic.
   static_assert(IgnoredRejected() == 0, ""); // both-error {{not an integral constant expression}} \
-                                             // ref-note {{in call to 'IgnoredRejected()'}}
+                                             // both-note {{in call to 'IgnoredRejected()'}}
 #endif
 
 
 #if __cplusplus >= 202002L
   /// FIXME: The following code should be accepted.
-  consteval int foo(int n) { // ref-error {{consteval function never produces a constant expression}}
-    return sizeof(int[n]); // ref-note 3{{not valid in a constant expression}}
-  }
-  constinit int var = foo(5); // ref-error {{not a constant expression}} \
-                              // ref-note 2{{in call to}} \
-                              // ref-error {{does not have a constant initializer}} \
-                              // ref-note {{required by 'constinit' specifier}} \
-                              // expected-error  {{is not a constant expression}} \
-                              // expected-error {{does not have a constant initializer}} \
-                              // expected-note {{required by 'constinit' specifier}} \
+  consteval int foo(int n) { // both-error {{consteval function never produces a constant expression}}
+    return sizeof(int[n]); // both-note 3{{not valid in a constant expression}}
+  }
+  constinit int var = foo(5); // both-error {{not a constant expression}} \
+                              // both-note 2{{in call to}} \
+                              // both-error {{does not have a constant initializer}} \
+                              // both-note {{required by 'constinit' specifier}}
 
 #endif
 };
diff --git a/clang/test/CXX/basic/basic.lookup/basic.lookup.argdep/p5-ex2.cpp b/clang/test/CXX/basic/basic.lookup/basic.lookup.argdep/p5-ex2.cpp
index a27946bd90a46..c200abafc0af8 100644
--- a/clang/test/CXX/basic/basic.lookup/basic.lookup.argdep/p5-ex2.cpp
+++ b/clang/test/CXX/basic/basic.lookup/basic.lookup.argdep/p5-ex2.cpp
@@ -61,6 +61,6 @@ void test() {
 
   // error: S::f is visible in instantiation context, but  R::g has internal
   // linkage and cannot be used outside N.cpp
-  apply(x, S::Z()); // expected-error@N.cpp:10 {{no matching function for call to 'g'}}
-                    // expected-note@-1 {{in instantiation of function template specialization 'apply<R::X, S::Z>' requested here}}
+  apply(x, S::Z()); // expected-error@N.cpp:10 {{use of undeclared identifier 'g'}}
+                    // expected-note@-1 {{requested here}}
 }
diff --git a/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp b/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp
index 54ec6aa61ec37..d70eb7de22c6a 100644
--- a/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp
+++ b/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp
@@ -66,11 +66,7 @@ void test_late() {
   // expected-note@p2.cpp:18 {{'exported' declared here}}
 #endif
 
-  internal = 1;
-#ifndef IMPLEMENTATION
-  // expected-error@-2 {{declaration of 'internal' must be imported from module 'A' before it is required}}
-  // expected-note@p2.cpp:20 {{declaration here is not visible}}
-#endif
+  internal = 1; // expected-error {{use of undeclared identifier 'internal'}}
 
   not_exported_private = 1;
 #ifndef IMPLEMENTATION
@@ -78,11 +74,7 @@ void test_late() {
   // expected-error@-3 {{undeclared identifier}}
 #endif
 
-  internal_private = 1;
-#ifndef IMPLEMENTATION
-  // FIXME: should not be visible here
-  // expected-error@-3 {{undeclared identifier}}
-#endif
+  internal_private = 1; // expected-error {{use of undeclared identifier 'internal_private'}}
 }
 
 #endif
diff --git a/clang/test/CXX/module/basic/basic.def.odr/p4.cppm b/clang/test/CXX/module/basic/basic.def.odr/p4.cppm
index 487dbdef283ee..7e88cbe78b4e3 100644
--- a/clang/test/CXX/module/basic/basic.def.odr/p4.cppm
+++ b/clang/test/CXX/module/basic/basic.def.odr/p4.cppm
@@ -128,7 +128,6 @@ void f(a::b, a::c) {}
 //
 // CHECK-DAG: @_ZW6Module25extern_var_module_linkage = external {{(dso_local )?}}global
 // CHECK-DAG: @_ZW6Module25inline_var_module_linkage = linkonce_odr {{(dso_local )?}}global
-// CHECK-DAG: @_ZL25static_var_module_linkage = internal {{(dso_local )?}}global i32 0,
 // CHECK-DAG: @_ZW6Module24const_var_module_linkage = available_externally {{(dso_local )?}}constant i32 3,
 
 module Module;
@@ -152,10 +151,6 @@ void use() {
   (void)&extern_var_module_linkage;
   (void)&inline_var_module_linkage;
 
-  // FIXME: Issue #61427 Internal-linkage declarations in the interface TU
-  // should not be not visible here.
-  (void)&static_var_module_linkage; // FIXME: Should not be visible here.
-
   (void)&const_var_module_linkage; // FIXME: will be visible after P2788R0
 }
 
diff --git a/clang/test/CXX/module/basic/basic.link/p2.cppm b/clang/test/CXX/module/basic/basic.link/p2.cppm
index 5a497304201dc..d7d2b5992a235 100644
--- a/clang/test/CXX/module/basic/basic.link/p2.cppm
+++ b/clang/test/CXX/module/basic/basic.link/p2.cppm
@@ -45,16 +45,14 @@ module M;
 void use_from_module_impl() {
   external_linkage_fn();
   module_linkage_fn();
-  internal_linkage_fn(); // expected-error {{no matching function for call to 'internal_linkage_fn'}}
+  internal_linkage_fn(); // expected-error {{use of undeclared identifier 'internal_linkage_fn'}} // expected-note@* {{}}
   (void)external_linkage_class{};
   (void)module_linkage_class{};
   (void)external_linkage_var;
   (void)module_linkage_var;
 
-  // FIXME: Issue #61427 Internal-linkage declarations in the interface TU
-  // should not be not visible here.
-  (void)internal_linkage_class{};
-  (void)internal_linkage_var;
+  (void)internal_linkage_class{}; // expected-error {{use of undeclared identifier 'internal_linkage_class'}} //expected-error{{}}
+  (void)internal_linkage_var; // expected-error {{use of undeclared identifier 'internal_linkage_var'}}
 }
 
 //--- user.cpp
@@ -63,11 +61,10 @@ import M;
 void use_from_module_impl() {
   external_linkage_fn();
   module_linkage_fn();   // expected-error {{use of undeclared identifier 'module_linkage_fn'}}
-  internal_linkage_fn(); // expected-error {{declaration of 'internal_linkage_fn' must be imported}}
+  internal_linkage_fn(); // expected-error {{use of undeclared identifier 'internal_linkage_fn'}}
   (void)external_linkage_class{};
-  (void)module_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}}
+  (void)module_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}} // expected-note@* {{}}
   (void)internal_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}}
-  // expected-note@M.cppm:10 {{declaration here is not visible}}
   (void)external_linkage_var;
   (void)module_linkage_var; // expected-error {{undeclared identifier}}
   (void)internal_linkage_var; // expected-error {{undeclared identifier}}
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_luti.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_luti.c
index 4b3f97d13c7eb..82e318a7460c2 100644
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_luti.c
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_luti.c
@@ -19,9 +19,9 @@
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1
 #else
-#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#define SVE_ACLE_FUNC(A1,A2) A1##A2
 #endif
 
 // SME-CHECK-LABEL: @test_svluti2_lane_s8(
@@ -39,7 +39,7 @@
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 svint8_t test_svluti2_lane_s8(svint8_t table, svuint8_t indices) MODE_ATTR{
-    return SVE_ACLE_FUNC(svluti2_lane,_s8,)(table, indices, 0);
+    return SVE_ACLE_FUNC(svluti2_lane,_s8)(table, indices, 0);
 }
 
 // SME-CHECK-LABEL: @test_svluti2_lane_u8(
@@ -57,7 +57,7 @@ svint8_t test_svluti2_lane_s8(svint8_t table, svuint8_t indices) MODE_ATTR{
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 svuint8_t test_svluti2_lane_u8(svuint8_t table, svuint8_t indices) MODE_ATTR{
-    return SVE_ACLE_FUNC(svluti2_lane,_u8,)(table, indices, 3);
+    return SVE_ACLE_FUNC(svluti2_lane,_u8)(table, indices, 3);
 }
 
 // SME-CHECK-LABEL: @test_svluti2_lane_s16(
@@ -75,7 +75,7 @@ svuint8_t test_svluti2_lane_u8(svuint8_t table, svuint8_t indices) MODE_ATTR{
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 svint16_t test_svluti2_lane_s16(svint16_t table, svuint8_t indices) MODE_ATTR{
-    return SVE_ACLE_FUNC(svluti2_lane,_s16,)(table, indices, 0);
+    return SVE_ACLE_FUNC(svluti2_lane,_s16)(table, indices, 0);
 }
 
 // SME-CHECK-LABEL: @test_svluti2_lane_u16(
@@ -93,7 +93,7 @@ svint16_t test_svluti2_lane_s16(svint16_t table, svuint8_t indices) MODE_ATTR{
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 svuint16_t test_svluti2_lane_u16(svuint16_t table, svuint8_t indices) MODE_ATTR{
-    return SVE_ACLE_FUNC(svluti2_lane,_u16,)(table, indices, 7);
+    return SVE_ACLE_FUNC(svluti2_lane,_u16)(table, indices, 7);
 }
 
 // SME-CHECK-LABEL: @test_svluti2_lane_f16(
@@ -111,7 +111,7 @@ svuint16_t test_svluti2_lane_u16(svuint16_t table, svuint8_t indices) MODE_ATTR{
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 svfloat16_t test_svluti2_lane_f16(svfloat16_t table, svuint8_t indices) MODE_ATTR{
-    return SVE_ACLE_FUNC(svluti2_lane,_f16,)(table, indices, 5);
+    return SVE_ACLE_FUNC(svluti2_lane,_f16)(table, indices, 5);
 }
 
 // SME-CHECK-LABEL: @test_svluti2_lane_bf16(
@@ -129,7 +129,7 @@ svfloat16_t test_svluti2_lane_f16(svfloat16_t table, svuint8_t indices) MODE_ATT
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
 svbfloat16_t test_svluti2_lane_bf16(svbfloat16_t table, svuint8_t indices) MODE_ATTR{
-    return SVE_ACLE_FUNC(svluti2_lane,_bf16,)(table, indices, 2);
+    return SVE_ACLE_FUNC(svluti2_lane,_bf16)(table, indices, 2);
 }
 
 // SME-CHECK-LABEL: @test_svluti4_lane_s8(
@@ -147,7 +147,7 @@ svbfloat16_t test_svluti2_lane_bf16(svbfloat16_t table, svuint8_t indices) MODE_
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 svint8_t test_svluti4_lane_s8(svint8_t table, svuint8_t indices) MODE_ATTR{
-    return SVE_ACLE_FUNC(svluti4_lane,_s8,)(table, indices, 0);
+    return SVE_ACLE_FUNC(svluti4_lane,_s8)(table, indices, 0);
 }
 
 // SME-CHECK-LABEL: @test_svluti4_lane_u8(
@@ -165,7 +165,7 @@ svint8_t test_svluti4_lane_s8(svint8_t table, svuint8_t indices) MODE_ATTR{
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 svuint8_t test_svluti4_lane_u8(svuint8_t table, svuint8_t indices) MODE_ATTR{
-    return SVE_ACLE_FUNC(svluti4_lane,_u8,)(table, indices, 1);
+    return SVE_ACLE_FUNC(svluti4_lane,_u8)(table, indices, 1);
 }
 
 // SME-CHECK-LABEL: @test_svluti4_lane_s16(
@@ -183,7 +183,7 @@ svuint8_t test_svluti4_lane_u8(svuint8_t table, svuint8_t indices) MODE_ATTR{
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 svint16_t test_svluti4_lane_s16(svint16_t table, svuint8_t indices) MODE_ATTR{
-    return SVE_ACLE_FUNC(svluti4_lane,_s16,)(table, indices, 0);
+    return SVE_ACLE_FUNC(svluti4_lane,_s16)(table, indices, 0);
 }
 
 // SME-CHECK-LABEL: @test_svluti4_lane_u16(
@@ -201,7 +201,7 @@ svint16_t test_svluti4_lane_s16(svint16_t table, svuint8_t indices) MODE_ATTR{
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 svuint16_t test_svluti4_lane_u16(svuint16_t table, svuint8_t indices) MODE_ATTR{
-    return SVE_ACLE_FUNC(svluti4_lane,_u16,)(table, indices, 3);
+    return SVE_ACLE_FUNC(svluti4_lane,_u16)(table, indices, 3);
 }
 
 // SME-CHECK-LABEL: @test_svluti4_lane_f16(
@@ -219,7 +219,7 @@ svuint16_t test_svluti4_lane_u16(svuint16_t table, svuint8_t indices) MODE_ATTR{
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 svfloat16_t test_svluti4_lane_f16(svfloat16_t table, svuint8_t indices) MODE_ATTR{
-    return SVE_ACLE_FUNC(svluti4_lane,_f16,)(table, indices, 2);
+    return SVE_ACLE_FUNC(svluti4_lane,_f16)(table, indices, 2);
 }
 
 // SME-CHECK-LABEL: @test_svluti4_lane_bf16(
@@ -237,7 +237,7 @@ svfloat16_t test_svluti4_lane_f16(svfloat16_t table, svuint8_t indices) MODE_ATT
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
 svbfloat16_t test_svluti4_lane_bf16(svbfloat16_t table, svuint8_t indices) MODE_ATTR{
-    return SVE_ACLE_FUNC(svluti4_lane,_bf16,)(table, indices, 1);
+    return SVE_ACLE_FUNC(svluti4_lane,_bf16)(table, indices, 1);
 }
 
 // SME-CHECK-LABEL: @test_svluti4_lane_s16_x2(
@@ -257,7 +257,7 @@ svbfloat16_t test_svluti4_lane_bf16(svbfloat16_t table, svuint8_t indices) MODE_
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 svint16_t test_svluti4_lane_s16_x2(svint16x2_t table, svuint8_t indices) MODE_ATTR{
-    return SVE_ACLE_FUNC(svluti4_lane,_s16,_x2)(table, indices, 0);
+    return SVE_ACLE_FUNC(svluti4_lane,_s16_x2)(table, indices, 0);
 }
 
 // SME-CHECK-LABEL: @test_svluti4_lane_u16_x2(
@@ -277,7 +277,7 @@ svint16_t test_svluti4_lane_s16_x2(svint16x2_t table, svuint8_t indices) MODE_AT
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 svuint16_t test_svluti4_lane_u16_x2(svuint16x2_t table, svuint8_t indices) MODE_ATTR{
-    return SVE_ACLE_FUNC(svluti4_lane,_u16,_x2)(table, indices, 3);
+    return SVE_ACLE_FUNC(svluti4_lane,_u16_x2)(table, indices, 3);
 }
 
 // SME-CHECK-LABEL: @test_svluti4_lane_f16_x2(
@@ -297,7 +297,7 @@ svuint16_t test_svluti4_lane_u16_x2(svuint16x2_t table, svuint8_t indices) MODE_
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 svfloat16_t test_svluti4_lane_f16_x2(svfloat16x2_t table, svuint8_t indices) MODE_ATTR{
-    return SVE_ACLE_FUNC(svluti4_lane,_f16,_x2)(table, indices, 2);
+    return SVE_ACLE_FUNC(svluti4_lane,_f16_x2)(table, indices, 2);
 }
 
 // SME-CHECK-LABEL: @test_svluti4_lane_bf16_x2(
@@ -317,5 +317,5 @@ svfloat16_t test_svluti4_lane_f16_x2(svfloat16x2_t table, svuint8_t indices) MOD
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
 svbfloat16_t test_svluti4_lane_bf16_x2(svbfloat16x2_t table, svuint8_t indices) MODE_ATTR{
-    return SVE_ACLE_FUNC(svluti4_lane,_bf16,_x2)(table, indices, 1);
+    return SVE_ACLE_FUNC(svluti4_lane,_bf16_x2)(table, indices, 1);
 }
diff --git a/clang/test/CodeGen/X86/amx_avx512_api.c b/clang/test/CodeGen/X86/amx_avx512_api.c
index aea790d61268d..fac41ea6c214f 100644
--- a/clang/test/CodeGen/X86/amx_avx512_api.c
+++ b/clang/test/CodeGen/X86/amx_avx512_api.c
@@ -16,18 +16,18 @@ __m512 test_tile_cvtrowd2ps(__tile1024i a, unsigned b) {
  return __tile_cvtrowd2ps(a, b);
 }
 
-__m512bh test_tile_cvtrowps2pbf16h(__tile1024i a, unsigned b) {
-  //CHECK-LABEL: @test_tile_cvtrowps2pbf16h
+__m512bh test_tile_cvtrowps2bf16h(__tile1024i a, unsigned b) {
+  //CHECK-LABEL: @test_tile_cvtrowps2bf16h
   //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
-  //CHECK-DAG: call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16h.internal
- return __tile_cvtrowps2pbf16h(a, b);
+  //CHECK-DAG: call <32 x bfloat> @llvm.x86.tcvtrowps2bf16h.internal
+ return __tile_cvtrowps2bf16h(a, b);
 }
 
-__m512bh test_tile_cvtrowps2pbf16l(__tile1024i a, unsigned b) {
-  //CHECK-LABEL: @test_tile_cvtrowps2pbf16l
+__m512bh test_tile_cvtrowps2bf16l(__tile1024i a, unsigned b) {
+  //CHECK-LABEL: @test_tile_cvtrowps2bf16l
   //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
-  //CHECK-DAG: call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16l.internal
- return __tile_cvtrowps2pbf16l(a, b);
+  //CHECK-DAG: call <32 x bfloat> @llvm.x86.tcvtrowps2bf16l.internal
+ return __tile_cvtrowps2bf16l(a, b);
 }
 
 __m512h test_tile_cvtrowps2phh(__tile1024i a, unsigned b) {
diff --git a/clang/test/CodeGen/X86/amxavx512-builtins.c b/clang/test/CodeGen/X86/amxavx512-builtins.c
index 172b5ae8f5308..0f203349b1d1e 100644
--- a/clang/test/CodeGen/X86/amxavx512-builtins.c
+++ b/clang/test/CodeGen/X86/amxavx512-builtins.c
@@ -10,16 +10,16 @@ __m512 test_tile_cvtrowd2ps(unsigned int A) {
   return _tile_cvtrowd2ps(1, A);
 }
 
-__m512bh test_tile_cvtrowps2pbf16h(unsigned int A) {
-  // CHECK-LABEL: @test_tile_cvtrowps2pbf16h(
-  // CHECK: call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16h(i8 1, i32 %{{.*}})
-  return _tile_cvtrowps2pbf16h(1, A);
+__m512bh test_tile_cvtrowps2bf16h(unsigned int A) {
+  // CHECK-LABEL: @test_tile_cvtrowps2bf16h(
+  // CHECK: call <32 x bfloat> @llvm.x86.tcvtrowps2bf16h(i8 1, i32 %{{.*}})
+  return _tile_cvtrowps2bf16h(1, A);
 }
 
-__m512bh test_tile_cvtrowps2pbf16l(unsigned int A) {
-  // CHECK-LABEL: @test_tile_cvtrowps2pbf16l(
-  // CHECK: call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16l(i8 1, i32 %{{.*}})
-  return _tile_cvtrowps2pbf16l(1, A);
+__m512bh test_tile_cvtrowps2bf16l(unsigned int A) {
+  // CHECK-LABEL: @test_tile_cvtrowps2bf16l(
+  // CHECK: call <32 x bfloat> @llvm.x86.tcvtrowps2bf16l(i8 1, i32 %{{.*}})
+  return _tile_cvtrowps2bf16l(1, A);
 }
 
 __m512h test_tile_cvtrowps2phh(unsigned int A) {
diff --git a/clang/test/CodeGenCXX/debug-info-codeview-nodebug.cpp b/clang/test/CodeGenCXX/debug-info-codeview-nodebug.cpp
new file mode 100644
index 0000000000000..c57133d8ac721
--- /dev/null
+++ b/clang/test/CodeGenCXX/debug-info-codeview-nodebug.cpp
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -DSETNODEBUG=0 -gcodeview -emit-llvm -std=c++14 -debug-info-kind=limited %s -o - | FileCheck %s --check-prefix=YESINFO
+// RUN: %clang_cc1 -DSETNODEBUG=1 -gcodeview -emit-llvm -std=c++14 -debug-info-kind=limited %s -o - | FileCheck %s --check-prefix=NOINFO
+
+#if SETNODEBUG
+#define NODEBUG __attribute__((nodebug))
+#else
+#define NODEBUG
+#endif
+
+struct t1 {
+  using t2 NODEBUG = void;
+};
+void func6() {
+  t1 v1;
+}
+// YESINFO-DAG: !DIDerivedType(tag: DW_TAG_typedef, name: "t2"
+// NOINFO: [[ELEMENTS:!.*]] = !{}
+// NOINFO: !DICompositeType(tag: DW_TAG_structure_type, name: "t1", {{.*}}, elements: [[ELEMENTS]],
+// NOINFO-NOT:  !DIDerivedType(tag: DW_TAG_typedef, name: "t2"
+
diff --git a/clang/test/CodeGenCXX/debug-info-object-pointer.cpp b/clang/test/CodeGenCXX/debug-info-object-pointer.cpp
index 594d4da791ee8..49079f5990996 100644
--- a/clang/test/CodeGenCXX/debug-info-object-pointer.cpp
+++ b/clang/test/CodeGenCXX/debug-info-object-pointer.cpp
@@ -5,12 +5,11 @@
 // CHECK: !DIDerivedType(tag: DW_TAG_pointer_type
 // CHECK-SAME:           flags: DIFlagArtificial | DIFlagObjectPointer
 //
-// // FIXME: DIFlagObjectPointer not attached to the explicit object
-// // argument in the subprogram declaration.
 // CHECK: !DISubprogram(name: "explicit_this",
 //                      flags: DIFlagPrototyped
-// CHECK-NOT: DIFlagObjectPointer
-// CHECK-NOT: DIFlagArtificial
+//
+// CHECK: !DIDerivedType(tag: DW_TAG_rvalue_reference_type
+// CHECK-SAME:           flags: DIFlagObjectPointer)
 //
 // CHECK: !DILocalVariable(name: "this", arg: 1
 // CHECK-SAME:             flags: DIFlagArtificial | DIFlagObjectPointer
diff --git a/clang/test/CodeGenHLSL/Bool.hlsl b/clang/test/CodeGenHLSL/Bool.hlsl
new file mode 100644
index 0000000000000..fb0f32b11241d
--- /dev/null
+++ b/clang/test/CodeGenHLSL/Bool.hlsl
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+
+// CHECK-LABEL: define noundef i1 {{.*}}fn{{.*}}(i1 noundef %x)
+// CHECK: [[X:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[Y:%.*]] = zext i1 {{%.*}} to i32
+// CHECK-NEXT: store i32 [[Y]], ptr [[X]], align 4
+// CHECK-NEXT: [[Z:%.*]] = load i32, ptr [[X]], align 4
+// CHECK-NEXT: [[L:%.*]] = trunc i32 [[Z]] to i1
+// CHECK-NEXT: ret i1 [[L]]
+bool fn(bool x) {
+  return x;
+}
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveSum.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveSum.hlsl
new file mode 100644
index 0000000000000..4bf423ccc1b82
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveSum.hlsl
@@ -0,0 +1,45 @@
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
+// RUN:   dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
+// RUN:   spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
+
+// Test basic lowering to runtime function call.
+
+// CHECK-LABEL: test_int
+int test_int(int expr) {
+  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.sum.i32([[TY]] %[[#]])
+  // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.sum.i32([[TY]] %[[#]])
+  // CHECK:  ret [[TY]] %[[RET]]
+  return WaveActiveSum(expr);
+}
+
+// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.reduce.sum.i32([[TY]]) #[[#attr:]]
+// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.reduce.sum.i32([[TY]]) #[[#attr:]]
+
+// CHECK-LABEL: test_uint64_t
+uint64_t test_uint64_t(uint64_t expr) {
+  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.sum.i64([[TY]] %[[#]])
+  // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.usum.i64([[TY]] %[[#]])
+  // CHECK:  ret [[TY]] %[[RET]]
+  return WaveActiveSum(expr);
+}
+
+// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.reduce.usum.i64([[TY]]) #[[#attr:]]
+// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.reduce.sum.i64([[TY]]) #[[#attr:]]
+
+// Test basic lowering to runtime function call with array and float value.
+
+// CHECK-LABEL: test_floatv4
+float4 test_floatv4(float4 expr) {
+  // CHECK-SPIRV:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY1:.*]] @llvm.spv.wave.reduce.sum.v4f32([[TY1]] %[[#]]
+  // CHECK-DXIL:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.dx.wave.reduce.sum.v4f32([[TY1]] %[[#]])
+  // CHECK:  ret [[TY1]] %[[RET1]]
+  return WaveActiveSum(expr);
+}
+
+// CHECK-DXIL: declare [[TY1]] @llvm.dx.wave.reduce.sum.v4f32([[TY1]]) #[[#attr]]
+// CHECK-SPIRV: declare spir_func [[TY1]] @llvm.spv.wave.reduce.sum.v4f32([[TY1]]) #[[#attr]]
+
+// CHECK: attributes #[[#attr]] = {{{.*}} convergent {{.*}}}
diff --git a/clang/test/CodeGenHLSL/builtins/distance.hlsl b/clang/test/CodeGenHLSL/builtins/distance.hlsl
index 6952700a87f1d..e830903261c8c 100644
--- a/clang/test/CodeGenHLSL/builtins/distance.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/distance.hlsl
@@ -33,8 +33,9 @@ half test_distance_half(half X, half Y) { return distance(X, Y); }
 // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half2Dv2_DhS_(
 // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[X:%.*]], <2 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
-// SPVCHECK-NEXT:    [[SPV_DISTANCE_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.distance.v2f16(<2 x half> [[X]], <2 x half> [[Y]])
-// SPVCHECK-NEXT:    ret half [[SPV_DISTANCE_I]]
+// SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[X]], [[Y]]
+// SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v2f16(<2 x half> [[SUB_I]])
+// SPVCHECK-NEXT:    ret half [[SPV_LENGTH_I]]
 //
 half test_distance_half2(half2 X, half2 Y) { return distance(X, Y); }
 
@@ -49,8 +50,9 @@ half test_distance_half2(half2 X, half2 Y) { return distance(X, Y); }
 // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half3Dv3_DhS_(
 // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[X:%.*]], <3 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
-// SPVCHECK-NEXT:    [[SPV_DISTANCE_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.distance.v3f16(<3 x half> [[X]], <3 x half> [[Y]])
-// SPVCHECK-NEXT:    ret half [[SPV_DISTANCE_I]]
+// SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[X]], [[Y]]
+// SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v3f16(<3 x half> [[SUB_I]])
+// SPVCHECK-NEXT:    ret half [[SPV_LENGTH_I]]
 //
 half test_distance_half3(half3 X, half3 Y) { return distance(X, Y); }
 
@@ -65,8 +67,9 @@ half test_distance_half3(half3 X, half3 Y) { return distance(X, Y); }
 // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half4Dv4_DhS_(
 // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[X:%.*]], <4 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
-// SPVCHECK-NEXT:    [[SPV_DISTANCE_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.distance.v4f16(<4 x half> [[X]], <4 x half> [[Y]])
-// SPVCHECK-NEXT:    ret half [[SPV_DISTANCE_I]]
+// SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[X]], [[Y]]
+// SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v4f16(<4 x half> [[SUB_I]])
+// SPVCHECK-NEXT:    ret half [[SPV_LENGTH_I]]
 //
 half test_distance_half4(half4 X, half4 Y) { return distance(X, Y); }
 
@@ -97,8 +100,9 @@ float test_distance_float(float X, float Y) { return distance(X, Y); }
 // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float2Dv2_fS_(
 // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[X:%.*]], <2 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
-// SPVCHECK-NEXT:    [[SPV_DISTANCE_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.distance.v2f32(<2 x float> [[X]], <2 x float> [[Y]])
-// SPVCHECK-NEXT:    ret float [[SPV_DISTANCE_I]]
+// SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[X]], [[Y]]
+// SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v2f32(<2 x float> [[SUB_I]])
+// SPVCHECK-NEXT:    ret float [[SPV_LENGTH_I]]
 //
 float test_distance_float2(float2 X, float2 Y) { return distance(X, Y); }
 
@@ -113,8 +117,9 @@ float test_distance_float2(float2 X, float2 Y) { return distance(X, Y); }
 // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float3Dv3_fS_(
 // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[X:%.*]], <3 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
-// SPVCHECK-NEXT:    [[SPV_DISTANCE_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.distance.v3f32(<3 x float> [[X]], <3 x float> [[Y]])
-// SPVCHECK-NEXT:    ret float [[SPV_DISTANCE_I]]
+// SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[X]], [[Y]]
+// SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v3f32(<3 x float> [[SUB_I]])
+// SPVCHECK-NEXT:    ret float [[SPV_LENGTH_I]]
 //
 float test_distance_float3(float3 X, float3 Y) { return distance(X, Y); }
 
@@ -129,7 +134,8 @@ float test_distance_float3(float3 X, float3 Y) { return distance(X, Y); }
 // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float4Dv4_fS_(
 // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
-// SPVCHECK-NEXT:    [[SPV_DISTANCE_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.distance.v4f32(<4 x float> [[X]], <4 x float> [[Y]])
-// SPVCHECK-NEXT:    ret float [[SPV_DISTANCE_I]]
+// SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[X]], [[Y]]
+// SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v4f32(<4 x float> [[SUB_I]])
+// SPVCHECK-NEXT:    ret float [[SPV_LENGTH_I]]
 //
 float test_distance_float4(float4 X, float4 Y) { return distance(X, Y); }
diff --git a/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl b/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
new file mode 100644
index 0000000000000..5d490fabc5bc8
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
@@ -0,0 +1,153 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s -DTARGET=dx
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN: -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s -DTARGET=spv
+
+#ifdef __HLSL_ENABLE_16_BIT
+// CHECK-LABEL: test_firstbitlow_ushort
+// CHECK: call i32 @llvm.[[TARGET]].firstbitlow.i16
+uint test_firstbitlow_ushort(uint16_t p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_ushort2
+// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitlow.v2i16
+uint2 test_firstbitlow_ushort2(uint16_t2 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_ushort3
+// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitlow.v3i16
+uint3 test_firstbitlow_ushort3(uint16_t3 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_ushort4
+// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitlow.v4i16
+uint4 test_firstbitlow_ushort4(uint16_t4 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_short
+// CHECK: call i32 @llvm.[[TARGET]].firstbitlow.i16
+uint test_firstbitlow_short(int16_t p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_short2
+// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitlow.v2i16
+uint2 test_firstbitlow_short2(int16_t2 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_short3
+// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitlow.v3i16
+uint3 test_firstbitlow_short3(int16_t3 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_short4
+// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitlow.v4i16
+uint4 test_firstbitlow_short4(int16_t4 p0) {
+  return firstbitlow(p0);
+}
+#endif // __HLSL_ENABLE_16_BIT
+
+// CHECK-LABEL: test_firstbitlow_uint
+// CHECK: call i32 @llvm.[[TARGET]].firstbitlow.i32
+uint test_firstbitlow_uint(uint p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_uint2
+// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitlow.v2i32
+uint2 test_firstbitlow_uint2(uint2 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_uint3
+// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitlow.v3i32
+uint3 test_firstbitlow_uint3(uint3 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_uint4
+// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitlow.v4i32
+uint4 test_firstbitlow_uint4(uint4 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_ulong
+// CHECK: call i32 @llvm.[[TARGET]].firstbitlow.i64
+uint test_firstbitlow_ulong(uint64_t p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_ulong2
+// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitlow.v2i64
+uint2 test_firstbitlow_ulong2(uint64_t2 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_ulong3
+// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitlow.v3i64
+uint3 test_firstbitlow_ulong3(uint64_t3 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_ulong4
+// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitlow.v4i64
+uint4 test_firstbitlow_ulong4(uint64_t4 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_int
+// CHECK: call i32 @llvm.[[TARGET]].firstbitlow.i32
+uint test_firstbitlow_int(int p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_int2
+// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitlow.v2i32
+uint2 test_firstbitlow_int2(int2 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_int3
+// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitlow.v3i32
+uint3 test_firstbitlow_int3(int3 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_int4
+// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitlow.v4i32
+uint4 test_firstbitlow_int4(int4 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_long
+// CHECK: call i32 @llvm.[[TARGET]].firstbitlow.i64
+uint test_firstbitlow_long(int64_t p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_long2
+// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitlow.v2i64
+uint2 test_firstbitlow_long2(int64_t2 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_long3
+// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitlow.v3i64
+uint3 test_firstbitlow_long3(int64_t3 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_long4
+// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitlow.v4i64
+uint4 test_firstbitlow_long4(int64_t4 p0) {
+  return firstbitlow(p0);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/length.hlsl b/clang/test/CodeGenHLSL/builtins/length.hlsl
index fcf3ee76ba5bb..2d4bbd995298f 100644
--- a/clang/test/CodeGenHLSL/builtins/length.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/length.hlsl
@@ -1,114 +1,163 @@
-// RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefixes=CHECK,DXCHECK \
-// RUN: -DTARGET=dx
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -O1 -o - | FileCheck %s
 
 // RUN: %clang_cc1 -finclude-default-header -triple \
 // RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \
-// RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefixes=CHECK,SPVCHECK \
-// RUN: -DTARGET=spv
+// RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
 
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
 // DXCHECK-LABEL: define noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
+//
+
+// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
 // CHECK-SAME: half noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half [[P0]])
 // CHECK-NEXT:    ret half [[ELT_ABS_I]]
 //
-
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
+// SPVCHECK-SAME: half noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// SPVCHECK-NEXT:  [[ENTRY:.*:]]
+// SPVCHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half [[P0]])
+// SPVCHECK-NEXT:    ret half [[ELT_ABS_I]]
+//
 half test_length_half(half p0)
 {
   return length(p0);
 }
 
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
 // DXCHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
+//
+
+
+// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
 // CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].fdot.v2f16(<2 x half> [[P0]], <2 x half> [[P0]])
+// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> [[P0]], <2 x half> [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-
-
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
+// SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SPVCHECK-NEXT:  [[ENTRY:.*:]]
+// SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v2f16(<2 x half> [[P0]])
+// SPVCHECK-NEXT:    ret half [[SPV_LENGTH_I]]
+//
 half test_length_half2(half2 p0)
 {
   return length(p0);
 }
 
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
 // DXCHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
+// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
 // CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].fdot.v3f16(<3 x half> [[P0]], <3 x half> [[P0]])
+// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> [[P0]], <3 x half> [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
+// SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SPVCHECK-NEXT:  [[ENTRY:.*:]]
+// SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v3f16(<3 x half> [[P0]])
+// SPVCHECK-NEXT:    ret half [[SPV_LENGTH_I]]
+//
 half test_length_half3(half3 p0)
 {
   return length(p0);
 }
 
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
 // DXCHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
+// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
 // CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].fdot.v4f16(<4 x half> [[P0]], <4 x half> [[P0]])
+// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> [[P0]], <4 x half> [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
+// SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SPVCHECK-NEXT:  [[ENTRY:.*:]]
+// SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v4f16(<4 x half> [[P0]])
+// SPVCHECK-NEXT:    ret half [[SPV_LENGTH_I]]
+//
 half test_length_half4(half4 p0)
 {
   return length(p0);
 }
 
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z17test_length_floatf(
 // DXCHECK-LABEL: define noundef nofpclass(nan inf) float @_Z17test_length_floatf(
+// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z17test_length_floatf(
 // CHECK-SAME: float noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float [[P0]])
 // CHECK-NEXT:    ret float [[ELT_ABS_I]]
 //
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z17test_length_floatf(
+// SPVCHECK-SAME: float noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SPVCHECK-NEXT:  [[ENTRY:.*:]]
+// SPVCHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float [[P0]])
+// SPVCHECK-NEXT:    ret float [[ELT_ABS_I]]
+//
 float test_length_float(float p0)
 {
   return length(p0);
 }
 
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
 // DXCHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
+// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
 // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].fdot.v2f32(<2 x float> [[P0]], <2 x float> [[P0]])
+// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> [[P0]], <2 x float> [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
+// SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SPVCHECK-NEXT:  [[ENTRY:.*:]]
+// SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v2f32(<2 x float> [[P0]])
+// SPVCHECK-NEXT:    ret float [[SPV_LENGTH_I]]
+//
 float test_length_float2(float2 p0)
 {
   return length(p0);
 }
 
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
 // DXCHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
+// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
 // CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].fdot.v3f32(<3 x float> [[P0]], <3 x float> [[P0]])
+// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> [[P0]], <3 x float> [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
+// SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SPVCHECK-NEXT:  [[ENTRY:.*:]]
+// SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v3f32(<3 x float> [[P0]])
+// SPVCHECK-NEXT:    ret float [[SPV_LENGTH_I]]
+//
 float test_length_float3(float3 p0)
 {
   return length(p0);
 }
 
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
 // DXCHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
+// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
 // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].fdot.v4f32(<4 x float> [[P0]], <4 x float> [[P0]])
+// CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> [[P0]], <4 x float> [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
+// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
+// SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// SPVCHECK-NEXT:  [[ENTRY:.*:]]
+// SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v4f32(<4 x float> [[P0]])
+// SPVCHECK-NEXT:    ret float [[SPV_LENGTH_I]]
+//
 float test_length_float4(float4 p0)
 {
   return length(p0);
diff --git a/clang/test/CodeGenSPIRV/Builtins/length.c b/clang/test/CodeGenSPIRV/Builtins/length.c
new file mode 100644
index 0000000000000..59e7c298dd816
--- /dev/null
+++ b/clang/test/CodeGenSPIRV/Builtins/length.c
@@ -0,0 +1,31 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+
+// RUN: %clang_cc1 -O1 -triple spirv-pc-vulkan-compute %s -emit-llvm -o - | FileCheck %s
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+
+// CHECK-LABEL: define spir_func float @test_length_float2(
+// CHECK-SAME: <2 x float> noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SPV_LENGTH:%.*]] = tail call float @llvm.spv.length.v2f32(<2 x float> [[X]])
+// CHECK-NEXT:    ret float [[SPV_LENGTH]]
+//
+float test_length_float2(float2 X) { return __builtin_spirv_length(X); }
+
+// CHECK-LABEL: define spir_func float @test_length_float3(
+// CHECK-SAME: <3 x float> noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SPV_LENGTH:%.*]] = tail call float @llvm.spv.length.v3f32(<3 x float> [[X]])
+// CHECK-NEXT:    ret float [[SPV_LENGTH]]
+//
+float test_length_float3(float3 X) { return __builtin_spirv_length(X); }
+
+// CHECK-LABEL: define spir_func float @test_length_float4(
+// CHECK-SAME: <4 x float> noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SPV_LENGTH:%.*]] = tail call float @llvm.spv.length.v4f32(<4 x float> [[X]])
+// CHECK-NEXT:    ret float [[SPV_LENGTH]]
+//
+float test_length_float4(float4 X) { return __builtin_spirv_length(X); }
diff --git a/clang/test/Driver/arm-mfpu.c b/clang/test/Driver/arm-mfpu.c
index 1b174be388124..5ea2230044dfb 100644
--- a/clang/test/Driver/arm-mfpu.c
+++ b/clang/test/Driver/arm-mfpu.c
@@ -356,8 +356,10 @@
 // CHECK-HF-DAG: "-target-cpu" "arm1176jzf-s"
 
 // RUN: %clang -target armv7-apple-darwin -x assembler %s -### -c 2>&1 \
-// RUN:   | FileCheck --check-prefix=ASM %s
-// ASM-NOT: -target-feature
+// RUN:   | FileCheck --check-prefix=ASM-NEON %s
+// RUN: %clang -target armv7-windows -x assembler %s -### -c 2>&1 \
+// RUN:   | FileCheck --check-prefix=ASM-NEON %s
+// ASM-NEON: "-target-feature" "+neon"
 
 // RUN: %clang -target armv8-linux-gnueabi -mfloat-abi=soft -mfpu=none %s -### -c 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-SOFT-ABI-FP %s
@@ -409,6 +411,25 @@
 // CHECK-ARM7-ANDROID-FP-DEFAULT-NOT: "-target-feature" "+sha2"
 // CHECK-ARM7-ANDROID-FP-DEFAULT-NOT: "-target-feature" "+aes"
 
+// RUN: %clang -target armv8-linux-androideabi21 %s -### -c 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-ARM8-ANDROID-FP-DEFAULT %s
+// CHECK-ARM8-ANDROID-FP-DEFAULT-DAG: "-target-feature" "+soft-float-abi"
+// CHECK-ARM8-ANDROID-FP-DEFAULT-DAG: "-target-feature" "+vfp3"
+// CHECK-ARM8-ANDROID-FP-DEFAULT-DAG: "-target-feature" "+vfp4"
+// CHECK-ARM8-ANDROID-FP-DEFAULT-DAG: "-target-feature" "+fp-armv8"
+// CHECK-ARM8-ANDROID-FP-DEFAULT-DAG: "-target-feature" "+aes"
+// CHECK-ARM8-ANDROID-FP-DEFAULT-DAG: "-target-feature" "+sha2"
+// CHECK-ARM8-ANDROID-FP-DEFAULT-NOT: "-target-feature" "+neon"
+
+// RUN: %clang -target armv8-linux-android %s -### -c 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-ARM8-ANDROID-DEFAULT %s
+// CHECK-ARM8-ANDROID-DEFAULT-DAG: "-target-feature" "+vfp3"
+// CHECK-ARM8-ANDROID-DEFAULT-DAG: "-target-feature" "+vfp4"
+// CHECK-ARM8-ANDROID-DEFAULT-DAG: "-target-feature" "+fp-armv8"
+// CHECK-ARM8-ANDROID-DEFAULT-DAG: "-target-feature" "+aes"
+// CHECK-ARM8-ANDROID-DEFAULT-DAG: "-target-feature" "+sha2"
+// CHECK-ARM8-ANDROID-DEFAULT-NOT: "-target-feature" "+neon"
+
 // RUN: %clang -target armv7-linux-androideabi21 %s -mfpu=vfp3-d16 -### -c 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ARM7-ANDROID-FP-D16 %s
 // CHECK-ARM7-ANDROID-FP-D16-NOT: "-target-feature" "+soft-float"
diff --git a/clang/test/Driver/baremetal-multilib-custom-flags.yaml b/clang/test/Driver/baremetal-multilib-custom-flags.yaml
new file mode 100644
index 0000000000000..9c0320ea16117
--- /dev/null
+++ b/clang/test/Driver/baremetal-multilib-custom-flags.yaml
@@ -0,0 +1,81 @@
+# UNSUPPORTED: system-windows
+
+# RUN: %clang --multi-lib-config=%s -no-canonical-prefixes -x c %s -### -o /dev/null 2>&1 \
+# RUN:     --target=thumbv8m.main-none-eabi -mfpu=none --sysroot= \
+# RUN:   | FileCheck --check-prefix=CHECK-DEFAULT %s
+
+# CHECK-DEFAULT:      "-cc1" "-triple" "thumbv8m.main-unknown-none-eabi"
+# CHECK-DEFAULT-SAME: "-internal-isystem" "[[SYSROOT:[^"]*]]/bin/../lib/clang-runtimes/arm-none-eabi/thumb/v8-m.main/nofp/include"
+# CHECK-DEFAULT-NEXT: "-L[[SYSROOT]]/bin/../lib/clang-runtimes/arm-none-eabi/thumb/v8-m.main/nofp/lib"
+
+# RUN: %clang --multi-lib-config=%s -no-canonical-prefixes -x c %s -### -o /dev/null 2>&1 \
+# RUN:     --target=thumbv8m.main-none-eabi -mfpu=none -fmultilib-flag=no-multithreaded --sysroot= \
+# RUN:   | FileCheck --check-prefix=CHECK-NOMULTI %s
+
+# CHECK-NOMULTI:      "-cc1" "-triple" "thumbv8m.main-unknown-none-eabi"
+# CHECK-NOMULTI-SAME: "-internal-isystem" "[[SYSROOT:[^"]*]]/bin/../lib/clang-runtimes/arm-none-eabi/thumb/v8-m.main/nofp/include"
+# CHECK-NOMULTI-NEXT: "-L[[SYSROOT]]/bin/../lib/clang-runtimes/arm-none-eabi/thumb/v8-m.main/nofp/lib"
+
+# RUN: %clang --multi-lib-config=%s -no-canonical-prefixes -x c %s -### -o /dev/null 2>&1 \
+# RUN:     --target=thumbv8m.main-none-eabi -mfpu=none -fmultilib-flag=multithreaded --sysroot= \
+# RUN:   | FileCheck --check-prefix=CHECK-MULTI %s
+
+# CHECK-MULTI:      "-cc1" "-triple" "thumbv8m.main-unknown-none-eabi"
+# CHECK-MULTI-SAME: "-internal-isystem" "[[SYSROOT:[^"]*]]/bin/../lib/clang-runtimes/arm-none-eabi/multithreaded/thumb/v8-m.main/nofp/include"
+# CHECK-MULTI-NEXT: "-L[[SYSROOT]]/bin/../lib/clang-runtimes/arm-none-eabi/multithreaded/thumb/v8-m.main/nofp/lib"
+
+# RUN: not %clang --multi-lib-config=%s -no-canonical-prefixes -x c %s -### -o /dev/null 2>&1 \
+# RUN:     --target=thumbv8m.main-none-eabi -mfpu=none -fmultilib-flag=singlethreaded -fmultilib-flag=no-io --sysroot= \
+# RUN:   | FileCheck --check-prefix=CHECK-ERROR %s
+# CHECK-ERROR-DAG: error: unsupported option '-fmultilib-flag=singlethreaded'
+# CHECK-ERROR-DAG: error: unsupported option '-fmultilib-flag=no-io'; did you mean '-fmultilib-flag=io-none'?
+
+# RUN: %clang --multi-lib-config=%s -no-canonical-prefixes -x c %s -### -o /dev/null 2>&1 \
+# RUN:     --target=thumbv8m.main-none-eabi -mfpu=none -print-multi-lib --sysroot= \
+# RUN:   | FileCheck --check-prefix=CHECK-PRINT-MULTI-LIB %s
+# CHECK-PRINT-MULTI-LIB: arm-none-eabi/thumb/v8-m.main/nofp;@-target=thumbv8m.main-unknown-none-eabi@mfpu=none@fmultilib-flag=no-multithreaded
+# CHECK-PRINT-MULTI-LIB: arm-none-eabi/multithreaded/thumb/v8-m.main/nofp;@-target=thumbv8m.main-unknown-none-eabi@mfpu=none@fmultilib-flag=multithreaded
+
+# RUN: %clang --target=arm-none-eabi --multi-lib-config=%s -x c %s -fmultilib-flag=no-multithreaded -### -o /dev/null 2>&1 \
+# RUN: | FileCheck --check-prefix=CHECK-MACRODEFINES-NOMULTI %s
+# CHECK-MACRODEFINES-NOMULTI:        "-D" "__SINGLE_THREAD__"
+
+# RUN: %clang --target=arm-none-eabi --multi-lib-config=%s -x c %s -fmultilib-flag=io-semihosting -### -o /dev/null 2>&1 \
+# RUN: | FileCheck --check-prefix=CHECK-MACRODEFINES-IO-SEMIHOSTING %s
+# CHECK-MACRODEFINES-IO-SEMIHOSTING: "-D" "SEMIHOSTING"
+
+# RUN: %clang --target=arm-none-eabi --multi-lib-config=%s -x c %s -fmultilib-flag=io-linux-syscalls -### -o /dev/null 2>&1 \
+# RUN: | FileCheck --check-prefix=CHECK-MACRODEFINES-IO-LINUX %s
+# CHECK-MACRODEFINES-IO-LINUX:  "-D" "LINUX_SYSCALLS"
+# CHECK-MACRODEFINES-IO-LINUX-SAME:  "-D" "HOSTED"
+
+---
+MultilibVersion: 1.0
+
+Groups:
+- Name: stdlib
+  Type: Exclusive
+
+Variants:
+- Dir: arm-none-eabi/thumb/v8-m.main/nofp
+  Flags: [--target=thumbv8m.main-unknown-none-eabi, -mfpu=none, -fmultilib-flag=no-multithreaded]
+  Group: stdlib
+- Dir: arm-none-eabi/multithreaded/thumb/v8-m.main/nofp
+  Flags: [--target=thumbv8m.main-unknown-none-eabi, -mfpu=none, -fmultilib-flag=multithreaded]
+  Group: stdlib
+
+Flags:
+  - Name: multithreading
+    Values:
+    - Name: no-multithreaded
+      MacroDefines: [__SINGLE_THREAD__]
+    - Name: multithreaded
+    Default: no-multithreaded
+  - Name: io
+    Values:
+    - Name: io-none
+    - Name: io-semihosting
+      MacroDefines: [SEMIHOSTING]
+    - Name: io-linux-syscalls
+      MacroDefines: [LINUX_SYSCALLS, HOSTED]
+    Default: io-none
\ No newline at end of file
diff --git a/clang/test/Driver/fuse-lipo.c b/clang/test/Driver/fuse-lipo.c
new file mode 100644
index 0000000000000..2dedb86ddc527
--- /dev/null
+++ b/clang/test/Driver/fuse-lipo.c
@@ -0,0 +1,15 @@
+// RUN: %clang %s -### --target=arm64-apple-darwin -arch x86_64 -arch arm64 -fuse-lipo=llvm-lipo 2>&1 | FileCheck -check-prefix=TEST1 %s
+// TEST1: llvm-lipo
+
+// RUN: %clang %s -### --target=arm64-apple-darwin -arch x86_64 -arch arm64 -fuse-lipo=nonexistant-lipo 2>&1 | FileCheck -check-prefix=TEST2 %s
+// TEST2: nonexistant-lipo
+
+// RUN: %clang %s -### --target=arm64-apple-darwin -fuse-lipo=llvm-lipo 2>&1 | FileCheck -check-prefix=TEST3 %s
+// TEST3: clang: warning: argument unused during compilation: '-fuse-lipo=llvm-lipo'
+
+// RUN: %clang %s -### --target=arm64-apple-darwin -Wno-unused-command-line-argument -fuse-lipo=llvm-lipo 2>&1 | FileCheck -check-prefix=TEST4 %s
+// TEST4-NOT: llvm-lipo
+
+// RUN: %clang %s -### --target=arm64-apple-darwin -arch x86_64 -arch arm64 2>&1 | FileCheck -check-prefix=TEST5 %s
+// TEST5: lipo
+// TEST5-NOT: llvm-lipo
diff --git a/clang/test/Modules/gmodules-nodebug.cpp b/clang/test/Modules/gmodules-nodebug.cpp
new file mode 100644
index 0000000000000..d83103768e838
--- /dev/null
+++ b/clang/test/Modules/gmodules-nodebug.cpp
@@ -0,0 +1,14 @@
+// REQUIRES: asserts
+
+// RUN: %clang_cc1 -std=c++23 -x c++-header -emit-pch -fmodule-format=obj \
+// RUN:   -o %t.pch %s \
+// RUN:   -mllvm -debug-only=pchcontainer &>%t-pch.ll
+// RUN: cat %t-pch.ll | FileCheck %s
+
+template<class...>                     
+using __void_t [[gnu::nodebug]] = void;
+                                       
+__void_t<> func() {}                   
+
+// CHECK: !DICompileUnit
+// CHECK-NOT: __void_t
diff --git a/clang/test/Modules/module-local-visibility-in-language-linkage.cppm b/clang/test/Modules/module-local-visibility-in-language-linkage.cppm
new file mode 100644
index 0000000000000..c046aef4e7486
--- /dev/null
+++ b/clang/test/Modules/module-local-visibility-in-language-linkage.cppm
@@ -0,0 +1,16 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: cd %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/m.a.cppm -emit-module-interface -o %t/a.pcm
+// RUN: %clang_cc1 -std=c++20 %t/m.b.cppm -fmodule-file=m:a=%t/a.pcm -fsyntax-only -verify
+
+//--- m.a.cppm
+export module m:a;
+int a;
+
+//--- m.b.cppm
+// expected-no-diagnostics
+module m:b;
+import :a;
+extern "C++" int get_a() { return a; }
diff --git a/clang/test/Modules/preferred_name.cppm b/clang/test/Modules/preferred_name.cppm
index 806781a81c5ca..86ba6ae96db99 100644
--- a/clang/test/Modules/preferred_name.cppm
+++ b/clang/test/Modules/preferred_name.cppm
@@ -53,10 +53,16 @@ import A;
 export using ::foo_templ;
 
 //--- Use1.cpp
-import A;         // expected-warning@foo.h:8 {{attribute declaration must precede definition}}
-#include "foo.h"  // expected-note@foo.h:9 {{previous definition is here}}
-
+// expected-no-diagnostics
+import A;
+#include "foo.h"
 //--- Use2.cpp
 // expected-no-diagnostics
 #include "foo.h"
 import A;
+
+//--- Use3.cpp
+#include "foo.h"
+import A;
+foo test;
+int size = test.size(); // expected-error {{no member named 'size' in 'foo'}}
diff --git a/clang/test/Preprocessor/arm-target-features.c b/clang/test/Preprocessor/arm-target-features.c
index 2999ee0d9e4d8..95ca7d0cbc3c2 100644
--- a/clang/test/Preprocessor/arm-target-features.c
+++ b/clang/test/Preprocessor/arm-target-features.c
@@ -132,6 +132,30 @@
 // CHECK-V7VE-DEFAULT-ABI-SOFT: #define __ARM_ARCH_EXT_IDIV__ 1
 // CHECK-V7VE-DEFAULT-ABI-SOFT: #define __ARM_FP 0xc
 
+// RUN: %clang -target x86_64-apple-macosx10.10 -arch armv7 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-DARWIN-V7 %s
+// CHECK-DARWIN-V7: #define __ARMEL__ 1
+// CHECK-DARWIN-V7: #define __ARM_ARCH 7
+// CHECK-DARWIN-V7: #define __ARM_ARCH_7A__ 1
+// CHECK-DARWIN-V7-NOT: __ARM_FEATURE_CRC32
+// CHECK-DARWIN-V7-NOT: __ARM_FEATURE_NUMERIC_MAXMIN
+// CHECK-DARWIN-V7-NOT: __ARM_FEATURE_DIRECTED_ROUNDING
+// CHECK-DARWIN-V7: #define __ARM_FP 0xc
+// CHECK-DARWIN-V7: #define __ARM_NEON 1
+// CHECK-DARWIN-V7: #define __ARM_NEON_FP 0x4
+// CHECK-DARWIN-V7: #define __ARM_NEON__ 1
+
+// RUN: %clang -target armv7-windows -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-WINDOWS-V7 %s
+// CHECK-WINDOWS-V7: #define __ARMEL__ 1
+// CHECK-WINDOWS-V7: #define __ARM_ARCH 7
+// CHECK-WINDOWS-V7: #define __ARM_ARCH_7A__ 1
+// CHECK-WINDOWS-V7-NOT: __ARM_FEATURE_CRC32
+// CHECK-WINDOWS-V7-NOT: __ARM_FEATURE_NUMERIC_MAXMIN
+// CHECK-WINDOWS-V7-NOT: __ARM_FEATURE_DIRECTED_ROUNDING
+// CHECK-WINDOWS-V7: #define __ARM_FP 0xe
+// CHECK-WINDOWS-V7: #define __ARM_NEON 1
+// CHECK-WINDOWS-V7: #define __ARM_NEON_FP 0x6
+// CHECK-WINDOWS-V7: #define __ARM_NEON__ 1
+
 // RUN: %clang -target x86_64-apple-macosx10.10 -arch armv7s -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V7S %s
 // CHECK-V7S: #define __ARMEL__ 1
 // CHECK-V7S: #define __ARM_ARCH 7
@@ -140,6 +164,9 @@
 // CHECK-V7S-NOT: __ARM_FEATURE_NUMERIC_MAXMIN
 // CHECK-V7S-NOT: __ARM_FEATURE_DIRECTED_ROUNDING
 // CHECK-V7S: #define __ARM_FP 0xe
+// CHECK-V7S: #define __ARM_NEON 1
+// CHECK-V7S: #define __ARM_NEON_FP 0x6
+// CHECK-V7S: #define __ARM_NEON__ 1
 
 // RUN: %clang -target arm-arm-none-eabi -march=armv7-m -mfloat-abi=soft -x c -E -dM %s | FileCheck -match-full-lines --check-prefix=CHECK-VFP-FP %s
 // RUN: %clang -target arm-arm-none-eabi -march=armv7-m -mfloat-abi=softfp -x c -E -dM %s | FileCheck -match-full-lines --check-prefix=CHECK-VFP-FP %s
diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c
index 3d2f4b83abcb8..8578993dbfaeb 100644
--- a/clang/test/Preprocessor/init-aarch64.c
+++ b/clang/test/Preprocessor/init-aarch64.c
@@ -123,7 +123,7 @@
 // AARCH64-NEXT: #define __FPCLASS_SNAN 0x0001
 // AARCH64-NEXT: #define __FP_FAST_FMA 1
 // AARCH64-NEXT: #define __FP_FAST_FMAF 1
-// AARCH64-NEXT: #define __FUNCTION_MULTI_VERSIONING_SUPPORT_LEVEL 202420
+// AARCH64-NEXT: #define __FUNCTION_MULTI_VERSIONING_SUPPORT_LEVEL 202430
 // AARCH64-NEXT: #define __GCC_ASM_FLAG_OUTPUTS__ 1
 // AARCH64-NEXT: #define __GCC_CONSTRUCTIVE_SIZE {{.+}}
 // AARCH64-NEXT: #define __GCC_DESTRUCTIVE_SIZE {{.+}}
@@ -434,7 +434,7 @@
 // AARCH64-DARWIN: #define __FLT_MIN_EXP__ (-125)
 // AARCH64-DARWIN: #define __FLT_MIN__ 1.17549435e-38F
 // AARCH64-DARWIN: #define __FLT_RADIX__ 2
-// AARCH64-DARWIN: #define __FUNCTION_MULTI_VERSIONING_SUPPORT_LEVEL 202420
+// AARCH64-DARWIN: #define __FUNCTION_MULTI_VERSIONING_SUPPORT_LEVEL 202430
 // AARCH64-DARWIN: #define __INT16_C_SUFFIX__
 // AARCH64-DARWIN: #define __INT16_FMTd__ "hd"
 // AARCH64-DARWIN: #define __INT16_FMTi__ "hi"
@@ -651,7 +651,7 @@
 // AARCH64-MSVC: #define __FLT_MIN_EXP__ (-125)
 // AARCH64-MSVC: #define __FLT_MIN__ 1.17549435e-38F
 // AARCH64-MSVC: #define __FLT_RADIX__ 2
-// AARCH64-MSVC: #define __FUNCTION_MULTI_VERSIONING_SUPPORT_LEVEL 202420
+// AARCH64-MSVC: #define __FUNCTION_MULTI_VERSIONING_SUPPORT_LEVEL 202430
 // AARCH64-MSVC: #define __INT_MAX__ 2147483647
 // AARCH64-MSVC: #define __LDBL_DECIMAL_DIG__ 17
 // AARCH64-MSVC: #define __LDBL_DENORM_MIN__ 4.9406564584124654e-324L
@@ -859,7 +859,7 @@
 // ARM64EC-MSVC: #define __FPCLASS_SNAN 0x0001
 // ARM64EC-MSVC: #define __FP_FAST_FMA 1
 // ARM64EC-MSVC: #define __FP_FAST_FMAF 1
-// ARM64EC-MSVC: #define __FUNCTION_MULTI_VERSIONING_SUPPORT_LEVEL 202420
+// ARM64EC-MSVC: #define __FUNCTION_MULTI_VERSIONING_SUPPORT_LEVEL 202430
 // ARM64EC-MSVC: #define __GCC_ASM_FLAG_OUTPUTS__ 1
 // ARM64EC-MSVC: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
 // ARM64EC-MSVC: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 1
diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp
index e405077b3de93..9be83f442de5d 100644
--- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp
+++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp
@@ -5,9 +5,9 @@
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#define SVE_ACLE_FUNC(A1, A2_UNUSED) A1
 #else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#define SVE_ACLE_FUNC(A1,A2) A1##A2
 #endif
 
 #include <arm_sve.h>
@@ -15,201 +15,201 @@
 void test_range_0_7()
 {
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmla_lane,_s16,,)(svundef_s16(), svundef_s16(), svundef_s16(), 8);
+  SVE_ACLE_FUNC(svmla_lane,_s16)(svundef_s16(), svundef_s16(), svundef_s16(), 8);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmla_lane,_u16,,)(svundef_u16(), svundef_u16(), svundef_u16(), -1);
+  SVE_ACLE_FUNC(svmla_lane,_u16)(svundef_u16(), svundef_u16(), svundef_u16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmlalb_lane,_s32,,)(svundef_s32(), svundef_s16(), svundef_s16(), -1);
+  SVE_ACLE_FUNC(svmlalb_lane,_s32)(svundef_s32(), svundef_s16(), svundef_s16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmlalb_lane,_u32,,)(svundef_u32(), svundef_u16(), svundef_u16(), 8);
+  SVE_ACLE_FUNC(svmlalb_lane,_u32)(svundef_u32(), svundef_u16(), svundef_u16(), 8);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmlalb_lane,_f32,,)(svundef_f32(), svundef_f16(), svundef_f16(), -1);
+  SVE_ACLE_FUNC(svmlalb_lane,_f32)(svundef_f32(), svundef_f16(), svundef_f16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmlalt_lane,_s32,,)(svundef_s32(), svundef_s16(), svundef_s16(), -1);
+  SVE_ACLE_FUNC(svmlalt_lane,_s32)(svundef_s32(), svundef_s16(), svundef_s16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmlalt_lane,_u32,,)(svundef_u32(), svundef_u16(), svundef_u16(), 8);
+  SVE_ACLE_FUNC(svmlalt_lane,_u32)(svundef_u32(), svundef_u16(), svundef_u16(), 8);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmlalt_lane,_f32,,)(svundef_f32(), svundef_f16(), svundef_f16(), -1);
+  SVE_ACLE_FUNC(svmlalt_lane,_f32)(svundef_f32(), svundef_f16(), svundef_f16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmls_lane,_s16,,)(svundef_s16(), svundef_s16(), svundef_s16(), -1);
+  SVE_ACLE_FUNC(svmls_lane,_s16)(svundef_s16(), svundef_s16(), svundef_s16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmls_lane,_u16,,)(svundef_u16(), svundef_u16(), svundef_u16(), 8);
+  SVE_ACLE_FUNC(svmls_lane,_u16)(svundef_u16(), svundef_u16(), svundef_u16(), 8);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmlslb_lane,_s32,,)(svundef_s32(), svundef_s16(), svundef_s16(),  -1);
+  SVE_ACLE_FUNC(svmlslb_lane,_s32)(svundef_s32(), svundef_s16(), svundef_s16(),  -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmlslb_lane,_u32,,)(svundef_u32(), svundef_u16(), svundef_u16(), 8);
+  SVE_ACLE_FUNC(svmlslb_lane,_u32)(svundef_u32(), svundef_u16(), svundef_u16(), 8);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmlslb_lane,_f32,,)(svundef_f32(), svundef_f16(), svundef_f16(), -1);
+  SVE_ACLE_FUNC(svmlslb_lane,_f32)(svundef_f32(), svundef_f16(), svundef_f16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmlslt_lane,_s32,,)(svundef_s32(), svundef_s16(), svundef_s16(), -1);
+  SVE_ACLE_FUNC(svmlslt_lane,_s32)(svundef_s32(), svundef_s16(), svundef_s16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmlslt_lane,_u32,,)(svundef_u32(), svundef_u16(), svundef_u16(), 8);
+  SVE_ACLE_FUNC(svmlslt_lane,_u32)(svundef_u32(), svundef_u16(), svundef_u16(), 8);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmlslt_lane,_f32,,)(svundef_f32(), svundef_f16(), svundef_f16(), -1);
+  SVE_ACLE_FUNC(svmlslt_lane,_f32)(svundef_f32(), svundef_f16(), svundef_f16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmul_lane,_s16,,)(svundef_s16(), svundef_s16(), 8);
+  SVE_ACLE_FUNC(svmul_lane,_s16)(svundef_s16(), svundef_s16(), 8);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmul_lane,_u16,,)(svundef_u16(), svundef_u16(), -1);
+  SVE_ACLE_FUNC(svmul_lane,_u16)(svundef_u16(), svundef_u16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmullb_lane,_s32,,)(svundef_s16(), svundef_s16(), -1);
+  SVE_ACLE_FUNC(svmullb_lane,_s32)(svundef_s16(), svundef_s16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmullb_lane,_u32,,)(svundef_u16(), svundef_u16(), 8);
+  SVE_ACLE_FUNC(svmullb_lane,_u32)(svundef_u16(), svundef_u16(), 8);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmullt_lane,_s32,,)(svundef_s16(), svundef_s16(), 8);
+  SVE_ACLE_FUNC(svmullt_lane,_s32)(svundef_s16(), svundef_s16(), 8);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svmullt_lane,_u32,,)(svundef_u16(), svundef_u16(), -1);
+  SVE_ACLE_FUNC(svmullt_lane,_u32)(svundef_u16(), svundef_u16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svqdmlalb_lane,_s32,,)(svundef_s32(), svundef_s16(), svundef_s16(), -1);
+  SVE_ACLE_FUNC(svqdmlalb_lane,_s32)(svundef_s32(), svundef_s16(), svundef_s16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svqdmlalt_lane,_s32,,)(svundef_s32(), svundef_s16(), svundef_s16(), -1);
+  SVE_ACLE_FUNC(svqdmlalt_lane,_s32)(svundef_s32(), svundef_s16(), svundef_s16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svqdmulh_lane,_s16,,)(svundef_s16(), svundef_s16(), -1);
+  SVE_ACLE_FUNC(svqdmulh_lane,_s16)(svundef_s16(), svundef_s16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svqdmlslb_lane,_s32,,)(svundef_s32(), svundef_s16(), svundef_s16(), -1);
+  SVE_ACLE_FUNC(svqdmlslb_lane,_s32)(svundef_s32(), svundef_s16(), svundef_s16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svqdmlslt_lane,_s32,,)(svundef_s32(), svundef_s16(), svundef_s16(), -1);
+  SVE_ACLE_FUNC(svqdmlslt_lane,_s32)(svundef_s32(), svundef_s16(), svundef_s16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svqdmullb_lane,_s32,,)(svundef_s16(), svundef_s16(), -1);
+  SVE_ACLE_FUNC(svqdmullb_lane,_s32)(svundef_s16(), svundef_s16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svqdmullt_lane,_s32,,)(svundef_s16(), svundef_s16(), -1);
+  SVE_ACLE_FUNC(svqdmullt_lane,_s32)(svundef_s16(), svundef_s16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svqrdmlah_lane,_s16,,)(svundef_s16(), svundef_s16(), svundef_s16(), -1);
+  SVE_ACLE_FUNC(svqrdmlah_lane,_s16)(svundef_s16(), svundef_s16(), svundef_s16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svqrdmlsh_lane,_s16,,)(svundef_s16(), svundef_s16(), svundef_s16(), -1);
+  SVE_ACLE_FUNC(svqrdmlsh_lane,_s16)(svundef_s16(), svundef_s16(), svundef_s16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svqrdmulh_lane,_s16,,)(svundef_s16(), svundef_s16(), -1);
+  SVE_ACLE_FUNC(svqrdmulh_lane,_s16)(svundef_s16(), svundef_s16(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svluti2_lane,_s16,,)(svundef_s16(), svundef_u8(), -1);
+  SVE_ACLE_FUNC(svluti2_lane,_s16)(svundef_s16(), svundef_u8(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svluti2_lane,_u16,,)(svundef_u16(), svundef_u8(), -1);
+  SVE_ACLE_FUNC(svluti2_lane,_u16)(svundef_u16(), svundef_u8(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svluti2_lane,_f16,,)(svundef_f16(), svundef_u8(), -1);
+  SVE_ACLE_FUNC(svluti2_lane,_f16)(svundef_f16(), svundef_u8(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svluti2_lane,_bf16,,)(svundef_bf16(), svundef_u8(), -1);
+  SVE_ACLE_FUNC(svluti2_lane,_bf16)(svundef_bf16(), svundef_u8(), -1);
 }
 
 void test_range_0_3()
 {
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svcdot_lane,_s32,,)(svundef_s32(), svundef_s8(), svundef_s8(), -1, 0);
+  SVE_ACLE_FUNC(svcdot_lane,_s32)(svundef_s32(), svundef_s8(), svundef_s8(), -1, 0);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svcmla_lane,_s16,,)(svundef_s16(), svundef_s16(), svundef_s16(), -1, 0);
+  SVE_ACLE_FUNC(svcmla_lane,_s16)(svundef_s16(), svundef_s16(), svundef_s16(), -1, 0);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svcmla_lane,_u16,,)(svundef_u16(), svundef_u16(), svundef_u16(), -1, 0);
+  SVE_ACLE_FUNC(svcmla_lane,_u16)(svundef_u16(), svundef_u16(), svundef_u16(), -1, 0);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svmla_lane,_s32,,)(svundef_s32(), svundef_s32(), svundef_s32(), -1);
+  SVE_ACLE_FUNC(svmla_lane,_s32)(svundef_s32(), svundef_s32(), svundef_s32(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svmla_lane,_u32,,)(svundef_u32(), svundef_u32(), svundef_u32(), 4);
+  SVE_ACLE_FUNC(svmla_lane,_u32)(svundef_u32(), svundef_u32(), svundef_u32(), 4);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svmlalb_lane,_s64,,)(svundef_s64(), svundef_s32(), svundef_s32(), -1);
+  SVE_ACLE_FUNC(svmlalb_lane,_s64)(svundef_s64(), svundef_s32(), svundef_s32(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svmlalb_lane,_u64,,)(svundef_u64(), svundef_u32(), svundef_u32(), 4);
+  SVE_ACLE_FUNC(svmlalb_lane,_u64)(svundef_u64(), svundef_u32(), svundef_u32(), 4);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svmlalt_lane,_s64,,)(svundef_s64(), svundef_s32(), svundef_s32(), -1);
+  SVE_ACLE_FUNC(svmlalt_lane,_s64)(svundef_s64(), svundef_s32(), svundef_s32(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svmlalt_lane,_u64,,)(svundef_u64(), svundef_u32(), svundef_u32(), 4);
+  SVE_ACLE_FUNC(svmlalt_lane,_u64)(svundef_u64(), svundef_u32(), svundef_u32(), 4);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svmls_lane,_s32,,)(svundef_s32(), svundef_s32(), svundef_s32(), 4);
+  SVE_ACLE_FUNC(svmls_lane,_s32)(svundef_s32(), svundef_s32(), svundef_s32(), 4);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svmls_lane,_u32,,)(svundef_u32(), svundef_u32(), svundef_u32(), -1);
+  SVE_ACLE_FUNC(svmls_lane,_u32)(svundef_u32(), svundef_u32(), svundef_u32(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svmlslb_lane,_s64,,)(svundef_s64(), svundef_s32(), svundef_s32(), -1);
+  SVE_ACLE_FUNC(svmlslb_lane,_s64)(svundef_s64(), svundef_s32(), svundef_s32(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svmlslb_lane,_u64,,)(svundef_u64(), svundef_u32(), svundef_u32(), 4);
+  SVE_ACLE_FUNC(svmlslb_lane,_u64)(svundef_u64(), svundef_u32(), svundef_u32(), 4);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svmlslt_lane,_s64,,)(svundef_s64(), svundef_s32(), svundef_s32(), -1);
+  SVE_ACLE_FUNC(svmlslt_lane,_s64)(svundef_s64(), svundef_s32(), svundef_s32(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svmlslt_lane,_u64,,)(svundef_u64(), svundef_u32(), svundef_u32(), 4);
+  SVE_ACLE_FUNC(svmlslt_lane,_u64)(svundef_u64(), svundef_u32(), svundef_u32(), 4);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svmul_lane,_s32,,)(svundef_s32(), svundef_s32(), -1);
+  SVE_ACLE_FUNC(svmul_lane,_s32)(svundef_s32(), svundef_s32(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svmul_lane,_u32,,)(svundef_u32(), svundef_u32(), 4);
+  SVE_ACLE_FUNC(svmul_lane,_u32)(svundef_u32(), svundef_u32(), 4);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svmullb_lane,_s64,,)(svundef_s32(), svundef_s32(), 4);
+  SVE_ACLE_FUNC(svmullb_lane,_s64)(svundef_s32(), svundef_s32(), 4);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svmullb_lane,_u64,,)(svundef_u32(), svundef_u32(), -1);
+  SVE_ACLE_FUNC(svmullb_lane,_u64)(svundef_u32(), svundef_u32(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svmullt_lane,_s64,,)(svundef_s32(), svundef_s32(), -1);
+  SVE_ACLE_FUNC(svmullt_lane,_s64)(svundef_s32(), svundef_s32(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svmullt_lane,_u64,,)(svundef_u32(), svundef_u32(), 4);
+  SVE_ACLE_FUNC(svmullt_lane,_u64)(svundef_u32(), svundef_u32(), 4);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svqdmulh_lane,_s32,,)(svundef_s32(), svundef_s32(), 4);
+  SVE_ACLE_FUNC(svqdmulh_lane,_s32)(svundef_s32(), svundef_s32(), 4);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svqrdcmlah_lane,_s16,,)(svundef_s16(), svundef_s16(), svundef_s16(), -1, 0);
+  SVE_ACLE_FUNC(svqrdcmlah_lane,_s16)(svundef_s16(), svundef_s16(), svundef_s16(), -1, 0);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svqrdmlah_lane,_s32,,)(svundef_s32(), svundef_s32(), svundef_s32(), 4);
+  SVE_ACLE_FUNC(svqrdmlah_lane,_s32)(svundef_s32(), svundef_s32(), svundef_s32(), 4);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svqdmlalb_lane,_s64,,)(svundef_s64(), svundef_s32(), svundef_s32(), 4);
+  SVE_ACLE_FUNC(svqdmlalb_lane,_s64)(svundef_s64(), svundef_s32(), svundef_s32(), 4);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svqdmlalt_lane,_s64,,)(svundef_s64(), svundef_s32(), svundef_s32(), 4);
+  SVE_ACLE_FUNC(svqdmlalt_lane,_s64)(svundef_s64(), svundef_s32(), svundef_s32(), 4);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svqrdmlsh_lane,_s32,,)(svundef_s32(), svundef_s32(), svundef_s32(), 4);
+  SVE_ACLE_FUNC(svqrdmlsh_lane,_s32)(svundef_s32(), svundef_s32(), svundef_s32(), 4);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svqdmlslb_lane,_s64,,)(svundef_s64(), svundef_s32(), svundef_s32(), 4);
+  SVE_ACLE_FUNC(svqdmlslb_lane,_s64)(svundef_s64(), svundef_s32(), svundef_s32(), 4);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svqdmlslt_lane,_s64,,)(svundef_s64(), svundef_s32(), svundef_s32(), 4);
+  SVE_ACLE_FUNC(svqdmlslt_lane,_s64)(svundef_s64(), svundef_s32(), svundef_s32(), 4);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svqrdmulh_lane,_s32,,)(svundef_s32(), svundef_s32(), 4);
+  SVE_ACLE_FUNC(svqrdmulh_lane,_s32)(svundef_s32(), svundef_s32(), 4);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svqdmullb_lane,_s64,,)(svundef_s32(), svundef_s32(), 4);
+  SVE_ACLE_FUNC(svqdmullb_lane,_s64)(svundef_s32(), svundef_s32(), 4);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svqdmullt_lane,_s64,,)(svundef_s32(), svundef_s32(), -1);
+  SVE_ACLE_FUNC(svqdmullt_lane,_s64)(svundef_s32(), svundef_s32(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svluti2_lane,_s8,,)(svundef_s8(), svundef_u8(), -1);
+  SVE_ACLE_FUNC(svluti2_lane,_s8)(svundef_s8(), svundef_u8(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svluti2_lane,_u8,,)(svundef_u8(), svundef_u8(), -1);
+  SVE_ACLE_FUNC(svluti2_lane,_u8)(svundef_u8(), svundef_u8(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svluti4_lane,_s16,,)(svundef_s16(), svundef_u8(), -1);
+  SVE_ACLE_FUNC(svluti4_lane,_s16)(svundef_s16(), svundef_u8(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svluti4_lane,_u16,,)(svundef_u16(), svundef_u8(), -1);
+  SVE_ACLE_FUNC(svluti4_lane,_u16)(svundef_u16(), svundef_u8(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svluti4_lane,_f16,,)(svundef_f16(), svundef_u8(), -1);
+  SVE_ACLE_FUNC(svluti4_lane,_f16)(svundef_f16(), svundef_u8(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svluti4_lane,_bf16,,)(svundef_bf16(), svundef_u8(), -1);
+  SVE_ACLE_FUNC(svluti4_lane,_bf16)(svundef_bf16(), svundef_u8(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svluti4_lane,_s16,_x2,)(svcreate2_s16(svundef_s16(),svundef_s16()), svundef_u8(), -1);
+  SVE_ACLE_FUNC(svluti4_lane,_s16_x2)(svcreate2_s16(svundef_s16(),svundef_s16()), svundef_u8(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svluti4_lane,_u16,_x2,)(svcreate2_u16(svundef_u16(),svundef_u16()), svundef_u8(), -1);
+  SVE_ACLE_FUNC(svluti4_lane,_u16_x2)(svcreate2_u16(svundef_u16(),svundef_u16()), svundef_u8(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svluti4_lane,_f16,_x2,)(svcreate2_f16(svundef_f16(),svundef_f16()), svundef_u8(), -1);
+  SVE_ACLE_FUNC(svluti4_lane,_f16_x2)(svcreate2_f16(svundef_f16(),svundef_f16()), svundef_u8(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svluti4_lane,_bf16,_x2,)(svcreate2_bf16(svundef_bf16(),svundef_bf16()), svundef_u8(), -1);
+  SVE_ACLE_FUNC(svluti4_lane,_bf16_x2)(svcreate2_bf16(svundef_bf16(),svundef_bf16()), svundef_u8(), -1);
 }
 
 void test_range_0_1()
 {
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svcdot_lane,_s64,,)(svundef_s64(), svundef_s16(), svundef_s16(), -1, 0);
+  SVE_ACLE_FUNC(svcdot_lane,_s64)(svundef_s64(), svundef_s16(), svundef_s16(), -1, 0);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svcmla_lane,_s32,,)(svundef_s32(), svundef_s32(), svundef_s32(), -1, 0);
+  SVE_ACLE_FUNC(svcmla_lane,_s32)(svundef_s32(), svundef_s32(), svundef_s32(), -1, 0);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svcmla_lane,_s32,,)(svundef_s32(), svundef_s32(), svundef_s32(), 2, 0);
+  SVE_ACLE_FUNC(svcmla_lane,_s32)(svundef_s32(), svundef_s32(), svundef_s32(), 2, 0);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svcmla_lane,_u32,,)(svundef_u32(), svundef_u32(), svundef_u32(), -1, 0);
+  SVE_ACLE_FUNC(svcmla_lane,_u32)(svundef_u32(), svundef_u32(), svundef_u32(), -1, 0);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svmla_lane,_s64,,)(svundef_s64(), svundef_s64(), svundef_s64(), 2);
+  SVE_ACLE_FUNC(svmla_lane,_s64)(svundef_s64(), svundef_s64(), svundef_s64(), 2);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svmla_lane,_u64,,)(svundef_u64(), svundef_u64(), svundef_u64(), -1);
+  SVE_ACLE_FUNC(svmla_lane,_u64)(svundef_u64(), svundef_u64(), svundef_u64(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svmls_lane,_s64,,)(svundef_s64(), svundef_s64(), svundef_s64(), -1);
+  SVE_ACLE_FUNC(svmls_lane,_s64)(svundef_s64(), svundef_s64(), svundef_s64(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svmls_lane,_u64,,)(svundef_u64(), svundef_u64(), svundef_u64(), 2);
+  SVE_ACLE_FUNC(svmls_lane,_u64)(svundef_u64(), svundef_u64(), svundef_u64(), 2);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svmul_lane,_s64,,)(svundef_s64(), svundef_s64(), 2);
+  SVE_ACLE_FUNC(svmul_lane,_s64)(svundef_s64(), svundef_s64(), 2);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svmul_lane,_u64,,)(svundef_u64(), svundef_u64(), -1);
+  SVE_ACLE_FUNC(svmul_lane,_u64)(svundef_u64(), svundef_u64(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svqdmulh_lane,_s64,,)(svundef_s64(), svundef_s64(), 2);
+  SVE_ACLE_FUNC(svqdmulh_lane,_s64)(svundef_s64(), svundef_s64(), 2);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svqrdcmlah_lane,_s32,,)(svundef_s32(), svundef_s32(), svundef_s32(), 2, 0);
+  SVE_ACLE_FUNC(svqrdcmlah_lane,_s32)(svundef_s32(), svundef_s32(), svundef_s32(), 2, 0);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svqrdmlah_lane,_s64,,)(svundef_s64(), svundef_s64(), svundef_s64(), 2);
+  SVE_ACLE_FUNC(svqrdmlah_lane,_s64)(svundef_s64(), svundef_s64(), svundef_s64(), 2);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svqrdmlsh_lane,_s64,,)(svundef_s64(), svundef_s64(), svundef_s64(), 2);
+  SVE_ACLE_FUNC(svqrdmlsh_lane,_s64)(svundef_s64(), svundef_s64(), svundef_s64(), 2);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svqrdmulh_lane,_s64,,)(svundef_s64(), svundef_s64(), 2);
+  SVE_ACLE_FUNC(svqrdmulh_lane,_s64)(svundef_s64(), svundef_s64(), 2);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svluti4_lane,_s8,,)(svundef_s8(), svundef_u8(), -1);
+  SVE_ACLE_FUNC(svluti4_lane,_s8)(svundef_s8(), svundef_u8(), -1);
   // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svluti4_lane,_u8,,)(svundef_u8(), svundef_u8(), -1);
+  SVE_ACLE_FUNC(svluti4_lane,_u8)(svundef_u8(), svundef_u8(), -1);
 }
diff --git a/clang/test/Sema/attr-nonblocking-constraints.cpp b/clang/test/Sema/attr-nonblocking-constraints.cpp
index bbc909f627f4c..b26a945843696 100644
--- a/clang/test/Sema/attr-nonblocking-constraints.cpp
+++ b/clang/test/Sema/attr-nonblocking-constraints.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fblocks -fcxx-exceptions -std=c++20 -verify -Wfunction-effects %s
+// RUN: %clang_cc1 -fsyntax-only -fblocks -fcxx-exceptions -std=c++20 -verify -Wfunction-effects -Wno-vla-extension %s
 // These are in a separate file because errors (e.g. incompatible attributes) currently prevent
 // the FXAnalysis pass from running at all.
 
@@ -246,6 +246,23 @@ void PTMFTester::convert() [[clang::nonblocking]]
 	(this->*mConvertFunc)();
 }
 
+// Allow implicit conversion from array to pointer.
+void nb14(unsigned idx) [[clang::nonblocking]]
+{
+	using FP = void (*)() [[clang::nonblocking]];
+	using FPArray = FP[2];
+	auto nb = +[]() [[clang::nonblocking]] {};
+
+	FPArray src{ nb, nullptr };
+	FP f = src[idx]; // This should not generate a warning.
+
+	FP twoDim[2][2] = {};
+	FP g = twoDim[1][1];
+
+	FP vla[idx];
+	FP h = vla[0];
+}
+
 // Block variables
 void nb17(void (^blk)() [[clang::nonblocking]]) [[clang::nonblocking]] {
 	blk();
diff --git a/clang/test/SemaCXX/constructor.cpp b/clang/test/SemaCXX/constructor.cpp
index abd7dbe18a0e6..b069d55118f56 100644
--- a/clang/test/SemaCXX/constructor.cpp
+++ b/clang/test/SemaCXX/constructor.cpp
@@ -96,3 +96,80 @@ namespace PR38286 {
   template<typename> struct C; // expected-note {{non-type declaration found}}
   template<typename T> C<T>::~C() {} // expected-error {{identifier 'C' after '~' in destructor name does not name a type}}
 }
+
+namespace GH121706 {
+
+struct A {
+  *&A(); // expected-error {{invalid constructor declaration}}
+};
+
+struct B {
+  *&&B(); // expected-error {{invalid constructor declaration}}
+};
+
+struct C {
+  *const C(); // expected-error {{invalid constructor declaration}}
+};
+
+struct D {
+  *const *D(); // expected-error {{invalid constructor declaration}}
+};
+
+struct E {
+  *E::*E(); // expected-error {{invalid constructor declaration}}
+};
+
+struct F {
+  *F::*const F(); // expected-error {{invalid constructor declaration}}
+};
+
+struct G {
+  ****G(); // expected-error {{invalid constructor declaration}}
+};
+
+struct H {
+  **H(const H &); // expected-error {{invalid constructor declaration}}
+};
+
+struct I {
+  *I(I &&); // expected-error {{invalid constructor declaration}}
+};
+
+struct J {
+  *&(J)(); // expected-error {{invalid constructor declaration}}
+};
+
+struct K {
+  **&&(K)(); // expected-error {{invalid constructor declaration}}
+};
+
+struct L {
+  *L(L&& other); // expected-error {{invalid constructor declaration}}
+};
+
+struct M {
+  *M(M& other); // expected-error {{invalid constructor declaration}}
+};
+
+struct N {
+  int N(); // expected-error {{constructor cannot have a return type}}
+};
+
+struct O {
+  static O(); // expected-error {{constructor cannot be declared 'static'}}
+};
+
+struct P {
+  explicit P();
+};
+
+struct Q {
+  constexpr Q();
+};
+
+struct R {
+  R();
+  friend R::R();
+};
+
+}
diff --git a/clang/test/SemaCXX/conversion-function.cpp b/clang/test/SemaCXX/conversion-function.cpp
index 749e2fc1b452b..b653a3bf1a1d2 100644
--- a/clang/test/SemaCXX/conversion-function.cpp
+++ b/clang/test/SemaCXX/conversion-function.cpp
@@ -494,3 +494,10 @@ using Result = B<int>::Lookup<int>;
 using Result = int (A2<int>::*)();
 }
 #endif
+
+namespace GH121706 {
+struct S {
+  *operator int();   // expected-error {{cannot specify any part of a return type in the declaration of a conversion function; put the complete type after 'operator'}}
+  **operator char(); // expected-error {{cannot specify any part of a return type in the declaration of a conversion function; put the complete type after 'operator'}}
+};
+}
diff --git a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
index 58b642d2735b6..202a819655217 100644
--- a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
+++ b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
@@ -321,3 +321,26 @@ namespace GH121242 {
         (void)z<X{}>;
     }
 } // namespace GH121242
+
+namespace GH123033 {
+  template <class... Types>
+  requires __is_same_as(Types...[0], int)
+  void print(double d);
+
+  template <class... Types>
+  requires  __is_same_as(Types...[0], int)
+  void print(double d);
+
+  template <class... Types>
+  Types...[0] convert(double d);
+
+  template <class... Types>
+  Types...[0] convert(double d) {
+      return static_cast<Types...[0]>(d);
+  }
+
+  void f() {
+      print<int, int>(12.34);
+      convert<int, int>(12.34);
+  }
+}
diff --git a/clang/test/SemaCXX/destructor.cpp b/clang/test/SemaCXX/destructor.cpp
index dfcd1b033af5a..589616ef8e437 100644
--- a/clang/test/SemaCXX/destructor.cpp
+++ b/clang/test/SemaCXX/destructor.cpp
@@ -586,4 +586,50 @@ struct Y : X {} y1{ }; // expected-error {{call to implicitly-deleted default co
                        // expected-note {{default constructor of 'Y' is implicitly deleted because base class 'X' has no destructor}}
 }
 
+namespace GH121706 {
+struct A {
+  *&~A(); // expected-error {{invalid destructor declaration}}
+};
+
+struct B {
+  *&&~B(); // expected-error {{invalid destructor declaration}}
+};
+
+struct C {
+  *const ~C(); // expected-error {{invalid destructor declaration}}
+};
+
+struct D {
+  *const * ~D(); // expected-error {{invalid destructor declaration}}
+};
+
+struct E {
+  *E::*~E(); // expected-error {{invalid destructor declaration}}
+};
+
+struct F {
+  *F::*const ~F(); // expected-error {{invalid destructor declaration}}
+};
+
+struct G {
+  ****~G(); // expected-error {{invalid destructor declaration}}
+};
+
+struct H {
+  **~H(); // expected-error {{invalid destructor declaration}}
+};
+
+struct I {
+  *~I(); // expected-error {{invalid destructor declaration}}
+};
+
+struct J {
+  *&~J(); // expected-error {{invalid destructor declaration}}
+};
+
+struct K {
+  **&&~K(); // expected-error {{invalid destructor declaration}}
+};
+}
+
 #endif // BE_THE_HEADER
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp
index e80b54b7c6967..7dd6c83dbba2a 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-array.cpp
@@ -92,35 +92,3 @@ char access_strings() {
   c = array_string[5];
   return c;
 }
-
-struct T {
-  int array[10];
-};
-
-const int index = 1;
-
-constexpr int get_const(int x) {
-  if(x < 3)
-    return ++x;
-  else
-    return x + 5;
-};
-
-void array_indexed_const_expr(unsigned idx) {
-  // expected-note@+2 {{change type of 'arr' to 'std::array' to label it for hardening}}
-  // expected-warning@+1{{'arr' is an unsafe buffer that does not perform bounds checks}}
-  int arr[10];
-  arr[sizeof(int)] = 5;
-
-  int array[sizeof(T)];
-  array[sizeof(int)] = 5;
-  array[sizeof(T) -1 ] = 3;
-
-  int k = arr[6 & 5];
-  k = arr[2 << index];
-  k = arr[8 << index]; // expected-note {{used in buffer access here}}
-  k = arr[16 >> 1];
-  k = arr[get_const(index)];
-  k = arr[get_const(5)]; // expected-note {{used in buffer access here}}
-  k = arr[get_const(4)];
-}
diff --git a/clang/test/SemaHLSL/BuiltIns/WaveActiveSum-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/WaveActiveSum-errors.hlsl
new file mode 100644
index 0000000000000..406e8fc57ca95
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/WaveActiveSum-errors.hlsl
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -emit-llvm-only -disable-llvm-passes -verify
+
+int test_too_few_arg() {
+  return __builtin_hlsl_wave_active_sum();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+}
+
+float2 test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_wave_active_sum(p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+}
+
+bool test_expr_bool_type_check(bool p0) {
+  return __builtin_hlsl_wave_active_sum(p0);
+  // expected-error@-1 {{invalid operand of type 'bool'}}
+}
+
+bool2 test_expr_bool_vec_type_check(bool2 p0) {
+  return __builtin_hlsl_wave_active_sum(p0);
+  // expected-error@-1 {{invalid operand of type 'bool2' (aka 'vector<bool, 2>')}}
+}
+
+struct S { float f; };
+
+S test_expr_struct_type_check(S p0) {
+  return __builtin_hlsl_wave_active_sum(p0);
+  // expected-error@-1 {{invalid operand of type 'S' where a scalar or vector is required}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/asfloat-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asfloat-errors.hlsl
index c2dd9e272e093..f5f223943b4cd 100644
--- a/clang/test/SemaHLSL/BuiltIns/asfloat-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/asfloat-errors.hlsl
@@ -24,12 +24,3 @@ float test_float_half(half p1) {
     // expected-note@hlsl/hlsl_detail.h:* {{candidate template ignored: could not match 'vector<half, N>' against 'half'}}
     // expected-note@hlsl/hlsl_detail.h:* {{candidate template ignored: substitution failure [with U = float, T = half]: no type named 'Type'}}
 }
-
-
-float test_float_half(bool p1) {
-    return asfloat(p1);
-    // expected-error@hlsl/hlsl_intrinsics.h:* {{no matching function for call to 'bit_cast'}}
-    // expected-note@-2 {{in instantiation of function template specialization 'hlsl::asfloat<bool>'}}
-    // expected-note@hlsl/hlsl_detail.h:* {{candidate template ignored: could not match 'vector<bool, N>' against 'bool'}}
-    // expected-note@hlsl/hlsl_detail.h:* {{candidate template ignored: substitution failure [with U = float, T = bool]: no type named 'Type'}}
-}
diff --git a/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl
index 1912ab3ae806b..b4024418dbba4 100644
--- a/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl
@@ -17,12 +17,10 @@ double test_int_builtin(double p0) {
 
 double2 test_int_builtin_2(double2 p0) {
   return __builtin_hlsl_elementwise_firstbithigh(p0);
-  // expected-error@-1 {{1st argument must be a vector of integers
-  // (was 'double2' (aka 'vector<double, 2>'))}}
+  // expected-error@-1 {{1st argument must be a vector of integers (was 'double2' (aka 'vector<double, 2>'))}}
 }
 
 float test_int_builtin_3(float p0) {
   return __builtin_hlsl_elementwise_firstbithigh(p0);
-  // expected-error@-1 {{1st argument must be a vector of integers
-  // (was 'float')}}
+  // expected-error@-1 {{1st argument must be a vector of integers (was 'double')}}
 }
diff --git a/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl
new file mode 100644
index 0000000000000..95c25e9e2fb60
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+
+int test_too_few_arg() {
+  return firstbitlow();
+  // expected-error@-1 {{no matching function for call to 'firstbitlow'}}
+}
+
+int test_too_many_arg(int p0) {
+  return firstbitlow(p0, p0);
+  // expected-error@-1 {{no matching function for call to 'firstbitlow'}}
+}
+
+double test_int_builtin(double p0) {
+  return firstbitlow(p0);
+  // expected-error@-1 {{call to 'firstbitlow' is ambiguous}}
+}
+
+double2 test_int_builtin_2(double2 p0) {
+  return __builtin_hlsl_elementwise_firstbitlow(p0);
+  // expected-error@-1 {{1st argument must be a vector of integers (was 'double2' (aka 'vector<double, 2>'))}}
+}
+
+float test_int_builtin_3(float p0) {
+  return __builtin_hlsl_elementwise_firstbitlow(p0);
+  // expected-error@-1 {{1st argument must be a vector of integers (was 'double')}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
index 56c8b32cc14e0..c77a07602b390 100644
--- a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
@@ -20,16 +20,38 @@ float2 test_lerp_no_second_arg(float2 p0) {
   // expected-error@-1 {{no matching function for call to 'lerp'}}
 }
 
-float2 test_lerp_vector_size_mismatch(float3 p0, float2 p1) {
+float2 test_lerp_vector_trunc_warn1(float3 p0) {
+  return lerp(p0, p0, p0);
+  // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'vector<float, 2>' (vector of 2 'float' values)}}
+}
+
+float2 test_lerp_vector_trunc_warn2(float3 p0, float2 p1) {
   return lerp(p0, p0, p1);
   // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'vector<float, 2>' (vector of 2 'float' values)}}
+  // expected-warning@-2 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'vector<float, 2>' (vector of 2 'float' values)}}
+}
+
+float2 test_lerp_vector_trunc_warn3(float3 p0, float2 p1) {
+  return lerp(p0, p1, p0);
+  // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'vector<float, 2>' (vector of 2 'float' values)}}
+  // expected-warning@-2 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'vector<float, 2>' (vector of 2 'float' values)}}
 }
 
-float2 test_lerp_builtin_vector_size_mismatch(float3 p0, float2 p1) {
+float2 test_lerp_builtin_vector_size_mismatch_Arg1(float3 p0, float2 p1) {
   return __builtin_hlsl_lerp(p0, p1, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must have the same type}}
 }
 
+float2 test_lerp_builtin_vector_size_mismatch_Arg2(float3 p0, float2 p1) {
+  return __builtin_hlsl_lerp(p1, p0, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must have the same type}}
+}
+
+float2 test_lerp_builtin_vector_size_mismatch_Arg3(float3 p0, float2 p1) {
+  return __builtin_hlsl_lerp(p1, p1, p0);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must have the same type}}
+}
+
 float test_lerp_scalar_mismatch(float p0, half p1) {
   return lerp(p1, p0, p1);
   // expected-error@-1 {{call to 'lerp' is ambiguous}}
@@ -45,6 +67,16 @@ float2 test_builtin_lerp_float2_splat(float p0, float2 p1) {
   // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
 }
 
+float2 test_builtin_lerp_float2_splat2(double p0, double2 p1) {
+  return __builtin_hlsl_lerp(p1, p0, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
+}
+
+float2 test_builtin_lerp_float2_splat3(double p0, double2 p1) {
+  return __builtin_hlsl_lerp(p1, p1, p0);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
+}
+
 float3 test_builtin_lerp_float3_splat(float p0, float3 p1) {
   return __builtin_hlsl_lerp(p0, p1, p1);
   // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
diff --git a/clang/test/SemaSPIRV/BuiltIns/length-errors.c b/clang/test/SemaSPIRV/BuiltIns/length-errors.c
new file mode 100644
index 0000000000000..3244bd6737f11
--- /dev/null
+++ b/clang/test/SemaSPIRV/BuiltIns/length-errors.c
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 %s -triple spirv-pc-vulkan-compute -verify
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+
+void test_too_few_arg()
+{
+  return __builtin_spirv_length();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+}
+
+void test_too_many_arg(float2 p0)
+{
+  return __builtin_spirv_length(p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+}
+
+float test_double_scalar_inputs(double p0) {
+  return __builtin_spirv_length(p0);
+  //  expected-error@-1 {{passing 'double' to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(double)))) double' (vector of 2 'double' values)}}
+}
+
+float test_int_scalar_inputs(int p0) {
+  return __builtin_spirv_length(p0);
+  //  expected-error@-1 {{passing 'int' to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(int)))) int' (vector of 2 'int' values)}}
+}
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index d03c783313dd7..39250f0617f4b 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -478,3 +478,53 @@ A a{.f1 = {1}};
 // CHECK-NEXT:     `-DeclRefExpr {{.+}} <col:10> 'int' NonTypeTemplateParm {{.+}} 'N' 'int'
 
 } // namespace GH83368
+
+namespace GH122134 {
+
+template <class, class>
+concept Constraint = true;
+
+template <class T, int> struct Struct {
+  Struct(Constraint<T> auto) {}
+};
+
+template <int N = 0> using Test = Struct<int, N>;
+
+Test test(42);
+
+// CHECK-LABEL: Dumping GH122134::<deduction guide for Test>:
+// CHECK-NEXT: FunctionTemplateDecl {{.*}} implicit <deduction guide for Test>
+// CHECK-NEXT: |-NonTypeTemplateParmDecl {{.*}} 'int' depth 0 index 0 N
+// CHECK-NEXT: | `-TemplateArgument {{.*}} expr '0'
+// CHECK-NEXT: |   `-IntegerLiteral {{.*}} 'int' 0
+// CHECK-NEXT: |-TemplateTypeParmDecl {{.*}} Concept {{.*}} 'Constraint' depth 0 index 1 auto:1
+// CHECK-NEXT: | `-ConceptSpecializationExpr {{.*}} 'bool' Concept {{.*}} 'Constraint'
+// CHECK-NEXT: |   |-ImplicitConceptSpecializationDecl {{.*}}
+// CHECK-NEXT: |   | |-TemplateArgument type 'type-parameter-0-1'
+// CHECK-NEXT: |   | | `-TemplateTypeParmType {{.*}} 'type-parameter-0-1' dependent depth 0 index 1
+// CHECK-NEXT: |   | `-TemplateArgument type 'int'
+// CHECK-NEXT: |   |   `-BuiltinType {{.*}} 'int'
+// CHECK-NEXT: |   |-TemplateArgument {{.*}} type 'auto:1':'type-parameter-0-1'
+// CHECK-NEXT: |   | `-TemplateTypeParmType {{.*}} 'auto:1' dependent depth 0 index 1
+// CHECK-NEXT: |   |   `-TemplateTypeParm {{.*}} 'auto:1'
+// CHECK-NEXT: |   `-TemplateArgument {{.*}} type 'int'
+// CHECK-NEXT: |     `-BuiltinType {{.*}} 'int'
+// CHECK-NEXT: |-TypeTraitExpr {{.*}} 'bool' __is_deducible
+// CHECK-NEXT: | |-DeducedTemplateSpecializationType {{.*}} 'GH122134::Test' dependent
+// CHECK-NEXT: | | `-name: 'GH122134::Test'
+// CHECK-NEXT: | |   `-TypeAliasTemplateDecl {{.*}} Test
+// CHECK-NEXT: | `-TemplateSpecializationType {{.*}} 'Struct<int, N>' dependent
+// CHECK-NEXT: |   |-name: 'Struct':'GH122134::Struct' qualified
+// CHECK-NEXT: |   | `-ClassTemplateDecl {{.*}} Struct
+// CHECK-NEXT: |   |-TemplateArgument type 'int'
+// CHECK-NEXT: |   | `-SubstTemplateTypeParmType {{.*}} 'int' sugar class depth 0 index 0 T
+// CHECK-NEXT: |   |   |-FunctionTemplate {{.*}} '<deduction guide for Struct>'
+// CHECK-NEXT: |   |   `-BuiltinType {{.*}} 'int'
+// CHECK-NEXT: |   `-TemplateArgument expr 'N'
+// CHECK-NEXT: |     `-SubstNonTypeTemplateParmExpr {{.*}} 'int'
+// CHECK-NEXT: |       |-NonTypeTemplateParmDecl {{.*}} 'int' depth 0 index 1
+// CHECK-NEXT: |       `-DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'N' 'int'
+// CHECK-NEXT: |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for Test> 'auto (auto:1) -> Struct<int, N>'
+// CHECK-NEXT: | `-ParmVarDecl {{.*}} 'auto:1'
+
+} // namespace GH122134
diff --git a/clang/test/TableGen/select-enum-errors.td b/clang/test/TableGen/select-enum-errors.td
new file mode 100644
index 0000000000000..c39c9a27c461f
--- /dev/null
+++ b/clang/test/TableGen/select-enum-errors.td
@@ -0,0 +1,45 @@
+// RUN: not clang-tblgen --gen-clang-diags-enums -DERROR1 -I%S %s 2>&1 | FileCheck %s --check-prefixes=CHECK,C1
+// RUN: not clang-tblgen --gen-clang-diags-enums -DERROR2 -I%S %s 2>&1 | FileCheck %s --check-prefixes=CHECK,C2
+// RUN: not clang-tblgen --gen-clang-diags-enums -DERROR3 -I%S %s 2>&1 | FileCheck %s --check-prefixes=CHECK,C3
+// RUN: not clang-tblgen --gen-clang-diags-enums -DERROR4 -I%S %s 2>&1 | FileCheck %s --check-prefixes=CHECK,C4
+include "DiagnosticBase.inc"
+
+// No real reason to diagnose these, the namespace generated as the
+// 'enumeration' name will never conflict with the enumerator.
+def EnumerationEnumeratorDupe : Error<"%enum_select<Matchy>{%Matchy{haha}}0">;
+
+// Enumerator values aren't required, though this does seem kind of silly/not
+// particularly useful?
+def NoEnumerators : Error<"%enum_select<Name>{foo|bar|baz}0">;
+
+def DupeNames1 : Error<"%enum_select<DupeName>{}0">;
+def DupeNames2 : Error<"%enum_select<DupeName>{}0">;
+// CHECK: error: Duplicate enumeration name 'DupeName'
+// CHECK-NEXT: def DupeNames2
+// CHECK: note: Previous diagnostic is here
+// CHECK-NEXT: def DupeNames1
+
+def DupeValue : Error<"%enum_select<DupeValue>{%DName{V1}|%DName{V2}}0">;
+// CHECK: error: Duplicate enumerator name 'DName'
+
+#ifdef ERROR1
+def EnumValNotExpected : Error<"%enum_select{V1|%Val2{V2}}0">;
+// C1: expected '<' after enum_select
+#endif
+
+#ifdef ERROR2
+def SelectWithArrow : Error<"%select<Something>{V1|%Val2{V2}}0">;
+// C2: modifier '<' syntax not valid with %select
+#endif
+
+#ifdef ERROR3
+// Missing closing > after the name of the enumeration
+def MissingClosing : Error<"%enum_select<MissingClosingName{}0">;
+// C3: expected '>' while parsing %enum_select
+#endif
+
+#ifdef ERROR4
+// Missing { after the name of an enumerator
+def MissingTextAfterEnumerator: Error<"%enum_select<Name>{%OtherName|foo}0">;
+// C4: expected '{' while parsing %enum_select
+#endif
diff --git a/clang/test/TableGen/select-enum.td b/clang/test/TableGen/select-enum.td
new file mode 100644
index 0000000000000..8a92acec62cfb
--- /dev/null
+++ b/clang/test/TableGen/select-enum.td
@@ -0,0 +1,26 @@
+// RUN: clang-tblgen --gen-clang-diags-enums -I%S %s 2>&1 | FileCheck %s
+include "DiagnosticBase.inc"
+
+def Diag : Error<"%enum_select<EnumName>{%Val1{V1}|%Val2{V2}|%Val3{V3}}0">;
+// CHECK: DIAG_ENUM(EnumName)
+// CHECK-NEXT: DIAG_ENUM_ITEM(0, Val1)
+// CHECK-NEXT: DIAG_ENUM_ITEM(1, Val2)
+// CHECK-NEXT: DIAG_ENUM_ITEM(2, Val3)
+// CHECK-NEXT: DIAG_ENUM_END()
+
+// These are OK, we permit missing values since they might not be useful.
+def Missing1 : Error<"%enum_select<DupeEnumName1>{V1|%Val2{V2}|%Val3{V3}}0">;
+// CHECK: DIAG_ENUM(DupeEnumName1)
+// CHECK-NEXT: DIAG_ENUM_ITEM(1, Val2)
+// CHECK-NEXT: DIAG_ENUM_ITEM(2, Val3)
+// CHECK-NEXT: DIAG_ENUM_END()
+def Missing2 : Error<"%enum_select<DupeEnumName2>{%Val1{V1}|V2|%Val3{V3}}0">;
+// CHECK: DIAG_ENUM(DupeEnumName2)
+// CHECK-NEXT: DIAG_ENUM_ITEM(0, Val1)
+// CHECK-NEXT: DIAG_ENUM_ITEM(2, Val3)
+// CHECK-NEXT: DIAG_ENUM_END()
+def Missing3 : Error<"%enum_select<DupeEnumName3>{%Val1{V1}|%Val2{V2}|V3}0">;
+// CHECK: DIAG_ENUM(DupeEnumName3)
+// CHECK-NEXT: DIAG_ENUM_ITEM(0, Val1)
+// CHECK-NEXT: DIAG_ENUM_ITEM(1, Val2)
+// CHECK-NEXT: DIAG_ENUM_END()
diff --git a/clang/tools/diagtool/DiagnosticNames.cpp b/clang/tools/diagtool/DiagnosticNames.cpp
index eb90f082437b3..c3a3002889c73 100644
--- a/clang/tools/diagtool/DiagnosticNames.cpp
+++ b/clang/tools/diagtool/DiagnosticNames.cpp
@@ -23,26 +23,13 @@ llvm::ArrayRef<DiagnosticRecord> diagtool::getBuiltinDiagnosticsByName() {
   return llvm::ArrayRef(BuiltinDiagnosticsByName);
 }
 
-
 // FIXME: Is it worth having two tables, especially when this one can get
 // out of sync easily?
 static const DiagnosticRecord BuiltinDiagnosticsByID[] = {
 #define DIAG(ENUM, CLASS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR,      \
              SHOWINSYSHEADER, SHOWINSYSMACRO, DEFER, CATEGORY)                 \
   {#ENUM, diag::ENUM, STR_SIZE(#ENUM, uint8_t)},
-#include "clang/Basic/DiagnosticCommonKinds.inc"
-#include "clang/Basic/DiagnosticCrossTUKinds.inc"
-#include "clang/Basic/DiagnosticDriverKinds.inc"
-#include "clang/Basic/DiagnosticFrontendKinds.inc"
-#include "clang/Basic/DiagnosticSerializationKinds.inc"
-#include "clang/Basic/DiagnosticLexKinds.inc"
-#include "clang/Basic/DiagnosticParseKinds.inc"
-#include "clang/Basic/DiagnosticASTKinds.inc"
-#include "clang/Basic/DiagnosticCommentKinds.inc"
-#include "clang/Basic/DiagnosticSemaKinds.inc"
-#include "clang/Basic/DiagnosticAnalysisKinds.inc"
-#include "clang/Basic/DiagnosticRefactoringKinds.inc"
-#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
+#include "clang/Basic/AllDiagnosticKinds.inc"
 #undef DIAG
 };
 
@@ -54,6 +41,13 @@ static bool orderByID(const DiagnosticRecord &Left,
 const DiagnosticRecord &diagtool::getDiagnosticForID(short DiagID) {
   DiagnosticRecord Key = {nullptr, DiagID, 0};
 
+  // The requirement for lower_bound to produce a valid result it is
+  // enough if the BuiltinDiagnosticsByID is partitioned (by DiagID),
+  // but as we want this function to work for all possible values of
+  // DiagID sent in as argument it is better to right away check if
+  // BuiltinDiagnosticsByID is sorted.
+  assert(llvm::is_sorted(BuiltinDiagnosticsByID, orderByID) &&
+         "IDs in BuiltinDiagnosticsByID must be sorted.");
   const DiagnosticRecord *Result =
       llvm::lower_bound(BuiltinDiagnosticsByID, Key, orderByID);
   assert(Result && "diagnostic not found; table may be out of date");
diff --git a/clang/tools/scan-build/bin/scan-build b/clang/tools/scan-build/bin/scan-build
index 37241c6d85c5b..b90e635d31757 100755
--- a/clang/tools/scan-build/bin/scan-build
+++ b/clang/tools/scan-build/bin/scan-build
@@ -820,7 +820,8 @@ ENDTEXT
       }
 
       # Emit the "View" link.
-      print OUT "<td><a href=\"$ReportFile#EndPath\">View Report</a></td>";
+      my $EncodedReport = URLEscape($ReportFile);
+      print OUT "<td><a href=\"$EncodedReport#EndPath\">View Report</a></td>";
 
       # Emit REPORTBUG markers.
       print OUT "\n<!-- REPORTBUG id=\"$ReportFile\" -->\n";
@@ -1465,6 +1466,16 @@ sub HtmlEscape {
   return $tmp;
 }
 
+##----------------------------------------------------------------------------##
+# URLEscape - encode characters that are special in URLs
+##----------------------------------------------------------------------------##
+
+sub URLEscape {
+  my $arg = shift || '';
+  $arg =~ s/\+/%2B/g;
+  return $arg;
+}
+
 ##----------------------------------------------------------------------------##
 # ShellEscape - backslash escape characters that are special to the shell
 ##----------------------------------------------------------------------------##
diff --git a/clang/unittests/AST/ExternalASTSourceTest.cpp b/clang/unittests/AST/ExternalASTSourceTest.cpp
index ad209604971f4..512f21e8efff4 100644
--- a/clang/unittests/AST/ExternalASTSourceTest.cpp
+++ b/clang/unittests/AST/ExternalASTSourceTest.cpp
@@ -67,9 +67,9 @@ TEST(ExternalASTSourceTest, FailedLookupOccursOnce) {
   struct TestSource : ExternalASTSource {
     TestSource(unsigned &Calls) : Calls(Calls) {}
 
-    bool FindExternalVisibleDeclsByName(const DeclContext *,
-                                        DeclarationName Name,
-                                        Module *NamedModule) override {
+    bool
+    FindExternalVisibleDeclsByName(const DeclContext *, DeclarationName Name,
+                                   const DeclContext *OriginalDC) override {
       if (Name.getAsString() == "j")
         ++Calls;
       return false;
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 4d48bcacddead..f8d13cd0ce250 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -5732,23 +5732,12 @@ TEST_F(FormatTest, HashInMacroDefinition) {
 
   verifyFormat("#define A void # ## #", getLLVMStyleWithColumns(22));
 
-#if 0
-  // FIXME: The correct format is:
   verifyFormat("{\n"
                "  {\n"
                "#define GEN_ID(_x) char *_x{#_x}\n"
                "    GEN_ID(one);\n"
                "  }\n"
                "}");
-#endif
-  verifyFormat("{\n"
-               "  {\n"
-               "#define GEN_ID(_x) \\\n"
-               "  char *_x { #_x }\n"
-               "    GEN_ID(one);\n"
-               "  }\n"
-               "}",
-               getGoogleStyle());
 }
 
 TEST_F(FormatTest, RespectWhitespaceInMacroDefinitions) {
@@ -27987,6 +27976,11 @@ TEST_F(FormatTest, BreakBinaryOperations) {
                "    operand1 + operand2 - (operand3 + operand4);",
                Style);
 
+  // Check operator>> special case.
+  verifyFormat("std::cin >> longOperand_1 >> longOperand_2 >>\n"
+               "    longOperand_3_;",
+               Style);
+
   Style.BreakBinaryOperations = FormatStyle::BBO_OnePerLine;
 
   // Logical operations
@@ -28065,6 +28059,13 @@ TEST_F(FormatTest, BreakBinaryOperations) {
                "         operand6->member;",
                Style);
 
+  // Check operator>> special case.
+  verifyFormat("std::cin >>\n"
+               "    longOperand_1 >>\n"
+               "    longOperand_2 >>\n"
+               "    longOperand_3_;",
+               Style);
+
   Style.BreakBinaryOperations = FormatStyle::BBO_RespectPrecedence;
   verifyFormat("result = op1 + op2 * op3 - op4;", Style);
 
@@ -28090,6 +28091,13 @@ TEST_F(FormatTest, BreakBinaryOperations) {
                "                  byte_buffer[3] << 24;",
                Style);
 
+  // Check operator>> special case.
+  verifyFormat("std::cin >>\n"
+               "    longOperand_1 >>\n"
+               "    longOperand_2 >>\n"
+               "    longOperand_3_;",
+               Style);
+
   Style.BreakBinaryOperations = FormatStyle::BBO_OnePerLine;
   Style.BreakBeforeBinaryOperators = FormatStyle::BOS_NonAssignment;
 
@@ -28164,6 +28172,13 @@ TEST_F(FormatTest, BreakBinaryOperations) {
                "                  << 24;",
                Style);
 
+  // Check operator>> special case.
+  verifyFormat("std::cin\n"
+               "    >> longOperand_1\n"
+               "    >> longOperand_2\n"
+               "    >> longOperand_3_;",
+               Style);
+
   Style.BreakBinaryOperations = FormatStyle::BBO_RespectPrecedence;
   verifyFormat("result = op1 + op2 * op3 - op4;", Style);
 
@@ -28188,6 +28203,13 @@ TEST_F(FormatTest, BreakBinaryOperations) {
                "                  | byte_buffer[2] << 16\n"
                "                  | byte_buffer[3] << 24;",
                Style);
+
+  // Check operator>> special case.
+  verifyFormat("std::cin\n"
+               "    >> longOperand_1\n"
+               "    >> longOperand_2\n"
+               "    >> longOperand_3_;",
+               Style);
 }
 
 TEST_F(FormatTest, RemoveEmptyLinesInUnwrappedLines) {
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 399502db52cbf..9ac60ce73750b 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -3413,14 +3413,27 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
   EXPECT_BRACE_KIND(Tokens[0], BK_Block);
   EXPECT_TOKEN(Tokens[1], tok::l_brace, TT_BlockLBrace);
   EXPECT_BRACE_KIND(Tokens[1], BK_Block);
-#if 0
-  // FIXME:
   EXPECT_BRACE_KIND(Tokens[11], BK_BracedInit);
   EXPECT_BRACE_KIND(Tokens[14], BK_BracedInit);
-#endif
   EXPECT_BRACE_KIND(Tokens[20], BK_Block);
   EXPECT_BRACE_KIND(Tokens[21], BK_Block);
 
+  Tokens = annotate("{\n"
+                    "#define FOO \\\n"
+                    "  { \\\n"
+                    "    case bar: { \\\n"
+                    "      break; \\\n"
+                    "    } \\\n"
+                    "  }\n"
+                    "}");
+  ASSERT_EQ(Tokens.size(), 15u) << Tokens;
+  EXPECT_TOKEN(Tokens[4], tok::l_brace, TT_BlockLBrace);
+  EXPECT_BRACE_KIND(Tokens[4], BK_Block);
+  EXPECT_TOKEN(Tokens[7], tok::colon, TT_CaseLabelColon);
+  EXPECT_BRACE_KIND(Tokens[8], BK_Block);
+  EXPECT_BRACE_KIND(Tokens[11], BK_Block);
+  EXPECT_BRACE_KIND(Tokens[12], BK_Block);
+
   Tokens = annotate("a = class extends goog.a {};",
                     getGoogleStyle(FormatStyle::LK_JavaScript));
   ASSERT_EQ(Tokens.size(), 11u) << Tokens;
diff --git a/clang/unittests/Lex/LexerTest.cpp b/clang/unittests/Lex/LexerTest.cpp
index aead7fb899d0a..c897998cabe66 100644
--- a/clang/unittests/Lex/LexerTest.cpp
+++ b/clang/unittests/Lex/LexerTest.cpp
@@ -603,6 +603,7 @@ TEST_F(LexerTest, CharRangeOffByOne) {
 
 TEST_F(LexerTest, FindNextToken) {
   Lex("int abcd = 0;\n"
+      "// A comment.\n"
       "int xyz = abcd;\n");
   std::vector<std::string> GeneratedByNextToken;
   SourceLocation Loc =
@@ -619,6 +620,26 @@ TEST_F(LexerTest, FindNextToken) {
                                                 "xyz", "=", "abcd", ";"));
 }
 
+TEST_F(LexerTest, FindNextTokenIncludingComments) {
+  Lex("int abcd = 0;\n"
+      "// A comment.\n"
+      "int xyz = abcd;\n");
+  std::vector<std::string> GeneratedByNextToken;
+  SourceLocation Loc =
+      SourceMgr.getLocForStartOfFile(SourceMgr.getMainFileID());
+  while (true) {
+    auto T = Lexer::findNextToken(Loc, SourceMgr, LangOpts, true);
+    ASSERT_TRUE(T);
+    if (T->is(tok::eof))
+      break;
+    GeneratedByNextToken.push_back(getSourceText(*T, *T));
+    Loc = T->getLocation();
+  }
+  EXPECT_THAT(GeneratedByNextToken,
+              ElementsAre("abcd", "=", "0", ";", "// A comment.", "int", "xyz",
+                          "=", "abcd", ";"));
+}
+
 TEST_F(LexerTest, CreatedFIDCountForPredefinedBuffer) {
   TrivialModuleLoader ModLoader;
   auto PP = CreatePP("", ModLoader);
diff --git a/clang/unittests/Sema/CMakeLists.txt b/clang/unittests/Sema/CMakeLists.txt
index 7ded562e8edfa..17d39408000a4 100644
--- a/clang/unittests/Sema/CMakeLists.txt
+++ b/clang/unittests/Sema/CMakeLists.txt
@@ -6,6 +6,7 @@ set(LLVM_LINK_COMPONENTS
 add_clang_unittest(SemaTests
   ExternalSemaSourceTest.cpp
   CodeCompleteTest.cpp
+  HeuristicResolverTest.cpp
   GslOwnerPointerInference.cpp
   SemaLookupTest.cpp
   SemaNoloadLookupTest.cpp
diff --git a/clang-tools-extra/clangd/unittests/HeuristicResolverTests.cpp b/clang/unittests/Sema/HeuristicResolverTest.cpp
similarity index 99%
rename from clang-tools-extra/clangd/unittests/HeuristicResolverTests.cpp
rename to clang/unittests/Sema/HeuristicResolverTest.cpp
index e4b3822fc7eb7..c4f054683ccdc 100644
--- a/clang-tools-extra/clangd/unittests/HeuristicResolverTests.cpp
+++ b/clang/unittests/Sema/HeuristicResolverTest.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-#include "HeuristicResolver.h"
+#include "clang/Sema/HeuristicResolver.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/Tooling/Tooling.h"
@@ -13,7 +13,6 @@
 #include "gtest/gtest.h"
 
 using namespace clang::ast_matchers;
-using clang::clangd::HeuristicResolver;
 using testing::ElementsAre;
 
 namespace clang {
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index cc6a8eaebd44e..41730eba32ce2 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -3043,6 +3043,10 @@ static void emitAttributes(const RecordKeeper &Records, raw_ostream &OS,
            << (R.getValueAsBit("InheritEvenIfAlreadyPresent") ? "true"
                                                               : "false");
       }
+      if (R.getValueAsBit("DeferDeserialization")) {
+        OS << ", "
+           << "/*DeferDeserialization=*/true";
+      }
       OS << ")\n";
 
       for (auto const &ai : Args) {
diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
index 72b3468dac486..fb00c640d6b14 100644
--- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
+++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
@@ -399,6 +399,7 @@ enum PieceKind {
   TextPieceClass,
   PlaceholderPieceClass,
   SelectPieceClass,
+  EnumSelectPieceClass,
   PluralPieceClass,
   DiffPieceClass,
   SubstitutionPieceClass,
@@ -408,6 +409,7 @@ enum ModifierType {
   MT_Unknown,
   MT_Placeholder,
   MT_Select,
+  MT_EnumSelect,
   MT_Sub,
   MT_Plural,
   MT_Diff,
@@ -421,6 +423,7 @@ enum ModifierType {
 
 static StringRef getModifierName(ModifierType MT) {
   switch (MT) {
+  case MT_EnumSelect:
   case MT_Select:
     return "select";
   case MT_Sub:
@@ -512,10 +515,26 @@ struct SelectPiece : Piece {
 
   static bool classof(const Piece *P) {
     return P->getPieceClass() == SelectPieceClass ||
+           P->getPieceClass() == EnumSelectPieceClass ||
            P->getPieceClass() == PluralPieceClass;
   }
 };
 
+struct EnumSelectPiece : SelectPiece {
+  EnumSelectPiece() : SelectPiece(EnumSelectPieceClass, MT_EnumSelect) {}
+
+  StringRef EnumName;
+  std::vector<StringRef> OptionEnumNames;
+
+  static bool classof(const Piece *P) {
+    return P->getPieceClass() == EnumSelectPieceClass;
+  }
+};
+
+struct EnumValuePiece : Piece {
+  ModifierType Kind;
+};
+
 struct PluralPiece : SelectPiece {
   PluralPiece() : SelectPiece(PluralPieceClass, MT_Plural) {}
 
@@ -579,6 +598,9 @@ struct DiagnosticTextBuilder {
   std::vector<std::string> buildForDocumentation(StringRef Role,
                                                  const Record *R);
   std::string buildForDefinition(const Record *R);
+  llvm::SmallVector<std::pair<
+      std::string, llvm::SmallVector<std::pair<unsigned, std::string>>>>
+  buildForEnum(const Record *R);
 
   Piece *getSubstitution(SubstitutionPiece *S) const {
     auto It = Substitutions.find(S->Name);
@@ -707,6 +729,7 @@ template <class Derived> struct DiagTextVisitor {
       CASE(Text);
       CASE(Placeholder);
       CASE(Select);
+      CASE(EnumSelect);
       CASE(Plural);
       CASE(Diff);
       CASE(Substitution);
@@ -886,6 +909,13 @@ struct DiagTextDocPrinter : DiagTextVisitor<DiagTextDocPrinter> {
       makeRowSeparator(RST[I]);
   }
 
+  void VisitEnumSelect(EnumSelectPiece *P) {
+    // Document this as if it were a 'select', which properly prints all of the
+    // options correctly in a readable/reasonable manner. There isn't really
+    // anything valuable we could add to readers here.
+    VisitSelect(P);
+  }
+
   void VisitPlural(PluralPiece *P) { VisitSelect(P); }
 
   void VisitDiff(DiffPiece *P) {
@@ -910,6 +940,47 @@ struct DiagTextDocPrinter : DiagTextVisitor<DiagTextDocPrinter> {
   std::vector<std::string> &RST;
 };
 
+struct DiagEnumPrinter : DiagTextVisitor<DiagEnumPrinter> {
+public:
+  using BaseTy = DiagTextVisitor<DiagEnumPrinter>;
+  using EnumeratorItem = std::pair<unsigned, std::string>;
+  using EnumeratorList = llvm::SmallVector<EnumeratorItem>;
+  using ResultTy = llvm::SmallVector<std::pair<std::string, EnumeratorList>>;
+
+  DiagEnumPrinter(DiagnosticTextBuilder &Builder, ResultTy &Result)
+      : BaseTy(Builder), Result(Result) {}
+
+  ResultTy &Result;
+
+  void VisitMulti(MultiPiece *P) {
+    for (auto *Child : P->Pieces)
+      Visit(Child);
+  }
+  void VisitText(TextPiece *P) {}
+  void VisitPlaceholder(PlaceholderPiece *P) {}
+  void VisitDiff(DiffPiece *P) {}
+  void VisitSelect(SelectPiece *P) {
+    for (auto *D : P->Options)
+      Visit(D);
+  }
+  void VisitPlural(PluralPiece *P) { VisitSelect(P); }
+  void VisitEnumSelect(EnumSelectPiece *P) {
+    assert(P->Options.size() == P->OptionEnumNames.size());
+
+    if (!P->EnumName.empty()) {
+      EnumeratorList List;
+
+      for (const auto &Tup : llvm::enumerate(P->OptionEnumNames))
+        if (!Tup.value().empty())
+          List.emplace_back(Tup.index(), Tup.value());
+
+      Result.emplace_back(P->EnumName, List);
+    }
+
+    VisitSelect(P);
+  }
+};
+
 struct DiagTextPrinter : DiagTextVisitor<DiagTextPrinter> {
 public:
   using BaseTy = DiagTextVisitor<DiagTextPrinter>;
@@ -929,7 +1000,7 @@ struct DiagTextPrinter : DiagTextVisitor<DiagTextPrinter> {
   void VisitSelect(SelectPiece *P) {
     Result += "%";
     Result += getModifierName(P->ModKind);
-    if (P->ModKind == MT_Select) {
+    if (P->ModKind == MT_Select || P->ModKind == MT_EnumSelect) {
       Result += "{";
       for (auto *D : P->Options) {
         Visit(D);
@@ -958,6 +1029,13 @@ struct DiagTextPrinter : DiagTextVisitor<DiagTextPrinter> {
     addInt(mapIndex(P->Index));
   }
 
+  void VisitEnumSelect(EnumSelectPiece *P) {
+    // Print as if we are a 'select', which will result in the compiler just
+    // treating this like a normal select.  This way we don't have to do any
+    // special work for the compiler to consume these.
+    VisitSelect(P);
+  }
+
   void VisitDiff(DiffPiece *P) {
     Result += "%diff{";
     Visit(P->Parts[0]);
@@ -1019,11 +1097,12 @@ Piece *DiagnosticTextBuilder::DiagText::parseDiagText(StringRef &Text,
     Text = Text.drop_front();
 
     // Extract the (optional) modifier.
-    size_t ModLength = Text.find_first_of("0123456789{");
+    size_t ModLength = Text.find_first_of("0123456789<{");
     StringRef Modifier = Text.slice(0, ModLength);
     Text = Text.slice(ModLength, StringRef::npos);
     ModifierType ModType = StringSwitch<ModifierType>{Modifier}
                                .Case("select", MT_Select)
+                               .Case("enum_select", MT_EnumSelect)
                                .Case("sub", MT_Sub)
                                .Case("diff", MT_Diff)
                                .Case("plural", MT_Plural)
@@ -1042,6 +1121,10 @@ Piece *DiagnosticTextBuilder::DiagText::parseDiagText(StringRef &Text,
                                 Modifier);
     };
 
+    if (ModType != MT_EnumSelect && Text[0] == '<')
+      Builder.PrintFatalError("modifier '<' syntax not valid with %" +
+                              Modifier);
+
     switch (ModType) {
     case MT_Unknown:
       Builder.PrintFatalError("Unknown modifier type: " + Modifier);
@@ -1058,6 +1141,55 @@ Piece *DiagnosticTextBuilder::DiagText::parseDiagText(StringRef &Text,
       Parsed.push_back(Select);
       continue;
     }
+    case MT_EnumSelect: {
+      EnumSelectPiece *EnumSelect = New<EnumSelectPiece>();
+      if (Text[0] != '<')
+        Builder.PrintFatalError("expected '<' after " + Modifier);
+
+      Text = Text.drop_front(); // Drop '<'
+      size_t EnumNameLen = Text.find_first_of('>');
+      EnumSelect->EnumName = Text.slice(0, EnumNameLen);
+      Text = Text.slice(EnumNameLen, StringRef::npos);
+      ExpectAndConsume(">");
+
+      if (Text[0] != '{')
+        Builder.PrintFatalError("expected '{' after " + Modifier);
+
+      do {
+        Text = Text.drop_front(); // '{' or '|'
+
+        bool BracketsRequired = false;
+        if (Text[0] == '%') {
+          BracketsRequired = true;
+          Text = Text.drop_front(); // '%'
+          size_t OptionNameLen = Text.find_first_of("{");
+          EnumSelect->OptionEnumNames.push_back(Text.slice(0, OptionNameLen));
+          Text = Text.slice(OptionNameLen, StringRef::npos);
+        } else {
+          EnumSelect->OptionEnumNames.push_back({});
+        }
+
+        if (BracketsRequired)
+          ExpectAndConsume("{");
+        else if (Text.front() == '{') {
+          Text = Text.drop_front();
+          BracketsRequired = true;
+        }
+
+        EnumSelect->Options.push_back(
+            parseDiagText(Text, StopAt::PipeOrCloseBrace));
+
+        if (BracketsRequired)
+          ExpectAndConsume("}");
+
+        assert(!Text.empty() && "malformed %select");
+      } while (Text.front() == '|');
+
+      ExpectAndConsume("}");
+      EnumSelect->Index = parseModifier(Text);
+      Parsed.push_back(EnumSelect);
+      continue;
+    }
     case MT_Plural: {
       PluralPiece *Plural = New<PluralPiece>();
       do {
@@ -1163,6 +1295,15 @@ DiagnosticTextBuilder::buildForDocumentation(StringRef Severity,
   return Result;
 }
 
+DiagEnumPrinter::ResultTy DiagnosticTextBuilder::buildForEnum(const Record *R) {
+  EvaluatingRecordGuard Guard(&EvaluatingRecord, R);
+  StringRef Text = R->getValueAsString("Summary");
+  DiagText D(*this, Text);
+  DiagEnumPrinter::ResultTy Result;
+  DiagEnumPrinter{*this, Result}.Visit(D.Root);
+  return Result;
+}
+
 std::string DiagnosticTextBuilder::buildForDefinition(const Record *R) {
   EvaluatingRecordGuard Guard(&EvaluatingRecord, R);
   StringRef Text = R->getValueAsString("Summary");
@@ -1377,6 +1518,56 @@ static void verifyDiagnosticWording(const Record &Diag) {
   // runs into odd situations like [[clang::warn_unused_result]],
   // #pragma clang, or --unwindlib=libgcc.
 }
+/// ClangDiagsEnumsEmitter - The top-level class emits .def files containing
+/// declarations of Clang diagnostic enums for selects.
+void clang::EmitClangDiagsEnums(const RecordKeeper &Records, raw_ostream &OS,
+                                const std::string &Component) {
+  DiagnosticTextBuilder DiagTextBuilder(Records);
+  ArrayRef<const Record *> Diags =
+      Records.getAllDerivedDefinitions("Diagnostic");
+
+  llvm::SmallVector<std::pair<const Record *, std::string>> EnumerationNames;
+
+  for (const Record &R : make_pointee_range(Diags)) {
+    DiagEnumPrinter::ResultTy Enums = DiagTextBuilder.buildForEnum(&R);
+
+    for (auto &Enumeration : Enums) {
+      bool ShouldPrint =
+          Component.empty() || Component == R.getValueAsString("Component");
+
+      auto PreviousByName = llvm::find_if(EnumerationNames, [&](auto &Prev) {
+        return Prev.second == Enumeration.first;
+      });
+
+      if (PreviousByName != EnumerationNames.end()) {
+        PrintError(&R,
+                   "Duplicate enumeration name '" + Enumeration.first + "'");
+        PrintNote(PreviousByName->first->getLoc(),
+                  "Previous diagnostic is here");
+      }
+
+      EnumerationNames.emplace_back(&R, Enumeration.first);
+
+      if (ShouldPrint)
+        OS << "DIAG_ENUM(" << Enumeration.first << ")\n";
+
+      llvm::SmallVector<std::string> EnumeratorNames;
+      for (auto &Enumerator : Enumeration.second) {
+        if (llvm::find(EnumeratorNames, Enumerator.second) !=
+            EnumeratorNames.end())
+          PrintError(&R,
+                     "Duplicate enumerator name '" + Enumerator.second + "'");
+        EnumeratorNames.push_back(Enumerator.second);
+
+        if (ShouldPrint)
+          OS << "DIAG_ENUM_ITEM(" << Enumerator.first << ", "
+             << Enumerator.second << ")\n";
+      }
+      if (ShouldPrint)
+        OS << "DIAG_ENUM_END()\n";
+    }
+  }
+}
 
 /// ClangDiagsDefsEmitter - The top-level class emits .def files containing
 /// declarations of Clang diagnostics.
diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp
index 6e2bd0c9f819b..8b8eadbe7f7e5 100644
--- a/clang/utils/TableGen/TableGen.cpp
+++ b/clang/utils/TableGen/TableGen.cpp
@@ -48,6 +48,7 @@ enum ActionType {
   GenClangBasicWriter,
   GenClangBuiltins,
   GenClangDiagsDefs,
+  GenClangDiagsEnums,
   GenClangDiagGroups,
   GenClangDiagsIndexName,
   GenClangCommentNodes,
@@ -173,6 +174,8 @@ cl::opt<ActionType> Action(
                    "Generate clang builtins list"),
         clEnumValN(GenClangDiagsDefs, "gen-clang-diags-defs",
                    "Generate Clang diagnostics definitions"),
+        clEnumValN(GenClangDiagsEnums, "gen-clang-diags-enums",
+                   "Generate Clang diagnostic enums for selects"),
         clEnumValN(GenClangDiagGroups, "gen-clang-diag-groups",
                    "Generate Clang diagnostic groups"),
         clEnumValN(GenClangDiagsIndexName, "gen-clang-diags-index-name",
@@ -387,6 +390,9 @@ bool ClangTableGenMain(raw_ostream &OS, const RecordKeeper &Records) {
   case GenClangDiagsDefs:
     EmitClangDiagsDefs(Records, OS, ClangComponent);
     break;
+  case GenClangDiagsEnums:
+    EmitClangDiagsEnums(Records, OS, ClangComponent);
+    break;
   case GenClangDiagGroups:
     EmitClangDiagGroups(Records, OS);
     break;
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index f7527ac535a87..0448c94de08e3 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -89,6 +89,8 @@ void EmitClangBuiltins(const llvm::RecordKeeper &Records,
 
 void EmitClangDiagsDefs(const llvm::RecordKeeper &Records,
                         llvm::raw_ostream &OS, const std::string &Component);
+void EmitClangDiagsEnums(const llvm::RecordKeeper &Records,
+                         llvm::raw_ostream &OS, const std::string &Component);
 void EmitClangDiagGroups(const llvm::RecordKeeper &Records,
                          llvm::raw_ostream &OS);
 void EmitClangDiagsIndexName(const llvm::RecordKeeper &Records,
diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
index 6a5f4b91d11d7..e3f3d12d7e521 100644
--- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
@@ -376,6 +376,95 @@ INTERCEPTOR(void, setbuffer, FILE *stream, char *buf, int size) {
 #define RTSAN_MAYBE_INTERCEPT_SETBUFFER
 #endif
 
+#if SANITIZER_INTERCEPT_FSEEK
+INTERCEPTOR(int, fgetpos, FILE *stream, fpos_t *pos) {
+  __rtsan_notify_intercepted_call("fgetpos");
+  return REAL(fgetpos)(stream, pos);
+}
+
+INTERCEPTOR(int, fseek, FILE *stream, long offset, int whence) {
+  __rtsan_notify_intercepted_call("fseek");
+  return REAL(fseek)(stream, offset, whence);
+}
+
+INTERCEPTOR(int, fseeko, FILE *stream, off_t offset, int whence) {
+  __rtsan_notify_intercepted_call("fseeko");
+  return REAL(fseeko)(stream, offset, whence);
+}
+
+INTERCEPTOR(int, fsetpos, FILE *stream, const fpos_t *pos) {
+  __rtsan_notify_intercepted_call("fsetpos");
+  return REAL(fsetpos)(stream, pos);
+}
+
+INTERCEPTOR(long, ftell, FILE *stream) {
+  __rtsan_notify_intercepted_call("ftell");
+  return REAL(ftell)(stream);
+}
+
+INTERCEPTOR(off_t, ftello, FILE *stream) {
+  __rtsan_notify_intercepted_call("ftello");
+  return REAL(ftello)(stream);
+}
+
+#if SANITIZER_LINUX && !SANITIZER_MUSL
+INTERCEPTOR(int, fgetpos64, FILE *stream, fpos64_t *pos) {
+  __rtsan_notify_intercepted_call("fgetpos64");
+  return REAL(fgetpos64)(stream, pos);
+}
+
+INTERCEPTOR(int, fseeko64, FILE *stream, off64_t offset, int whence) {
+  __rtsan_notify_intercepted_call("fseeko64");
+  return REAL(fseeko64)(stream, offset, whence);
+}
+
+INTERCEPTOR(int, fsetpos64, FILE *stream, const fpos64_t *pos) {
+  __rtsan_notify_intercepted_call("fsetpos64");
+  return REAL(fsetpos64)(stream, pos);
+}
+
+INTERCEPTOR(off64_t, ftello64, FILE *stream) {
+  __rtsan_notify_intercepted_call("ftello64");
+  return REAL(ftello64)(stream);
+}
+#endif
+
+INTERCEPTOR(void, rewind, FILE *stream) {
+  __rtsan_notify_intercepted_call("rewind");
+  return REAL(rewind)(stream);
+}
+#define RTSAN_MAYBE_INTERCEPT_FGETPOS INTERCEPT_FUNCTION(fgetpos)
+#define RTSAN_MAYBE_INTERCEPT_FSEEK INTERCEPT_FUNCTION(fseek)
+#define RTSAN_MAYBE_INTERCEPT_FSEEKO INTERCEPT_FUNCTION(fseeko)
+#define RTSAN_MAYBE_INTERCEPT_FSETPOS INTERCEPT_FUNCTION(fsetpos)
+#define RTSAN_MAYBE_INTERCEPT_FTELL INTERCEPT_FUNCTION(ftell)
+#define RTSAN_MAYBE_INTERCEPT_FTELLO INTERCEPT_FUNCTION(ftello)
+#define RTSAN_MAYBE_INTERCEPT_REWIND INTERCEPT_FUNCTION(rewind)
+#if SANITIZER_LINUX && !SANITIZER_MUSL
+#define RTSAN_MAYBE_INTERCEPT_FGETPOS64 INTERCEPT_FUNCTION(fgetpos64)
+#define RTSAN_MAYBE_INTERCEPT_FSEEKO64 INTERCEPT_FUNCTION(fseeko64)
+#define RTSAN_MAYBE_INTERCEPT_FSETPOS64 INTERCEPT_FUNCTION(fsetpos64)
+#define RTSAN_MAYBE_INTERCEPT_FTELLO64 INTERCEPT_FUNCTION(ftello64)
+#else
+#define RTSAN_MAYBE_INTERCEPT_FGETPOS64
+#define RTSAN_MAYBE_INTERCEPT_FSEEKO64
+#define RTSAN_MAYBE_INTERCEPT_FSETPOS64
+#define RTSAN_MAYBE_INTERCEPT_FTELLO64
+#endif
+#else
+#define RTSAN_MAYBE_INTERCEPT_FGETPOS
+#define RTSAN_MAYBE_INTERCEPT_FSEEK
+#define RTSAN_MAYBE_INTERCEPT_FSEEKO
+#define RTSAN_MAYBE_INTERCEPT_FSETPOS
+#define RTSAN_MAYBE_INTERCEPT_FTELL
+#define RTSAN_MAYBE_INTERCEPT_FTELLO
+#define RTSAN_MAYBE_INTERCEPT_REWIND
+#define RTSAN_MAYBE_INTERCEPT_FGETPOS64
+#define RTSAN_MAYBE_INTERCEPT_FSEEKO64
+#define RTSAN_MAYBE_INTERCEPT_FSETPOS64
+#define RTSAN_MAYBE_INTERCEPT_FTELLO64
+#endif
+
 INTERCEPTOR(int, puts, const char *s) {
   __rtsan_notify_intercepted_call("puts");
   return REAL(puts)(s);
@@ -1042,6 +1131,17 @@ void __rtsan::InitializeInterceptors() {
   RTSAN_MAYBE_INTERCEPT_SETVBUF;
   RTSAN_MAYBE_INTERCEPT_SETLINEBUF;
   RTSAN_MAYBE_INTERCEPT_SETBUFFER;
+  RTSAN_MAYBE_INTERCEPT_FGETPOS;
+  RTSAN_MAYBE_INTERCEPT_FSEEK;
+  RTSAN_MAYBE_INTERCEPT_FSEEKO;
+  RTSAN_MAYBE_INTERCEPT_FSETPOS;
+  RTSAN_MAYBE_INTERCEPT_FTELL;
+  RTSAN_MAYBE_INTERCEPT_FTELLO;
+  RTSAN_MAYBE_INTERCEPT_REWIND;
+  RTSAN_MAYBE_INTERCEPT_FGETPOS64;
+  RTSAN_MAYBE_INTERCEPT_FSEEKO64;
+  RTSAN_MAYBE_INTERCEPT_FSETPOS64;
+  RTSAN_MAYBE_INTERCEPT_FTELLO64;
   INTERCEPT_FUNCTION(lseek);
   RTSAN_MAYBE_INTERCEPT_LSEEK64;
   INTERCEPT_FUNCTION(dup);
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
index 5488d3c7e2056..c26643c6a2d63 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
@@ -478,6 +478,81 @@ class RtsanOpenedFileTest : public RtsanFileTest {
   int fd = -1;
 };
 
+#if SANITIZER_INTERCEPT_FSEEK
+TEST_F(RtsanOpenedFileTest, FgetposDieWhenRealtime) {
+  auto Func = [this]() {
+    fpos_t pos;
+    int ret = fgetpos(GetOpenFile(), &pos);
+    ASSERT_THAT(ret, Eq(0));
+  };
+
+  ExpectRealtimeDeath(Func, MAYBE_APPEND_64("fgetpos"));
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanOpenedFileTest, FsetposDieWhenRealtime) {
+  fpos_t pos;
+  int ret = fgetpos(GetOpenFile(), &pos);
+  ASSERT_THAT(ret, Eq(0));
+  auto Func = [this, pos]() {
+    int ret = fsetpos(GetOpenFile(), &pos);
+    ASSERT_THAT(ret, Eq(0));
+  };
+
+  ExpectRealtimeDeath(Func, MAYBE_APPEND_64("fsetpos"));
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanOpenedFileTest, FseekDieWhenRealtime) {
+  auto Func = [this]() {
+    int ret = fseek(GetOpenFile(), 0, SEEK_CUR);
+    ASSERT_THAT(ret, Eq(0));
+  };
+
+  ExpectRealtimeDeath(Func, "fseek");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanOpenedFileTest, FseekoDieWhenRealtime) {
+  auto Func = [this]() {
+    int ret = fseeko(GetOpenFile(), 0, SEEK_CUR);
+    ASSERT_THAT(ret, Eq(0));
+  };
+
+  ExpectRealtimeDeath(Func, MAYBE_APPEND_64("fseeko"));
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanOpenedFileTest, FtellDieWhenRealtime) {
+  auto Func = [this]() {
+    long ret = ftell(GetOpenFile());
+    ASSERT_THAT(ret, Eq(0));
+  };
+
+  ExpectRealtimeDeath(Func, "ftell");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanOpenedFileTest, FtelloDieWhenRealtime) {
+  auto Func = [this]() {
+    off_t ret = ftello(GetOpenFile());
+    ASSERT_THAT(ret, Eq(0));
+  };
+
+  ExpectRealtimeDeath(Func, MAYBE_APPEND_64("ftello"));
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanOpenedFileTest, RewindDieWhenRealtime) {
+  int end = fseek(GetOpenFile(), 0, SEEK_END);
+  EXPECT_THAT(end, Eq(0));
+  auto Func = [this]() { rewind(GetOpenFile()); };
+
+  ExpectRealtimeDeath(Func, "rewind");
+  ExpectNonRealtimeSurvival(Func);
+}
+#endif
+
 TEST(TestRtsanInterceptors, IoctlDiesWhenRealtime) {
   auto Func = []() { ioctl(0, FIONREAD); };
   ExpectRealtimeDeath(Func, "ioctl");
diff --git a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
index b4702339db59c..a7b78f885eea4 100755
--- a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
+++ b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
@@ -55,6 +55,7 @@ cd $BUILD_DIR
 
 ZLIB_BUILD=${BUILD_DIR}/zlib
 LIBCXX_BUILD=${BUILD_DIR}/libcxx
+LIBCXX_INSTALL=${BUILD_DIR}/libcxx-install
 LLVM_BUILD=${BUILD_DIR}/llvm
 SYMBOLIZER_BUILD=${BUILD_DIR}/symbolizer
 
@@ -87,11 +88,12 @@ make -j libz.a
 
 # Build and install libcxxabi and libcxx.
 if [[ ! -f ${LLVM_BUILD}/build.ninja ]]; then
-  rm -rf ${LIBCXX_BUILD}
-  mkdir -p ${LIBCXX_BUILD}
+  rm -rf "${LIBCXX_BUILD}" "${LIBCXX_INSTALL}"
+  mkdir -p ${LIBCXX_BUILD} ${LIBCXX_INSTALL}
   cd ${LIBCXX_BUILD}
   LIBCXX_FLAGS="${FLAGS} -Wno-macro-redefined"
   cmake -GNinja \
+    -DCMAKE_INSTALL_PREFIX="${LIBCXX_INSTALL}" \
     -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" \
     -DCMAKE_BUILD_TYPE=Release \
     -DCMAKE_C_COMPILER_WORKS=ON \
@@ -114,11 +116,11 @@ if [[ ! -f ${LLVM_BUILD}/build.ninja ]]; then
   $LLVM_SRC/../runtimes
 fi
 cd ${LIBCXX_BUILD}
-ninja cxx cxxabi
+ninja cxx cxxabi && ninja install-cxx install-cxxabi
 
 FLAGS="${FLAGS} -fno-rtti -fno-exceptions"
 LLVM_CFLAGS="${FLAGS} -Wno-global-constructors"
-LLVM_CXXFLAGS="${LLVM_CFLAGS} -nostdinc++ -I${ZLIB_BUILD} -isystem ${LIBCXX_BUILD}/include -isystem ${LIBCXX_BUILD}/include/c++/v1"
+LLVM_CXXFLAGS="${LLVM_CFLAGS} -nostdinc++ -I${ZLIB_BUILD} -isystem ${LIBCXX_INSTALL}/include -isystem ${LIBCXX_INSTALL}/include/c++/v1"
 
 # Build LLVM.
 if [[ ! -f ${LLVM_BUILD}/build.ninja ]]; then
@@ -134,7 +136,7 @@ if [[ ! -f ${LLVM_BUILD}/build.ninja ]]; then
     -DLLVM_ENABLE_LIBCXX=ON \
     -DCMAKE_C_FLAGS_RELEASE="${LLVM_CFLAGS}" \
     -DCMAKE_CXX_FLAGS_RELEASE="${LLVM_CXXFLAGS}" \
-    -DCMAKE_EXE_LINKER_FLAGS="$LINKFLAGS -stdlib=libc++ -L${LIBCXX_BUILD}/lib" \
+    -DCMAKE_EXE_LINKER_FLAGS="$LINKFLAGS -stdlib=libc++ -L${LIBCXX_INSTALL}/lib" \
     -DLLVM_TABLEGEN=$TBLGEN \
     -DLLVM_INCLUDE_TESTS=OFF \
     -DLLVM_ENABLE_ZLIB=ON \
@@ -163,7 +165,7 @@ SYMBOLIZER_API_LIST+=,__sanitizer_symbolize_demangle
 SYMBOLIZER_API_LIST+=,__sanitizer_symbolize_set_demangle
 SYMBOLIZER_API_LIST+=,__sanitizer_symbolize_set_inline_frames
 
-LIBCXX_ARCHIVE_DIR=$(dirname $(find $LIBCXX_BUILD -name libc++.a | head -n1))
+LIBCXX_ARCHIVE_DIR=$(dirname $(find $LIBCXX_INSTALL -name libc++.a | head -n1))
 
 # Merge all the object files together and copy the resulting library back.
 $LINK $LIBCXX_ARCHIVE_DIR/libc++.a \
diff --git a/compiler-rt/test/profile/ContinuousSyncMode/online-merging.c b/compiler-rt/test/profile/ContinuousSyncMode/online-merging.c
index c1931410f8c76..b11a098b4d2b7 100644
--- a/compiler-rt/test/profile/ContinuousSyncMode/online-merging.c
+++ b/compiler-rt/test/profile/ContinuousSyncMode/online-merging.c
@@ -1,4 +1,5 @@
 // REQUIRES: continuous-mode
+// UNSUPPORTED: powerpc-{{.*}}
 
 // Test the online merging mode (%m) along with continuous mode (%c).
 //
diff --git a/flang/cmake/modules/AddFlang.cmake b/flang/cmake/modules/AddFlang.cmake
index 9ed1a3050b7e8..1f178772067ed 100644
--- a/flang/cmake/modules/AddFlang.cmake
+++ b/flang/cmake/modules/AddFlang.cmake
@@ -18,7 +18,7 @@ endmacro()
 
 function(add_flang_library name)
   set(options SHARED STATIC INSTALL_WITH_TOOLCHAIN)
-  set(multiValueArgs ADDITIONAL_HEADERS CLANG_LIBS)
+  set(multiValueArgs ADDITIONAL_HEADERS CLANG_LIBS MLIR_LIBS)
   cmake_parse_arguments(ARG
     "${options}"
     ""
@@ -66,6 +66,9 @@ function(add_flang_library name)
   llvm_add_library(${name} ${LIBTYPE} ${ARG_UNPARSED_ARGUMENTS} ${srcs})
 
   clang_target_link_libraries(${name} PRIVATE ${ARG_CLANG_LIBS})
+  if (ARG_MLIR_LIBS)
+    mlir_target_link_libraries(${name} PRIVATE ${ARG_MLIR_LIBS})
+  endif()
 
   if (TARGET ${name})
 
diff --git a/flang/docs/OpenACC-descriptor-management.md b/flang/docs/OpenACC-descriptor-management.md
index 0b5103000d8e7..52d00ae4daef8 100644
--- a/flang/docs/OpenACC-descriptor-management.md
+++ b/flang/docs/OpenACC-descriptor-management.md
@@ -348,7 +348,7 @@ acc.attach.recipe @attach_ref :
      %offset : index,
      %size : index):
   fir.call _FortranAOpenACCAttachDescriptor(%aug_ptr, %base_addr_val, %offset, %size) :
-      (!fir.ref<none>, !fir.ref<!fir.box<none>>, index, index) -> none
+      (!fir.ref<none>, !fir.ref<!fir.box<none>>, index, index) -> ()
   acc.yield
 }
 
diff --git a/flang/docs/ParameterizedDerivedTypes.md b/flang/docs/ParameterizedDerivedTypes.md
index 851775b123b43..0ed9f8b494e78 100644
--- a/flang/docs/ParameterizedDerivedTypes.md
+++ b/flang/docs/ParameterizedDerivedTypes.md
@@ -435,16 +435,16 @@ allocate(t1(2)::p)
 **FIR**
 ```
 // For allocatable
-%5 = fir.call @_FortranAAllocatableInitDerived(%desc, %type) : (!fir.box<none>, ) -> ()
+fir.call @_FortranAAllocatableInitDerived(%desc, %type) : (!fir.box<none>, ) -> ()
 // The AllocatableSetDerivedLength functions is called for each length type parameters.
-%6 = fir.call @_FortranAAllocatableSetDerivedLength(%desc, %pos, %value) : (!fir.box<none>, i32, i64) -> ()
-%7 = fir.call @_FortranAAllocatableAllocate(%3) : (!fir.box<none>) -> ()
+fir.call @_FortranAAllocatableSetDerivedLength(%desc, %pos, %value) : (!fir.box<none>, i32, i64) -> ()
+fir.call @_FortranAAllocatableAllocate(%3) : (!fir.box<none>) -> ()
 
 // For pointer
-%5 = fir.call @_FortranAPointerNullifyDerived(%desc, %type) : (!fir.box<none>, ) -> ()
+fir.call @_FortranAPointerNullifyDerived(%desc, %type) : (!fir.box<none>, ) -> ()
 // The PointerSetDerivedLength functions is called for each length type parameters.
-%6 = fir.call @_FortranAPointerSetDerivedLength(%desc, %pos, %value) : (!fir.box<none>, i32, i64) -> ()
-%7 = fir.call @_FortranAPointerAllocate(%3) : (!fir.box<none>) -> ()
+fir.call @_FortranAPointerSetDerivedLength(%desc, %pos, %value) : (!fir.box<none>, i32, i64) -> ()
+fir.call @_FortranAPointerAllocate(%3) : (!fir.box<none>) -> ()
 ```
 
 `DEALLOCATE`
@@ -478,7 +478,7 @@ NULLIFY(p)
 
 **FIR**
 ```
-%0 = fir.call @_FortranAPointerNullifyDerived(%desc, %type) : (!fir.box<none>, !fir.tdesc) -> ()
+fir.call @_FortranAPointerNullifyDerived(%desc, %type) : (!fir.box<none>, !fir.tdesc) -> ()
 ```
 
 #### Formatted I/O
@@ -518,7 +518,7 @@ func.func @_QMpdtPprint_pdt() {
   %c8_i32 = arith.constant 8 : i32
   %3 = fir.convert %1 : (!fir.box<!fir.type<_QMpdtTt{l:i32,i:!fir.array<?xi32>}>>) -> !fir.box<none>
   %4 = fir.convert %2 : (!fir.ref<!fir.char<1,22>>) -> !fir.ref<i8>
-  %5 = fir.call @_FortranAInitialize(%3, %4, %c8_i32) : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+  fir.call @_FortranAInitialize(%3, %4, %c8_i32) : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
   %c-1_i32 = arith.constant -1 : i32
   %6 = fir.address_of(@_QQcl.2E2F6669725F7064745F6578616D706C652E66393000) : !fir.ref<!fir.char<1,22>>
   %7 = fir.convert %6 : (!fir.ref<!fir.char<1,22>>) -> !fir.ref<i8>
@@ -882,7 +882,7 @@ func.func @_QMpdt_initPlocal() {
   %c8_i32 = arith.constant 8 : i32
   %3 = fir.convert %1 : (!fir.box<!fir.type<_QMpdt_initTt{l:i32,i:!fir.array<?xi32>}>>) -> !fir.box<none>
   %4 = fir.convert %2 : (!fir.ref<!fir.char<1,22>>) -> !fir.ref<i8>
-  %5 = fir.call @_FortranAInitialize(%3, %4, %c8_i32) : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+  fir.call @_FortranAInitialize(%3, %4, %c8_i32) : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
   return
 }
 ```
diff --git a/flang/docs/PolymorphicEntities.md b/flang/docs/PolymorphicEntities.md
index befcc53127a4a..6583068508584 100644
--- a/flang/docs/PolymorphicEntities.md
+++ b/flang/docs/PolymorphicEntities.md
@@ -609,7 +609,7 @@ finalization with a call the the `@_FortranADestroy` function
 
 **FIR**
 ```
-%5 = fir.call @_FortranADestroy(%desc) : (!fir.box<none>) -> none
+fir.call @_FortranADestroy(%desc) : (!fir.box<none>) -> ()
 ```
 
 The `@_FortranADestroy` function will take care to call the final subroutines
diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def
index 9d03ec88a56b8..deb8d1aede518 100644
--- a/flang/include/flang/Frontend/CodeGenOptions.def
+++ b/flang/include/flang/Frontend/CodeGenOptions.def
@@ -32,6 +32,7 @@ CODEGENOPT(PrepareForThinLTO , 1, 0) ///< Set when -flto=thin is enabled on the
                                      ///< compile step.
 CODEGENOPT(StackArrays, 1, 0) ///< -fstack-arrays (enable the stack-arrays pass)
 CODEGENOPT(LoopVersioning, 1, 0) ///< Enable loop versioning.
+CODEGENOPT(UnrollLoops, 1, 0) ///< Enable loop unrolling
 CODEGENOPT(AliasAnalysis, 1, 0) ///< Enable alias analysis pass
 
 CODEGENOPT(Underscoring, 1, 1)
diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h
index 607aff41f6459..c24f43737df50 100644
--- a/flang/include/flang/Lower/AbstractConverter.h
+++ b/flang/include/flang/Lower/AbstractConverter.h
@@ -130,9 +130,18 @@ class AbstractConverter {
   virtual void
   createHostAssociateVarCloneDealloc(const Fortran::semantics::Symbol &sym) = 0;
 
-  virtual void copyHostAssociateVar(
-      const Fortran::semantics::Symbol &sym,
-      mlir::OpBuilder::InsertPoint *copyAssignIP = nullptr) = 0;
+  /// For a host-associated symbol (a symbol associated with another symbol from
+  /// an enclosing scope), either:
+  ///
+  /// * if \p hostIsSource == true: copy \p sym's value *from* its corresponding
+  /// host symbol,
+  ///
+  /// * if \p hostIsSource == false: copy \p sym's value *to* its corresponding
+  /// host symbol.
+  virtual void
+  copyHostAssociateVar(const Fortran::semantics::Symbol &sym,
+                       mlir::OpBuilder::InsertPoint *copyAssignIP = nullptr,
+                       bool hostIsSource = true) = 0;
 
   virtual void copyVar(mlir::Location loc, mlir::Value dst, mlir::Value src,
                        fir::FortranVariableFlagsEnum attrs) = 0;
diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index c5d86e713f253..ea658fb16a36c 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -804,6 +804,15 @@ elideLengthsAlreadyInType(mlir::Type type, mlir::ValueRange lenParams);
 /// Get the address space which should be used for allocas
 uint64_t getAllocaAddressSpace(mlir::DataLayout *dataLayout);
 
+/// The two vectors of MLIR values have the following property:
+///   \p extents1[i] must have the same value as \p extents2[i]
+/// The function returns a new vector of MLIR values that preserves
+/// the same property vs \p extents1 and \p extents2, but allows
+/// more optimizations. For example, if extents1[j] is a known constant,
+/// and extents2[j] is not, then result[j] is the MLIR value extents1[j].
+llvm::SmallVector<mlir::Value> deduceOptimalExtents(mlir::ValueRange extents1,
+                                                    mlir::ValueRange extents2);
+
 } // namespace fir::factory
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_FIRBUILDER_H
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index c8aad644bc784..0684ad0f926ec 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -508,6 +508,17 @@ genTypeAndKindConvert(mlir::Location loc, fir::FirOpBuilder &builder,
                       hlfir::Entity source, mlir::Type toType,
                       bool preserveLowerBounds);
 
+/// A shortcut for loadTrivialScalar(getElementAt()),
+/// which designates and loads an element of an array.
+Entity loadElementAt(mlir::Location loc, fir::FirOpBuilder &builder,
+                     Entity entity, mlir::ValueRange oneBasedIndices);
+
+/// Return a vector of extents for the given entity.
+/// The function creates new operations, but tries to clean-up
+/// after itself.
+llvm::SmallVector<mlir::Value, Fortran::common::maxRank>
+genExtentsVector(mlir::Location loc, fir::FirOpBuilder &builder, Entity entity);
+
 } // namespace hlfir
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_HLFIRTOOLS_H
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
index 09b49b95fefe5..eaa1de76154d9 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
@@ -320,7 +320,7 @@ constexpr TypeBuilderFunc getModel<unsigned long long>() {
 template <>
 constexpr TypeBuilderFunc getModel<double>() {
   return [](mlir::MLIRContext *context) -> mlir::Type {
-    return mlir::FloatType::getF64(context);
+    return mlir::Float64Type::get(context);
   };
 }
 template <>
@@ -347,11 +347,11 @@ constexpr TypeBuilderFunc getModel<long double>() {
     static_assert(size == 16 || size == 10 || size == 8,
                   "unsupported long double size");
     if constexpr (size == 16)
-      return mlir::FloatType::getF128(context);
+      return mlir::Float128Type::get(context);
     if constexpr (size == 10)
-      return mlir::FloatType::getF80(context);
+      return mlir::Float80Type::get(context);
     if constexpr (size == 8)
-      return mlir::FloatType::getF64(context);
+      return mlir::Float64Type::get(context);
     llvm_unreachable("failed static assert");
   };
 }
@@ -369,7 +369,7 @@ constexpr TypeBuilderFunc getModel<const long double *>() {
 template <>
 constexpr TypeBuilderFunc getModel<float>() {
   return [](mlir::MLIRContext *context) -> mlir::Type {
-    return mlir::FloatType::getF32(context);
+    return mlir::Float32Type::get(context);
   };
 }
 template <>
@@ -674,6 +674,8 @@ struct RuntimeTableKey<RT(ATs...)> {
       llvm::SmallVector<mlir::Type, sizeof...(ATs)> argTys;
       for (auto f : args)
         argTys.push_back(f(ctxt));
+      if (mlir::isa<mlir::NoneType>(retTy))
+        return mlir::FunctionType::get(ctxt, argTys, {});
       return mlir::FunctionType::get(ctxt, argTys, {retTy});
     };
   }
diff --git a/flang/include/flang/Optimizer/Dialect/FIRType.h b/flang/include/flang/Optimizer/Dialect/FIRType.h
index ac638d98980d1..e19fcde8d0e64 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRType.h
+++ b/flang/include/flang/Optimizer/Dialect/FIRType.h
@@ -139,6 +139,13 @@ inline bool isa_builtin_cptr_type(mlir::Type t) {
   return false;
 }
 
+// Is `t` type(c_devptr)?
+inline bool isa_builtin_c_devptr_type(mlir::Type t) {
+  if (auto recTy = mlir::dyn_cast_or_null<fir::RecordType>(t))
+    return recTy.getName().ends_with("T__builtin_c_devptr");
+  return false;
+}
+
 /// Is `t` type(c_devptr)?
 inline bool isa_builtin_cdevptr_type(mlir::Type t) {
   if (auto recTy = mlir::dyn_cast_or_null<fir::RecordType>(t))
@@ -439,7 +446,7 @@ inline mlir::Type wrapInClassOrBoxType(mlir::Type eleTy,
 /// Return the elementType where intrinsic types are replaced with none for
 /// unlimited polymorphic entities.
 ///
-/// i32 -> none
+/// i32 -> ()
 /// !fir.array<2xf32> -> !fir.array<2xnone>
 /// !fir.heap<!fir.array<2xf32>> -> !fir.heap<!fir.array<2xnone>>
 inline mlir::Type updateTypeForUnlimitedPolymorphic(mlir::Type ty) {
diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.td b/flang/include/flang/Optimizer/HLFIR/Passes.td
index 644f1e3c3af2b..90cf6e74241bd 100644
--- a/flang/include/flang/Optimizer/HLFIR/Passes.td
+++ b/flang/include/flang/Optimizer/HLFIR/Passes.td
@@ -43,6 +43,17 @@ def LowerHLFIROrderedAssignments : Pass<"lower-hlfir-ordered-assignments", "::ml
 
 def SimplifyHLFIRIntrinsics : Pass<"simplify-hlfir-intrinsics"> {
   let summary = "Simplify HLFIR intrinsic operations that don't need to result in runtime calls";
+  let options = [Option<"allowNewSideEffects", "allow-new-side-effects", "bool",
+                        /*default=*/"false",
+                        "If enabled, then the HLFIR operations simplification "
+                        "may introduce operations with side effects. "
+                        "For example, hlfir.matmul may be inlined as "
+                        "and hlfir.eval_in_mem with hlfir.assign inside it."
+                        "The hlfir.assign has a write effect on the memory "
+                        "argument of hlfir.eval_in_mem, which may block "
+                        "some existing MLIR transformations (e.g. CSE) "
+                        "that otherwise would have been possible across "
+                        "the hlfir.matmul.">];
 }
 
 def InlineElementals : Pass<"inline-elementals"> {
diff --git a/flang/lib/Common/CMakeLists.txt b/flang/lib/Common/CMakeLists.txt
index de6bea396f3cb..4b5df0a49f403 100644
--- a/flang/lib/Common/CMakeLists.txt
+++ b/flang/lib/Common/CMakeLists.txt
@@ -47,6 +47,6 @@ add_flang_library(FortranCommon
   LINK_COMPONENTS
   Support
 
-  LINK_LIBS
+  MLIR_LIBS
   MLIRIR
 )
diff --git a/flang/lib/Frontend/CMakeLists.txt b/flang/lib/Frontend/CMakeLists.txt
index 0a0482505b747..d063ed36d00b4 100644
--- a/flang/lib/Frontend/CMakeLists.txt
+++ b/flang/lib/Frontend/CMakeLists.txt
@@ -41,13 +41,6 @@ add_flang_library(flangFrontend
   flangPasses
   FIROpenACCSupport
   FlangOpenMPTransforms
-  MLIRTransforms
-  MLIRBuiltinToLLVMIRTranslation
-  MLIRLLVMToLLVMIRTranslation
-  MLIRSCFToControlFlow
-  MLIRTargetLLVMIRImport
-  ${dialect_libs}
-  ${extension_libs}
 
   LINK_COMPONENTS
   Passes
@@ -63,6 +56,15 @@ add_flang_library(flangFrontend
   FrontendOpenACC
   FrontendOpenMP
 
+  MLIR_LIBS
+  MLIRTransforms
+  MLIRBuiltinToLLVMIRTranslation
+  MLIRLLVMToLLVMIRTranslation
+  MLIRSCFToControlFlow
+  MLIRTargetLLVMIRImport
+  ${dialect_libs}
+  ${extension_libs}
+
   CLANG_LIBS
   clangBasic
   clangDriver
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 5e7127313c133..15b1e1e0a2488 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -246,6 +246,10 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
                    clang::driver::options::OPT_fno_loop_versioning, false))
     opts.LoopVersioning = 1;
 
+  opts.UnrollLoops = args.hasFlag(clang::driver::options::OPT_funroll_loops,
+                                  clang::driver::options::OPT_fno_unroll_loops,
+                                  (opts.OptimizationLevel > 1));
+
   opts.AliasAnalysis = opts.OptimizationLevel > 0;
 
   // -mframe-pointer=none/non-leaf/all option.
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 52a18d59c7cda..b0545a7ac2f99 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -1028,6 +1028,8 @@ void CodeGenAction::runOptimizationPipeline(llvm::raw_pwrite_stream &os) {
   si.registerCallbacks(pic, &mam);
   if (ci.isTimingEnabled())
     si.getTimePasses().setOutStream(ci.getTimingStreamLLVM());
+  pto.LoopUnrolling = opts.UnrollLoops;
+  pto.LoopInterleaving = opts.UnrollLoops;
   llvm::PassBuilder pb(targetMachine, pto, pgoOpt, &pic);
 
   // Attempt to load pass plugins and register their callbacks with PB.
diff --git a/flang/lib/FrontendTool/CMakeLists.txt b/flang/lib/FrontendTool/CMakeLists.txt
index 2acaffbe50380..faf56e9d955a1 100644
--- a/flang/lib/FrontendTool/CMakeLists.txt
+++ b/flang/lib/FrontendTool/CMakeLists.txt
@@ -8,12 +8,14 @@ add_flang_library(flangFrontendTool
 
   LINK_LIBS
   flangFrontend
-  MLIRPass
 
   LINK_COMPONENTS
   Option
   Support
 
+  MLIR_LIBS
+  MLIRPass
+
   CLANG_LIBS
   clangBasic
   clangDriver
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 37f51d74d23f8..700ca56141a32 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -891,9 +891,10 @@ class FirConverter : public Fortran::lower::AbstractConverter {
                  isPointer, Fortran::semantics::Symbol::Flags());
   }
 
-  void copyHostAssociateVar(
-      const Fortran::semantics::Symbol &sym,
-      mlir::OpBuilder::InsertPoint *copyAssignIP = nullptr) override final {
+  void
+  copyHostAssociateVar(const Fortran::semantics::Symbol &sym,
+                       mlir::OpBuilder::InsertPoint *copyAssignIP = nullptr,
+                       bool hostIsSource = true) override final {
     // 1) Fetch the original copy of the variable.
     assert(sym.has<Fortran::semantics::HostAssocDetails>() &&
            "No host-association found");
@@ -908,16 +909,14 @@ class FirConverter : public Fortran::lower::AbstractConverter {
            "Host and associated symbol boxes are the same");
 
     // 3) Perform the assignment.
-    mlir::OpBuilder::InsertPoint insPt = builder->saveInsertionPoint();
+    mlir::OpBuilder::InsertionGuard guard(*builder);
     if (copyAssignIP && copyAssignIP->isSet())
       builder->restoreInsertionPoint(*copyAssignIP);
     else
       builder->setInsertionPointAfter(sb.getAddr().getDefiningOp());
 
     Fortran::lower::SymbolBox *lhs_sb, *rhs_sb;
-    if (copyAssignIP && copyAssignIP->isSet() &&
-        sym.test(Fortran::semantics::Symbol::Flag::OmpLastPrivate)) {
-      // lastprivate case
+    if (!hostIsSource) {
       lhs_sb = &hsb;
       rhs_sb = &sb;
     } else {
@@ -926,11 +925,6 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     }
 
     copyVar(sym, *lhs_sb, *rhs_sb, sym.flags());
-
-    if (copyAssignIP && copyAssignIP->isSet() &&
-        sym.test(Fortran::semantics::Symbol::Flag::OmpLastPrivate)) {
-      builder->restoreInsertionPoint(insPt);
-    }
   }
 
   void genEval(Fortran::lower::pft::Evaluation &eval,
diff --git a/flang/lib/Lower/CMakeLists.txt b/flang/lib/Lower/CMakeLists.txt
index f57f0e7a77a01..f611010765cb5 100644
--- a/flang/lib/Lower/CMakeLists.txt
+++ b/flang/lib/Lower/CMakeLists.txt
@@ -55,17 +55,19 @@ add_flang_library(FortranLower
   FIRSupport
   FIRTransforms
   HLFIRDialect
-  ${dialect_libs}
-  ${extension_libs}
   FortranCommon
   FortranParser
   FortranEvaluate
   FortranSemantics
+
+  LINK_COMPONENTS
+  Support
+
+  MLIR_LIBS
+  ${dialect_libs}
+  ${extension_libs}
   MLIRAffineToStandard
   MLIRFuncDialect
   MLIRLLVMDialect
   MLIRSCFToControlFlow
-
-  LINK_COMPONENTS
-  Support
 )
diff --git a/flang/lib/Lower/ConvertType.cpp b/flang/lib/Lower/ConvertType.cpp
index 037d4335fedf1..2fab520e6c475 100644
--- a/flang/lib/Lower/ConvertType.cpp
+++ b/flang/lib/Lower/ConvertType.cpp
@@ -36,17 +36,17 @@ static mlir::Type genRealType(mlir::MLIRContext *context, int kind) {
           Fortran::common::TypeCategory::Real, kind)) {
     switch (kind) {
     case 2:
-      return mlir::FloatType::getF16(context);
+      return mlir::Float16Type::get(context);
     case 3:
-      return mlir::FloatType::getBF16(context);
+      return mlir::BFloat16Type::get(context);
     case 4:
-      return mlir::FloatType::getF32(context);
+      return mlir::Float32Type::get(context);
     case 8:
-      return mlir::FloatType::getF64(context);
+      return mlir::Float64Type::get(context);
     case 10:
-      return mlir::FloatType::getF80(context);
+      return mlir::Float80Type::get(context);
     case 16:
-      return mlir::FloatType::getF128(context);
+      return mlir::Float128Type::get(context);
     }
   }
   llvm_unreachable("REAL type translation not implemented");
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index 9dfdbd8337ae9..5b89816850bed 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -145,7 +145,7 @@ void DataSharingProcessor::copyFirstPrivateSymbol(
 void DataSharingProcessor::copyLastPrivateSymbol(
     const semantics::Symbol *sym, mlir::OpBuilder::InsertPoint *lastPrivIP) {
   if (sym->test(semantics::Symbol::Flag::OmpLastPrivate))
-    converter.copyHostAssociateVar(*sym, lastPrivIP);
+    converter.copyHostAssociateVar(*sym, lastPrivIP, /*hostIsSource=*/false);
 }
 
 void DataSharingProcessor::collectOmpObjectListSymbol(
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 158f76250572e..52541bb91481d 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2082,7 +2082,7 @@ genSectionsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
       const auto &objList = std::get<ObjectList>(lastp->t);
       for (const Object &object : objList) {
         semantics::Symbol *sym = object.sym();
-        converter.copyHostAssociateVar(*sym, &insp);
+        converter.copyHostAssociateVar(*sym, &insp, /*hostIsSource=*/false);
       }
     }
   }
@@ -2511,7 +2511,7 @@ static void genStandaloneDo(lower::AbstractConverter &converter,
 
   DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval,
                            /*shouldCollectPreDeterminedSymbols=*/true,
-                           enableDelayedPrivatizationStaging, symTable);
+                           enableDelayedPrivatization, symTable);
   dsp.processStep1(&wsloopClauseOps);
 
   mlir::omp::LoopNestOperands loopNestClauseOps;
diff --git a/flang/lib/Lower/Runtime.cpp b/flang/lib/Lower/Runtime.cpp
index 3474832bdb225..2be5ef76e46b8 100644
--- a/flang/lib/Lower/Runtime.cpp
+++ b/flang/lib/Lower/Runtime.cpp
@@ -210,7 +210,7 @@ void Fortran::lower::genPointerAssociate(fir::FirOpBuilder &builder,
       fir::runtime::getRuntimeFunc<mkRTKey(PointerAssociate)>(loc, builder);
   llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
       builder, loc, func.getFunctionType(), pointer, target);
-  builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  builder.create<fir::CallOp>(loc, func, args);
 }
 
 void Fortran::lower::genPointerAssociateRemapping(fir::FirOpBuilder &builder,
@@ -228,7 +228,7 @@ void Fortran::lower::genPointerAssociateRemapping(fir::FirOpBuilder &builder,
   llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
       builder, loc, func.getFunctionType(), pointer, target, bounds, sourceFile,
       sourceLine);
-  builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  builder.create<fir::CallOp>(loc, func, args);
 }
 
 void Fortran::lower::genPointerAssociateLowerBounds(fir::FirOpBuilder &builder,
@@ -241,5 +241,5 @@ void Fortran::lower::genPointerAssociateLowerBounds(fir::FirOpBuilder &builder,
           loc, builder);
   llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
       builder, loc, func.getFunctionType(), pointer, target, lbounds);
-  builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  builder.create<fir::CallOp>(loc, func, args);
 }
diff --git a/flang/lib/Optimizer/Analysis/CMakeLists.txt b/flang/lib/Optimizer/Analysis/CMakeLists.txt
index 1358219fd98d5..6fe9c70f83765 100644
--- a/flang/lib/Optimizer/Analysis/CMakeLists.txt
+++ b/flang/lib/Optimizer/Analysis/CMakeLists.txt
@@ -13,6 +13,8 @@ add_flang_library(FIRAnalysis
   FIRBuilder
   FIRDialect
   HLFIRDialect
+
+  MLIR_LIBS
   MLIRFuncDialect
   MLIRLLVMDialect
   MLIRMathTransforms
diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt
index 0960e858c4111..f8faeaa81c90c 100644
--- a/flang/lib/Optimizer/Builder/CMakeLists.txt
+++ b/flang/lib/Optimizer/Builder/CMakeLists.txt
@@ -51,6 +51,8 @@ add_flang_library(FIRBuilder
   FIRSupport
   FortranEvaluate
   HLFIRDialect
+
+  MLIR_LIBS
   ${dialect_libs}
   ${extension_libs}
 )
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index d01becfe80093..64c540cfb95ae 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -105,17 +105,17 @@ mlir::Type fir::FirOpBuilder::getVarLenSeqTy(mlir::Type eleTy, unsigned rank) {
 mlir::Type fir::FirOpBuilder::getRealType(int kind) {
   switch (kindMap.getRealTypeID(kind)) {
   case llvm::Type::TypeID::HalfTyID:
-    return mlir::FloatType::getF16(getContext());
+    return mlir::Float16Type::get(getContext());
   case llvm::Type::TypeID::BFloatTyID:
-    return mlir::FloatType::getBF16(getContext());
+    return mlir::BFloat16Type::get(getContext());
   case llvm::Type::TypeID::FloatTyID:
-    return mlir::FloatType::getF32(getContext());
+    return mlir::Float32Type::get(getContext());
   case llvm::Type::TypeID::DoubleTyID:
-    return mlir::FloatType::getF64(getContext());
+    return mlir::Float64Type::get(getContext());
   case llvm::Type::TypeID::X86_FP80TyID:
-    return mlir::FloatType::getF80(getContext());
+    return mlir::Float80Type::get(getContext());
   case llvm::Type::TypeID::FP128TyID:
-    return mlir::FloatType::getF128(getContext());
+    return mlir::Float128Type::get(getContext());
   default:
     fir::emitFatalError(mlir::UnknownLoc::get(getContext()),
                         "unsupported type !fir.real<kind>");
@@ -1401,6 +1401,10 @@ static void genComponentByComponentAssignment(fir::FirOpBuilder &builder,
 /// Can the assignment of this record type be implement with a simple memory
 /// copy (it requires no deep copy or user defined assignment of components )?
 static bool recordTypeCanBeMemCopied(fir::RecordType recordType) {
+  // c_devptr type is a special case. It has a nested c_ptr field but we know it
+  // can be copied directly.
+  if (fir::isa_builtin_c_devptr_type(recordType))
+    return true;
   if (fir::hasDynamicSize(recordType))
     return false;
   for (auto [_, fieldType] : recordType.getTypeList()) {
@@ -1740,3 +1744,17 @@ uint64_t fir::factory::getAllocaAddressSpace(mlir::DataLayout *dataLayout) {
       return mlir::cast<mlir::IntegerAttr>(addrSpace).getUInt();
   return 0;
 }
+
+llvm::SmallVector<mlir::Value>
+fir::factory::deduceOptimalExtents(mlir::ValueRange extents1,
+                                   mlir::ValueRange extents2) {
+  llvm::SmallVector<mlir::Value> extents;
+  extents.reserve(extents1.size());
+  for (auto [extent1, extent2] : llvm::zip(extents1, extents2)) {
+    if (!fir::getIntIfConstant(extent1) && fir::getIntIfConstant(extent2))
+      extents.push_back(extent2);
+    else
+      extents.push_back(extent1);
+  }
+  return extents;
+}
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 94238bc24e453..f71adf123511d 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -939,8 +939,10 @@ llvm::SmallVector<mlir::Value> hlfir::genLoopNestWithReductions(
       doLoop = builder.create<fir::DoLoopOp>(loc, one, ub, one, isUnordered,
                                              /*finalCountValue=*/false,
                                              parentLoop.getRegionIterArgs());
-      // Return the results of the child loop from its parent loop.
-      builder.create<fir::ResultOp>(loc, doLoop.getResults());
+      if (!reductionInits.empty()) {
+        // Return the results of the child loop from its parent loop.
+        builder.create<fir::ResultOp>(loc, doLoop.getResults());
+      }
     }
 
     builder.setInsertionPointToStart(doLoop.getBody());
@@ -955,7 +957,8 @@ llvm::SmallVector<mlir::Value> hlfir::genLoopNestWithReductions(
   reductionValues =
       genBody(loc, builder, oneBasedIndices, parentLoop.getRegionIterArgs());
   builder.setInsertionPointToEnd(parentLoop.getBody());
-  builder.create<fir::ResultOp>(loc, reductionValues);
+  if (!reductionValues.empty())
+    builder.create<fir::ResultOp>(loc, reductionValues);
   builder.setInsertionPointAfter(outerLoop);
   return outerLoop->getResults();
 }
@@ -1410,3 +1413,23 @@ void hlfir::computeEvaluateOpIn(mlir::Location loc, fir::FirOpBuilder &builder,
     builder.clone(op, mapper);
   return;
 }
+
+hlfir::Entity hlfir::loadElementAt(mlir::Location loc,
+                                   fir::FirOpBuilder &builder,
+                                   hlfir::Entity entity,
+                                   mlir::ValueRange oneBasedIndices) {
+  return loadTrivialScalar(loc, builder,
+                           getElementAt(loc, builder, entity, oneBasedIndices));
+}
+
+llvm::SmallVector<mlir::Value, Fortran::common::maxRank>
+hlfir::genExtentsVector(mlir::Location loc, fir::FirOpBuilder &builder,
+                        hlfir::Entity entity) {
+  entity = hlfir::derefPointersAndAllocatables(loc, builder, entity);
+  mlir::Value shape = hlfir::genShape(loc, builder, entity);
+  llvm::SmallVector<mlir::Value, Fortran::common::maxRank> extents =
+      hlfir::getExplicitExtentsFromShape(shape, builder);
+  if (shape.getUses().empty())
+    shape.getDefiningOp()->erase();
+  return extents;
+}
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index f6f2e15e469e6..6a343645ab878 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -2367,7 +2367,7 @@ mlir::Value IntrinsicLibrary::genAcosd(mlir::Type resultType,
       mlir::FunctionType::get(context, {resultType}, {args[0].getType()});
   llvm::APFloat pi = llvm::APFloat(llvm::numbers::pi);
   mlir::Value dfactor = builder.createRealConstant(
-      loc, mlir::FloatType::getF64(context), pi / llvm::APFloat(180.0));
+      loc, mlir::Float64Type::get(context), pi / llvm::APFloat(180.0));
   mlir::Value factor = builder.createConvert(loc, args[0].getType(), dfactor);
   mlir::Value arg = builder.create<mlir::arith::MulFOp>(loc, args[0], factor);
   return getRuntimeCallGenerator("acos", ftype)(builder, loc, {arg});
@@ -2518,7 +2518,7 @@ mlir::Value IntrinsicLibrary::genAsind(mlir::Type resultType,
       mlir::FunctionType::get(context, {resultType}, {args[0].getType()});
   llvm::APFloat pi = llvm::APFloat(llvm::numbers::pi);
   mlir::Value dfactor = builder.createRealConstant(
-      loc, mlir::FloatType::getF64(context), pi / llvm::APFloat(180.0));
+      loc, mlir::Float64Type::get(context), pi / llvm::APFloat(180.0));
   mlir::Value factor = builder.createConvert(loc, args[0].getType(), dfactor);
   mlir::Value arg = builder.create<mlir::arith::MulFOp>(loc, args[0], factor);
   return getRuntimeCallGenerator("asin", ftype)(builder, loc, {arg});
@@ -2544,7 +2544,7 @@ mlir::Value IntrinsicLibrary::genAtand(mlir::Type resultType,
   }
   llvm::APFloat pi = llvm::APFloat(llvm::numbers::pi);
   mlir::Value dfactor = builder.createRealConstant(
-      loc, mlir::FloatType::getF64(context), llvm::APFloat(180.0) / pi);
+      loc, mlir::Float64Type::get(context), llvm::APFloat(180.0) / pi);
   mlir::Value factor = builder.createConvert(loc, resultType, dfactor);
   return builder.create<mlir::arith::MulFOp>(loc, atan, factor);
 }
@@ -2569,7 +2569,7 @@ mlir::Value IntrinsicLibrary::genAtanpi(mlir::Type resultType,
   }
   llvm::APFloat inv_pi = llvm::APFloat(llvm::numbers::inv_pi);
   mlir::Value dfactor =
-      builder.createRealConstant(loc, mlir::FloatType::getF64(context), inv_pi);
+      builder.createRealConstant(loc, mlir::Float64Type::get(context), inv_pi);
   mlir::Value factor = builder.createConvert(loc, resultType, dfactor);
   return builder.create<mlir::arith::MulFOp>(loc, atan, factor);
 }
@@ -3124,7 +3124,7 @@ mlir::Value IntrinsicLibrary::genCosd(mlir::Type resultType,
       mlir::FunctionType::get(context, {resultType}, {args[0].getType()});
   llvm::APFloat pi = llvm::APFloat(llvm::numbers::pi);
   mlir::Value dfactor = builder.createRealConstant(
-      loc, mlir::FloatType::getF64(context), pi / llvm::APFloat(180.0));
+      loc, mlir::Float64Type::get(context), pi / llvm::APFloat(180.0));
   mlir::Value factor = builder.createConvert(loc, args[0].getType(), dfactor);
   mlir::Value arg = builder.create<mlir::arith::MulFOp>(loc, args[0], factor);
   return getRuntimeCallGenerator("cos", ftype)(builder, loc, {arg});
@@ -4418,12 +4418,12 @@ IntrinsicLibrary::genIeeeCopySign(mlir::Type resultType,
   mlir::FloatType yRealType =
       mlir::dyn_cast<mlir::FloatType>(yRealVal.getType());
 
-  if (yRealType == mlir::FloatType::getBF16(builder.getContext())) {
+  if (yRealType == mlir::BFloat16Type::get(builder.getContext())) {
     // Workaround: CopySignOp and BitcastOp don't work for kind 3 arg Y.
     // This conversion should always preserve the sign bit.
     yRealVal = builder.createConvert(
-        loc, mlir::FloatType::getF32(builder.getContext()), yRealVal);
-    yRealType = mlir::FloatType::getF32(builder.getContext());
+        loc, mlir::Float32Type::get(builder.getContext()), yRealVal);
+    yRealType = mlir::Float32Type::get(builder.getContext());
   }
 
   // Args have the same type.
@@ -4979,7 +4979,7 @@ mlir::Value IntrinsicLibrary::genIeeeReal(mlir::Type resultType,
 
   assert(args.size() == 2);
   mlir::Type i1Ty = builder.getI1Type();
-  mlir::Type f32Ty = mlir::FloatType::getF32(builder.getContext());
+  mlir::Type f32Ty = mlir::Float32Type::get(builder.getContext());
   mlir::Value a = args[0];
   mlir::Type aType = a.getType();
 
@@ -5179,7 +5179,7 @@ mlir::Value IntrinsicLibrary::genIeeeRem(mlir::Type resultType,
   mlir::Value x = args[0];
   mlir::Value y = args[1];
   if (mlir::dyn_cast<mlir::FloatType>(resultType).getWidth() < 32) {
-    mlir::Type f32Ty = mlir::FloatType::getF32(builder.getContext());
+    mlir::Type f32Ty = mlir::Float32Type::get(builder.getContext());
     x = builder.create<fir::ConvertOp>(loc, f32Ty, x);
     y = builder.create<fir::ConvertOp>(loc, f32Ty, y);
   } else {
@@ -5213,7 +5213,7 @@ mlir::Value IntrinsicLibrary::genIeeeRint(mlir::Type resultType,
   }
   if (mlir::cast<mlir::FloatType>(resultType).getWidth() == 16)
     a = builder.create<fir::ConvertOp>(
-        loc, mlir::FloatType::getF32(builder.getContext()), a);
+        loc, mlir::Float32Type::get(builder.getContext()), a);
   mlir::Value result = builder.create<fir::ConvertOp>(
       loc, resultType, genRuntimeCall("nearbyint", a.getType(), a));
   if (isStaticallyPresent(args[1])) {
@@ -5298,10 +5298,10 @@ mlir::Value IntrinsicLibrary::genIeeeSignbit(mlir::Type resultType,
   mlir::Value realVal = args[0];
   mlir::FloatType realType = mlir::dyn_cast<mlir::FloatType>(realVal.getType());
   int bitWidth = realType.getWidth();
-  if (realType == mlir::FloatType::getBF16(builder.getContext())) {
+  if (realType == mlir::BFloat16Type::get(builder.getContext())) {
     // Workaround: can't bitcast or convert real(3) to integer(2) or real(2).
     realVal = builder.createConvert(
-        loc, mlir::FloatType::getF32(builder.getContext()), realVal);
+        loc, mlir::Float32Type::get(builder.getContext()), realVal);
     bitWidth = 32;
   }
   mlir::Type intType = builder.getIntegerType(bitWidth);
@@ -6065,7 +6065,7 @@ mlir::Value IntrinsicLibrary::genModulo(mlir::Type resultType,
   auto fastMathFlags = builder.getFastMathFlags();
   // F128 arith::RemFOp may be lowered to a runtime call that may be unsupported
   // on the target, so generate a call to Fortran Runtime's ModuloReal16.
-  if (resultType == mlir::FloatType::getF128(builder.getContext()) ||
+  if (resultType == mlir::Float128Type::get(builder.getContext()) ||
       (fastMathFlags & mlir::arith::FastMathFlags::ninf) ==
           mlir::arith::FastMathFlags::none)
     return builder.createConvert(
@@ -6254,7 +6254,7 @@ mlir::Value IntrinsicLibrary::genNearest(mlir::Type resultType,
     mlir::FloatType yType = mlir::dyn_cast<mlir::FloatType>(args[1].getType());
     const unsigned yBitWidth = yType.getWidth();
     if (xType != yType) {
-      mlir::Type f32Ty = mlir::FloatType::getF32(builder.getContext());
+      mlir::Type f32Ty = mlir::Float32Type::get(builder.getContext());
       if (xBitWidth < 32)
         x1 = builder.createConvert(loc, f32Ty, x1);
       if (yBitWidth > 32 && yBitWidth > xBitWidth)
@@ -7205,7 +7205,7 @@ mlir::Value IntrinsicLibrary::genSind(mlir::Type resultType,
       mlir::FunctionType::get(context, {resultType}, {args[0].getType()});
   llvm::APFloat pi = llvm::APFloat(llvm::numbers::pi);
   mlir::Value dfactor = builder.createRealConstant(
-      loc, mlir::FloatType::getF64(context), pi / llvm::APFloat(180.0));
+      loc, mlir::Float64Type::get(context), pi / llvm::APFloat(180.0));
   mlir::Value factor = builder.createConvert(loc, args[0].getType(), dfactor);
   mlir::Value arg = builder.create<mlir::arith::MulFOp>(loc, args[0], factor);
   return getRuntimeCallGenerator("sin", ftype)(builder, loc, {arg});
@@ -7286,7 +7286,7 @@ mlir::Value IntrinsicLibrary::genTand(mlir::Type resultType,
       mlir::FunctionType::get(context, {resultType}, {args[0].getType()});
   llvm::APFloat pi = llvm::APFloat(llvm::numbers::pi);
   mlir::Value dfactor = builder.createRealConstant(
-      loc, mlir::FloatType::getF64(context), pi / llvm::APFloat(180.0));
+      loc, mlir::Float64Type::get(context), pi / llvm::APFloat(180.0));
   mlir::Value factor = builder.createConvert(loc, args[0].getType(), dfactor);
   mlir::Value arg = builder.create<mlir::arith::MulFOp>(loc, args[0], factor);
   return getRuntimeCallGenerator("tan", ftype)(builder, loc, {arg});
diff --git a/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
index b3b07d18a956b..fcc91752552c3 100644
--- a/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
@@ -1579,7 +1579,7 @@ PPCIntrinsicLibrary::genVecConvert(mlir::Type resultType,
 
       return callOp.getResult(0);
     } else if (width == 64) {
-      auto fTy{mlir::FloatType::getF64(context)};
+      auto fTy{mlir::Float64Type::get(context)};
       auto ty{mlir::VectorType::get(2, fTy)};
 
       // vec_vtf(arg1, arg2) = fmul(1.0 / (1 << arg2), llvm.sitofp(arg1))
@@ -1639,7 +1639,7 @@ PPCIntrinsicLibrary::genVecConvert(mlir::Type resultType,
       newArgs[0] =
           builder.create<fir::CallOp>(loc, funcOp, newArgs).getResult(0);
       auto fvf32Ty{newArgs[0].getType()};
-      auto f32type{mlir::FloatType::getF32(context)};
+      auto f32type{mlir::Float32Type::get(context)};
       auto mvf32Ty{mlir::VectorType::get(4, f32type)};
       newArgs[0] = builder.createConvert(loc, mvf32Ty, newArgs[0]);
 
@@ -1949,7 +1949,7 @@ PPCIntrinsicLibrary::genVecLdCallGrp(mlir::Type resultType,
     fname = isBEVecElemOrderOnLE() ? "llvm.ppc.vsx.lxvd2x.be"
                                    : "llvm.ppc.vsx.lxvd2x";
     // llvm.ppc.altivec.lxvd2x* returns <2 x double>
-    intrinResTy = mlir::VectorType::get(2, mlir::FloatType::getF64(context));
+    intrinResTy = mlir::VectorType::get(2, mlir::Float64Type::get(context));
   } break;
   case VecOp::Xlw4:
     fname = isBEVecElemOrderOnLE() ? "llvm.ppc.vsx.lxvw4x.be"
@@ -2092,7 +2092,7 @@ PPCIntrinsicLibrary::genVecPerm(mlir::Type resultType,
   auto mlirTy{vecTyInfo.toMlirVectorType(context)};
 
   auto vi32Ty{mlir::VectorType::get(4, mlir::IntegerType::get(context, 32))};
-  auto vf64Ty{mlir::VectorType::get(2, mlir::FloatType::getF64(context))};
+  auto vf64Ty{mlir::VectorType::get(2, mlir::Float64Type::get(context))};
 
   auto mArg0{builder.createConvert(loc, mlirTy, argBases[0])};
   auto mArg1{builder.createConvert(loc, mlirTy, argBases[1])};
diff --git a/flang/lib/Optimizer/Builder/Runtime/Inquiry.cpp b/flang/lib/Optimizer/Builder/Runtime/Inquiry.cpp
index e01a6f05b5fdd..f4d03c95ae518 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Inquiry.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Inquiry.cpp
@@ -40,7 +40,7 @@ void fir::runtime::genLbound(fir::FirOpBuilder &builder, mlir::Location loc,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
   auto args = fir::runtime::createArguments(
       builder, loc, fTy, resultAddr, array, kind, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  builder.create<fir::CallOp>(loc, func, args);
 }
 
 /// Generate call to `Ubound` runtime routine.  Calls to UBOUND with a DIM
@@ -57,7 +57,7 @@ void fir::runtime::genUbound(fir::FirOpBuilder &builder, mlir::Location loc,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
   auto args = fir::runtime::createArguments(builder, loc, fTy, resultBox, array,
                                             kind, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, uboundFunc, args).getResult(0);
+  builder.create<fir::CallOp>(loc, uboundFunc, args);
 }
 
 /// Generate call to `Size` runtime routine. This routine is a version when
@@ -113,5 +113,5 @@ void fir::runtime::genShape(fir::FirOpBuilder &builder, mlir::Location loc,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
   auto args = fir::runtime::createArguments(
       builder, loc, fTy, resultAddr, array, kind, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  builder.create<fir::CallOp>(loc, func, args);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
index ded9579f2c1df..963051ccdc379 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
@@ -38,8 +38,7 @@ struct ForcedRandomNumberReal16 {
       auto strTy = fir::runtime::getModel<const char *>()(ctx);
       auto intTy = fir::runtime::getModel<int>()(ctx);
       ;
-      return mlir::FunctionType::get(ctx, {boxTy, strTy, intTy},
-                                     mlir::NoneType::get(ctx));
+      return mlir::FunctionType::get(ctx, {boxTy, strTy, intTy}, {});
     };
   }
 };
diff --git a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
index d0092add0118f..4ff7c86bb0a24 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
@@ -27,7 +27,7 @@ struct ForcedErfcScaled10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(ErfcScaled10));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF80(ctx);
+      auto ty = mlir::Float80Type::get(ctx);
       return mlir::FunctionType::get(ctx, {ty}, {ty});
     };
   }
@@ -38,7 +38,7 @@ struct ForcedErfcScaled16 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(ErfcScaled16));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF128(ctx);
+      auto ty = mlir::Float128Type::get(ctx);
       return mlir::FunctionType::get(ctx, {ty}, {ty});
     };
   }
@@ -49,7 +49,7 @@ struct ForcedExponent10_4 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Exponent10_4));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto fltTy = mlir::FloatType::getF80(ctx);
+      auto fltTy = mlir::Float80Type::get(ctx);
       auto intTy = mlir::IntegerType::get(ctx, 32);
       return mlir::FunctionType::get(ctx, fltTy, intTy);
     };
@@ -60,7 +60,7 @@ struct ForcedExponent10_8 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Exponent10_8));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto fltTy = mlir::FloatType::getF80(ctx);
+      auto fltTy = mlir::Float80Type::get(ctx);
       auto intTy = mlir::IntegerType::get(ctx, 64);
       return mlir::FunctionType::get(ctx, fltTy, intTy);
     };
@@ -72,7 +72,7 @@ struct ForcedExponent16_4 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Exponent16_4));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto fltTy = mlir::FloatType::getF128(ctx);
+      auto fltTy = mlir::Float128Type::get(ctx);
       auto intTy = mlir::IntegerType::get(ctx, 32);
       return mlir::FunctionType::get(ctx, fltTy, intTy);
     };
@@ -83,7 +83,7 @@ struct ForcedExponent16_8 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Exponent16_8));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto fltTy = mlir::FloatType::getF128(ctx);
+      auto fltTy = mlir::Float128Type::get(ctx);
       auto intTy = mlir::IntegerType::get(ctx, 64);
       return mlir::FunctionType::get(ctx, fltTy, intTy);
     };
@@ -95,7 +95,7 @@ struct ForcedFraction10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Fraction10));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF80(ctx);
+      auto ty = mlir::Float80Type::get(ctx);
       return mlir::FunctionType::get(ctx, {ty}, {ty});
     };
   }
@@ -106,7 +106,7 @@ struct ForcedFraction16 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Fraction16));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF128(ctx);
+      auto ty = mlir::Float128Type::get(ctx);
       return mlir::FunctionType::get(ctx, {ty}, {ty});
     };
   }
@@ -117,7 +117,7 @@ struct ForcedMod10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(ModReal10));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto fltTy = mlir::FloatType::getF80(ctx);
+      auto fltTy = mlir::Float80Type::get(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
       auto intTy = mlir::IntegerType::get(ctx, 8 * sizeof(int));
       return mlir::FunctionType::get(ctx, {fltTy, fltTy, strTy, intTy},
@@ -131,7 +131,7 @@ struct ForcedMod16 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(ModReal16));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto fltTy = mlir::FloatType::getF128(ctx);
+      auto fltTy = mlir::Float128Type::get(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
       auto intTy = mlir::IntegerType::get(ctx, 8 * sizeof(int));
       return mlir::FunctionType::get(ctx, {fltTy, fltTy, strTy, intTy},
@@ -145,7 +145,7 @@ struct ForcedModulo10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(ModuloReal10));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto fltTy = mlir::FloatType::getF80(ctx);
+      auto fltTy = mlir::Float80Type::get(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
       auto intTy = mlir::IntegerType::get(ctx, 8 * sizeof(int));
       return mlir::FunctionType::get(ctx, {fltTy, fltTy, strTy, intTy},
@@ -159,7 +159,7 @@ struct ForcedModulo16 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(ModuloReal16));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto fltTy = mlir::FloatType::getF128(ctx);
+      auto fltTy = mlir::Float128Type::get(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
       auto intTy = mlir::IntegerType::get(ctx, 8 * sizeof(int));
       return mlir::FunctionType::get(ctx, {fltTy, fltTy, strTy, intTy},
@@ -173,7 +173,7 @@ struct ForcedNearest10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Nearest10));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto fltTy = mlir::FloatType::getF80(ctx);
+      auto fltTy = mlir::Float80Type::get(ctx);
       auto boolTy = mlir::IntegerType::get(ctx, 1);
       return mlir::FunctionType::get(ctx, {fltTy, boolTy}, {fltTy});
     };
@@ -185,7 +185,7 @@ struct ForcedNearest16 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Nearest16));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto fltTy = mlir::FloatType::getF128(ctx);
+      auto fltTy = mlir::Float128Type::get(ctx);
       auto boolTy = mlir::IntegerType::get(ctx, 1);
       return mlir::FunctionType::get(ctx, {fltTy, boolTy}, {fltTy});
     };
@@ -197,7 +197,7 @@ struct ForcedRRSpacing10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(RRSpacing10));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF80(ctx);
+      auto ty = mlir::Float80Type::get(ctx);
       return mlir::FunctionType::get(ctx, {ty}, {ty});
     };
   }
@@ -208,7 +208,7 @@ struct ForcedRRSpacing16 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(RRSpacing16));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF128(ctx);
+      auto ty = mlir::Float128Type::get(ctx);
       return mlir::FunctionType::get(ctx, {ty}, {ty});
     };
   }
@@ -219,7 +219,7 @@ struct ForcedScale10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Scale10));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto fltTy = mlir::FloatType::getF80(ctx);
+      auto fltTy = mlir::Float80Type::get(ctx);
       auto intTy = mlir::IntegerType::get(ctx, 64);
       return mlir::FunctionType::get(ctx, {fltTy, intTy}, {fltTy});
     };
@@ -231,7 +231,7 @@ struct ForcedScale16 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Scale16));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto fltTy = mlir::FloatType::getF128(ctx);
+      auto fltTy = mlir::Float128Type::get(ctx);
       auto intTy = mlir::IntegerType::get(ctx, 64);
       return mlir::FunctionType::get(ctx, {fltTy, intTy}, {fltTy});
     };
@@ -243,7 +243,7 @@ struct ForcedSetExponent10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(SetExponent10));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto fltTy = mlir::FloatType::getF80(ctx);
+      auto fltTy = mlir::Float80Type::get(ctx);
       auto intTy = mlir::IntegerType::get(ctx, 64);
       return mlir::FunctionType::get(ctx, {fltTy, intTy}, {fltTy});
     };
@@ -255,7 +255,7 @@ struct ForcedSetExponent16 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(SetExponent16));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto fltTy = mlir::FloatType::getF128(ctx);
+      auto fltTy = mlir::Float128Type::get(ctx);
       auto intTy = mlir::IntegerType::get(ctx, 64);
       return mlir::FunctionType::get(ctx, {fltTy, intTy}, {fltTy});
     };
@@ -267,7 +267,7 @@ struct ForcedSpacing10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Spacing10));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF80(ctx);
+      auto ty = mlir::Float80Type::get(ctx);
       return mlir::FunctionType::get(ctx, {ty}, {ty});
     };
   }
@@ -278,7 +278,7 @@ struct ForcedSpacing16 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Spacing16));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF128(ctx);
+      auto ty = mlir::Float128Type::get(ctx);
       return mlir::FunctionType::get(ctx, {ty}, {ty});
     };
   }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
index 1aa941bd2131c..f778b963c59ca 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
@@ -27,7 +27,7 @@ struct ForcedMaxvalReal10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(MaxvalReal10));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF80(ctx);
+      auto ty = mlir::Float80Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
@@ -43,7 +43,7 @@ struct ForcedMaxvalReal16 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(MaxvalReal16));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF128(ctx);
+      auto ty = mlir::Float128Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
@@ -94,7 +94,7 @@ struct ForcedMinvalReal10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(MinvalReal10));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF80(ctx);
+      auto ty = mlir::Float80Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
@@ -110,7 +110,7 @@ struct ForcedMinvalReal16 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(MinvalReal16));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF128(ctx);
+      auto ty = mlir::Float128Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
@@ -173,7 +173,7 @@ struct ForcedNorm2Real10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Norm2_10));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF80(ctx);
+      auto ty = mlir::Float80Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
@@ -188,7 +188,7 @@ struct ForcedNorm2Real16 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Norm2_16));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF128(ctx);
+      auto ty = mlir::Float128Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
@@ -209,7 +209,7 @@ struct ForcedNorm2DimReal16 {
       auto intTy = mlir::IntegerType::get(ctx, 8 * sizeof(int));
       return mlir::FunctionType::get(
           ctx, {fir::ReferenceType::get(boxTy), boxTy, intTy, strTy, intTy},
-          mlir::NoneType::get(ctx));
+          {});
     };
   }
 };
@@ -219,7 +219,7 @@ struct ForcedProductReal10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(ProductReal10));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF80(ctx);
+      auto ty = mlir::Float80Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
@@ -235,7 +235,7 @@ struct ForcedProductReal16 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(ProductReal16));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF128(ctx);
+      auto ty = mlir::Float128Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
@@ -287,7 +287,7 @@ struct ForcedProductComplex10 {
       ExpandAndQuoteKey(RTNAME(CppProductComplex10));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::ComplexType::get(mlir::FloatType::getF80(ctx));
+      auto ty = mlir::ComplexType::get(mlir::Float80Type::get(ctx));
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
@@ -305,7 +305,7 @@ struct ForcedProductComplex16 {
       ExpandAndQuoteKey(RTNAME(CppProductComplex16));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::ComplexType::get(mlir::FloatType::getF128(ctx));
+      auto ty = mlir::ComplexType::get(mlir::Float128Type::get(ctx));
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
@@ -323,7 +323,7 @@ struct ForcedDotProductReal10 {
       ExpandAndQuoteKey(RTNAME(DotProductReal10));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF80(ctx);
+      auto ty = mlir::Float80Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
@@ -339,7 +339,7 @@ struct ForcedDotProductReal16 {
       ExpandAndQuoteKey(RTNAME(DotProductReal16));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF128(ctx);
+      auto ty = mlir::Float128Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
@@ -355,7 +355,7 @@ struct ForcedDotProductComplex10 {
       ExpandAndQuoteKey(RTNAME(CppDotProductComplex10));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::ComplexType::get(mlir::FloatType::getF80(ctx));
+      auto ty = mlir::ComplexType::get(mlir::Float80Type::get(ctx));
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
@@ -373,7 +373,7 @@ struct ForcedDotProductComplex16 {
       ExpandAndQuoteKey(RTNAME(CppDotProductComplex16));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::ComplexType::get(mlir::FloatType::getF128(ctx));
+      auto ty = mlir::ComplexType::get(mlir::Float128Type::get(ctx));
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
@@ -423,7 +423,7 @@ struct ForcedSumReal10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(SumReal10));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF80(ctx);
+      auto ty = mlir::Float80Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
@@ -439,7 +439,7 @@ struct ForcedSumReal16 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(SumReal16));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF128(ctx);
+      auto ty = mlir::Float128Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
@@ -489,7 +489,7 @@ struct ForcedSumComplex10 {
       ExpandAndQuoteKey(RTNAME(CppSumComplex10));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::ComplexType::get(mlir::FloatType::getF80(ctx));
+      auto ty = mlir::ComplexType::get(mlir::Float80Type::get(ctx));
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
@@ -507,7 +507,7 @@ struct ForcedSumComplex16 {
       ExpandAndQuoteKey(RTNAME(CppSumComplex16));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::ComplexType::get(mlir::FloatType::getF128(ctx));
+      auto ty = mlir::ComplexType::get(mlir::Float128Type::get(ctx));
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
@@ -573,7 +573,7 @@ struct ForcedReduceReal10Ref {
       ExpandAndQuoteKey(RTNAME(ReduceReal10Ref));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF80(ctx);
+      auto ty = mlir::Float80Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto refTy = fir::ReferenceType::get(ty);
@@ -593,7 +593,7 @@ struct ForcedReduceReal10Value {
       ExpandAndQuoteKey(RTNAME(ReduceReal10Value));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF80(ctx);
+      auto ty = mlir::Float80Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto refTy = fir::ReferenceType::get(ty);
@@ -613,7 +613,7 @@ struct ForcedReduceReal16Ref {
       ExpandAndQuoteKey(RTNAME(ReduceReal16Ref));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF128(ctx);
+      auto ty = mlir::Float128Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto refTy = fir::ReferenceType::get(ty);
@@ -633,7 +633,7 @@ struct ForcedReduceReal16Value {
       ExpandAndQuoteKey(RTNAME(ReduceReal16Value));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF128(ctx);
+      auto ty = mlir::Float128Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto refTy = fir::ReferenceType::get(ty);
@@ -653,7 +653,7 @@ struct ForcedReduceReal10DimRef {
       ExpandAndQuoteKey(RTNAME(ReduceReal10DimRef));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF80(ctx);
+      auto ty = mlir::Float80Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto refTy = fir::ReferenceType::get(ty);
@@ -675,7 +675,7 @@ struct ForcedReduceReal10DimValue {
       ExpandAndQuoteKey(RTNAME(ReduceReal10DimValue));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF80(ctx);
+      auto ty = mlir::Float80Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto refTy = fir::ReferenceType::get(ty);
@@ -697,7 +697,7 @@ struct ForcedReduceReal16DimRef {
       ExpandAndQuoteKey(RTNAME(ReduceReal16DimRef));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF128(ctx);
+      auto ty = mlir::Float128Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto refTy = fir::ReferenceType::get(ty);
@@ -719,7 +719,7 @@ struct ForcedReduceReal16DimValue {
       ExpandAndQuoteKey(RTNAME(ReduceReal16DimValue));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF128(ctx);
+      auto ty = mlir::Float128Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto refTy = fir::ReferenceType::get(ty);
@@ -911,7 +911,7 @@ struct ForcedReduceComplex10Ref {
       ExpandAndQuoteKey(RTNAME(CppReduceComplex10Ref));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::ComplexType::get(mlir::FloatType::getF80(ctx));
+      auto ty = mlir::ComplexType::get(mlir::Float80Type::get(ctx));
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto refTy = fir::ReferenceType::get(ty);
@@ -932,7 +932,7 @@ struct ForcedReduceComplex10Value {
       ExpandAndQuoteKey(RTNAME(CppReduceComplex10Value));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::ComplexType::get(mlir::FloatType::getF80(ctx));
+      auto ty = mlir::ComplexType::get(mlir::Float80Type::get(ctx));
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto refTy = fir::ReferenceType::get(ty);
@@ -953,7 +953,7 @@ struct ForcedReduceComplex10DimRef {
       ExpandAndQuoteKey(RTNAME(CppReduceComplex10DimRef));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::ComplexType::get(mlir::FloatType::getF80(ctx));
+      auto ty = mlir::ComplexType::get(mlir::Float80Type::get(ctx));
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto refTy = fir::ReferenceType::get(ty);
@@ -975,7 +975,7 @@ struct ForcedReduceComplex10DimValue {
       ExpandAndQuoteKey(RTNAME(CppReduceComplex10DimValue));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::ComplexType::get(mlir::FloatType::getF80(ctx));
+      auto ty = mlir::ComplexType::get(mlir::Float80Type::get(ctx));
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto refTy = fir::ReferenceType::get(ty);
@@ -997,7 +997,7 @@ struct ForcedReduceComplex16Ref {
       ExpandAndQuoteKey(RTNAME(CppReduceComplex16Ref));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::ComplexType::get(mlir::FloatType::getF128(ctx));
+      auto ty = mlir::ComplexType::get(mlir::Float128Type::get(ctx));
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto refTy = fir::ReferenceType::get(ty);
@@ -1018,7 +1018,7 @@ struct ForcedReduceComplex16Value {
       ExpandAndQuoteKey(RTNAME(CppReduceComplex16Value));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::ComplexType::get(mlir::FloatType::getF128(ctx));
+      auto ty = mlir::ComplexType::get(mlir::Float128Type::get(ctx));
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto refTy = fir::ReferenceType::get(ty);
@@ -1039,7 +1039,7 @@ struct ForcedReduceComplex16DimRef {
       ExpandAndQuoteKey(RTNAME(CppReduceComplex16DimRef));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::ComplexType::get(mlir::FloatType::getF128(ctx));
+      auto ty = mlir::ComplexType::get(mlir::Float128Type::get(ctx));
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto refTy = fir::ReferenceType::get(ty);
@@ -1061,7 +1061,7 @@ struct ForcedReduceComplex16DimValue {
       ExpandAndQuoteKey(RTNAME(CppReduceComplex16DimValue));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::ComplexType::get(mlir::FloatType::getF128(ctx));
+      auto ty = mlir::ComplexType::get(mlir::Float128Type::get(ctx));
       auto boxTy =
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto refTy = fir::ReferenceType::get(ty);
diff --git a/flang/lib/Optimizer/Builder/Runtime/Transformational.cpp b/flang/lib/Optimizer/Builder/Runtime/Transformational.cpp
index 517ba3799798f..978524494af9b 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Transformational.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Transformational.cpp
@@ -25,14 +25,13 @@ struct ForcedBesselJn_10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(BesselJn_10));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF80(ctx);
+      auto ty = mlir::Float80Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
       auto intTy = mlir::IntegerType::get(ctx, 32);
-      auto noneTy = mlir::NoneType::get(ctx);
       return mlir::FunctionType::get(
-          ctx, {boxTy, intTy, intTy, ty, ty, ty, strTy, intTy}, {noneTy});
+          ctx, {boxTy, intTy, intTy, ty, ty, ty, strTy, intTy}, {});
     };
   }
 };
@@ -42,14 +41,13 @@ struct ForcedBesselJn_16 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(BesselJn_16));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF128(ctx);
+      auto ty = mlir::Float128Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
       auto intTy = mlir::IntegerType::get(ctx, 32);
-      auto noneTy = mlir::NoneType::get(ctx);
       return mlir::FunctionType::get(
-          ctx, {boxTy, intTy, intTy, ty, ty, ty, strTy, intTy}, {noneTy});
+          ctx, {boxTy, intTy, intTy, ty, ty, ty, strTy, intTy}, {});
     };
   }
 };
@@ -63,9 +61,8 @@ struct ForcedBesselJnX0_10 {
           fir::runtime::getModel<Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
       auto intTy = mlir::IntegerType::get(ctx, 32);
-      auto noneTy = mlir::NoneType::get(ctx);
       return mlir::FunctionType::get(ctx, {boxTy, intTy, intTy, strTy, intTy},
-                                     {noneTy});
+                                     {});
     };
   }
 };
@@ -79,9 +76,8 @@ struct ForcedBesselJnX0_16 {
           fir::runtime::getModel<Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
       auto intTy = mlir::IntegerType::get(ctx, 32);
-      auto noneTy = mlir::NoneType::get(ctx);
       return mlir::FunctionType::get(ctx, {boxTy, intTy, intTy, strTy, intTy},
-                                     {noneTy});
+                                     {});
     };
   }
 };
@@ -91,14 +87,13 @@ struct ForcedBesselYn_10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(BesselYn_10));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF80(ctx);
+      auto ty = mlir::Float80Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
       auto intTy = mlir::IntegerType::get(ctx, 32);
-      auto noneTy = mlir::NoneType::get(ctx);
       return mlir::FunctionType::get(
-          ctx, {boxTy, intTy, intTy, ty, ty, ty, strTy, intTy}, {noneTy});
+          ctx, {boxTy, intTy, intTy, ty, ty, ty, strTy, intTy}, {});
     };
   }
 };
@@ -108,14 +103,13 @@ struct ForcedBesselYn_16 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(BesselYn_16));
   static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
     return [](mlir::MLIRContext *ctx) {
-      auto ty = mlir::FloatType::getF128(ctx);
+      auto ty = mlir::Float128Type::get(ctx);
       auto boxTy =
           fir::runtime::getModel<Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
       auto intTy = mlir::IntegerType::get(ctx, 32);
-      auto noneTy = mlir::NoneType::get(ctx);
       return mlir::FunctionType::get(
-          ctx, {boxTy, intTy, intTy, ty, ty, ty, strTy, intTy}, {noneTy});
+          ctx, {boxTy, intTy, intTy, ty, ty, ty, strTy, intTy}, {});
     };
   }
 };
@@ -129,9 +123,8 @@ struct ForcedBesselYnX0_10 {
           fir::runtime::getModel<Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
       auto intTy = mlir::IntegerType::get(ctx, 32);
-      auto noneTy = mlir::NoneType::get(ctx);
       return mlir::FunctionType::get(ctx, {boxTy, intTy, intTy, strTy, intTy},
-                                     {noneTy});
+                                     {});
     };
   }
 };
@@ -145,9 +138,8 @@ struct ForcedBesselYnX0_16 {
           fir::runtime::getModel<Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
       auto intTy = mlir::IntegerType::get(ctx, 32);
-      auto noneTy = mlir::NoneType::get(ctx);
       return mlir::FunctionType::get(ctx, {boxTy, intTy, intTy, strTy, intTy},
-                                     {noneTy});
+                                     {});
     };
   }
 };
@@ -339,9 +331,8 @@ struct ForcedMatmulTypeModel {
           fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
       auto strTy = fir::runtime::getModel<const char *>()(ctx);
       auto intTy = fir::runtime::getModel<int>()(ctx);
-      auto voidTy = fir::runtime::getModel<void>()(ctx);
       return mlir::FunctionType::get(
-          ctx, {boxRefTy, boxTy, boxTy, strTy, intTy}, {voidTy});
+          ctx, {boxRefTy, boxTy, boxTy, strTy, intTy}, {});
     };
   }
 };
diff --git a/flang/lib/Optimizer/CodeGen/CMakeLists.txt b/flang/lib/Optimizer/CodeGen/CMakeLists.txt
index f47d11875f04d..81c8a68b95367 100644
--- a/flang/lib/Optimizer/CodeGen/CMakeLists.txt
+++ b/flang/lib/Optimizer/CodeGen/CMakeLists.txt
@@ -21,6 +21,14 @@ add_flang_library(FIRCodeGen
   FIRDialect
   FIRDialectSupport
   FIRSupport
+
+  LINK_COMPONENTS
+  AsmParser
+  AsmPrinter
+  Remarks
+  TargetParser
+
+  MLIR_LIBS
   MLIRComplexToLLVM
   MLIRComplexToStandard
   MLIRGPUDialect
@@ -34,10 +42,4 @@ add_flang_library(FIRCodeGen
   MLIRLLVMToLLVMIRTranslation
   MLIRTargetLLVMIRExport
   MLIRVectorToLLVM
-
-  LINK_COMPONENTS
-  AsmParser
-  AsmPrinter
-  Remarks
-  TargetParser
 )
diff --git a/flang/lib/Optimizer/CodeGen/Target.cpp b/flang/lib/Optimizer/CodeGen/Target.cpp
index c332493eb8072..1bc673bb34e32 100644
--- a/flang/lib/Optimizer/CodeGen/Target.cpp
+++ b/flang/lib/Optimizer/CodeGen/Target.cpp
@@ -572,12 +572,12 @@ struct TargetX86_64 : public GenericTarget<TargetX86_64> {
       // select an fp type of the right size, and it makes things simpler
       // here.
       if (partByteSize > 8)
-        return mlir::FloatType::getF128(context);
+        return mlir::Float128Type::get(context);
       if (partByteSize > 4)
-        return mlir::FloatType::getF64(context);
+        return mlir::Float64Type::get(context);
       if (partByteSize > 2)
-        return mlir::FloatType::getF32(context);
-      return mlir::FloatType::getF16(context);
+        return mlir::Float32Type::get(context);
+      return mlir::Float16Type::get(context);
     }
     assert(partByteSize <= 8 &&
            "expect integer part of aggregate argument to fit into eight bytes");
diff --git a/flang/lib/Optimizer/Dialect/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CMakeLists.txt
index 08caa15700d4c..d39dca8ed0000 100644
--- a/flang/lib/Optimizer/Dialect/CMakeLists.txt
+++ b/flang/lib/Optimizer/Dialect/CMakeLists.txt
@@ -20,14 +20,16 @@ add_flang_library(FIRDialect
   LINK_LIBS
   CUFAttrs
   FIRDialectSupport
-  MLIRArithDialect
-  MLIRBuiltinToLLVMIRTranslation
-  MLIROpenMPToLLVM
-  MLIRLLVMToLLVMIRTranslation
-  MLIRTargetLLVMIRExport
 
   LINK_COMPONENTS
   AsmParser
   AsmPrinter
   Remarks
+
+  MLIR_LIBS
+  MLIRArithDialect
+  MLIRBuiltinToLLVMIRTranslation
+  MLIROpenMPToLLVM
+  MLIRLLVMToLLVMIRTranslation
+  MLIRTargetLLVMIRExport
 )
diff --git a/flang/lib/Optimizer/Dialect/CUF/Attributes/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CUF/Attributes/CMakeLists.txt
index ec5484c1d6108..a0f58504eff05 100644
--- a/flang/lib/Optimizer/Dialect/CUF/Attributes/CMakeLists.txt
+++ b/flang/lib/Optimizer/Dialect/CUF/Attributes/CMakeLists.txt
@@ -7,11 +7,11 @@ add_flang_library(CUFAttrs
   CUFAttrsIncGen
   CUFOpsIncGen
 
-  LINK_LIBS
-  MLIRTargetLLVMIRExport
-
   LINK_COMPONENTS
   AsmParser
   AsmPrinter
   Remarks
+
+  MLIR_LIBS
+  MLIRTargetLLVMIRExport
 )
diff --git a/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt
index 5d4bd0785971f..e483b4a164113 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt
+++ b/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt
@@ -14,12 +14,14 @@ add_flang_library(CUFDialect
   CUFAttrs
   FIRDialect
   FIRDialectSupport
-  MLIRIR
-  MLIRGPUDialect
-  MLIRTargetLLVMIRExport
 
   LINK_COMPONENTS
   AsmParser
   AsmPrinter
   Remarks
+
+  MLIR_LIBS
+  MLIRIR
+  MLIRGPUDialect
+  MLIRTargetLLVMIRExport
 )
diff --git a/flang/lib/Optimizer/Dialect/FIRType.cpp b/flang/lib/Optimizer/Dialect/FIRType.cpp
index d25e5651f1142..d8ce231d1b5a7 100644
--- a/flang/lib/Optimizer/Dialect/FIRType.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRType.cpp
@@ -1249,17 +1249,17 @@ mlir::Type fir::fromRealTypeID(mlir::MLIRContext *context,
                                llvm::Type::TypeID typeID, fir::KindTy kind) {
   switch (typeID) {
   case llvm::Type::TypeID::HalfTyID:
-    return mlir::FloatType::getF16(context);
+    return mlir::Float16Type::get(context);
   case llvm::Type::TypeID::BFloatTyID:
-    return mlir::FloatType::getBF16(context);
+    return mlir::BFloat16Type::get(context);
   case llvm::Type::TypeID::FloatTyID:
-    return mlir::FloatType::getF32(context);
+    return mlir::Float32Type::get(context);
   case llvm::Type::TypeID::DoubleTyID:
-    return mlir::FloatType::getF64(context);
+    return mlir::Float64Type::get(context);
   case llvm::Type::TypeID::X86_FP80TyID:
-    return mlir::FloatType::getF80(context);
+    return mlir::Float80Type::get(context);
   case llvm::Type::TypeID::FP128TyID:
-    return mlir::FloatType::getF128(context);
+    return mlir::Float128Type::get(context);
   default:
     mlir::emitError(mlir::UnknownLoc::get(context))
         << "unsupported type: !fir.real<" << kind << ">";
diff --git a/flang/lib/Optimizer/Dialect/Support/CMakeLists.txt b/flang/lib/Optimizer/Dialect/Support/CMakeLists.txt
index c37b0549f7fc1..bfdd5279b6f29 100644
--- a/flang/lib/Optimizer/Dialect/Support/CMakeLists.txt
+++ b/flang/lib/Optimizer/Dialect/Support/CMakeLists.txt
@@ -8,6 +8,6 @@ add_flang_library(FIRDialectSupport
   MLIRIR
   intrinsics_gen
 
-  LINK_LIBS
+  MLIR_LIBS
   ${dialect_libs}
 )
diff --git a/flang/lib/Optimizer/HLFIR/IR/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/IR/CMakeLists.txt
index 267d6469ee7ab..8a646bedf94b8 100644
--- a/flang/lib/Optimizer/HLFIR/IR/CMakeLists.txt
+++ b/flang/lib/Optimizer/HLFIR/IR/CMakeLists.txt
@@ -13,11 +13,13 @@ add_flang_library(HLFIRDialect
   LINK_LIBS
   CUFAttrs
   FIRDialect
-  MLIRIR
-  ${dialect_libs}
 
   LINK_COMPONENTS
   AsmParser
   AsmPrinter
   Remarks
+
+  MLIR_LIBS
+  MLIRIR
+  ${dialect_libs}
 )
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index 82aac7cafa1d0..d93e25280237f 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -1197,7 +1197,8 @@ hlfir::MatmulOp::canonicalize(MatmulOp matmulOp,
       mlir::Location loc = matmulOp.getLoc();
       mlir::Type resultTy = matmulOp.getResult().getType();
       auto matmulTransposeOp = rewriter.create<hlfir::MatmulTransposeOp>(
-          loc, resultTy, transposeOp.getArray(), matmulOp.getRhs());
+          loc, resultTy, transposeOp.getArray(), matmulOp.getRhs(),
+          matmulOp.getFastmathAttr());
 
       // we don't need to remove any hlfir.destroy because it will be needed for
       // the new intrinsic result anyway
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
index 25a532204dd05..09286aced6089 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
@@ -27,11 +27,13 @@ add_flang_library(HLFIRTransforms
   FIRTransforms
   FlangOpenMPTransforms
   HLFIRDialect
-  MLIRIR
-  ${dialect_libs}
 
   LINK_COMPONENTS
   AsmParser
   AsmPrinter
   Remarks
+
+  MLIR_LIBS
+  MLIRIR
+  ${dialect_libs}
 )
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
index 314ced8679521..fe7ae0eeed3cc 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
@@ -28,8 +28,88 @@ namespace hlfir {
 #include "flang/Optimizer/HLFIR/Passes.h.inc"
 } // namespace hlfir
 
+#define DEBUG_TYPE "simplify-hlfir-intrinsics"
+
+static llvm::cl::opt<bool> forceMatmulAsElemental(
+    "flang-inline-matmul-as-elemental",
+    llvm::cl::desc("Expand hlfir.matmul as elemental operation"),
+    llvm::cl::init(false));
+
 namespace {
 
+// Helper class to generate operations related to computing
+// product of values.
+class ProductFactory {
+public:
+  ProductFactory(mlir::Location loc, fir::FirOpBuilder &builder)
+      : loc(loc), builder(builder) {}
+
+  // Generate an update of the inner product value:
+  //   acc += v1 * v2, OR
+  //   acc += CONJ(v1) * v2, OR
+  //   acc ||= v1 && v2
+  //
+  // CONJ parameter specifies whether the first complex product argument
+  // needs to be conjugated.
+  template <bool CONJ = false>
+  mlir::Value genAccumulateProduct(mlir::Value acc, mlir::Value v1,
+                                   mlir::Value v2) {
+    mlir::Type resultType = acc.getType();
+    acc = castToProductType(acc, resultType);
+    v1 = castToProductType(v1, resultType);
+    v2 = castToProductType(v2, resultType);
+    mlir::Value result;
+    if (mlir::isa<mlir::FloatType>(resultType)) {
+      result = builder.create<mlir::arith::AddFOp>(
+          loc, acc, builder.create<mlir::arith::MulFOp>(loc, v1, v2));
+    } else if (mlir::isa<mlir::ComplexType>(resultType)) {
+      if constexpr (CONJ)
+        result = fir::IntrinsicLibrary{builder, loc}.genConjg(resultType, v1);
+      else
+        result = v1;
+
+      result = builder.create<fir::AddcOp>(
+          loc, acc, builder.create<fir::MulcOp>(loc, result, v2));
+    } else if (mlir::isa<mlir::IntegerType>(resultType)) {
+      result = builder.create<mlir::arith::AddIOp>(
+          loc, acc, builder.create<mlir::arith::MulIOp>(loc, v1, v2));
+    } else if (mlir::isa<fir::LogicalType>(resultType)) {
+      result = builder.create<mlir::arith::OrIOp>(
+          loc, acc, builder.create<mlir::arith::AndIOp>(loc, v1, v2));
+    } else {
+      llvm_unreachable("unsupported type");
+    }
+
+    return builder.createConvert(loc, resultType, result);
+  }
+
+private:
+  mlir::Location loc;
+  fir::FirOpBuilder &builder;
+
+  mlir::Value castToProductType(mlir::Value value, mlir::Type type) {
+    if (mlir::isa<fir::LogicalType>(type))
+      return builder.createConvert(loc, builder.getIntegerType(1), value);
+
+    // TODO: the multiplications/additions by/of zero resulting from
+    // complex * real are optimized by LLVM under -fno-signed-zeros
+    // -fno-honor-nans.
+    // We can make them disappear by default if we:
+    //   * either expand the complex multiplication into real
+    //     operations, OR
+    //   * set nnan nsz fast-math flags to the complex operations.
+    if (fir::isa_complex(type) && !fir::isa_complex(value.getType())) {
+      mlir::Value zeroCmplx = fir::factory::createZeroValue(builder, loc, type);
+      fir::factory::Complex helper(builder, loc);
+      mlir::Type partType = helper.getComplexPartType(type);
+      return helper.insertComplexPart(zeroCmplx,
+                                      castToProductType(value, partType),
+                                      /*isImagPart=*/false);
+    }
+    return builder.createConvert(loc, type, value);
+  }
+};
+
 class TransposeAsElementalConversion
     : public mlir::OpRewritePattern<hlfir::TransposeOp> {
 public:
@@ -83,11 +163,8 @@ class TransposeAsElementalConversion
   static mlir::Value genResultShape(mlir::Location loc,
                                     fir::FirOpBuilder &builder,
                                     hlfir::Entity array) {
-    mlir::Value inShape = hlfir::genShape(loc, builder, array);
-    llvm::SmallVector<mlir::Value> inExtents =
-        hlfir::getExplicitExtentsFromShape(inShape, builder);
-    if (inShape.getUses().empty())
-      inShape.getDefiningOp()->erase();
+    llvm::SmallVector<mlir::Value, 2> inExtents =
+        hlfir::genExtentsVector(loc, builder, array);
 
     // transpose indices
     assert(inExtents.size() == 2 && "checked in TransposeOp::validate");
@@ -130,7 +207,7 @@ class SumAsElementalConversion : public mlir::OpRewritePattern<hlfir::SumOp> {
     mlir::Value resultShape, dimExtent;
     llvm::SmallVector<mlir::Value> arrayExtents;
     if (isTotalReduction)
-      arrayExtents = genArrayExtents(loc, builder, array);
+      arrayExtents = hlfir::genExtentsVector(loc, builder, array);
     else
       std::tie(resultShape, dimExtent) =
           genResultShapeForPartialReduction(loc, builder, array, dimVal);
@@ -156,7 +233,8 @@ class SumAsElementalConversion : public mlir::OpRewritePattern<hlfir::SumOp> {
       // If DIM is not present, do total reduction.
 
       // Initial value for the reduction.
-      mlir::Value reductionInitValue = genInitValue(loc, builder, elementType);
+      mlir::Value reductionInitValue =
+          fir::factory::createZeroValue(builder, loc, elementType);
 
       // The reduction loop may be unordered if FastMathFlags::reassoc
       // transformations are allowed. The integer reduction is always
@@ -257,17 +335,6 @@ class SumAsElementalConversion : public mlir::OpRewritePattern<hlfir::SumOp> {
   }
 
 private:
-  static llvm::SmallVector<mlir::Value>
-  genArrayExtents(mlir::Location loc, fir::FirOpBuilder &builder,
-                  hlfir::Entity array) {
-    mlir::Value inShape = hlfir::genShape(loc, builder, array);
-    llvm::SmallVector<mlir::Value> inExtents =
-        hlfir::getExplicitExtentsFromShape(inShape, builder);
-    if (inShape.getUses().empty())
-      inShape.getDefiningOp()->erase();
-    return inExtents;
-  }
-
   // Return fir.shape specifying the shape of the result
   // of a SUM reduction with DIM=dimVal. The second return value
   // is the extent of the DIM dimension.
@@ -276,7 +343,7 @@ class SumAsElementalConversion : public mlir::OpRewritePattern<hlfir::SumOp> {
                                     fir::FirOpBuilder &builder,
                                     hlfir::Entity array, int64_t dimVal) {
     llvm::SmallVector<mlir::Value> inExtents =
-        genArrayExtents(loc, builder, array);
+        hlfir::genExtentsVector(loc, builder, array);
     assert(dimVal > 0 && dimVal <= static_cast<int64_t>(inExtents.size()) &&
            "DIM must be present and a positive constant not exceeding "
            "the array's rank");
@@ -286,26 +353,6 @@ class SumAsElementalConversion : public mlir::OpRewritePattern<hlfir::SumOp> {
     return {builder.create<fir::ShapeOp>(loc, inExtents), dimExtent};
   }
 
-  // Generate the initial value for a SUM reduction with the given
-  // data type.
-  static mlir::Value genInitValue(mlir::Location loc,
-                                  fir::FirOpBuilder &builder,
-                                  mlir::Type elementType) {
-    if (auto ty = mlir::dyn_cast<mlir::FloatType>(elementType)) {
-      const llvm::fltSemantics &sem = ty.getFloatSemantics();
-      return builder.createRealConstant(loc, elementType,
-                                        llvm::APFloat::getZero(sem));
-    } else if (auto ty = mlir::dyn_cast<mlir::ComplexType>(elementType)) {
-      mlir::Value initValue = genInitValue(loc, builder, ty.getElementType());
-      return fir::factory::Complex{builder, loc}.createComplex(ty, initValue,
-                                                               initValue);
-    } else if (mlir::isa<mlir::IntegerType>(elementType)) {
-      return builder.createIntegerConstant(loc, elementType, 0);
-    }
-
-    llvm_unreachable("unsupported SUM reduction type");
-  }
-
   // Generate scalar addition of the two values (of the same data type).
   static mlir::Value genScalarAdd(mlir::Location loc,
                                   fir::FirOpBuilder &builder,
@@ -467,9 +514,449 @@ class CShiftAsElementalConversion
   }
 };
 
+template <typename Op>
+class MatmulConversion : public mlir::OpRewritePattern<Op> {
+public:
+  using mlir::OpRewritePattern<Op>::OpRewritePattern;
+
+  llvm::LogicalResult
+  matchAndRewrite(Op matmul, mlir::PatternRewriter &rewriter) const override {
+    mlir::Location loc = matmul.getLoc();
+    fir::FirOpBuilder builder{rewriter, matmul.getOperation()};
+    hlfir::Entity lhs = hlfir::Entity{matmul.getLhs()};
+    hlfir::Entity rhs = hlfir::Entity{matmul.getRhs()};
+    mlir::Value resultShape, innerProductExtent;
+    std::tie(resultShape, innerProductExtent) =
+        genResultShape(loc, builder, lhs, rhs);
+
+    if (forceMatmulAsElemental || isMatmulTranspose) {
+      // Generate hlfir.elemental that produces the result of
+      // MATMUL/MATMUL(TRANSPOSE).
+      // Note that this implementation is very suboptimal for MATMUL,
+      // but is quite good for MATMUL(TRANSPOSE), e.g.:
+      //   R(1:N) = R(1:N) + MATMUL(TRANSPOSE(X(1:N,1:N)), Y(1:N))
+      // Inlining MATMUL(TRANSPOSE) as hlfir.elemental may result
+      // in merging the inner product computation with the elemental
+      // addition. Note that the inner product computation will
+      // benefit from processing the lowermost dimensions of X and Y,
+      // which may be the best when they are contiguous.
+      //
+      // This is why we always inline MATMUL(TRANSPOSE) as an elemental.
+      // MATMUL is inlined below by default unless forceMatmulAsElemental.
+      hlfir::ExprType resultType =
+          mlir::cast<hlfir::ExprType>(matmul.getType());
+      hlfir::ElementalOp newOp = genElementalMatmul(
+          loc, builder, resultType, resultShape, lhs, rhs, innerProductExtent);
+      rewriter.replaceOp(matmul, newOp);
+      return mlir::success();
+    }
+
+    // Generate hlfir.eval_in_mem to mimic the MATMUL implementation
+    // from Fortran runtime. The implementation needs to operate
+    // with the result array as an in-memory object.
+    hlfir::EvaluateInMemoryOp evalOp =
+        builder.create<hlfir::EvaluateInMemoryOp>(
+            loc, mlir::cast<hlfir::ExprType>(matmul.getType()), resultShape);
+    builder.setInsertionPointToStart(&evalOp.getBody().front());
+
+    // Embox the raw array pointer to simplify designating it.
+    // TODO: this currently results in redundant lower bounds
+    // addition for the designator, but this should be fixed in
+    // hlfir::Entity::mayHaveNonDefaultLowerBounds().
+    mlir::Value resultArray = evalOp.getMemory();
+    mlir::Type arrayType = fir::dyn_cast_ptrEleTy(resultArray.getType());
+    resultArray = builder.createBox(loc, fir::BoxType::get(arrayType),
+                                    resultArray, resultShape, /*slice=*/nullptr,
+                                    /*lengths=*/{}, /*tdesc=*/nullptr);
+
+    // The contiguous MATMUL version is best for the cases
+    // where the input arrays and (maybe) the result are contiguous
+    // in their lowermost dimensions.
+    // Especially, when LLVM can recognize the continuity
+    // and vectorize the loops properly.
+    // Note that the contiguous MATMUL inlining is correct
+    // even when the input arrays are not contiguous.
+    // TODO: we can try to recognize the cases when the continuity
+    // is not statically obvious and try to generate an explicitly
+    // continuous version under a dynamic check. This should allow
+    // LLVM to vectorize the loops better. Note that this can
+    // also be postponed up to the LoopVersioning pass.
+    // The fallback implementation may use genElementalMatmul() with
+    // an hlfir.assign into the result of eval_in_mem.
+    mlir::LogicalResult rewriteResult =
+        genContiguousMatmul(loc, builder, hlfir::Entity{resultArray},
+                            resultShape, lhs, rhs, innerProductExtent);
+
+    if (mlir::failed(rewriteResult)) {
+      // Erase the unclaimed eval_in_mem op.
+      rewriter.eraseOp(evalOp);
+      return rewriter.notifyMatchFailure(matmul,
+                                         "genContiguousMatmul() failed");
+    }
+
+    rewriter.replaceOp(matmul, evalOp);
+    return mlir::success();
+  }
+
+private:
+  static constexpr bool isMatmulTranspose =
+      std::is_same_v<Op, hlfir::MatmulTransposeOp>;
+
+  // Return a tuple of:
+  //   * A fir.shape operation representing the shape of the result
+  //     of a MATMUL/MATMUL(TRANSPOSE).
+  //   * An extent of the dimensions of the input array
+  //     that are processed during the inner product computation.
+  static std::tuple<mlir::Value, mlir::Value>
+  genResultShape(mlir::Location loc, fir::FirOpBuilder &builder,
+                 hlfir::Entity input1, hlfir::Entity input2) {
+    llvm::SmallVector<mlir::Value, 2> input1Extents =
+        hlfir::genExtentsVector(loc, builder, input1);
+    llvm::SmallVector<mlir::Value, 2> input2Extents =
+        hlfir::genExtentsVector(loc, builder, input2);
+
+    llvm::SmallVector<mlir::Value, 2> newExtents;
+    mlir::Value innerProduct1Extent, innerProduct2Extent;
+    if (input1Extents.size() == 1) {
+      assert(!isMatmulTranspose &&
+             "hlfir.matmul_transpose's first operand must be rank-2 array");
+      assert(input2Extents.size() == 2 &&
+             "hlfir.matmul second argument must be rank-2 array");
+      newExtents.push_back(input2Extents[1]);
+      innerProduct1Extent = input1Extents[0];
+      innerProduct2Extent = input2Extents[0];
+    } else {
+      if (input2Extents.size() == 1) {
+        assert(input1Extents.size() == 2 &&
+               "hlfir.matmul first argument must be rank-2 array");
+        if constexpr (isMatmulTranspose)
+          newExtents.push_back(input1Extents[1]);
+        else
+          newExtents.push_back(input1Extents[0]);
+      } else {
+        assert(input1Extents.size() == 2 && input2Extents.size() == 2 &&
+               "hlfir.matmul arguments must be rank-2 arrays");
+        if constexpr (isMatmulTranspose)
+          newExtents.push_back(input1Extents[1]);
+        else
+          newExtents.push_back(input1Extents[0]);
+
+        newExtents.push_back(input2Extents[1]);
+      }
+      if constexpr (isMatmulTranspose)
+        innerProduct1Extent = input1Extents[0];
+      else
+        innerProduct1Extent = input1Extents[1];
+
+      innerProduct2Extent = input2Extents[0];
+    }
+    // The inner product dimensions of the input arrays
+    // must match. Pick the best (e.g. constant) out of them
+    // so that the inner product loop bound can be used in
+    // optimizations.
+    llvm::SmallVector<mlir::Value> innerProductExtent =
+        fir::factory::deduceOptimalExtents({innerProduct1Extent},
+                                           {innerProduct2Extent});
+    return {builder.create<fir::ShapeOp>(loc, newExtents),
+            innerProductExtent[0]};
+  }
+
+  static mlir::LogicalResult
+  genContiguousMatmul(mlir::Location loc, fir::FirOpBuilder &builder,
+                      hlfir::Entity result, mlir::Value resultShape,
+                      hlfir::Entity lhs, hlfir::Entity rhs,
+                      mlir::Value innerProductExtent) {
+    // This code does not support MATMUL(TRANSPOSE), and it is supposed
+    // to be inlined as hlfir.elemental.
+    if constexpr (isMatmulTranspose)
+      return mlir::failure();
+
+    mlir::OpBuilder::InsertionGuard guard(builder);
+    mlir::Type resultElementType = result.getFortranElementType();
+    llvm::SmallVector<mlir::Value, 2> resultExtents =
+        mlir::cast<fir::ShapeOp>(resultShape.getDefiningOp()).getExtents();
+
+    // The inner product loop may be unordered if FastMathFlags::reassoc
+    // transformations are allowed. The integer/logical inner product is
+    // always unordered.
+    // Note that isUnordered is currently applied to all loops
+    // in the loop nests generated below, while it has to be applied
+    // only to one.
+    bool isUnordered = mlir::isa<mlir::IntegerType>(resultElementType) ||
+                       mlir::isa<fir::LogicalType>(resultElementType) ||
+                       static_cast<bool>(builder.getFastMathFlags() &
+                                         mlir::arith::FastMathFlags::reassoc);
+
+    // Insert the initialization loop nest that fills the whole result with
+    // zeroes.
+    mlir::Value initValue =
+        fir::factory::createZeroValue(builder, loc, resultElementType);
+    auto genInitBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                           mlir::ValueRange oneBasedIndices,
+                           mlir::ValueRange reductionArgs)
+        -> llvm::SmallVector<mlir::Value, 0> {
+      hlfir::Entity resultElement =
+          hlfir::getElementAt(loc, builder, result, oneBasedIndices);
+      builder.create<hlfir::AssignOp>(loc, initValue, resultElement);
+      return {};
+    };
+
+    hlfir::genLoopNestWithReductions(loc, builder, resultExtents,
+                                     /*reductionInits=*/{}, genInitBody,
+                                     /*isUnordered=*/true);
+
+    if (lhs.getRank() == 2 && rhs.getRank() == 2) {
+      //   LHS(NROWS,N) * RHS(N,NCOLS) -> RESULT(NROWS,NCOLS)
+      //
+      // Insert the computation loop nest:
+      //   DO 2 K = 1, N
+      //    DO 2 J = 1, NCOLS
+      //     DO 2 I = 1, NROWS
+      //   2  RESULT(I,J) = RESULT(I,J) + LHS(I,K)*RHS(K,J)
+      auto genMatrixMatrix = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                                 mlir::ValueRange oneBasedIndices,
+                                 mlir::ValueRange reductionArgs)
+          -> llvm::SmallVector<mlir::Value, 0> {
+        mlir::Value I = oneBasedIndices[0];
+        mlir::Value J = oneBasedIndices[1];
+        mlir::Value K = oneBasedIndices[2];
+        hlfir::Entity resultElement =
+            hlfir::getElementAt(loc, builder, result, {I, J});
+        hlfir::Entity resultElementValue =
+            hlfir::loadTrivialScalar(loc, builder, resultElement);
+        hlfir::Entity lhsElementValue =
+            hlfir::loadElementAt(loc, builder, lhs, {I, K});
+        hlfir::Entity rhsElementValue =
+            hlfir::loadElementAt(loc, builder, rhs, {K, J});
+        mlir::Value productValue =
+            ProductFactory{loc, builder}.genAccumulateProduct(
+                resultElementValue, lhsElementValue, rhsElementValue);
+        builder.create<hlfir::AssignOp>(loc, productValue, resultElement);
+        return {};
+      };
+
+      // Note that the loops are inserted in reverse order,
+      // so innerProductExtent should be passed as the last extent.
+      hlfir::genLoopNestWithReductions(
+          loc, builder,
+          {resultExtents[0], resultExtents[1], innerProductExtent},
+          /*reductionInits=*/{}, genMatrixMatrix, isUnordered);
+      return mlir::success();
+    }
+
+    if (lhs.getRank() == 2 && rhs.getRank() == 1) {
+      //   LHS(NROWS,N) * RHS(N) -> RESULT(NROWS)
+      //
+      // Insert the computation loop nest:
+      //   DO 2 K = 1, N
+      //    DO 2 J = 1, NROWS
+      //   2 RES(J) = RES(J) + LHS(J,K)*RHS(K)
+      auto genMatrixVector = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                                 mlir::ValueRange oneBasedIndices,
+                                 mlir::ValueRange reductionArgs)
+          -> llvm::SmallVector<mlir::Value, 0> {
+        mlir::Value J = oneBasedIndices[0];
+        mlir::Value K = oneBasedIndices[1];
+        hlfir::Entity resultElement =
+            hlfir::getElementAt(loc, builder, result, {J});
+        hlfir::Entity resultElementValue =
+            hlfir::loadTrivialScalar(loc, builder, resultElement);
+        hlfir::Entity lhsElementValue =
+            hlfir::loadElementAt(loc, builder, lhs, {J, K});
+        hlfir::Entity rhsElementValue =
+            hlfir::loadElementAt(loc, builder, rhs, {K});
+        mlir::Value productValue =
+            ProductFactory{loc, builder}.genAccumulateProduct(
+                resultElementValue, lhsElementValue, rhsElementValue);
+        builder.create<hlfir::AssignOp>(loc, productValue, resultElement);
+        return {};
+      };
+      hlfir::genLoopNestWithReductions(
+          loc, builder, {resultExtents[0], innerProductExtent},
+          /*reductionInits=*/{}, genMatrixVector, isUnordered);
+      return mlir::success();
+    }
+    if (lhs.getRank() == 1 && rhs.getRank() == 2) {
+      //   LHS(N) * RHS(N,NCOLS) -> RESULT(NCOLS)
+      //
+      // Insert the computation loop nest:
+      //   DO 2 K = 1, N
+      //    DO 2 J = 1, NCOLS
+      //   2 RES(J) = RES(J) + LHS(K)*RHS(K,J)
+      auto genVectorMatrix = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                                 mlir::ValueRange oneBasedIndices,
+                                 mlir::ValueRange reductionArgs)
+          -> llvm::SmallVector<mlir::Value, 0> {
+        mlir::Value J = oneBasedIndices[0];
+        mlir::Value K = oneBasedIndices[1];
+        hlfir::Entity resultElement =
+            hlfir::getElementAt(loc, builder, result, {J});
+        hlfir::Entity resultElementValue =
+            hlfir::loadTrivialScalar(loc, builder, resultElement);
+        hlfir::Entity lhsElementValue =
+            hlfir::loadElementAt(loc, builder, lhs, {K});
+        hlfir::Entity rhsElementValue =
+            hlfir::loadElementAt(loc, builder, rhs, {K, J});
+        mlir::Value productValue =
+            ProductFactory{loc, builder}.genAccumulateProduct(
+                resultElementValue, lhsElementValue, rhsElementValue);
+        builder.create<hlfir::AssignOp>(loc, productValue, resultElement);
+        return {};
+      };
+      hlfir::genLoopNestWithReductions(
+          loc, builder, {resultExtents[0], innerProductExtent},
+          /*reductionInits=*/{}, genVectorMatrix, isUnordered);
+      return mlir::success();
+    }
+
+    llvm_unreachable("unsupported MATMUL arguments' ranks");
+  }
+
+  static hlfir::ElementalOp
+  genElementalMatmul(mlir::Location loc, fir::FirOpBuilder &builder,
+                     hlfir::ExprType resultType, mlir::Value resultShape,
+                     hlfir::Entity lhs, hlfir::Entity rhs,
+                     mlir::Value innerProductExtent) {
+    mlir::OpBuilder::InsertionGuard guard(builder);
+    mlir::Type resultElementType = resultType.getElementType();
+    auto genKernel = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                         mlir::ValueRange resultIndices) -> hlfir::Entity {
+      mlir::Value initValue =
+          fir::factory::createZeroValue(builder, loc, resultElementType);
+      // The inner product loop may be unordered if FastMathFlags::reassoc
+      // transformations are allowed. The integer/logical inner product is
+      // always unordered.
+      bool isUnordered = mlir::isa<mlir::IntegerType>(resultElementType) ||
+                         mlir::isa<fir::LogicalType>(resultElementType) ||
+                         static_cast<bool>(builder.getFastMathFlags() &
+                                           mlir::arith::FastMathFlags::reassoc);
+
+      auto genBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                         mlir::ValueRange oneBasedIndices,
+                         mlir::ValueRange reductionArgs)
+          -> llvm::SmallVector<mlir::Value, 1> {
+        llvm::SmallVector<mlir::Value, 2> lhsIndices;
+        llvm::SmallVector<mlir::Value, 2> rhsIndices;
+        // MATMUL:
+        //   LHS(NROWS,N) * RHS(N,NCOLS) -> RESULT(NROWS,NCOLS)
+        //   LHS(NROWS,N) * RHS(N) -> RESULT(NROWS)
+        //   LHS(N) * RHS(N,NCOLS) -> RESULT(NCOLS)
+        //
+        // MATMUL(TRANSPOSE):
+        //   TRANSPOSE(LHS(N,NROWS)) * RHS(N,NCOLS) -> RESULT(NROWS,NCOLS)
+        //   TRANSPOSE(LHS(N,NROWS)) * RHS(N) -> RESULT(NROWS)
+        //
+        // The resultIndices iterate over (NROWS[,NCOLS]).
+        // The oneBasedIndices iterate over (N).
+        if (lhs.getRank() > 1)
+          lhsIndices.push_back(resultIndices[0]);
+        lhsIndices.push_back(oneBasedIndices[0]);
+
+        if constexpr (isMatmulTranspose) {
+          // Swap the LHS indices for TRANSPOSE.
+          std::swap(lhsIndices[0], lhsIndices[1]);
+        }
+
+        rhsIndices.push_back(oneBasedIndices[0]);
+        if (rhs.getRank() > 1)
+          rhsIndices.push_back(resultIndices.back());
+
+        hlfir::Entity lhsElementValue =
+            hlfir::loadElementAt(loc, builder, lhs, lhsIndices);
+        hlfir::Entity rhsElementValue =
+            hlfir::loadElementAt(loc, builder, rhs, rhsIndices);
+        mlir::Value productValue =
+            ProductFactory{loc, builder}.genAccumulateProduct(
+                reductionArgs[0], lhsElementValue, rhsElementValue);
+        return {productValue};
+      };
+      llvm::SmallVector<mlir::Value, 1> innerProductValue =
+          hlfir::genLoopNestWithReductions(loc, builder, {innerProductExtent},
+                                           {initValue}, genBody, isUnordered);
+      return hlfir::Entity{innerProductValue[0]};
+    };
+    hlfir::ElementalOp elementalOp = hlfir::genElementalOp(
+        loc, builder, resultElementType, resultShape, /*typeParams=*/{},
+        genKernel,
+        /*isUnordered=*/true, /*polymorphicMold=*/nullptr, resultType);
+
+    return elementalOp;
+  }
+};
+
+class DotProductConversion
+    : public mlir::OpRewritePattern<hlfir::DotProductOp> {
+public:
+  using mlir::OpRewritePattern<hlfir::DotProductOp>::OpRewritePattern;
+
+  llvm::LogicalResult
+  matchAndRewrite(hlfir::DotProductOp product,
+                  mlir::PatternRewriter &rewriter) const override {
+    hlfir::Entity op = hlfir::Entity{product};
+    if (!op.isScalar())
+      return rewriter.notifyMatchFailure(product, "produces non-scalar result");
+
+    mlir::Location loc = product.getLoc();
+    fir::FirOpBuilder builder{rewriter, product.getOperation()};
+    hlfir::Entity lhs = hlfir::Entity{product.getLhs()};
+    hlfir::Entity rhs = hlfir::Entity{product.getRhs()};
+    mlir::Type resultElementType = product.getType();
+    bool isUnordered = mlir::isa<mlir::IntegerType>(resultElementType) ||
+                       mlir::isa<fir::LogicalType>(resultElementType) ||
+                       static_cast<bool>(builder.getFastMathFlags() &
+                                         mlir::arith::FastMathFlags::reassoc);
+
+    mlir::Value extent = genProductExtent(loc, builder, lhs, rhs);
+
+    auto genBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                       mlir::ValueRange oneBasedIndices,
+                       mlir::ValueRange reductionArgs)
+        -> llvm::SmallVector<mlir::Value, 1> {
+      hlfir::Entity lhsElementValue =
+          hlfir::loadElementAt(loc, builder, lhs, oneBasedIndices);
+      hlfir::Entity rhsElementValue =
+          hlfir::loadElementAt(loc, builder, rhs, oneBasedIndices);
+      mlir::Value productValue =
+          ProductFactory{loc, builder}.genAccumulateProduct</*CONJ=*/true>(
+              reductionArgs[0], lhsElementValue, rhsElementValue);
+      return {productValue};
+    };
+
+    mlir::Value initValue =
+        fir::factory::createZeroValue(builder, loc, resultElementType);
+
+    llvm::SmallVector<mlir::Value, 1> result = hlfir::genLoopNestWithReductions(
+        loc, builder, {extent},
+        /*reductionInits=*/{initValue}, genBody, isUnordered);
+
+    rewriter.replaceOp(product, result[0]);
+    return mlir::success();
+  }
+
+private:
+  static mlir::Value genProductExtent(mlir::Location loc,
+                                      fir::FirOpBuilder &builder,
+                                      hlfir::Entity input1,
+                                      hlfir::Entity input2) {
+    llvm::SmallVector<mlir::Value, 1> input1Extents =
+        hlfir::genExtentsVector(loc, builder, input1);
+    llvm::SmallVector<mlir::Value, 1> input2Extents =
+        hlfir::genExtentsVector(loc, builder, input2);
+
+    assert(input1Extents.size() == 1 && input2Extents.size() == 1 &&
+           "hlfir.dot_product arguments must be vectors");
+    llvm::SmallVector<mlir::Value, 1> extent =
+        fir::factory::deduceOptimalExtents(input1Extents, input2Extents);
+    return extent[0];
+  }
+};
+
 class SimplifyHLFIRIntrinsics
     : public hlfir::impl::SimplifyHLFIRIntrinsicsBase<SimplifyHLFIRIntrinsics> {
 public:
+  using SimplifyHLFIRIntrinsicsBase<
+      SimplifyHLFIRIntrinsics>::SimplifyHLFIRIntrinsicsBase;
+
   void runOnOperation() override {
     mlir::MLIRContext *context = &getContext();
 
@@ -482,6 +969,24 @@ class SimplifyHLFIRIntrinsics
     patterns.insert<TransposeAsElementalConversion>(context);
     patterns.insert<SumAsElementalConversion>(context);
     patterns.insert<CShiftAsElementalConversion>(context);
+    patterns.insert<MatmulConversion<hlfir::MatmulTransposeOp>>(context);
+
+    // If forceMatmulAsElemental is false, then hlfir.matmul inlining
+    // will introduce hlfir.eval_in_mem operation with new memory side
+    // effects. This conflicts with CSE and optimized bufferization, e.g.:
+    //   A(1:N,1:N) =  A(1:N,1:N) - MATMUL(...)
+    // If we introduce hlfir.eval_in_mem before CSE, then the current
+    // MLIR CSE won't be able to optimize the trivial loads of 'N' value
+    // that happen before and after hlfir.matmul.
+    // If 'N' loads are not optimized, then the optimized bufferization
+    // won't be able to prove that the slices of A are identical
+    // on both sides of the assignment.
+    // This is actually the CSE problem, but we can work it around
+    // for the time being.
+    if (forceMatmulAsElemental || this->allowNewSideEffects)
+      patterns.insert<MatmulConversion<hlfir::MatmulOp>>(context);
+
+    patterns.insert<DotProductConversion>(context);
 
     if (mlir::failed(mlir::applyPatternsGreedily(
             getOperation(), std::move(patterns), config))) {
diff --git a/flang/lib/Optimizer/OpenACC/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/CMakeLists.txt
index ed673121353c1..04d351ac265d6 100644
--- a/flang/lib/Optimizer/OpenACC/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenACC/CMakeLists.txt
@@ -18,5 +18,7 @@ add_flang_library(FIROpenACCSupport
   FIRDialectSupport
   FIRSupport
   HLFIRDialect
+
+  MLIR_LIBS
   MLIROpenACCDialect
 )
diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
index 026889cca238a..9fe2d3947c26d 100644
--- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
@@ -23,9 +23,11 @@ add_flang_library(FlangOpenMPTransforms
   FIRSupport
   FortranCommon
   FortranEvaluate
+  HLFIRDialect
+
+  MLIR_LIBS
   MLIRFuncDialect
   MLIROpenMPDialect
-  HLFIRDialect
   MLIRIR
   MLIRPass
   MLIRTransformUtils
diff --git a/flang/lib/Optimizer/Passes/CMakeLists.txt b/flang/lib/Optimizer/Passes/CMakeLists.txt
index 40abbdfbdd651..eb25beba309bf 100644
--- a/flang/lib/Optimizer/Passes/CMakeLists.txt
+++ b/flang/lib/Optimizer/Passes/CMakeLists.txt
@@ -12,16 +12,18 @@ add_flang_library(flangPasses
   FIRCodeGen
   FIRTransforms
   FlangOpenMPTransforms
-  ${dialect_libs}
-  ${extension_libs}
   FortranCommon
   HLFIRTransforms
+
+  LINK_COMPONENTS
+  Passes
+
+  MLIR_LIBS
+  ${dialect_libs}
+  ${extension_libs}
   MLIRPass
   MLIRReconcileUnrealizedCasts
   MLIRSCFToControlFlow
   MLIRSupport
   MLIRTransforms
-
-  LINK_COMPONENTS
-  Passes
 )
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index e1d7376ec3805..1cc3f0b81c20a 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -232,6 +232,12 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP,
   if (optLevel.isOptimizingForSpeed()) {
     addCanonicalizerPassWithoutRegionSimplification(pm);
     pm.addPass(mlir::createCSEPass());
+    // Run SimplifyHLFIRIntrinsics pass late after CSE,
+    // and allow introducing operations with new side effects.
+    addNestedPassToAllTopLevelOperations<PassConstructor>(pm, []() {
+      return hlfir::createSimplifyHLFIRIntrinsics(
+          {/*allowNewSideEffects=*/true});
+    });
     addNestedPassToAllTopLevelOperations<PassConstructor>(
         pm, hlfir::createOptimizedBufferization);
     addNestedPassToAllTopLevelOperations<PassConstructor>(
diff --git a/flang/lib/Optimizer/Support/CMakeLists.txt b/flang/lib/Optimizer/Support/CMakeLists.txt
index 8794c24712417..f8e4fc5bcefea 100644
--- a/flang/lib/Optimizer/Support/CMakeLists.txt
+++ b/flang/lib/Optimizer/Support/CMakeLists.txt
@@ -16,6 +16,11 @@ add_flang_library(FIRSupport
 
   LINK_LIBS
   FIRDialect
+
+  LINK_COMPONENTS
+  TargetParser
+
+  MLIR_LIBS
   ${dialect_libs}
   ${extension_libs}
   MLIRBuiltinToLLVMIRTranslation
@@ -24,7 +29,4 @@ add_flang_library(FIRSupport
   MLIRLLVMToLLVMIRTranslation
   MLIRTargetLLVMIRExport
   MLIRTargetLLVMIRImport
-
-  LINK_COMPONENTS
-  TargetParser
 )
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index d20d3bc4108ce..9c550f983434a 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -48,6 +48,8 @@ add_flang_library(FIRTransforms
   FIRSupport
   FortranCommon
   HLFIRDialect
+
+  MLIR_LIBS
   MLIRAffineUtils
   MLIRFuncDialect
   MLIRGPUDialect
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index e93bed37d39f7..8b8c00fa7ecfc 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -224,6 +224,8 @@ static bool inDeviceContext(mlir::Operation *op) {
     return true;
   if (auto funcOp = op->getParentOfType<mlir::gpu::GPUFuncOp>())
     return true;
+  if (auto funcOp = op->getParentOfType<mlir::gpu::LaunchOp>())
+    return true;
   if (auto funcOp = op->getParentOfType<mlir::func::FuncOp>()) {
     if (auto cudaProcAttr =
             funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
diff --git a/flang/lib/Support/CMakeLists.txt b/flang/lib/Support/CMakeLists.txt
index 9c7887aecafbd..12183f590316d 100644
--- a/flang/lib/Support/CMakeLists.txt
+++ b/flang/lib/Support/CMakeLists.txt
@@ -1,9 +1,9 @@
 add_flang_library(FortranSupport
   Timing.cpp
 
-  LINK_LIBS
-  MLIRSupport
-
   LINK_COMPONENTS
   Support
+
+  MLIR_LIBS
+  MLIRSupport
 )
diff --git a/flang/module/__cuda_device.f90 b/flang/module/__cuda_device.f90
index 81b1f5aa334bb..73f3d19c98a31 100644
--- a/flang/module/__cuda_device.f90
+++ b/flang/module/__cuda_device.f90
@@ -14,19 +14,4 @@
   ! Set PRIVATE by default to explicitly only export what is meant
   ! to be exported by this MODULE.
 
-  interface
-    attributes(device) function __fadd_rd(x, y) bind(c, name='__nv_fadd_rd')
-      real, intent(in), value :: x, y
-      real :: __fadd_rd
-    end function
-  end interface
-  public :: __fadd_rd
-
-  interface
-    attributes(device) function __fadd_ru(x, y) bind(c, name='__nv_fadd_ru')
-      real, intent(in), value :: x, y
-      real :: __fadd_ru
-    end function
-  end interface
-  public :: __fadd_ru
 end module
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index b07f82be6a724..3d487fd000a09 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -9,7 +9,10 @@
 ! CUDA Fortran procedures available in device subprogram
 
 module cudadevice
-  use __cuda_device, only: __fadd_rd, __fadd_ru
+  use __cuda_device
+  use, intrinsic :: __fortran_builtins, only: dim3 => __builtin_dim3
+  use, intrinsic :: __fortran_builtins, only: c_devptr => __builtin_c_devptr
+  use, intrinsic :: __fortran_builtins, only: c_devloc => __builtin_c_devloc
 implicit none
 
   ! Set PRIVATE by default to explicitly only export what is meant
@@ -72,4 +75,22 @@ attributes(device) subroutine threadfence_system()
   end interface
   public :: threadfence_system
 
+  ! Math API
+
+  interface
+    attributes(device) function __fadd_rd(x, y) bind(c, name='__nv_fadd_rd')
+      real, intent(in), value :: x, y
+      real :: __fadd_rd
+    end function
+  end interface
+  public :: __fadd_rd
+
+  interface
+    attributes(device) function __fadd_ru(x, y) bind(c, name='__nv_fadd_ru')
+      real, intent(in), value :: x, y
+      real :: __fadd_ru
+    end function
+  end interface
+  public :: __fadd_ru
+  
 end module
diff --git a/flang/runtime/tools.h b/flang/runtime/tools.h
index 3fe3283415ba9..facbd23161057 100644
--- a/flang/runtime/tools.h
+++ b/flang/runtime/tools.h
@@ -206,6 +206,23 @@ inline RT_API_ATTRS RESULT ApplyType(
     default:
       terminator.Crash("not yet implemented: INTEGER(KIND=%d)", kind);
     }
+  case TypeCategory::Unsigned:
+    switch (kind) {
+    case 1:
+      return FUNC<TypeCategory::Unsigned, 1>{}(std::forward<A>(x)...);
+    case 2:
+      return FUNC<TypeCategory::Unsigned, 2>{}(std::forward<A>(x)...);
+    case 4:
+      return FUNC<TypeCategory::Unsigned, 4>{}(std::forward<A>(x)...);
+    case 8:
+      return FUNC<TypeCategory::Unsigned, 8>{}(std::forward<A>(x)...);
+#if defined __SIZEOF_INT128__ && !AVOID_NATIVE_UINT128_T
+    case 16:
+      return FUNC<TypeCategory::Unsigned, 16>{}(std::forward<A>(x)...);
+#endif
+    default:
+      terminator.Crash("not yet implemented: UNSIGNED(KIND=%d)", kind);
+    }
   case TypeCategory::Real:
     switch (kind) {
 #if 0 // TODO: REAL(2 & 3)
diff --git a/flang/test/Analysis/AliasAnalysis/ptr-component.fir b/flang/test/Analysis/AliasAnalysis/ptr-component.fir
index 279143a581460..856d8e2d94c98 100644
--- a/flang/test/Analysis/AliasAnalysis/ptr-component.fir
+++ b/flang/test/Analysis/AliasAnalysis/ptr-component.fir
@@ -101,7 +101,7 @@ func.func @_QMmPfoo.fir(%arg0: !fir.ref<!fir.type<_QMmTt{next:!fir.box<!fir.ptr<
   %17 = fir.convert %0 : (!fir.ref<!fir.box<!fir.type<_QMmTt{next:!fir.box<!fir.ptr<!fir.type<_QMmTt>>>,i:i32}>>>) -> !fir.ref<!fir.box<none>>
   %18 = fir.convert %15 : (!fir.box<!fir.type<_QMmTt{next:!fir.box<!fir.ptr<!fir.type<_QMmTt>>>,i:i32}>>) -> !fir.box<none>
   %19 = fir.convert %16 : (!fir.ref<!fir.char<1,9>>) -> !fir.ref<i8>
-  %20 = fir.call @_FortranAAssign(%17, %18, %19, %c14_i32) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  fir.call @_FortranAAssign(%17, %18, %19, %c14_i32) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   %21 = fir.field_index next, !fir.type<_QMmTt{next:!fir.box<!fir.ptr<!fir.type<_QMmTt>>>,i:i32}>
   %22 = fir.coordinate_of %5, %21 {test.ptr="xnext2.fir"}: (!fir.ref<!fir.type<_QMmTt{next:!fir.box<!fir.ptr<!fir.type<_QMmTt>>>,i:i32}>>, !fir.field) -> !fir.ref<!fir.box<!fir.ptr<!fir.type<_QMmTt{next:!fir.box<!fir.ptr<!fir.type<_QMmTt>>>,i:i32}>>>>
   %23 = fir.load %22 : !fir.ref<!fir.box<!fir.ptr<!fir.type<_QMmTt{next:!fir.box<!fir.ptr<!fir.type<_QMmTt>>>,i:i32}>>>>
diff --git a/flang/test/Driver/funroll-loops.f90 b/flang/test/Driver/funroll-loops.f90
new file mode 100644
index 0000000000000..5c1a07e7d5d12
--- /dev/null
+++ b/flang/test/Driver/funroll-loops.f90
@@ -0,0 +1,5 @@
+! RUN: %flang -### -funroll-loops %s 2>&1 | FileCheck %s -check-prefix UNROLL
+! RUN: %flang -### -fno-unroll-loops %s 2>&1 | FileCheck %s -check-prefix NO-UNROLL
+
+! UNROLL: "-funroll-loops"
+! NO-UNROLL: "-fno-unroll-loops"
diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90
index 55e86da2dfdf1..dd46aecb3274c 100644
--- a/flang/test/Driver/mlir-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-pass-pipeline.f90
@@ -35,15 +35,19 @@
 ! O2-NEXT: (S) {{.*}} num-dce'd
 ! O2-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! O2-NEXT: 'fir.global' Pipeline
+! O2-NEXT:   SimplifyHLFIRIntrinsics
 ! O2-NEXT:   OptimizedBufferization
 ! O2-NEXT:   InlineHLFIRAssign
 ! O2-NEXT: 'func.func' Pipeline
+! O2-NEXT:   SimplifyHLFIRIntrinsics
 ! O2-NEXT:   OptimizedBufferization
 ! O2-NEXT:   InlineHLFIRAssign
 ! O2-NEXT: 'omp.declare_reduction' Pipeline
+! O2-NEXT:   SimplifyHLFIRIntrinsics
 ! O2-NEXT:   OptimizedBufferization
 ! O2-NEXT:   InlineHLFIRAssign
 ! O2-NEXT: 'omp.private' Pipeline
+! O2-NEXT:   SimplifyHLFIRIntrinsics
 ! O2-NEXT:   OptimizedBufferization
 ! O2-NEXT:   InlineHLFIRAssign
 ! ALL: LowerHLFIROrderedAssignments
diff --git a/flang/test/Fir/CUDA/cuda-alloc-free.fir b/flang/test/Fir/CUDA/cuda-alloc-free.fir
index abf2d56695b17..6194f0071cd79 100644
--- a/flang/test/Fir/CUDA/cuda-alloc-free.fir
+++ b/flang/test/Fir/CUDA/cuda-alloc-free.fir
@@ -15,7 +15,7 @@ func.func @_QPsub1() {
 // CHECK: %[[CONV:.*]] = fir.convert %3 : (!fir.llvm_ptr<i8>) -> !fir.ref<i32>
 // CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CONV]] {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Eidev"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 // CHECK: %[[DEVPTR:.*]] = fir.convert %[[DECL]]#1 : (!fir.ref<i32>) -> !fir.llvm_ptr<i8>
-// CHECK: fir.call @_FortranACUFMemFree(%[[DEVPTR]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFMemFree(%[[DEVPTR]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, i32, !fir.ref<i8>, i32) -> ()
 
 func.func @_QPsub2() {
   %0 = cuf.alloc !fir.array<10xf32> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QMcuda_varFcuda_alloc_freeEa"} -> !fir.ref<!fir.array<10xf32>>
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index 2f805d4a2b6bb..35c6e2a77a697 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -24,7 +24,7 @@ func.func @_QPsub1() {
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFFreeDescriptor(%[[BOX_NONE]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFFreeDescriptor(%[[BOX_NONE]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i8>, i32) -> ()
 
 fir.global @_QMmod1Ea {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?xf32>>> {
     %0 = fir.zero_bits !fir.heap<!fir.array<?xf32>>
@@ -80,7 +80,7 @@ func.func @_QPsub5() {
   %6 = fir.convert %5#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
   %7 = fir.convert %c1 : (index) -> i64
   %8 = fir.convert %c10_i32 : (i32) -> i64
-  %9 = fir.call @_FortranAAllocatableSetBounds(%6, %c0_i32, %7, %8) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+  fir.call @_FortranAAllocatableSetBounds(%6, %c0_i32, %7, %8) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
   %10 = cuf.allocate %5#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<pinned>} -> i32
   %11 = cuf.deallocate %5#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<pinned>} -> i32
   return
@@ -108,7 +108,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
   %2 = fir.convert %1#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
   %3 = fir.convert %c1 : (index) -> i64
   %4 = fir.convert %c10_i32 : (i32) -> i64
-  %5 = fir.call @_FortranAAllocatableSetBounds(%2, %c0_i32, %3, %4) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+  fir.call @_FortranAAllocatableSetBounds(%2, %c0_i32, %3, %4) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
   %6 = cuf.allocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>} -> i32
   return
 }
diff --git a/flang/test/Fir/CUDA/cuda-code-gen.mlir b/flang/test/Fir/CUDA/cuda-code-gen.mlir
index 0f1b8b1cd6a8e..3ad28fa7bd517 100644
--- a/flang/test/Fir/CUDA/cuda-code-gen.mlir
+++ b/flang/test/Fir/CUDA/cuda-code-gen.mlir
@@ -91,8 +91,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<f80 = dense<128> : vector<2xi64>
     %16 = fir.convert %6 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
     %17 = fir.convert %c1 : (index) -> i64
     %18 = fir.convert %c16_i32 : (i32) -> i64
-    %19 = fir.call @_FortranAAllocatableSetBounds(%16, %c0_i32, %17, %18) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
-    %20 = fir.call @_FortranAAllocatableSetBounds(%16, %c1_i32, %17, %18) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+    fir.call @_FortranAAllocatableSetBounds(%16, %c0_i32, %17, %18) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
+    fir.call @_FortranAAllocatableSetBounds(%16, %c1_i32, %17, %18) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
     %21 = fir.address_of(@_QQclX64756D6D792E6D6C697200) : !fir.ref<!fir.char<1,11>>
     %c31_i32 = arith.constant 31 : i32
     %false = arith.constant false
@@ -102,7 +102,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<f80 = dense<128> : vector<2xi64>
     %24 = fir.convert %21 : (!fir.ref<!fir.char<1,11>>) -> !fir.ref<i8>
     %25 = fir.call @_FortranACUFAllocatableAllocate(%23, %c-1_i64, %false, %22, %24, %c31_i32) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
     %26 = fir.convert %13 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-    %27 = fir.call @_FortranAAllocatableSetBounds(%26, %c0_i32, %17, %18) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+    fir.call @_FortranAAllocatableSetBounds(%26, %c0_i32, %17, %18) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
     %28 = fir.address_of(@_QQclX64756D6D792E6D6C697200) : !fir.ref<!fir.char<1,11>>
     %c34_i32 = arith.constant 34 : i32
     %false_0 = arith.constant false
@@ -115,7 +115,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<f80 = dense<128> : vector<2xi64>
     %34 = fircg.ext_rebox %33 : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<!fir.array<?x?xf32>>
     return
   }
-  func.func private @_FortranAAllocatableSetBounds(!fir.ref<!fir.box<none>>, i32, i64, i64) -> none attributes {fir.runtime}
+  func.func private @_FortranAAllocatableSetBounds(!fir.ref<!fir.box<none>>, i32, i64, i64) -> () attributes {fir.runtime}
   fir.global linkonce @_QQclX64756D6D792E6D6C697200 constant : !fir.char<1,11> {
     %0 = fir.string_lit "dummy.mlir\00"(11) : !fir.char<1,11>
     fir.has_value %0 : !fir.char<1,11>
@@ -165,7 +165,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vec
     fir.has_value %0 : !fir.char<1,8>
   }
   func.func private @_FortranACUFAllocDescriptor(i64, !fir.ref<i8>, i32) -> !fir.ref<!fir.box<none>> attributes {fir.runtime}
-  func.func private @_FortranACUFFreeDescriptor(!fir.ref<!fir.box<none>>, !fir.ref<i8>, i32) -> none attributes {fir.runtime}
+  func.func private @_FortranACUFFreeDescriptor(!fir.ref<!fir.box<none>>, !fir.ref<i8>, i32) -> () attributes {fir.runtime}
 }
 
 // CHECK-LABEL: llvm.func @_QQmain()
diff --git a/flang/test/Fir/CUDA/cuda-constructor-2.f90 b/flang/test/Fir/CUDA/cuda-constructor-2.f90
index eb118ccee311c..89fc99b736f4f 100644
--- a/flang/test/Fir/CUDA/cuda-constructor-2.f90
+++ b/flang/test/Fir/CUDA/cuda-constructor-2.f90
@@ -25,7 +25,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
 // CHECK-DAG: %[[VAR_NAME2:.*]] = fir.convert %[[VAR_NAME]] : (!fir.ref<!fir.char<1,12>>) -> !fir.ref<i8>
 // CHECK-DAG: %[[CST:.*]] = arith.constant 20 : index
 // CHECK-DAG: %[[CST2:.*]] = fir.convert %[[CST]] : (index) -> i64
-// CHECK-DAG: fir.call @_FortranACUFRegisterVariable(%[[MODULE2]], %[[VAR_ADDR2]], %[[VAR_NAME2]], %[[CST2]]) : (!fir.ref<!fir.llvm_ptr<i8>>, !fir.ref<i8>, !fir.ref<i8>, i64) -> none
+// CHECK-DAG: fir.call @_FortranACUFRegisterVariable(%[[MODULE2]], %[[VAR_ADDR2]], %[[VAR_NAME2]], %[[CST2]]) : (!fir.ref<!fir.llvm_ptr<i8>>, !fir.ref<i8>, !fir.ref<i8>, i64) -> ()
 // CHECK-DAG: %[[BOX:.*]] = fir.address_of(@_QMmtestsEndev) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK-DAG: %[[BOXREF:.*]] = fir.convert %[[BOX]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<i8>
 // CHECK-DAG: fir.call @_FortranACUFRegisterVariable(%[[MODULE:.*]], %[[BOXREF]], %{{.*}}, %{{.*}}) 
@@ -59,4 +59,4 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i
       gpu.return %6 : i32
     }
   }
-}
\ No newline at end of file
+}
diff --git a/flang/test/Fir/CUDA/cuda-data-transfer.fir b/flang/test/Fir/CUDA/cuda-data-transfer.fir
index 5ed27f1be0a43..415d0015918bb 100644
--- a/flang/test/Fir/CUDA/cuda-data-transfer.fir
+++ b/flang/test/Fir/CUDA/cuda-data-transfer.fir
@@ -17,7 +17,7 @@ func.func @_QPsub1() {
 // CHECK: %[[AHOST:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Eahost"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 // CHECK: %[[AHOST_BOX:.*]] = fir.convert %[[AHOST]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%[[AHOST_BOX]], %[[ADEV_BOX]], %c1{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%[[AHOST_BOX]], %[[ADEV_BOX]], %c1{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> ()
 
 func.func @_QPsub2() {
   %0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "adev", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub2Eadev"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
@@ -38,7 +38,7 @@ func.func @_QPsub2() {
 // CHECK: fir.store %[[EMBOX]] to %[[TEMP_BOX]] : !fir.ref<!fir.box<i32>>
 // CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[TEMP_CONV:.*]] = fir.convert %[[TEMP_BOX]] : (!fir.ref<!fir.box<i32>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFDataTransferCstDesc(%[[ADEV_BOX]], %[[TEMP_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferCstDesc(%[[ADEV_BOX]], %[[TEMP_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> ()
 
 func.func @_QPsub3() {
   %0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "adev", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub3Eadev"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
@@ -58,7 +58,7 @@ func.func @_QPsub3() {
 // CHECK: fir.store %[[EMBOX]] to %[[TEMP_BOX]] : !fir.ref<!fir.box<i32>>
 // CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[V_CONV:.*]] = fir.convert %[[TEMP_BOX]] : (!fir.ref<!fir.box<i32>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFDataTransferCstDesc(%[[ADEV_BOX]], %[[V_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferCstDesc(%[[ADEV_BOX]], %[[V_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> ()
   
 func.func @_QPsub4() {
   %0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "adev", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub4Eadev"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
@@ -81,12 +81,12 @@ func.func @_QPsub4() {
 // CHECK: fir.store %[[EMBOX]] to %[[TEMP_BOX]] : !fir.ref<!fir.box<!fir.array<10xi32>>>
 // CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[AHOST_BOX:.*]] = fir.convert %[[TEMP_BOX]] : (!fir.ref<!fir.box<!fir.array<10xi32>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%[[ADEV_BOX]], %[[AHOST_BOX]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%[[ADEV_BOX]], %[[AHOST_BOX]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> ()
 // CHECK: %[[EMBOX:.*]] = fir.embox %[[AHOST]]#0(%[[AHOST_SHAPE]]) : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<10xi32>>
 // CHECK: fir.store %[[EMBOX]] to %[[TEMP_BOX1]] : !fir.ref<!fir.box<!fir.array<10xi32>>>
 // CHECK: %[[AHOST_BOX:.*]] = fir.convert %[[TEMP_BOX1]] : (!fir.ref<!fir.box<!fir.array<10xi32>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFDataTransferDescDescNoRealloc(%[[AHOST_BOX]], %[[ADEV_BOX]], %c1{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferDescDescNoRealloc(%[[AHOST_BOX]], %[[ADEV_BOX]], %c1{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> ()
 
 func.func @_QPsub5(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}) {
   %0 = fir.dummy_scope : !fir.dscope
@@ -123,12 +123,12 @@ func.func @_QPsub5(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}) {
 // CHECK: fir.store %[[EMBOX]] to %[[TEMP_BOX]] : !fir.ref<!fir.box<!fir.array<?x?xi32>>>
 // CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[AHOST_BOX:.*]] = fir.convert %[[TEMP_BOX]] : (!fir.ref<!fir.box<!fir.array<?x?xi32>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%[[ADEV_BOX]], %[[AHOST_BOX]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%[[ADEV_BOX]], %[[AHOST_BOX]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> ()
 // CHECK: %[[EMBOX:.*]] = fir.embox %[[AHOST]]#1(%[[SHAPE]]) : (!fir.ref<!fir.array<?x?xi32>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?xi32>>
 // CHECK: fir.store %[[EMBOX]] to %[[TEMP_BOX1]] : !fir.ref<!fir.box<!fir.array<?x?xi32>>>
 // CHECK: %[[AHOST_BOX:.*]] = fir.convert %[[TEMP_BOX1]] : (!fir.ref<!fir.box<!fir.array<?x?xi32>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFDataTransferDescDescNoRealloc(%[[AHOST_BOX]], %[[ADEV_BOX]], %c1{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferDescDescNoRealloc(%[[AHOST_BOX]], %[[ADEV_BOX]], %c1{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> ()
 
 func.func @_QPsub6() {
   %0 = cuf.alloc i32 {bindc_name = "idev", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub6Eidev"} -> !fir.ref<i32>
@@ -149,12 +149,12 @@ func.func @_QPsub6() {
 // CHECK: %[[IHOST:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub6Eihost"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 // CHECK: %[[DST:.*]] = fir.convert %[[IHOST]]#0 : (!fir.ref<i32>) -> !fir.llvm_ptr<i8>
 // CHECK: %[[SRC:.*]] = fir.convert %[[IDEV]]#0 : (!fir.ref<i32>) -> !fir.llvm_ptr<i8>
-// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[DST]], %[[SRC]], %c4{{.*}}, %c1{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[DST]], %[[SRC]], %c4{{.*}}, %c1{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> ()
 // CHECK: %[[LOAD:.*]] = fir.load %[[IHOST]]#0 : !fir.ref<i32>
 // CHECK: %[[ASSOC:.*]]:3 = hlfir.associate %[[LOAD]] {uniq_name = ".cuf_host_tmp"} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
 // CHECK: %[[DST:.*]] = fir.convert %[[IDEV]]#0 : (!fir.ref<i32>) -> !fir.llvm_ptr<i8>
 // CHECK: %[[SRC:.*]] = fir.convert %[[ASSOC]]#0 : (!fir.ref<i32>) -> !fir.llvm_ptr<i8>
-// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[DST]], %[[SRC]], %c4{{.*}}, %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[DST]], %[[SRC]], %c4{{.*}}, %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> ()
 
 func.func @_QPsub7() {
   %c10 = arith.constant 10 : index
@@ -177,11 +177,11 @@ func.func @_QPsub7() {
 // CHECK: %[[BYTES:.*]] = arith.muli %c10{{.*}}, %c4{{.*}} : i64
 // CHECK: %[[DST:.*]] = fir.convert %[[IHOST]]#0 : (!fir.ref<!fir.array<10xi32>>) -> !fir.llvm_ptr<i8>
 // CHECK: %[[SRC:.*]] = fir.convert %[[IDEV]]#0 : (!fir.ref<!fir.array<10xi32>>) -> !fir.llvm_ptr<i8>
-// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[DST]], %[[SRC]], %[[BYTES]], %c1{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[DST]], %[[SRC]], %[[BYTES]], %c1{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> ()
 // CHECK: %[[BYTES:.*]] = arith.muli %c10{{.*}}, %c4{{.*}} : i64
 // CHECK: %[[DST:.*]] = fir.convert %[[IDEV]]#0 : (!fir.ref<!fir.array<10xi32>>) -> !fir.llvm_ptr<i8>
 // CHECK: %[[SRC:.*]] = fir.convert %[[IHOST]]#0 : (!fir.ref<!fir.array<10xi32>>) -> !fir.llvm_ptr<i8>
-// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[DST]], %[[SRC]], %[[BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[DST]], %[[SRC]], %[[BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> ()
 
 fir.global @_QMmtestsEn(dense<[3, 4, 5, 6, 7]> : tensor<5xi32>) {data_attr = #cuf.cuda<device>} : !fir.array<5xi32>
 func.func @_QPsub8() attributes {fir.bindc_name = "t"} {
@@ -206,7 +206,7 @@ func.func @_QPsub8() attributes {fir.bindc_name = "t"} {
 // CHECK: %[[DECL:.*]] = fir.declare %[[ADDR_CONV]]
 // CHECK: %[[DST:.*]] = fir.convert %[[LOCAL]] : (!fir.ref<!fir.array<5xi32>>) -> !fir.llvm_ptr<i8>
 // CHECK: %[[SRC:.*]] = fir.convert %[[DECL]] : (!fir.ref<!fir.array<5xi32>>) -> !fir.llvm_ptr<i8>
-// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[DST]], %[[SRC]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[DST]], %[[SRC]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> ()
 
 
 func.func @_QPsub9() {
@@ -231,7 +231,7 @@ func.func @_QPsub9() {
 // CHECK: %[[DECL:.*]] = fir.declare %[[ADDR_CONV]]
 // CHECK: %[[DST:.*]] = fir.convert %[[DECL]] : (!fir.ref<!fir.array<5xi32>>) -> !fir.llvm_ptr<i8>
 // CHECK: %[[SRC:.*]] = fir.convert %[[LOCAL]] : (!fir.ref<!fir.array<5xi32>>) -> !fir.llvm_ptr<i8>
-// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[DST]], %[[SRC]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[DST]], %[[SRC]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> ()
 
 fir.global @_QMmod1Ea {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?xi32>>> {
   %c0 = arith.constant 0 : index
@@ -254,7 +254,7 @@ func.func @_QQdesc_global() attributes {fir.bindc_name = "host_sub"} {
 // CHECK: %[[GLOBAL_ADDRESS:.*]] = fir.address_of(@_QMmod1Ea) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK: %[[GLOBAL_DECL:.*]]:2 = hlfir.declare %[[GLOBAL_ADDRESS]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[GLOBAL_DECL:.*]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFDataTransferGlobalDescDesc(%[[BOX_NONE]],{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferGlobalDescDesc(%[[BOX_NONE]],{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> ()
 
 fir.global @_QMmod2Eadev {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?xi32>>> {
   %c0 = arith.constant 0 : index
@@ -285,7 +285,7 @@ func.func @_QPdesc_global_ptr() {
 // CHECK: fir.store %[[EMBOX]] to %[[TEMP_BOX]] : !fir.ref<!fir.box<!fir.array<10xi32>>>
 // CHECK: %[[ADEV_BOXNONE:.*]] = fir.convert %[[DECL_ADEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[AHOST_BOXNONE:.*]] = fir.convert %[[TEMP_BOX]] : (!fir.ref<!fir.box<!fir.array<10xi32>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFDataTransferGlobalDescDesc(%[[ADEV_BOXNONE]], %[[AHOST_BOXNONE]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferGlobalDescDesc(%[[ADEV_BOXNONE]], %[[AHOST_BOXNONE]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> ()
 
 func.func @_QPscalar_to_array() {
   %c1_i32 = arith.constant 1 : i32
@@ -312,7 +312,7 @@ func.func @_QPtest_type() {
 }
 
 // CHECK-LABEL: func.func @_QPtest_type()
-// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%{{.*}}, %{{.*}}, %c12{{.*}}, %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%{{.*}}, %{{.*}}, %c12{{.*}}, %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> ()
 
 func.func @_QPtest_array_type() {
   %c10 = arith.constant 10 : index
@@ -331,7 +331,7 @@ func.func @_QPtest_array_type() {
 // CHECK: %[[CONV_BYTES:.*]] = fir.convert %[[BYTES]] : (index) -> i64
 // CHECK: fir.call @_FortranACUFMemAlloc(%[[CONV_BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
 // CHECK: %[[BYTES:.*]] = arith.muli %c10{{.*}}, %c12{{.*}} : i64
-// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%{{.*}}, %{{.*}}, %[[BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%{{.*}}, %{{.*}}, %[[BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> ()
 
 func.func @_QPshape_shift() {
   %c0_i32 = arith.constant 0 : i32
@@ -365,7 +365,7 @@ func.func @_QPshape_shift2() {
 // CHECK-LABEL: func.func @_QPshape_shift2()
 // CHECK: %[[C10:.*]] = fir.convert %c10{{.*}} : (index) -> i64
 // CHECK: %[[BYTES:.*]] = arith.muli %[[C10]], %c4{{.*}} : i64
-// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%{{.*}}, %{{.*}}, %[[BYTES]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%{{.*}}, %{{.*}}, %[[BYTES]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> ()
 
 fir.global @_QMmod1Ea_dev {data_attr = #cuf.cuda<device>} : !fir.array<4xf32> {
   %0 = fir.zero_bits !fir.array<4xf32>
@@ -407,7 +407,7 @@ func.func @_QQchar_transfer() attributes {fir.bindc_name = "char_transfer"} {
 // CHECK-LABEL:  func.func @_QQchar_transfer()
 // CHECK: fir.call @_FortranACUFMemAlloc
 // CHECK: %[[BYTES:.*]] = arith.muli %c10{{.*}}, %c1{{.*}} : i64
-// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%{{.*}}, %{{.*}}, %[[BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%{{.*}}, %{{.*}}, %[[BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> ()
 
 func.func @_QPdevmul(%arg0: !fir.ref<!fir.array<1x?xf32>> {fir.bindc_name = "b"}, %arg1: !fir.ref<i32> {fir.bindc_name = "wa"}, %arg2: !fir.ref<i32> {fir.bindc_name = "wb"}) {
   %c0_i64 = arith.constant 0 : i64
@@ -447,10 +447,10 @@ func.func @_QPdevmul(%arg0: !fir.ref<!fir.array<1x?xf32>> {fir.bindc_name = "b"}
 // CHECK: %[[EMBOX:.*]] = fir.embox %{{.*}}(%{{.*}}) [%{{.*}}] : (!fir.ref<!fir.array<1x?xf32>>, !fir.shape<2>, !fir.slice<2>) -> !fir.box<!fir.array<?x?xf32>>
 // CHECK: fir.store %[[EMBOX]] to %[[ALLOCA1]] : !fir.ref<!fir.box<!fir.array<?x?xf32>>>
 // CHECK: %[[SRC:.*]] = fir.convert %[[ALLOCA1]] : (!fir.ref<!fir.box<!fir.array<?x?xf32>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%{{.*}}, %[[SRC]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%{{.*}}, %[[SRC]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> ()
 // CHECK: fir.store %[[EMBOX]] to %[[ALLOCA0]] : !fir.ref<!fir.box<!fir.array<?x?xf32>>>
 // CHECK: %[[DST:.*]] = fir.convert %[[ALLOCA0]] : (!fir.ref<!fir.box<!fir.array<?x?xf32>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%[[DST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%[[DST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> ()
 
 func.func @_QPlogical_cst() {
   %c0_i64 = arith.constant 0 : i64
@@ -470,7 +470,7 @@ func.func @_QPlogical_cst() {
 // CHECK: %[[EMBOX:.*]] = fir.embox %[[CONST]] : (!fir.ref<!fir.logical<4>>) -> !fir.box<!fir.logical<4>>
 // CHECK: fir.store %[[EMBOX]] to %[[DESC]] : !fir.ref<!fir.box<!fir.logical<4>>>
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DESC]] : (!fir.ref<!fir.box<!fir.logical<4>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFDataTransferCstDesc(%{{.*}}, %[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferCstDesc(%{{.*}}, %[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> ()
 
 func.func @_QPcallkernel(%arg0: !fir.box<!fir.array<?x?xcomplex<f32>>> {fir.bindc_name = "a"}, %arg1: !fir.ref<f32> {fir.bindc_name = "b"}, %arg2: !fir.ref<f32> {fir.bindc_name = "c"}) {
   %c0_i64 = arith.constant 0 : i64
@@ -517,7 +517,7 @@ func.func @_QPcallkernel(%arg0: !fir.box<!fir.array<?x?xcomplex<f32>>> {fir.bind
 // CHECK: %[[REBOX1:.*]] = fir.rebox %[[REBOX0]] [%{{.*}}] : (!fir.box<!fir.array<?x?xcomplex<f32>>>, !fir.slice<2>) -> !fir.box<!fir.array<?x?xcomplex<f32>>>
 // CHECK: fir.store %[[REBOX1]] to %[[ALLOCA]] : !fir.ref<!fir.box<!fir.array<?x?xcomplex<f32>>>>
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ALLOCA]] : (!fir.ref<!fir.box<!fir.array<?x?xcomplex<f32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%{{.*}}, %[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%{{.*}}, %[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> ()
 
 func.func @_QPsrc_cst() {
   %0 = fir.dummy_scope : !fir.dscope
@@ -557,7 +557,7 @@ func.func @_QPsrc_cst() {
 // CHECK: %[[CST:.*]] = arith.constant -4.000000e+00 : f32
 // CHECK: fir.store %[[CST]] to %[[ALLOCA]] : !fir.ref<f32>
 // CHECK: %[[CONV:.*]] = fir.convert %[[ALLOCA]] : (!fir.ref<f32>) -> !fir.llvm_ptr<i8>
-// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%{{.*}}, %[[CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%{{.*}}, %[[CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> ()
 
 func.func @_QPchecksums(%arg0: !fir.box<!fir.array<?xf64>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}, %arg1: !fir.ref<i32> {fir.bindc_name = "n"}) {
   %c0 = arith.constant 0 : index
@@ -580,6 +580,6 @@ func.func @_QPchecksums(%arg0: !fir.box<!fir.array<?xf64>> {cuf.data_attr = #cuf
 // CHECK-LABEL: func.func @_QPchecksums
 // CHECK: %[[DST:.*]] = fir.convert %{{.*}} : (!fir.ref<!fir.box<!fir.array<?xf64>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[SRC:.*]] = fir.convert %{{.*}} : (!fir.ref<!fir.box<!fir.array<?xf64>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFDataTransferDescDescNoRealloc(%[[DST]], %[[SRC]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferDescDescNoRealloc(%[[DST]], %[[SRC]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> ()
 
 } // end of module
diff --git a/flang/test/Fir/CUDA/cuda-global-addr.mlir b/flang/test/Fir/CUDA/cuda-global-addr.mlir
index 0ccd0c797fb6f..3e50c7a51f49c 100644
--- a/flang/test/Fir/CUDA/cuda-global-addr.mlir
+++ b/flang/test/Fir/CUDA/cuda-global-addr.mlir
@@ -34,7 +34,7 @@ func.func @_QQmain() attributes {fir.bindc_name = "test"} {
 // CHECK: %[[DECL:.*]] = fir.declare %[[DEVICE_ADDR_CONV]](%{{.*}}) {data_attr = #cuf.cuda<device>, uniq_name = "_QMmod1Eadev"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<10xi32>>
 // CHECK: %[[ARRAY_COOR:.*]] = fir.array_coor %[[DECL]](%{{.*}}) %c4{{.*}} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
 // CHECK: %[[ARRAY_COOR_PTR:.*]] = fir.convert %[[ARRAY_COOR]] : (!fir.ref<i32>) -> !fir.llvm_ptr<i8>
-// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[ARRAY_COOR_PTR]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[ARRAY_COOR_PTR]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> ()
 
 // -----
 
@@ -65,3 +65,28 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> :
 // There is no symbol for it and the call would result into an unresolved reference.
 // CHECK-NOT: fir.call {{.*}}GetDeviceAddress
 
+// -----
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+fir.global @_QMmod1Eadev {data_attr = #cuf.cuda<device>} : !fir.array<10xi32> {
+  %0 = fir.zero_bits !fir.array<10xi32>
+  fir.has_value %0 : !fir.array<10xi32>
+}
+func.func @_QQmain() attributes {fir.bindc_name = "test"} {
+  %dim = arith.constant 1 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %dim, %grid_y = %dim, %grid_z = %dim)
+             threads(%tx, %ty, %tz) in (%block_x = %dim, %block_y = %dim, %block_z = %dim) {
+    %c10 = arith.constant 10 : index
+    %1 = fir.shape %c10 : (index) -> !fir.shape<1>
+    %3 = fir.address_of(@_QMmod1Eadev) : !fir.ref<!fir.array<10xi32>>
+    %4 = fir.declare %3(%1) {data_attr = #cuf.cuda<device>, uniq_name = "_QMmod1Eadev"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<10xi32>>
+    gpu.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: func.func @_QQmain()
+// CHECK: gpu.launch
+// CHECK-NOT: fir.call {{.*}}GetDeviceAddress
+
+}
diff --git a/flang/test/Fir/abstract-result-2.fir b/flang/test/Fir/abstract-result-2.fir
index af13d57476e8c..8e59f60c8c244 100644
--- a/flang/test/Fir/abstract-result-2.fir
+++ b/flang/test/Fir/abstract-result-2.fir
@@ -21,11 +21,11 @@ func.func @_QMi8Pintrinsic_pack0(%arg0: !fir.box<!fir.array<?x?x?xi32>> {fir.bin
   %9 = fir.convert %arg0 : (!fir.box<!fir.array<?x?x?xi32>>) -> !fir.box<none>
   %10 = fir.convert %5 : (!fir.box<!fir.logical<4>>) -> !fir.box<none>
   %11 = fir.convert %arg2 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
-  %12 = fir.call @_FortranAPack(%8, %9, %10, %11) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>) -> none
+  fir.call @_FortranAPack(%8, %9, %10, %11) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>) -> ()
   %13 = fir.load %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   return %13 : !fir.box<!fir.heap<!fir.array<?xi32>>>
 }
-func.func private @_FortranAPack(!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>) -> none attributes {fir.runtime}
+func.func private @_FortranAPack(!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>) -> () attributes {fir.runtime}
 
 // CHECK-LABEL: func.func private @empty
 // CHECK-SAME:(!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.array<?x?x?xi32>> {fir.bindc_name = "array"}, !fir.ref<!fir.logical<4>> {fir.bindc_name = "mask"}, !fir.box<!fir.array<?xi32>> {fir.bindc_name = "vector", fir.optional})
diff --git a/flang/test/Fir/array-value-copy-3.fir b/flang/test/Fir/array-value-copy-3.fir
index 2840c3c68d701..945a857e46669 100644
--- a/flang/test/Fir/array-value-copy-3.fir
+++ b/flang/test/Fir/array-value-copy-3.fir
@@ -23,7 +23,7 @@ func.func @test_overlap_with_alloc_components(%arg0: !fir.ref<!fir.array<10x!t_w
   %9 = fir.do_loop %arg1 = %c0 to %c9 step %c1 unordered iter_args(%arg2 = %2) -> (!fir.array<10x!t_with_alloc_comp>) {
     %10 = fir.array_access %7, %arg1 : (!fir.array<10x!t_with_alloc_comp>, index) -> !fir.ref<!t_with_alloc_comp>
     %11 = fir.array_access %arg2, %arg1 : (!fir.array<10x!t_with_alloc_comp>, index) -> !fir.ref<!t_with_alloc_comp>
-    fir.call @custom_assign(%11, %10) : (!fir.ref<!t_with_alloc_comp>, !fir.ref<!t_with_alloc_comp>) -> none
+    fir.call @custom_assign(%11, %10) : (!fir.ref<!t_with_alloc_comp>, !fir.ref<!t_with_alloc_comp>) -> ()
     %19 = fir.array_amend %arg2, %11 : (!fir.array<10x!t_with_alloc_comp>, !fir.ref<!t_with_alloc_comp>) -> !fir.array<10x!t_with_alloc_comp>
     fir.result %19 : !fir.array<10x!t_with_alloc_comp>
   }
@@ -41,7 +41,7 @@ func.func @test_overlap_with_alloc_components(%arg0: !fir.ref<!fir.array<10x!t_w
 // CHECK:   %[[VAL_11:.*]] = fir.allocmem !fir.array<10x!fir.type<t{i:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>
 // CHECK:   %[[VAL_12:.*]] = fir.embox %[[VAL_11]](%[[VAL_9]]) : (!fir.heap<!fir.array<10x!fir.type<t{i:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<10x!fir.type<t{i:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>
 // CHECK:   %[[VAL_16:.*]] = fir.convert %[[VAL_12]] : (!fir.box<!fir.heap<!fir.array<10x!fir.type<t{i:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>) -> !fir.box<none>
-// CHECK:   fir.call @_FortranAInitialize(%[[VAL_16]], %{{.*}}, %{{.*}}) : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+// CHECK:   fir.call @_FortranAInitialize(%[[VAL_16]], %{{.*}}, %{{.*}}) : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 // CHECK:   fir.do_loop {{.*}} {
 // CHECK:     fir.call @_FortranAAssign
 // CHECK:   }
@@ -52,5 +52,5 @@ func.func @test_overlap_with_alloc_components(%arg0: !fir.ref<!fir.array<10x!t_w
 // CHECK:     fir.call @_FortranAAssign
 // CHECK:   }
 // CHECK:   %[[VAL_72:.*]] = fir.convert %[[VAL_12]] : (!fir.box<!fir.heap<!fir.array<10x!fir.type<t{i:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>) -> !fir.box<none>
-// CHECK:   %[[VAL_73:.*]] = fir.call @_FortranADestroy(%[[VAL_72]]) : (!fir.box<none>) -> none
+// CHECK:   fir.call @_FortranADestroy(%[[VAL_72]]) : (!fir.box<none>) -> ()
 // CHECK:   fir.freemem %[[VAL_11]]
diff --git a/flang/test/Fir/array-value-copy-4.fir b/flang/test/Fir/array-value-copy-4.fir
index f120f054f6abd..85def88e6887c 100644
--- a/flang/test/Fir/array-value-copy-4.fir
+++ b/flang/test/Fir/array-value-copy-4.fir
@@ -43,7 +43,7 @@ func.func @_QMmodPsub1(%arg0: !fir.box<!fir.array<?x!fir.type<_QMmodTrec1{dat:!f
     %23 = fir.convert %0 : (!fir.ref<!fir.box<!fir.type<_QMmodTrec1{dat:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>) -> !fir.ref<!fir.box<none>>
     %24 = fir.convert %21 : (!fir.box<!fir.type<_QMmodTrec1{dat:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.box<none>
     %25 = fir.convert %22 : (!fir.ref<!fir.char<1,12>>) -> !fir.ref<i8>
-    %26 = fir.call @_FortranAAssign(%23, %24, %25, %c9_i32) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+    fir.call @_FortranAAssign(%23, %24, %25, %c9_i32) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
     %27 = fir.array_amend %arg2, %19 : (!fir.array<?x!fir.type<_QMmodTrec1{dat:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.ref<!fir.type<_QMmodTrec1{dat:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.array<?x!fir.type<_QMmodTrec1{dat:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>
     fir.result %27 : !fir.array<?x!fir.type<_QMmodTrec1{dat:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>
   }
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index 29a0f66157971..51e68d2157631 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -36,15 +36,19 @@ func.func @_QQmain() {
 // PASSES-NEXT:    (S) 0 num-dce'd - Number of operations DCE'd
 // PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 // PASSES-NEXT: 'fir.global' Pipeline
+// PASSES-NEXT:    SimplifyHLFIRIntrinsics
 // PASSES-NEXT:    OptimizedBufferization
 // PASSES-NEXT:    InlineHLFIRAssign
 // PASSES-NEXT: 'func.func' Pipeline
+// PASSES-NEXT:    SimplifyHLFIRIntrinsics
 // PASSES-NEXT:    OptimizedBufferization
 // PASSES-NEXT:    InlineHLFIRAssign
 // PASSES-NEXT: 'omp.declare_reduction' Pipeline
+// PASSES-NEXT:    SimplifyHLFIRIntrinsics
 // PASSES-NEXT:    OptimizedBufferization
 // PASSES-NEXT:    InlineHLFIRAssign
 // PASSES-NEXT: 'omp.private' Pipeline
+// PASSES-NEXT:    SimplifyHLFIRIntrinsics
 // PASSES-NEXT:    OptimizedBufferization
 // PASSES-NEXT:    InlineHLFIRAssign
 // PASSES-NEXT:   LowerHLFIROrderedAssignments
diff --git a/flang/test/Fir/boxproc-openmp.fir b/flang/test/Fir/boxproc-openmp.fir
index 8b714539b5e85..9db053ad93c66 100644
--- a/flang/test/Fir/boxproc-openmp.fir
+++ b/flang/test/Fir/boxproc-openmp.fir
@@ -14,7 +14,7 @@ omp.private {type = private} @_QFsub1Et1_private_ref_rec__QFsub1Tt : !fir.ref<!f
   %3 = fir.address_of(@_QQclXea6256ba131ddd9c2210e68030a0edd3) : !fir.ref<!fir.char<1,49>>
   %4 = fir.convert %2 : (!fir.box<!fir.type<_QFsub1Tt{p1:!fir.boxproc<() -> ()>}>>) -> !fir.box<none>
   %5 = fir.convert %3 : (!fir.ref<!fir.char<1,49>>) -> !fir.ref<i8>
-  %6 = fir.call @_FortranAInitialize(%4, %5, %c1_i32) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+  fir.call @_FortranAInitialize(%4, %5, %c1_i32) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 //CHECK: omp.yield(%{{.*}} : !fir.ref<!fir.type<_QFsub1TtUnboxProc{p1:() -> ()}>>)
   omp.yield(%1 : !fir.ref<!fir.type<_QFsub1Tt{p1:!fir.boxproc<() -> ()>}>>)
 }
@@ -52,7 +52,7 @@ omp.private {type = firstprivate} @_QFsub2Et1_firstprivate_ref_box_heap_rec__QFs
   %4 = fir.convert %arg1 : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QFsub2Tt{p1:!fir.boxproc<() -> ()>}>>>>) -> !fir.ref<!fir.box<none>>
   %5 = fir.convert %2 : (!fir.box<!fir.type<_QFsub2Tt{p1:!fir.boxproc<() -> ()>}>>) -> !fir.box<none>
   %6 = fir.convert %3 : (!fir.ref<!fir.char<1,8>>) -> !fir.ref<i8>
-  %7 = fir.call @_FortranAAssign(%4, %5, %6, %c5_i32) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  fir.call @_FortranAAssign(%4, %5, %6, %c5_i32) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 //CHECK: omp.yield(%{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.type<_QFsub2TtUnboxProc{p1:() -> ()}>>>>)
   omp.yield(%arg1 : !fir.ref<!fir.box<!fir.heap<!fir.type<_QFsub2Tt{p1:!fir.boxproc<() -> ()>}>>>>)
 } dealloc {
@@ -78,10 +78,10 @@ func.func @_QPsub2() {
   }
   return
 }
-func.func private @_FortranAInitialize(!fir.box<none>, !fir.ref<i8>, i32) -> none attributes {fir.runtime}
+func.func private @_FortranAInitialize(!fir.box<none>, !fir.ref<i8>, i32) -> () attributes {fir.runtime}
 fir.global linkonce @_QQclXea constant : !fir.char<1,8> {
   %0 = fir.string_lit "pp.f90\00"(8) : !fir.char<1,8>
   fir.has_value %0 : !fir.char<1,8>
 }
 func.func private @_FortranAAllocatableDeallocate(!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32 attributes {fir.runtime}
-func.func private @_FortranAAssign(!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none attributes {fir.runtime}
+func.func private @_FortranAAssign(!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> () attributes {fir.runtime}
diff --git a/flang/test/Fir/polymorphic.fir b/flang/test/Fir/polymorphic.fir
index 74b29ed6ca729..f9cf6fab6b707 100644
--- a/flang/test/Fir/polymorphic.fir
+++ b/flang/test/Fir/polymorphic.fir
@@ -169,16 +169,16 @@ func.func @_QMmod2Pinitp(%arg0: !fir.ref<!fir.class<!fir.ptr<none>>> {fir.bindc_
   %1 = fir.load %arg0 : !fir.ref<!fir.class<!fir.ptr<none>>>
   %2 = fir.convert %0 : (!fir.ref<!fir.class<!fir.ptr<none>>>) -> !fir.ref<!fir.box<none>>
   %3 = fir.convert %1 : (!fir.class<!fir.ptr<none>>) -> !fir.box<none>
-  %4 = fir.call @_FortranAPointerAssociate(%2, %3) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> none
+  fir.call @_FortranAPointerAssociate(%2, %3) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> ()
   return
 }
-func.func private @_FortranAPointerAssociate(!fir.ref<!fir.box<none>>, !fir.box<none>) -> none attributes {fir.runtime}
+func.func private @_FortranAPointerAssociate(!fir.ref<!fir.box<none>>, !fir.box<none>) -> () attributes {fir.runtime}
 
 // CHECK-LABEL: define void @_QMmod2Pinitp(
 // CHECK-SAME: ptr nocapture %[[ARG0:.*]]){{.*}}{
 // CHECK: %[[ALLOCA_CLASS_NONE:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }
 // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[ALLOCA_CLASS_NONE]], ptr %[[ARG0]], i32 40, i1 false)
-// CHECK: %{{.*}} = call {} @_FortranAPointerAssociate(ptr @_QMmod2Ep, ptr %[[ALLOCA_CLASS_NONE]])
+// CHECK: call void @_FortranAPointerAssociate(ptr @_QMmod2Ep, ptr %[[ALLOCA_CLASS_NONE]])
 // CHECK: ret void
 
 fir.global linkonce_odr @_QMmod1E.dt.p1 constant target : !fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{which:i8,isargdescriptorset:i8,__padding0:!fir.array<6xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}> {
diff --git a/flang/test/Fir/rebox_assumed_rank_codegen.fir b/flang/test/Fir/rebox_assumed_rank_codegen.fir
index 3c4de0bef509f..b4336b9279493 100644
--- a/flang/test/Fir/rebox_assumed_rank_codegen.fir
+++ b/flang/test/Fir/rebox_assumed_rank_codegen.fir
@@ -55,7 +55,7 @@ func.func private @takes_assumed_rank_t(!fir.box<!fir.array<*:!sometype>>)
 // CHECK:           %[[VAL_4:.*]] = fir.zero_bits !fir.ref<none>
 // CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.array<?x?x?x?x?x?x?x?x?x?x?x?x?x?x?xf32>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
-// CHECK:           %[[VAL_7:.*]] = fir.call @_FortranACopyAndUpdateDescriptor(%[[VAL_5]], %[[VAL_6]], %[[VAL_4]], %[[VAL_2]], %[[VAL_1]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<none>, i8, i32) -> none
+// CHECK:           fir.call @_FortranACopyAndUpdateDescriptor(%[[VAL_5]], %[[VAL_6]], %[[VAL_4]], %[[VAL_2]], %[[VAL_1]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<none>, i8, i32) -> ()
 // CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_3]] : !fir.ref<!fir.box<!fir.array<?x?x?x?x?x?x?x?x?x?x?x?x?x?x?xf32>>>
 // CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.array<?x?x?x?x?x?x?x?x?x?x?x?x?x?x?xf32>>) -> !fir.box<!fir.array<*:f32>>
 // CHECK:           fir.call @somefunc(%[[VAL_9]]) : (!fir.box<!fir.array<*:f32>>) -> ()
@@ -78,7 +78,7 @@ func.func private @takes_assumed_rank_t(!fir.box<!fir.array<*:!sometype>>)
 // CHECK:           %[[VAL_4:.*]] = fir.zero_bits !fir.ref<none>
 // CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?x?x?x?x?x?x?x?x?x?x?x?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
-// CHECK:           %[[VAL_7:.*]] = fir.call @_FortranACopyAndUpdateDescriptor(%[[VAL_5]], %[[VAL_6]], %[[VAL_4]], %[[VAL_2]], %[[VAL_1]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<none>, i8, i32) -> none
+// CHECK:           fir.call @_FortranACopyAndUpdateDescriptor(%[[VAL_5]], %[[VAL_6]], %[[VAL_4]], %[[VAL_2]], %[[VAL_1]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<none>, i8, i32) -> ()
 // CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?x?x?x?x?x?x?x?x?x?x?x?x?xf32>>>>
 // CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.heap<!fir.array<?x?x?x?x?x?x?x?x?x?x?x?x?x?x?xf32>>>) -> !fir.box<!fir.heap<!fir.array<*:f32>>>
 // CHECK:           fir.call @somefuncalloc(%[[VAL_9]]) : (!fir.box<!fir.heap<!fir.array<*:f32>>>) -> ()
@@ -93,7 +93,7 @@ func.func private @takes_assumed_rank_t(!fir.box<!fir.array<*:!sometype>>)
 // CHECK:           %[[VAL_4:.*]] = fir.zero_bits !fir.ref<none>
 // CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?x?x?x?x?x?x?x?x?x?x?x?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
-// CHECK:           %[[VAL_7:.*]] = fir.call @_FortranACopyAndUpdateDescriptor(%[[VAL_5]], %[[VAL_6]], %[[VAL_4]], %[[VAL_2]], %[[VAL_1]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<none>, i8, i32) -> none
+// CHECK:           fir.call @_FortranACopyAndUpdateDescriptor(%[[VAL_5]], %[[VAL_6]], %[[VAL_4]], %[[VAL_2]], %[[VAL_1]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<none>, i8, i32) -> ()
 // CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?x?x?x?x?x?x?x?x?x?x?x?x?xf32>>>>
 // CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.ptr<!fir.array<?x?x?x?x?x?x?x?x?x?x?x?x?x?x?xf32>>>) -> !fir.box<!fir.ptr<!fir.array<*:f32>>>
 // CHECK:           fir.call @somefuncpointer(%[[VAL_9]]) : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> ()
@@ -109,7 +109,7 @@ func.func private @takes_assumed_rank_t(!fir.box<!fir.array<*:!sometype>>)
 // CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.array<?x?x?x?x?x?x?x?x?x?x?x?x?x?x?x!fir.type<t1{i:i32}>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<*:!fir.type<t2{t1:!fir.type<t1{i:i32}>,x:f32}>>>) -> !fir.box<none>
 // CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_4]] : (!fir.tdesc<!fir.type<t1{i:i32}>>) -> !fir.ref<none>
-// CHECK:           %[[VAL_8:.*]] = fir.call @_FortranACopyAndUpdateDescriptor(%[[VAL_5]], %[[VAL_6]], %[[VAL_7]], %[[VAL_2]], %[[VAL_1]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<none>, i8, i32) -> none
+// CHECK:           fir.call @_FortranACopyAndUpdateDescriptor(%[[VAL_5]], %[[VAL_6]], %[[VAL_7]], %[[VAL_2]], %[[VAL_1]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<none>, i8, i32) -> ()
 // CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_3]] : !fir.ref<!fir.box<!fir.array<?x?x?x?x?x?x?x?x?x?x?x?x?x?x?x!fir.type<t1{i:i32}>>>>
 // CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (!fir.box<!fir.array<?x?x?x?x?x?x?x?x?x?x?x?x?x?x?x!fir.type<t1{i:i32}>>>) -> !fir.box<!fir.array<*:!fir.type<t1{i:i32}>>>
 // CHECK:           fir.call @somefunct1(%[[VAL_10]]) : (!fir.box<!fir.array<*:!fir.type<t1{i:i32}>>>) -> ()
@@ -119,6 +119,6 @@ func.func private @takes_assumed_rank_t(!fir.box<!fir.array<*:!sometype>>)
 // CHECK-LABEL:   func.func @test_poly_to_nonepoly(
 // CHECK:           %[[VAL_4:.*]] = fir.type_desc !fir.type<sometype{i:i32}>
 // CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_4]] : (!fir.tdesc<!fir.type<sometype{i:i32}>>) -> !fir.ref<none>
-// CHECK:           %[[VAL_8:.*]] = fir.call @_FortranACopyAndUpdateDescriptor(%{{.*}}, %{{.*}}, %[[VAL_7]],
+// CHECK:           fir.call @_FortranACopyAndUpdateDescriptor(%{{.*}}, %{{.*}}, %[[VAL_7]],
 
-// CHECK:         func.func private @_FortranACopyAndUpdateDescriptor(!fir.ref<!fir.box<none>> {llvm.nocapture}, !fir.box<none> {llvm.nocapture}, !fir.ref<none>, i8, i32) -> none attributes {fir.runtime}
+// CHECK:         func.func private @_FortranACopyAndUpdateDescriptor(!fir.ref<!fir.box<none>> {llvm.nocapture}, !fir.box<none> {llvm.nocapture}, !fir.ref<none>, i8, i32) attributes {fir.runtime}
diff --git a/flang/test/Fir/tbaa-codegen2.fir b/flang/test/Fir/tbaa-codegen2.fir
index 12232a29aae4a..8f8b6a29129e7 100644
--- a/flang/test/Fir/tbaa-codegen2.fir
+++ b/flang/test/Fir/tbaa-codegen2.fir
@@ -43,7 +43,7 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.targ
     %9 = fir.convert %0 : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<none>>
     %10 = fir.convert %7 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
     %11 = fir.convert %8 : (!fir.ref<!fir.char<1,16>>) -> !fir.ref<i8>
-    %12 = fir.call @_FortranAAssign(%9, %10, %11, %c3_i32) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+    fir.call @_FortranAAssign(%9, %10, %11, %c3_i32) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
     fir.freemem %6 : !fir.heap<!fir.array<?xi32>>
     %13 = fir.array_coor %2 %c2 : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
     // load modified not to have tbaa
@@ -53,7 +53,7 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.targ
     fir.store %14 to %15 {tbaa = [#tbaa_tag]} : !fir.ref<i32>
     return
   }
-  func.func private @_FortranAAssign(!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none attributes {fir.runtime}
+  func.func private @_FortranAAssign(!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> () attributes {fir.runtime}
   fir.global linkonce @_QQclX2F746D702F73696D706C652E66393000 constant : !fir.char<1,16> {
     %0 = fir.string_lit "/tmp/simple.f90\00"(16) : !fir.char<1,16>
     fir.has_value %0 : !fir.char<1,16>
diff --git a/flang/test/HLFIR/all-lowering.fir b/flang/test/HLFIR/all-lowering.fir
index e83378eacf9c9..df6771e565efc 100644
--- a/flang/test/HLFIR/all-lowering.fir
+++ b/flang/test/HLFIR/all-lowering.fir
@@ -50,7 +50,7 @@ func.func @_QPall2(%arg0: !fir.box<!fir.array<?x?x!fir.logical<4>>> {fir.bindc_n
 // CHECK-DAG:     %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]]
 // CHECK-DAG:     %[[MASK_ARG:.*]] = fir.convert %[[MASK]]#1
 
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranAAllDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+// CHECK:         fir.call @_FortranAAllDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
 // CHECK:         %[[RET:.*]] = fir.load %[[RET_BOX]]
 // CHECK:         %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[RET]]
 // CHECK-NEXT:    %[[ADDR:.*]] = fir.box_addr %[[RET]]
@@ -96,7 +96,7 @@ func.func @_QPall3(%arg0: !fir.ref<!fir.array<2x!fir.logical<4>>> {fir.bindc_nam
 
 // CHECK-DAG:     %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]]
 // CHECK-DAG:     %[[MASK_ARG:.*]] = fir.convert %[[MASK_BOX]] : (!fir.box<!fir.array<2x2x!fir.logical<4>>>) -> !fir.box<none>
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranAAllDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]])
+// CHECK:         fir.call @_FortranAAllDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]])
 // CHECK:         %[[RET:.*]] = fir.load %[[RET_BOX]]
 // CHECK:         %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[RET]]
 // CHECK-NEXT:    %[[ADDR:.*]] = fir.box_addr %[[RET]]
@@ -144,7 +144,7 @@ func.func @_QPall4(%arg0: !fir.box<!fir.array<?x?x!fir.logical<4>>> {fir.bindc_n
 // CHECK-DAG:     %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]]
 // CHECK-DAG:     %[[MASK_ARG:.*]] = fir.convert %[[MASK]]#1
 
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranAAllDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+// CHECK:         fir.call @_FortranAAllDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
 // CHECK:         %[[RET:.*]] = fir.load %[[RET_BOX]]
 // CHECK:         %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[RET]]
 // CHECK-NEXT:    %[[ADDR:.*]] = fir.box_addr %[[RET]]
diff --git a/flang/test/HLFIR/any-elemental.fir b/flang/test/HLFIR/any-elemental.fir
index 6e233068d2e9b..a7c559679d965 100644
--- a/flang/test/HLFIR/any-elemental.fir
+++ b/flang/test/HLFIR/any-elemental.fir
@@ -161,7 +161,7 @@ func.func @_Qtest_recursive() attributes {fir.bindc_name = "test"} {
   %25 = arith.xori %24, %true : i1
   cf.cond_br %25, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
-  %26 = fir.call @_FortranAStopStatement(%c2_i32, %false, %false) fastmath<contract> : (i32, i1, i1) -> none
+  fir.call @_FortranAStopStatement(%c2_i32, %false, %false) fastmath<contract> : (i32, i1, i1) -> ()
   fir.unreachable
 ^bb2:  // pred: ^bb0
   return
diff --git a/flang/test/HLFIR/any-lowering.fir b/flang/test/HLFIR/any-lowering.fir
index 039146727d3f5..72fcdd37b6193 100644
--- a/flang/test/HLFIR/any-lowering.fir
+++ b/flang/test/HLFIR/any-lowering.fir
@@ -52,7 +52,7 @@ func.func @_QPany2(%arg0: !fir.box<!fir.array<?x?x!fir.logical<4>>> {fir.bindc_n
 // CHECK-DAG:     %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]]
 // CHECK-DAG:     %[[MASK_ARG:.*]] = fir.convert %[[MASK]]#1
 
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranAAnyDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+// CHECK:         fir.call @_FortranAAnyDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
 // CHECK:         %[[RET:.*]] = fir.load %[[RET_BOX]]
 // CHECK:         %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[RET]]
 // CHECK-NEXT:    %[[ADDR:.*]] = fir.box_addr %[[RET]]
@@ -99,7 +99,7 @@ func.func @_QPany3(%arg0: !fir.ref<!fir.array<2x!fir.logical<4>>> {fir.bindc_nam
 
 // CHECK-DAG:     %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]]
 // CHECK-DAG:     %[[MASK_ARG:.*]] = fir.convert %[[MASK_BOX]] : (!fir.box<!fir.array<2x2x!fir.logical<4>>>) -> !fir.box<none>
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranAAnyDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]])
+// CHECK:         fir.call @_FortranAAnyDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]])
 // CHECK:         %[[RET:.*]] = fir.load %[[RET_BOX]]
 // CHECK:         %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[RET]]
 // CHECK-NEXT:    %[[ADDR:.*]] = fir.box_addr %[[RET]]
@@ -148,7 +148,7 @@ func.func @_QPany4(%arg0: !fir.box<!fir.array<?x?x!fir.logical<4>>> {fir.bindc_n
 // CHECK-DAG:     %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]]
 // CHECK-DAG:     %[[MASK_ARG:.*]] = fir.convert %[[MASK]]#1
 
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranAAnyDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+// CHECK:         fir.call @_FortranAAnyDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
 // CHECK:         %[[RET:.*]] = fir.load %[[RET_BOX]]
 // CHECK:         %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[RET]]
 // CHECK-NEXT:    %[[ADDR:.*]] = fir.box_addr %[[RET]]
diff --git a/flang/test/HLFIR/assign-codegen.fir b/flang/test/HLFIR/assign-codegen.fir
index e0dcc06d75a9c..581d1ab0e7739 100644
--- a/flang/test/HLFIR/assign-codegen.fir
+++ b/flang/test/HLFIR/assign-codegen.fir
@@ -142,7 +142,7 @@ func.func @array(%arg0: !fir.box<!fir.array<?xi32>>, %arg1: !fir.ref<!fir.array<
 // CHECK:  fir.store %[[VAL_6]] to %[[VAL_2]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
 // CHECK:  %[[VAL_26:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:  %[[VAL_27:.*]] = fir.convert %[[VAL_10]] : (!fir.box<!fir.array<100xi32>>) -> !fir.box<none>
-// CHECK:  %[[VAL_29:.*]] = fir.call @_FortranAAssign(%[[VAL_26]], %[[VAL_27]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+// CHECK:  fir.call @_FortranAAssign(%[[VAL_26]], %[[VAL_27]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 
 
 func.func @array_temp(%arg0: !fir.box<!fir.array<?xi32>>, %arg1: !fir.ref<!fir.array<100xi32>>) {
@@ -167,7 +167,7 @@ func.func @array_temp(%arg0: !fir.box<!fir.array<?xi32>>, %arg1: !fir.ref<!fir.a
 // CHECK:  fir.store %[[VAL_6]] to %[[VAL_2]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
 // CHECK:  %[[VAL_26:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:  %[[VAL_27:.*]] = fir.convert %[[VAL_10]] : (!fir.box<!fir.array<100xi32>>) -> !fir.box<none>
-// CHECK:  %[[VAL_29:.*]] = fir.call @_FortranAAssignTemporary(%[[VAL_26]], %[[VAL_27]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+// CHECK:  fir.call @_FortranAAssignTemporary(%[[VAL_26]], %[[VAL_27]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 
 
 func.func @test_scalar_to_array(%lhs: !fir.box<!fir.array<?xi32>>, %rhs: i32) {
@@ -184,7 +184,7 @@ func.func @test_scalar_to_array(%lhs: !fir.box<!fir.array<?xi32>>, %rhs: i32) {
 // CHECK:  fir.store %[[VAL_0]] to %[[VAL_2]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
 // CHECK:  %[[VAL_10:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:  %[[VAL_11:.*]] = fir.convert %[[VAL_6]] : (!fir.box<i32>) -> !fir.box<none>
-// CHECK:  %[[VAL_13:.*]] = fir.call @_FortranAAssign(%[[VAL_10]], %[[VAL_11]]
+// CHECK:  fir.call @_FortranAAssign(%[[VAL_10]], %[[VAL_11]]
 
 
 func.func @test_i1_scalar_to_array(%lhs: !fir.box<!fir.array<?x!fir.logical<4>>>, %rhs: i1) {
@@ -196,7 +196,7 @@ func.func @test_i1_scalar_to_array(%lhs: !fir.box<!fir.array<?x!fir.logical<4>>>
 // CHECK:  %[[VAL_6:.*]] = fir.alloca !fir.logical<4>
 // CHECK:  %[[VAL_7:.*]] = fir.embox %[[VAL_6]] : (!fir.ref<!fir.logical<4>>) -> !fir.box<!fir.logical<4>>
 // CHECK:  %[[VAL_12:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.logical<4>>) -> !fir.box<none>
-// CHECK:  %[[VAL_14:.*]] = fir.call @_FortranAAssign(%{{.*}}, %[[VAL_12]]
+// CHECK:  fir.call @_FortranAAssign(%{{.*}}, %[[VAL_12]]
 
 func.func @alloc_assign(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, %arg1: !fir.box<!fir.array<?xi32>>) {
   hlfir.assign %arg1 to %arg0 realloc : !fir.box<!fir.array<?xi32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
@@ -207,7 +207,7 @@ func.func @alloc_assign(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>,
 // CHECK-SAME: %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>>) {
 // CHECK:  %[[VAL_2:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:  %[[VAL_3:.*]] = fir.convert %[[VAL_1]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
-// CHECK:  fir.call @_FortranAAssign(%[[VAL_2]], %[[VAL_3]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+// CHECK:  fir.call @_FortranAAssign(%[[VAL_2]], %[[VAL_3]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 
 func.func @alloc_assign_temp(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, %arg1: !fir.box<!fir.array<?xi32>>) {
   hlfir.assign %arg1 to %arg0 realloc temporary_lhs : !fir.box<!fir.array<?xi32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
@@ -218,7 +218,7 @@ func.func @alloc_assign_temp(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32
 // CHECK-SAME: %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>>) {
 // CHECK:  %[[VAL_2:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:  %[[VAL_3:.*]] = fir.convert %[[VAL_1]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
-// CHECK:  fir.call @_FortranAAssignTemporary(%[[VAL_2]], %[[VAL_3]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+// CHECK:  fir.call @_FortranAAssignTemporary(%[[VAL_2]], %[[VAL_3]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 
 func.func @test_alloc_assign_explicit_length_character(%lhs: !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>, %rhs: !fir.box<!fir.array<?x!fir.char<1,?>>>) {
   hlfir.assign %rhs to %lhs realloc keep_lhs_len : !fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>
@@ -229,7 +229,7 @@ func.func @test_alloc_assign_explicit_length_character(%lhs: !fir.ref<!fir.box<!
 // CHECK-SAME:  %[[VAL_1:.*]]: !fir.box<!fir.array<?x!fir.char<1,?>>>) {
 // CHECK:  %[[VAL_7:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:  %[[VAL_8:.*]] = fir.convert %[[VAL_1]] : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> !fir.box<none>
-// CHECK:  %[[VAL_10:.*]] = fir.call @_FortranAAssignExplicitLengthCharacter(%[[VAL_7]], %[[VAL_8]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+// CHECK:  fir.call @_FortranAAssignExplicitLengthCharacter(%[[VAL_7]], %[[VAL_8]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 
 func.func @test_alloc_assign_polymorphic(%lhs: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<t>>>>>, %rhs: !fir.class<!fir.array<?x!fir.type<t>>>) {
   hlfir.assign %rhs to %lhs realloc : !fir.class<!fir.array<?x!fir.type<t>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<t>>>>>
@@ -240,7 +240,7 @@ func.func @test_alloc_assign_polymorphic(%lhs: !fir.ref<!fir.class<!fir.heap<!fi
 // CHECK-SAME:  %[[VAL_1:.*]]: !fir.class<!fir.array<?x!fir.type<t>>>) {
 // CHECK:  %[[VAL_7:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<t>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:  %[[VAL_8:.*]] = fir.convert %[[VAL_1]] : (!fir.class<!fir.array<?x!fir.type<t>>>) -> !fir.box<none>
-// CHECK:  %[[VAL_10:.*]] = fir.call @_FortranAAssignPolymorphic(%[[VAL_7]], %[[VAL_8]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+// CHECK:  fir.call @_FortranAAssignPolymorphic(%[[VAL_7]], %[[VAL_8]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 
 func.func @assing_scalar_int_to_polymorphic(%arg0: !fir.ref<!fir.class<!fir.heap<none>>>) {
   %c123_i32 = arith.constant 123 : i32
@@ -258,7 +258,7 @@ func.func @assing_scalar_int_to_polymorphic(%arg0: !fir.ref<!fir.class<!fir.heap
 // CHECK:           %[[VAL_4:.*]] = fir.embox %[[VAL_3]] : (!fir.ref<i32>) -> !fir.box<i32>
 // CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.class<!fir.heap<none>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_4]] : (!fir.box<i32>) -> !fir.box<none>
-// CHECK:           %[[VAL_11:.*]] = fir.call @_FortranAAssignPolymorphic(%[[VAL_8]], %[[VAL_9]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+// CHECK:           fir.call @_FortranAAssignPolymorphic(%[[VAL_8]], %[[VAL_9]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 
 func.func @assign_i1_to_polymorphic(%arg0: !fir.ref<!fir.class<!fir.heap<none>>>) {
   %false = arith.constant false
@@ -279,7 +279,7 @@ func.func @assign_i1_to_polymorphic(%arg0: !fir.ref<!fir.class<!fir.heap<none>>>
 // CHECK:           %[[VAL_6:.*]] = fir.embox %[[VAL_5]] : (!fir.ref<!fir.logical<4>>) -> !fir.box<!fir.logical<4>>
 // CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.class<!fir.heap<none>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_6]] : (!fir.box<!fir.logical<4>>) -> !fir.box<none>
-// CHECK:           %[[VAL_13:.*]] = fir.call @_FortranAAssignPolymorphic(%[[VAL_10]], %[[VAL_11]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+// CHECK:           fir.call @_FortranAAssignPolymorphic(%[[VAL_10]], %[[VAL_11]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 
 func.func @test_alloc_assign_polymorphic_temp(%lhs: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<t>>>>>, %rhs: !fir.class<!fir.array<?x!fir.type<t>>>) {
   hlfir.assign %rhs to %lhs realloc temporary_lhs : !fir.class<!fir.array<?x!fir.type<t>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<t>>>>>
@@ -290,7 +290,7 @@ func.func @test_alloc_assign_polymorphic_temp(%lhs: !fir.ref<!fir.class<!fir.hea
 // CHECK-SAME:  %[[VAL_1:.*]]: !fir.class<!fir.array<?x!fir.type<t>>>) {
 // CHECK:  %[[VAL_7:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<t>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:  %[[VAL_8:.*]] = fir.convert %[[VAL_1]] : (!fir.class<!fir.array<?x!fir.type<t>>>) -> !fir.box<none>
-// CHECK:  %[[VAL_10:.*]] = fir.call @_FortranAAssignTemporary(%[[VAL_7]], %[[VAL_8]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+// CHECK:  fir.call @_FortranAAssignTemporary(%[[VAL_7]], %[[VAL_8]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 
 func.func @test_allocatable_component(%arg0: !fir.ref<!fir.type<_QFtestTt1{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>> {fir.bindc_name = "x", fir.target}, %arg1: !fir.ref<!fir.type<_QFtestTt1{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>> {fir.bindc_name = "y", fir.target}) {
   %4:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtestEx"} : (!fir.ref<!fir.type<_QFtestTt1{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> (!fir.ref<!fir.type<_QFtestTt1{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.ref<!fir.type<_QFtestTt1{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>)
@@ -310,7 +310,7 @@ func.func @test_allocatable_component(%arg0: !fir.ref<!fir.type<_QFtestTt1{a:!fi
 // CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.box<!fir.type<_QFtestTt1{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_6]] : (!fir.box<!fir.type<_QFtestTt1{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.box<none>
 // CHECK:           %[[VAL_12:.*]] = fir.convert %{{.*}} : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:           %[[VAL_13:.*]] = fir.call @_FortranAAssign(%[[VAL_10]], %[[VAL_11]], %[[VAL_12]], %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+// CHECK:           fir.call @_FortranAAssign(%[[VAL_10]], %[[VAL_11]], %[[VAL_12]], %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 // CHECK:           return
 // CHECK:         }
 
@@ -332,7 +332,7 @@ func.func @test_allocatable_component_temp(%arg0: !fir.ref<!fir.type<_QFtestTt1{
 // CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.box<!fir.type<_QFtestTt1{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_6]] : (!fir.box<!fir.type<_QFtestTt1{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.box<none>
 // CHECK:           %[[VAL_12:.*]] = fir.convert %{{.*}} : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:           %[[VAL_13:.*]] = fir.call @_FortranAAssignTemporary(%[[VAL_10]], %[[VAL_11]], %[[VAL_12]], %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+// CHECK:           fir.call @_FortranAAssignTemporary(%[[VAL_10]], %[[VAL_11]], %[[VAL_12]], %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 // CHECK:           return
 // CHECK:         }
 
@@ -357,7 +357,7 @@ func.func @_QFPtest_scalar_lhs_finalization(%arg0: !fir.ref<!fir.type<_QMa8vTt1{
 // CHECK:           %[[VAL_10:.*]] = fir.convert %[[BOX]] : (!fir.ref<!fir.box<!fir.type<_QMa8vTt1{val:i32}>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_5]] : (!fir.box<!fir.type<_QMa8vTt1{val:i32}>>) -> !fir.box<none>
 // CHECK:           %[[VAL_12:.*]] = fir.convert %{{.*}} : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:           %[[VAL_13:.*]] = fir.call @_FortranAAssign(%[[VAL_10]], %[[VAL_11]], %[[VAL_12]], %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+// CHECK:           fir.call @_FortranAAssign(%[[VAL_10]], %[[VAL_11]], %[[VAL_12]], %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 // CHECK:           return
 // CHECK:         }
 
@@ -423,7 +423,7 @@ func.func @test_upoly_expr_assignment(%arg0: !fir.class<!fir.array<?xnone>> {fir
 // CHECK:             %[[VAL_26:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.class<none>>) -> !fir.ref<!fir.box<none>>
 // CHECK:             %[[VAL_27:.*]] = fir.convert %[[VAL_12]] : (!fir.class<none>) -> !fir.box<none>
 // CHECK:             %[[VAL_28:.*]] = fir.convert %[[VAL_23]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:             %[[VAL_29:.*]] = fir.call @_FortranAAssignTemporary(%[[VAL_26]], %[[VAL_27]], %[[VAL_28]], %[[VAL_25]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+// CHECK:             fir.call @_FortranAAssignTemporary(%[[VAL_26]], %[[VAL_27]], %[[VAL_28]], %[[VAL_25]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 // CHECK:           }
 // CHECK:           return
 // CHECK:         }
diff --git a/flang/test/HLFIR/assumed_shape_with_value_keyword.f90 b/flang/test/HLFIR/assumed_shape_with_value_keyword.f90
index feea3712a822c..0f904041b7101 100644
--- a/flang/test/HLFIR/assumed_shape_with_value_keyword.f90
+++ b/flang/test/HLFIR/assumed_shape_with_value_keyword.f90
@@ -140,7 +140,7 @@ subroutine test_optional3(x)
 ! CHECK:          %[[C0_I32:.*]] = arith.constant 0 : i32
 ! CHECK:          %[[FALSE:.*]] = arith.constant false
 ! CHECK:          %[[FALSE_0:.*]] = arith.constant false
-! CHECK:          %[[VAL_2:.*]] = fir.call @_FortranAStopStatement(%[[C0_I32]], %[[FALSE]], %[[FALSE]]_0) fastmath<contract> : (i32, i1, i1) -> none
+! CHECK:          fir.call @_FortranAStopStatement(%[[C0_I32]], %[[FALSE]], %[[FALSE]]_0) fastmath<contract> : (i32, i1, i1) -> ()
 ! CHECK:          fir.unreachable
 ! CHECK:          b2:  // pred: ^bb0
 ! CHECK:          return
diff --git a/flang/test/HLFIR/boxchar_emboxing.f90 b/flang/test/HLFIR/boxchar_emboxing.f90
index c25a5c283e369..787aa8325a8c8 100644
--- a/flang/test/HLFIR/boxchar_emboxing.f90
+++ b/flang/test/HLFIR/boxchar_emboxing.f90
@@ -22,7 +22,7 @@
 ! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_10]] : (index) -> i64
 ! CHECK:           %[[VAL_14:.*]] = arith.constant false
 ! CHECK:           %[[VAL_15:.*]] = arith.constant false
-! CHECK:           %[[VAL_16:.*]] = fir.call @_FortranAStopStatementText(%[[VAL_12]], %[[VAL_13]], %[[VAL_14]], %[[VAL_15]]) fastmath<contract> : (!fir.ref<i8>, i64, i1, i1) -> none
+! CHECK:           fir.call @_FortranAStopStatementText(%[[VAL_12]], %[[VAL_13]], %[[VAL_14]], %[[VAL_15]]) fastmath<contract> : (!fir.ref<i8>, i64, i1, i1) -> ()
 ! CHECK:           fir.unreachable
 ! CHECK:         ^bb3:
 ! CHECK:           return
@@ -66,7 +66,7 @@ end subroutine test1
 ! CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_12]] : (index) -> i64
 ! CHECK:           %[[VAL_16:.*]] = arith.constant false
 ! CHECK:           %[[VAL_17:.*]] = arith.constant false
-! CHECK:           %[[VAL_18:.*]] = fir.call @_FortranAStopStatementText(%[[VAL_14]], %[[VAL_15]], %[[VAL_16]], %[[VAL_17]]) fastmath<contract> : (!fir.ref<i8>, i64, i1, i1) -> none
+! CHECK:           fir.call @_FortranAStopStatementText(%[[VAL_14]], %[[VAL_15]], %[[VAL_16]], %[[VAL_17]]) fastmath<contract> : (!fir.ref<i8>, i64, i1, i1) -> ()
 ! CHECK:           fir.unreachable
 ! CHECK:         ^bb3:
 ! CHECK:           return
diff --git a/flang/test/HLFIR/bufferize-destroy-for-derived.fir b/flang/test/HLFIR/bufferize-destroy-for-derived.fir
index a3c756682777f..618ebf8028225 100644
--- a/flang/test/HLFIR/bufferize-destroy-for-derived.fir
+++ b/flang/test/HLFIR/bufferize-destroy-for-derived.fir
@@ -25,7 +25,7 @@ func.func @_QPtest1(%arg0: !fir.box<!fir.array<?x!fir.type<_QMtypesTt1{x:!fir.bo
 // CHECK:           hlfir.assign %[[VAL_7:.*]]#0 to %{{.*}}#0 : !fir.box<!fir.array<?x!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>>>, !fir.box<!fir.array<?x!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>>>
 // CHECK-NEXT:      %[[VAL_18:.*]] = fir.box_addr %[[VAL_7]]#0 : (!fir.box<!fir.array<?x!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>>>) -> !fir.heap<!fir.array<?x!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>>>
 // CHECK-NEXT:      %[[VAL_19:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.box<!fir.array<?x!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>>>) -> !fir.box<none>
-// CHECK-NEXT:      %[[VAL_20:.*]] = fir.call @_FortranADestroyWithoutFinalization(%[[VAL_19]]) : (!fir.box<none>) -> none
+// CHECK-NEXT:      fir.call @_FortranADestroyWithoutFinalization(%[[VAL_19]]) : (!fir.box<none>) -> ()
 // CHECK-NEXT:      fir.freemem %[[VAL_18]] : !fir.heap<!fir.array<?x!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>>>
 // CHECK-NEXT:      return
 // CHECK-NEXT:    }
@@ -57,9 +57,9 @@ func.func @_QPtest2(%arg0: !fir.box<!fir.array<?x!fir.type<_QMtypesTt2{x:!fir.bo
 // CHECK-NEXT:      %[[VAL_21:.*]] = arith.constant {{[0-9]*}} : i32
 // CHECK-NEXT:      %[[VAL_22:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.box<!fir.array<?x!fir.type<_QMtypesTt2{x:!fir.box<!fir.heap<f32>>}>>>) -> !fir.box<none>
 // CHECK-NEXT:      %[[VAL_23:.*]] = fir.convert %[[VAL_19]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-// CHECK-NEXT:      %[[VAL_24:.*]] = fir.call @_FortranAFinalize(%[[VAL_22]], %[[VAL_23]], %[[VAL_21]]) : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+// CHECK-NEXT:      fir.call @_FortranAFinalize(%[[VAL_22]], %[[VAL_23]], %[[VAL_21]]) : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 // CHECK-NEXT:      %[[VAL_25:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.box<!fir.array<?x!fir.type<_QMtypesTt2{x:!fir.box<!fir.heap<f32>>}>>>) -> !fir.box<none>
-// CHECK-NEXT:      %[[VAL_26:.*]] = fir.call @_FortranADestroyWithoutFinalization(%[[VAL_25]]) : (!fir.box<none>) -> none
+// CHECK-NEXT:      fir.call @_FortranADestroyWithoutFinalization(%[[VAL_25]]) : (!fir.box<none>) -> ()
 // CHECK-NEXT:      fir.freemem %[[VAL_18]] : !fir.heap<!fir.array<?x!fir.type<_QMtypesTt2{x:!fir.box<!fir.heap<f32>>}>>>
 // CHECK-NEXT:      return
 // CHECK-NEXT:    }
@@ -91,7 +91,7 @@ func.func @_QPtest3(%arg0: !fir.box<!fir.array<?x!fir.type<_QMtypesTt3{x:f32}>>>
 // CHECK-NEXT:      %[[VAL_21:.*]] = arith.constant {{[0-9]*}} : i32
 // CHECK-NEXT:      %[[VAL_22:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.box<!fir.array<?x!fir.type<_QMtypesTt3{x:f32}>>>) -> !fir.box<none>
 // CHECK-NEXT:      %[[VAL_23:.*]] = fir.convert %[[VAL_19]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-// CHECK-NEXT:      %[[VAL_24:.*]] = fir.call @_FortranAFinalize(%[[VAL_22]], %[[VAL_23]], %[[VAL_21]]) : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+// CHECK-NEXT:      fir.call @_FortranAFinalize(%[[VAL_22]], %[[VAL_23]], %[[VAL_21]]) : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 // CHECK-NEXT:      fir.freemem %[[VAL_18]] : !fir.heap<!fir.array<?x!fir.type<_QMtypesTt3{x:f32}>>>
 // CHECK-NEXT:      return
 // CHECK-NEXT:    }
diff --git a/flang/test/HLFIR/bufferize-end-associate-for-derived.fir b/flang/test/HLFIR/bufferize-end-associate-for-derived.fir
index 089fe574893db..aad297d0b072f 100644
--- a/flang/test/HLFIR/bufferize-end-associate-for-derived.fir
+++ b/flang/test/HLFIR/bufferize-end-associate-for-derived.fir
@@ -16,7 +16,7 @@ func.func @_QPtest1(%arg0: !fir.box<!fir.array<?x!fir.type<_QMtypesTt1{x:!fir.bo
 }
 // CHECK-LABEL:   func.func @_QPtest1(
 // CHECK-NOT:       fir.call @_Fortran
-// CHECK:           fir.call @_FortranADestroyWithoutFinalization(%{{.*}}) : (!fir.box<none>) -> none
+// CHECK:           fir.call @_FortranADestroyWithoutFinalization(%{{.*}}) : (!fir.box<none>) -> ()
 // CHECK-NOT:       fir.call @_Fortran
 
 func.func @_QPtest2(%arg0: !fir.box<!fir.array<?x!fir.type<_QMtypesTt2{x:!fir.box<!fir.heap<f32>>}>>> {fir.bindc_name = "x"}) {
@@ -33,7 +33,7 @@ func.func @_QPtest2(%arg0: !fir.box<!fir.array<?x!fir.type<_QMtypesTt2{x:!fir.bo
 }
 // CHECK-LABEL:   func.func @_QPtest2(
 // CHECK-NOT:       fir.call @_Fortran
-// CHECK:           fir.call @_FortranADestroyWithoutFinalization(%{{.*}}) : (!fir.box<none>) -> none
+// CHECK:           fir.call @_FortranADestroyWithoutFinalization(%{{.*}}) : (!fir.box<none>) -> ()
 // CHECK-NOT:       fir.call @_Fortran
 
 func.func @_QPtest3(%arg0: !fir.box<!fir.array<?x!fir.type<_QMtypesTt3{x:f32}>>> {fir.bindc_name = "x"}) {
diff --git a/flang/test/HLFIR/bufferize-poly-expr.fir b/flang/test/HLFIR/bufferize-poly-expr.fir
index dfa62a9ac5ab7..49c2347b2b26d 100644
--- a/flang/test/HLFIR/bufferize-poly-expr.fir
+++ b/flang/test/HLFIR/bufferize-poly-expr.fir
@@ -24,7 +24,7 @@ func.func @test_poly_expr_without_associate() {
 // CHECK:           %[[VAL_9:.*]] = arith.constant 0 : i32
 // CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QFtestTt{c:i32}>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_4]]#1 : (!fir.class<!fir.heap<!fir.type<_QFtestTt{c:i32}>>>) -> !fir.box<none>
-// CHECK:           %[[VAL_12:.*]] = fir.call @_FortranAAllocatableApplyMold(%[[VAL_10]], %[[VAL_11]], %[[VAL_9]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> none
+// CHECK:           fir.call @_FortranAAllocatableApplyMold(%[[VAL_10]], %[[VAL_11]], %[[VAL_9]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 // CHECK:           hlfir.assign %[[VAL_4]]#0 to %[[VAL_8]]#0 realloc temporary_lhs : !fir.class<!fir.heap<!fir.type<_QFtestTt{c:i32}>>>, !fir.ref<!fir.class<!fir.heap<!fir.type<_QFtestTt{c:i32}>>>>
 // CHECK:           %[[VAL_8B:.*]] = fir.load %[[VAL_8]]#0
 // CHECK:           %[[VAL_13:.*]] = fir.undefined tuple<!fir.class<!fir.heap<!fir.type<_QFtestTt{c:i32}>>>, i1>
@@ -47,7 +47,7 @@ func.func @test_poly_expr_with_associate(%arg1: !fir.class<!fir.array<3x!fir.typ
   %8 = fir.shape %7#1 : (index) -> !fir.shape<1>
   %9:3 = hlfir.associate %6(%8) {uniq_name = ".tmp.assign"} : (!hlfir.expr<?x!fir.type<_QMtest_typeTt1{i:i32}>?>, !fir.shape<1>) -> (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtest_typeTt1{i:i32}>>>>, !fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtest_typeTt1{i:i32}>>>>, i1)
   %10 = fir.convert %0 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtest_typeTt1{i:i32}>>>>>) -> !fir.box<none>
-  %11 = fir.call @_FortranADestroy(%10) fastmath<contract> : (!fir.box<none>) -> none
+  fir.call @_FortranADestroy(%10) fastmath<contract> : (!fir.box<none>) -> ()
   %c3 = arith.constant 3 : index
   %12 = fir.shape %c3 : (index) -> !fir.shape<1>
   %c1 = arith.constant 1 : index
@@ -79,7 +79,7 @@ func.func @test_poly_expr_with_associate(%arg1: !fir.class<!fir.array<3x!fir.typ
 // CHECK:           %[[VAL_15:.*]] = arith.constant 1 : i32
 // CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtest_typeTt1{i:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_5]] : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtest_typeTt1{i:i32}>>>>) -> !fir.box<none>
-// CHECK:           %[[VAL_18:.*]] = fir.call @_FortranAAllocatableApplyMold(%[[VAL_16]], %[[VAL_17]], %[[VAL_15]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> none
+// CHECK:           fir.call @_FortranAAllocatableApplyMold(%[[VAL_16]], %[[VAL_17]], %[[VAL_15]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 // CHECK:           hlfir.assign %[[VAL_5]] to %[[VAL_14]]#0 realloc temporary_lhs : !fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtest_typeTt1{i:i32}>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtest_typeTt1{i:i32}>>>>>
 // CHECK:           %[[VAL_14B:.*]] = fir.load %[[VAL_14]]#0
 // CHECK:           %[[VAL_19:.*]] = fir.undefined tuple<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtest_typeTt1{i:i32}>>>>, i1>
@@ -89,7 +89,7 @@ func.func @test_poly_expr_with_associate(%arg1: !fir.class<!fir.array<3x!fir.typ
 // CHECK:           %[[VAL_23:.*]]:3 = fir.box_dims %[[VAL_5]], %[[VAL_22]] : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtest_typeTt1{i:i32}>>>>, index) -> (index, index, index)
 // CHECK:           %[[VAL_24:.*]] = fir.shape %[[VAL_23]]#1 : (index) -> !fir.shape<1>
 // CHECK:           %[[VAL_27:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtest_typeTt1{i:i32}>>>>>) -> !fir.box<none>
-// CHECK:           %[[VAL_28:.*]] = fir.call @_FortranADestroy(%[[VAL_27]]) fastmath<contract> : (!fir.box<none>) -> none
+// CHECK:           fir.call @_FortranADestroy(%[[VAL_27]]) fastmath<contract> : (!fir.box<none>) -> ()
 // CHECK:           %[[VAL_29:.*]] = arith.constant 3 : index
 // CHECK:           %[[VAL_30:.*]] = fir.shape %[[VAL_29]] : (index) -> !fir.shape<1>
 // CHECK:           %[[VAL_31:.*]] = arith.constant 1 : index
diff --git a/flang/test/HLFIR/bufferize01.fir b/flang/test/HLFIR/bufferize01.fir
index 02ac6076268af..40e2769e459c1 100644
--- a/flang/test/HLFIR/bufferize01.fir
+++ b/flang/test/HLFIR/bufferize01.fir
@@ -27,7 +27,7 @@
 // CHECK:           %[[VAL_15:.*]] = fir.address_of(@_QQclXce30ef70ff16a711a97719fb946c0b3d) : !fir.ref<!fir.char<1>>
 // CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_6]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<1x!fir.char<1,?>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_15]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<i8>
-// CHECK:           %[[VAL_18:.*]] = fir.call @_FortranAInitArrayConstructorVector(%[[VAL_14]], %[[VAL_16]], %[[VAL_2]], %[[VAL_1]], %[[VAL_17]], %[[VAL_0]]) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.ref<!fir.box<none>>, i1, i32, !fir.ref<i8>, i32) -> none
+// CHECK:           fir.call @_FortranAInitArrayConstructorVector(%[[VAL_14]], %[[VAL_16]], %[[VAL_2]], %[[VAL_1]], %[[VAL_17]], %[[VAL_0]]) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.ref<!fir.box<none>>, i1, i32, !fir.ref<i8>, i32) -> ()
 // CHECK:           %[[VAL_19:.*]] = fir.load %[[VAL_10]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
 // CHECK:           %[[VAL_20:.*]] = fir.box_addr %[[VAL_19]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> !fir.heap<!fir.char<1,?>>
 // CHECK:           %[[VAL_21:.*]] = fir.load %[[VAL_10]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
@@ -43,7 +43,7 @@
 // CHECK:           %[[VAL_31:.*]]:2 = fir.unboxchar %[[VAL_30]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 // CHECK:           %[[VAL_32:.*]] = fir.embox %[[VAL_31]]#0 typeparams %[[VAL_29]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
 // CHECK:           %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
-// CHECK:           %[[VAL_34:.*]] = fir.call @_FortranAPushArrayConstructorValue(%[[VAL_14]], %[[VAL_33]]) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.box<none>) -> none
+// CHECK:           fir.call @_FortranAPushArrayConstructorValue(%[[VAL_14]], %[[VAL_33]]) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
 // CHECK:           %[[VAL_35:.*]] = fir.load %[[VAL_6]] : !fir.ref<!fir.box<!fir.heap<!fir.array<1x!fir.char<1,?>>>>>
 // CHECK:           %[[VAL_36:.*]] = fir.undefined tuple<!fir.box<!fir.heap<!fir.array<1x!fir.char<1,?>>>>, i1>
 // CHECK:           %[[VAL_37:.*]] = fir.insert_value %[[VAL_36]], %[[VAL_2]], [1 : index] : (tuple<!fir.box<!fir.heap<!fir.array<1x!fir.char<1,?>>>>, i1>, i1) -> tuple<!fir.box<!fir.heap<!fir.array<1x!fir.char<1,?>>>>, i1>
@@ -101,7 +101,7 @@ func.func @_QPtest1() {
   %10 = fir.address_of(@_QQclXce30ef70ff16a711a97719fb946c0b3d) : !fir.ref<!fir.char<1,1>>
   %11 = fir.convert %1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<1x!fir.char<1,?>>>>>) -> !fir.ref<!fir.box<none>>
   %12 = fir.convert %10 : (!fir.ref<!fir.char<1,1>>) -> !fir.ref<i8>
-  %13 = fir.call @_FortranAInitArrayConstructorVector(%9, %11, %true, %c80_i32, %12, %c1_i32) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.ref<!fir.box<none>>, i1, i32, !fir.ref<i8>, i32) -> none
+  fir.call @_FortranAInitArrayConstructorVector(%9, %11, %true, %c80_i32, %12, %c1_i32) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.ref<!fir.box<none>>, i1, i32, !fir.ref<i8>, i32) -> ()
   %14 = fir.load %5#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
   %15 = fir.box_addr %14 : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> !fir.heap<!fir.char<1,?>>
   %16 = fir.load %5#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
@@ -117,7 +117,7 @@ func.func @_QPtest1() {
   %26:2 = fir.unboxchar %25 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
   %27 = fir.embox %26#0 typeparams %24 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
   %28 = fir.convert %27 : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
-  %29 = fir.call @_FortranAPushArrayConstructorValue(%9, %28) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.box<none>) -> none
+  fir.call @_FortranAPushArrayConstructorValue(%9, %28) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
   %30 = fir.load %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<1x!fir.char<1,?>>>>>
   %31 = hlfir.as_expr %30 move %true : (!fir.box<!fir.heap<!fir.array<1x!fir.char<1,?>>>>, i1) -> !hlfir.expr<1x!fir.char<1,?>>
   %32 = fir.box_elesize %30 : (!fir.box<!fir.heap<!fir.array<1x!fir.char<1,?>>>>) -> index
@@ -137,12 +137,12 @@ func.func @_QPtest1() {
   hlfir.destroy %31 : !hlfir.expr<1x!fir.char<1,?>>
   return
 }
-func.func private @_FortranAInitArrayConstructorVector(!fir.llvm_ptr<i8>, !fir.ref<!fir.box<none>>, i1, i32, !fir.ref<i8>, i32) -> none attributes {fir.runtime}
+func.func private @_FortranAInitArrayConstructorVector(!fir.llvm_ptr<i8>, !fir.ref<!fir.box<none>>, i1, i32, !fir.ref<i8>, i32) -> () attributes {fir.runtime}
 fir.global linkonce @_QQclXce30ef70ff16a711a97719fb946c0b3d constant : !fir.char<1,1> {
   %0 = fir.string_lit "\00"(1) : !fir.char<1,1>
   fir.has_value %0 : !fir.char<1,1>
 }
-func.func private @_FortranAPushArrayConstructorValue(!fir.llvm_ptr<i8>, !fir.box<none>) -> none attributes {fir.runtime}
+func.func private @_FortranAPushArrayConstructorValue(!fir.llvm_ptr<i8>, !fir.box<none>) -> () attributes {fir.runtime}
 
 // -----
 
diff --git a/flang/test/HLFIR/copy-in-out-codegen.fir b/flang/test/HLFIR/copy-in-out-codegen.fir
index 8031536550bdf..f4ea36c2244e5 100644
--- a/flang/test/HLFIR/copy-in-out-codegen.fir
+++ b/flang/test/HLFIR/copy-in-out-codegen.fir
@@ -16,7 +16,7 @@ func.func @test_copy_in(%box: !fir.box<!fir.array<?xf64>>, %temp: !fir.ref<!fir.
 // CHECK:           } else {
 // CHECK:             %[[VAL_8:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:             %[[VAL_9:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?xf64>>) -> !fir.box<none>
-// CHECK:             %[[VAL_11:.*]] = fir.call @_FortranACopyInAssign(%[[VAL_8]], %[[VAL_9]],
+// CHECK:             fir.call @_FortranACopyInAssign(%[[VAL_8]], %[[VAL_9]],
 // CHECK:             %[[VAL_12:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
 // CHECK:             %[[VAL_13:.*]] = fir.rebox %[[VAL_12]] : (!fir.box<!fir.heap<!fir.array<?xf64>>>) -> !fir.box<!fir.array<?xf64>>
 // CHECK:             fir.result %[[VAL_13]] : !fir.box<!fir.array<?xf64>>
@@ -42,7 +42,7 @@ func.func @test_copy_in_optional(%box: !fir.box<!fir.array<?xf64>>, %temp: !fir.
 // CHECK:             } else {
 // CHECK:               %[[VAL_10:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:               %[[VAL_11:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?xf64>>) -> !fir.box<none>
-// CHECK:               %[[VAL_13:.*]] = fir.call @_FortranACopyInAssign(%[[VAL_10]], %[[VAL_11]],
+// CHECK:               fir.call @_FortranACopyInAssign(%[[VAL_10]], %[[VAL_11]],
 // CHECK:               %[[VAL_14:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
 // CHECK:               %[[VAL_15:.*]] = fir.rebox %[[VAL_14]] : (!fir.box<!fir.heap<!fir.array<?xf64>>>) -> !fir.box<!fir.array<?xf64>>
 // CHECK:               fir.result %[[VAL_15]] : !fir.box<!fir.array<?xf64>>
@@ -68,7 +68,7 @@ func.func @test_copy_out_no_copy_back(%temp: !fir.ref<!fir.box<!fir.heap<!fir.ar
 // CHECK:             %[[VAL_2:.*]] = fir.zero_bits !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
 // CHECK:             %[[VAL_6:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:             %[[VAL_7:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK:             %[[VAL_9:.*]] = fir.call @_FortranACopyOutAssign(%[[VAL_6]], %[[VAL_7]],
+// CHECK:             fir.call @_FortranACopyOutAssign(%[[VAL_6]], %[[VAL_7]],
 // CHECK:           }
 // CHECK:           return
 // CHECK:         }
@@ -86,7 +86,7 @@ func.func @test_copy_out_copy_back(%box: !fir.box<!fir.array<?xf64>>, %temp: !fi
 // CHECK:             fir.store %[[VAL_0]] to %[[VAL_3]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
 // CHECK:             %[[VAL_7:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.array<?xf64>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:             %[[VAL_8:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK:             %[[VAL_10:.*]] = fir.call @_FortranACopyOutAssign(%[[VAL_7]], %[[VAL_8]],
+// CHECK:             fir.call @_FortranACopyOutAssign(%[[VAL_7]], %[[VAL_8]],
 // CHECK:           }
 // CHECK:           return
 // CHECK:         }
@@ -105,7 +105,7 @@ func.func @test_copy_in_poly(%poly : !fir.class<!fir.array<?x!fir.type<test_copy
 // CHECK:           } else {
 // CHECK:             %[[VAL_8:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<test_copy_in_polyTt1{i:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:             %[[VAL_9:.*]] = fir.convert %[[VAL_0]] : (!fir.class<!fir.array<?x!fir.type<test_copy_in_polyTt1{i:i32}>>>) -> !fir.box<none>
-// CHECK:             %[[VAL_11:.*]] = fir.call @_FortranACopyInAssign(%[[VAL_8]], %[[VAL_9]],
+// CHECK:             fir.call @_FortranACopyInAssign(%[[VAL_8]], %[[VAL_9]],
 // CHECK:             %[[VAL_12:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<test_copy_in_polyTt1{i:i32}>>>>>
 // CHECK:             %[[VAL_13:.*]] = fir.rebox %[[VAL_12]] : (!fir.class<!fir.heap<!fir.array<?x!fir.type<test_copy_in_polyTt1{i:i32}>>>>) -> !fir.class<!fir.array<?x!fir.type<test_copy_in_polyTt1{i:i32}>>>
 // CHECK:             fir.result %[[VAL_13]] : !fir.class<!fir.array<?x!fir.type<test_copy_in_polyTt1{i:i32}>>>
diff --git a/flang/test/HLFIR/count-lowering-default-int-kinds.fir b/flang/test/HLFIR/count-lowering-default-int-kinds.fir
index 68bc7fdbaad87..4869ec688c825 100644
--- a/flang/test/HLFIR/count-lowering-default-int-kinds.fir
+++ b/flang/test/HLFIR/count-lowering-default-int-kinds.fir
@@ -9,7 +9,7 @@ module attributes {fir.defaultkind = "a1c4d8i8l4r4", fir.kindmap = ""} {
 }
 // CHECK-LABEL: func.func @test_i8
 // CHECK: %[[KIND:.*]] = arith.constant 8 : i32
-// CHECK: fir.call @_FortranACountDim(%{{.*}}, %{{.*}}, %{{.*}}, %[[KIND]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACountDim(%{{.*}}, %{{.*}}, %{{.*}}, %[[KIND]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32) -> ()
 
 module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = ""} {
   func.func @test_i4(%arg0: !fir.box<!fir.array<?x?x!fir.logical<4>>> {fir.bindc_name = "x"}, %arg1: i64) -> !hlfir.expr<?xi32> {
@@ -19,7 +19,7 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = ""} {
 }
 // CHECK-LABEL: func.func @test_i4
 // CHECK: %[[KIND:.*]] = arith.constant 4 : i32
-// CHECK: fir.call @_FortranACountDim(%{{.*}}, %{{.*}}, %{{.*}}, %[[KIND]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACountDim(%{{.*}}, %{{.*}}, %{{.*}}, %[[KIND]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32) -> ()
 
 module attributes {fir.defaultkind = "a1c4d8i2l4r4", fir.kindmap = ""} {
   func.func @test_i2(%arg0: !fir.box<!fir.array<?x?x!fir.logical<4>>> {fir.bindc_name = "x"}, %arg1: i64) -> !hlfir.expr<?xi16> {
@@ -29,7 +29,7 @@ module attributes {fir.defaultkind = "a1c4d8i2l4r4", fir.kindmap = ""} {
 }
 // CHECK-LABEL: func.func @test_i2
 // CHECK: %[[KIND:.*]] = arith.constant 2 : i32
-// CHECK: fir.call @_FortranACountDim(%{{.*}}, %{{.*}}, %{{.*}}, %[[KIND]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACountDim(%{{.*}}, %{{.*}}, %{{.*}}, %[[KIND]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32) -> ()
 
 module attributes {fir.defaultkind = "a1c4d8i1l4r4", fir.kindmap = ""} {
   func.func @test_i1(%arg0: !fir.box<!fir.array<?x?x!fir.logical<4>>> {fir.bindc_name = "x"}, %arg1: i64) -> !hlfir.expr<?xi8> {
@@ -39,4 +39,4 @@ module attributes {fir.defaultkind = "a1c4d8i1l4r4", fir.kindmap = ""} {
 }
 // CHECK-LABEL: func.func @test_i1
 // CHECK: %[[KIND:.*]] = arith.constant 1 : i32
-// CHECK: fir.call @_FortranACountDim(%{{.*}}, %{{.*}}, %{{.*}}, %[[KIND]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32) -> none
+// CHECK: fir.call @_FortranACountDim(%{{.*}}, %{{.*}}, %{{.*}}, %[[KIND]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32) -> ()
diff --git a/flang/test/HLFIR/count-lowering.fir b/flang/test/HLFIR/count-lowering.fir
index c3309724981a3..a314b507d048c 100644
--- a/flang/test/HLFIR/count-lowering.fir
+++ b/flang/test/HLFIR/count-lowering.fir
@@ -51,7 +51,7 @@ func.func @_QPcount2(%arg0: !fir.box<!fir.array<?x?x!fir.logical<4>>> {fir.bindc
 // CHECK-DAG:     %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]]
 // CHECK-DAG:     %[[MASK_ARG:.*]] = fir.convert %[[MASK]]#1
 
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranACountDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[KIND]], %[[LOC_STR:.*]], %[[LOC_N:.*]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32) -> none
+// CHECK:         fir.call @_FortranACountDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[KIND]], %[[LOC_STR:.*]], %[[LOC_N:.*]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32) -> ()
 // CHECK:         %[[RET:.*]] = fir.load %[[RET_BOX]]
 // CHECK:         %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[RET]]
 // CHECK-NEXT:    %[[ADDR:.*]] = fir.box_addr %[[RET]]
@@ -99,7 +99,7 @@ func.func @_QPcount3(%arg0: !fir.ref<!fir.array<2xi32>> {fir.bindc_name = "s"})
 // CHECK-DAG:     %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]]
 // CHECK-DAG:     %[[MASK_ARG:.*]] = fir.convert %[[MASK_BOX]] : (!fir.box<!fir.array<2x2x!fir.logical<4>>>) -> !fir.box<none>
 
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranACountDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[KIND]], %[[LOC_STR:.*]], %[[LOC_N:.*]])
+// CHECK:         fir.call @_FortranACountDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[KIND]], %[[LOC_STR:.*]], %[[LOC_N:.*]])
 // CHECK:         %[[RET:.*]] = fir.load %[[RET_BOX]]
 // CHECK:         %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[RET]]
 // CHECK-NEXT:    %[[ADDR:.*]] = fir.box_addr %[[RET]]
@@ -150,7 +150,7 @@ func.func @_QPcount4(%arg0: !fir.box<!fir.array<?x?x!fir.logical<4>>> {fir.bindc
 // CHECK-DAG:     %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]]
 // CHECK-DAG:     %[[MASK_ARG:.*]] = fir.convert %[[MASK]]#1
 
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranACountDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[KIND]], %[[LOC_STR:.*]], %[[LOC_N:.*]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32) -> none
+// CHECK:         fir.call @_FortranACountDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[KIND]], %[[LOC_STR:.*]], %[[LOC_N:.*]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32) -> ()
 // CHECK:         %[[RET:.*]] = fir.load %[[RET_BOX]]
 // CHECK:         %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[RET]]
 // CHECK-NEXT:    %[[ADDR:.*]] = fir.box_addr %[[RET]]
diff --git a/flang/test/HLFIR/cshift-lowering.fir b/flang/test/HLFIR/cshift-lowering.fir
index 386b81c4dbff6..44408d785f682 100644
--- a/flang/test/HLFIR/cshift-lowering.fir
+++ b/flang/test/HLFIR/cshift-lowering.fir
@@ -25,7 +25,7 @@ func.func @cshift1(%arg0: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}, %a
 // CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_6]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
 // CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_11]] : (i32) -> i64
-// CHECK:           %[[VAL_17:.*]] = fir.call @_FortranACshiftVector(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.ref<i8>, i32) -> none
+// CHECK:           fir.call @_FortranACshiftVector(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.ref<i8>, i32) -> ()
 
 // 2d boxed array shift by scalar
 func.func @cshift2(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "a"}, %arg1: i32 {fir.bindc_name = "sh"}) {
@@ -53,7 +53,7 @@ func.func @cshift2(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "a"},
 // CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_8]]#1 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 // CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_12]] : (!fir.box<i32>) -> !fir.box<none>
 // CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_4]] : (index) -> i32
-// CHECK:           %[[VAL_19:.*]] = fir.call @_FortranACshift(%[[VAL_14]], %[[VAL_15]], %[[VAL_16]], %[[VAL_17]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+// CHECK:           fir.call @_FortranACshift(%[[VAL_14]], %[[VAL_15]], %[[VAL_16]], %[[VAL_17]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
 
 // 2d boxed array shift by boxed array
 func.func @cshift3(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "a"}, %arg1: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "sh"}) {
@@ -80,7 +80,7 @@ func.func @cshift3(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "a"},
 // CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_7]]#1 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 // CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_8]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
 // CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_4]] : (index) -> i32
-// CHECK:           %[[VAL_18:.*]] = fir.call @_FortranACshift(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %[[VAL_16]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+// CHECK:           fir.call @_FortranACshift(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %[[VAL_16]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
 
 // 2d boxed array shift by array expr
 func.func @cshift4(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "a"}, %arg1: !hlfir.expr<?xi32> {fir.bindc_name = "sh"}) {
@@ -110,7 +110,7 @@ func.func @cshift4(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "a"},
 // CHECK:           %[[VAL_18:.*]] = fir.convert %[[VAL_7]]#1 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 // CHECK:           %[[VAL_19:.*]] = fir.convert %[[VAL_15]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
 // CHECK:           %[[VAL_20:.*]] = fir.convert %[[VAL_4]] : (index) -> i32
-// CHECK:           %[[VAL_22:.*]] = fir.call @_FortranACshift(%[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %[[VAL_20]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+// CHECK:           fir.call @_FortranACshift(%[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %[[VAL_20]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
 
 // 2d array expr shift by array expr
 func.func @cshift5(%arg0: !hlfir.expr<?x?xi32> {fir.bindc_name = "a"}, %arg1: !hlfir.expr<?xi32> {fir.bindc_name = "sh"}) {
@@ -144,7 +144,7 @@ func.func @cshift5(%arg0: !hlfir.expr<?x?xi32> {fir.bindc_name = "a"}, %arg1: !h
 // CHECK:           %[[VAL_23:.*]] = fir.convert %[[VAL_15]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 // CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_20]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
 // CHECK:           %[[VAL_25:.*]] = fir.convert %[[VAL_4]] : (index) -> i32
-// CHECK:           %[[VAL_27:.*]] = fir.call @_FortranACshift(%[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+// CHECK:           fir.call @_FortranACshift(%[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
 
 // 2d array expr shift by array expr with explicit dim
 func.func @cshift6(%arg0: !hlfir.expr<?x?xi32> {fir.bindc_name = "a"}, %arg1: !hlfir.expr<?xi32> {fir.bindc_name = "sh"}, %dim : i16) {
@@ -179,7 +179,7 @@ func.func @cshift6(%arg0: !hlfir.expr<?x?xi32> {fir.bindc_name = "a"}, %arg1: !h
 // CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_25:.*]] = fir.convert %[[VAL_17]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 // CHECK:           %[[VAL_26:.*]] = fir.convert %[[VAL_22]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
-// CHECK:           %[[VAL_28:.*]] = fir.call @_FortranACshift(%[[VAL_24]], %[[VAL_25]], %[[VAL_26]], %[[VAL_15]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+// CHECK:           fir.call @_FortranACshift(%[[VAL_24]], %[[VAL_25]], %[[VAL_26]], %[[VAL_15]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
 
 // shift of polymorphic array
 func.func @cshift7(%arg0: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>, %arg1: !fir.ref<f32>) {
@@ -214,4 +214,4 @@ func.func @cshift7(%arg0: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_
 // CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_6]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_18:.*]] = fir.convert %[[VAL_11]] : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>) -> !fir.box<none>
 // CHECK:           %[[VAL_19:.*]] = fir.convert %[[VAL_15]] : (i32) -> i64
-// CHECK:           %[[VAL_21:.*]] = fir.call @_FortranACshiftVector(%[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.ref<i8>, i32) -> none
+// CHECK:           fir.call @_FortranACshiftVector(%[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.ref<i8>, i32) -> ()
diff --git a/flang/test/HLFIR/elemental-codegen.fir b/flang/test/HLFIR/elemental-codegen.fir
index 0d5f343cb1771..2443217f557f8 100644
--- a/flang/test/HLFIR/elemental-codegen.fir
+++ b/flang/test/HLFIR/elemental-codegen.fir
@@ -171,20 +171,20 @@ func.func @test_polymorphic(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.bindc_
 // CHECK:           %[[RANK:.*]] = arith.constant 2 : i32
 // CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_5]]#1 : (!fir.class<!fir.type<_QMtypesTt>>) -> !fir.box<none>
-// CHECK:           %[[VAL_17:.*]] = fir.call @_FortranAAllocatableApplyMold(%[[VAL_15]], %[[VAL_16]], %[[RANK]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> none
+// CHECK:           fir.call @_FortranAAllocatableApplyMold(%[[VAL_15]], %[[VAL_16]], %[[RANK]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 // CHECK:           %[[VAL_18:.*]] = arith.constant 1 : index
 // CHECK:           %[[VAL_19:.*]] = arith.constant 0 : index
 // CHECK:           %[[VAL_20:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_21:.*]] = fir.convert %[[VAL_19]] : (index) -> i32
 // CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_18]] : (index) -> i64
 // CHECK:           %[[VAL_23:.*]] = fir.convert %[[EX0]] : (index) -> i64
-// CHECK:           %[[VAL_24:.*]] = fir.call @_FortranAAllocatableSetBounds(%[[VAL_20]], %[[VAL_21]], %[[VAL_22]], %[[VAL_23]]) : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+// CHECK:           fir.call @_FortranAAllocatableSetBounds(%[[VAL_20]], %[[VAL_21]], %[[VAL_22]], %[[VAL_23]]) : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 // CHECK:           %[[VAL_25:.*]] = arith.constant 1 : index
 // CHECK:           %[[VAL_26:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_27:.*]] = fir.convert %[[VAL_25]] : (index) -> i32
 // CHECK:           %[[VAL_28:.*]] = fir.convert %[[VAL_18]] : (index) -> i64
 // CHECK:           %[[VAL_29:.*]] = fir.convert %[[EX1]] : (index) -> i64
-// CHECK:           %[[VAL_30:.*]] = fir.call @_FortranAAllocatableSetBounds(%[[VAL_26]], %[[VAL_27]], %[[VAL_28]], %[[VAL_29]]) : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+// CHECK:           fir.call @_FortranAAllocatableSetBounds(%[[VAL_26]], %[[VAL_27]], %[[VAL_28]], %[[VAL_29]]) : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 // CHECK:           %[[VAL_31:.*]] = fir.address_of(@_QQclX
 // CHECK:           %[[VAL_32:.*]] = arith.constant {{.*}} : index
 // CHECK:           %[[VAL_33:.*]] = arith.constant {{.*}} : i32
@@ -255,20 +255,20 @@ func.func @test_polymorphic_expr(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.b
 // CHECK:           %[[VAL_15:.*]] = arith.constant 2 : i32
 // CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_6]]#1 : (!fir.class<!fir.type<_QMtypesTt>>) -> !fir.box<none>
-// CHECK:           %[[VAL_18:.*]] = fir.call @_FortranAAllocatableApplyMold(%[[VAL_16]], %[[VAL_17]], %[[VAL_15]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> none
+// CHECK:           fir.call @_FortranAAllocatableApplyMold(%[[VAL_16]], %[[VAL_17]], %[[VAL_15]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 // CHECK:           %[[VAL_19:.*]] = arith.constant 1 : index
 // CHECK:           %[[VAL_20:.*]] = arith.constant 0 : index
 // CHECK:           %[[VAL_21:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (index) -> i32
 // CHECK:           %[[VAL_23:.*]] = fir.convert %[[VAL_19]] : (index) -> i64
 // CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_2]] : (index) -> i64
-// CHECK:           %[[VAL_25:.*]] = fir.call @_FortranAAllocatableSetBounds(%[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_24]]) : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+// CHECK:           fir.call @_FortranAAllocatableSetBounds(%[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_24]]) : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 // CHECK:           %[[VAL_26:.*]] = arith.constant 1 : index
 // CHECK:           %[[VAL_27:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_28:.*]] = fir.convert %[[VAL_26]] : (index) -> i32
 // CHECK:           %[[VAL_29:.*]] = fir.convert %[[VAL_19]] : (index) -> i64
 // CHECK:           %[[VAL_30:.*]] = fir.convert %[[VAL_3]] : (index) -> i64
-// CHECK:           %[[VAL_31:.*]] = fir.call @_FortranAAllocatableSetBounds(%[[VAL_27]], %[[VAL_28]], %[[VAL_29]], %[[VAL_30]]) : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+// CHECK:           fir.call @_FortranAAllocatableSetBounds(%[[VAL_27]], %[[VAL_28]], %[[VAL_29]], %[[VAL_30]]) : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 // CHECK:           %[[VAL_32:.*]] = fir.address_of(@_QQcl
 // CHECK:           %[[VAL_33:.*]] = arith.constant {{.*}} : index
 // CHECK:           %[[VAL_34:.*]] = arith.constant {{.*}} : i32
@@ -308,20 +308,20 @@ func.func @test_polymorphic_expr(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.b
 // CHECK:           %[[VAL_64:.*]] = arith.constant 2 : i32
 // CHECK:           %[[VAL_65:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_66:.*]] = fir.convert %[[VAL_40]] : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>) -> !fir.box<none>
-// CHECK:           %[[VAL_67:.*]] = fir.call @_FortranAAllocatableApplyMold(%[[VAL_65]], %[[VAL_66]], %[[VAL_64]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> none
+// CHECK:           fir.call @_FortranAAllocatableApplyMold(%[[VAL_65]], %[[VAL_66]], %[[VAL_64]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 // CHECK:           %[[VAL_68:.*]] = arith.constant 1 : index
 // CHECK:           %[[VAL_69:.*]] = arith.constant 0 : index
 // CHECK:           %[[VAL_70:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_71:.*]] = fir.convert %[[VAL_69]] : (index) -> i32
 // CHECK:           %[[VAL_72:.*]] = fir.convert %[[VAL_68]] : (index) -> i64
 // CHECK:           %[[VAL_73:.*]] = fir.convert %[[VAL_2]] : (index) -> i64
-// CHECK:           %[[VAL_74:.*]] = fir.call @_FortranAAllocatableSetBounds(%[[VAL_70]], %[[VAL_71]], %[[VAL_72]], %[[VAL_73]]) : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+// CHECK:           fir.call @_FortranAAllocatableSetBounds(%[[VAL_70]], %[[VAL_71]], %[[VAL_72]], %[[VAL_73]]) : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 // CHECK:           %[[VAL_75:.*]] = arith.constant 1 : index
 // CHECK:           %[[VAL_76:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_77:.*]] = fir.convert %[[VAL_75]] : (index) -> i32
 // CHECK:           %[[VAL_78:.*]] = fir.convert %[[VAL_68]] : (index) -> i64
 // CHECK:           %[[VAL_79:.*]] = fir.convert %[[VAL_3]] : (index) -> i64
-// CHECK:           %[[VAL_80:.*]] = fir.call @_FortranAAllocatableSetBounds(%[[VAL_76]], %[[VAL_77]], %[[VAL_78]], %[[VAL_79]]) : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+// CHECK:           fir.call @_FortranAAllocatableSetBounds(%[[VAL_76]], %[[VAL_77]], %[[VAL_78]], %[[VAL_79]]) : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 // CHECK:           %[[VAL_81:.*]] = fir.address_of(@_QQcl
 // CHECK:           %[[VAL_82:.*]] = arith.constant {{.*}} : index
 // CHECK:           %[[VAL_83:.*]] = arith.constant {{.*}} : i32
diff --git a/flang/test/HLFIR/matmul-lowering.fir b/flang/test/HLFIR/matmul-lowering.fir
index fd76db2659516..51a859401bf4a 100644
--- a/flang/test/HLFIR/matmul-lowering.fir
+++ b/flang/test/HLFIR/matmul-lowering.fir
@@ -29,7 +29,7 @@ func.func @_QPmatmul1(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "lh
 // CHECK:         %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK-DAG:     %[[LHS_ARG:.*]] = fir.convert %[[LHS_VAR]]#1 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 // CHECK-DAG:     %[[RHS_ARG:.*]] = fir.convert %[[RHS_VAR]]#1 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranAMatmulInteger4Integer4(%[[RET_ARG]], %[[LHS_ARG]], %[[RHS_ARG]], %[[LOC_STR:.*]], %[[LOC_N:.*]]) fastmath<contract>
+// CHECK:         fir.call @_FortranAMatmulInteger4Integer4(%[[RET_ARG]], %[[LHS_ARG]], %[[RHS_ARG]], %[[LOC_STR:.*]], %[[LOC_N:.*]]) fastmath<contract>
 
 // CHECK:         %[[RET:.*]] = fir.load %[[RET_BOX]]
 // CHECK-DAG:     %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[RET]]
@@ -71,7 +71,7 @@ func.func @_QPtest(%arg0: !fir.ref<!fir.array<3x3xf32>> {fir.bindc_name = "a"},
 }
 // just check that we apply the patterns successfully. The details are checked above
 // CHECK-LABEL: func.func @_QPtest(
-// CHECK:         fir.call @_FortranAMatmulReal4Real4({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
-// CHECK:         fir.call @_FortranAMatmulReal4Real4({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+// CHECK:         fir.call @_FortranAMatmulReal4Real4({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
+// CHECK:         fir.call @_FortranAMatmulReal4Real4({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 // CHECK:         return
 // CHECK-NEXT:  }
diff --git a/flang/test/HLFIR/maxloc-lowering.fir b/flang/test/HLFIR/maxloc-lowering.fir
index a51c9b483fa05..be52627564c49 100644
--- a/flang/test/HLFIR/maxloc-lowering.fir
+++ b/flang/test/HLFIR/maxloc-lowering.fir
@@ -28,7 +28,7 @@ func.func @_QPmaxloc1(%arg0: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"},
 // CHECK:         %[[V8:.*]] = fir.convert %[[V0]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK-NEXT:    %[[V9:.*]] = fir.convert %[[V1]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
 // CHECK:         %[[V12:.*]] = fir.convert %[[V3]] : (!fir.box<i1>) -> !fir.box<none>
-// CHECK-NEXT:    %[[V13:.*]] = fir.call @_FortranAMaxlocInteger4(%[[V8]], %[[V9]], %[[C4]], {{.*}}, {{.*}}, %[[V12]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+// CHECK-NEXT:    fir.call @_FortranAMaxlocInteger4(%[[V8]], %[[V9]], %[[C4]], {{.*}}, {{.*}}, %[[V12]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
 // CHECK-NEXT:    %[[V14:.*]] = fir.load %[[V0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK-NEXT:    %[[V15:.*]]:3 = fir.box_dims %[[V14]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
 // CHECK-NEXT:    %[[V16:.*]] = fir.box_addr %[[V14]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
@@ -72,7 +72,7 @@ func.func @_QPmaxloc2(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "a"
 // CHECK:         %[[V11:.*]] = fir.convert %[[V0]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK-NEXT:    %[[V12:.*]] = fir.convert %[[V1]]#1 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 // CHECK:         %[[V15:.*]] = fir.convert %[[V6]] : (!fir.box<i1>) -> !fir.box<none>
-// CHECK-NEXT:    %[[V16:.*]] = fir.call @_FortranAMaxlocDim(%[[V11]], %[[V12]], %[[C4]], %[[V5]], {{.*}}, {{.*}}, %[[V15]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+// CHECK-NEXT:    fir.call @_FortranAMaxlocDim(%[[V11]], %[[V12]], %[[C4]], %[[V5]], {{.*}}, {{.*}}, %[[V15]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
 // CHECK-NEXT:    %[[V17:.*]] = fir.load %[[V0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK-NEXT:    %[[V18:.*]]:3 = fir.box_dims %[[V17]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
 // CHECK-NEXT:    %[[V19:.*]] = fir.box_addr %[[V17]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
@@ -114,7 +114,7 @@ func.func @_QPmaxloc3(%arg0: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"},
 // CHECK:         %[[V9:.*]] = fir.convert %[[V0]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK-NEXT:    %[[V10:.*]] = fir.convert %[[V1]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
 // CHECK:         %[[V13:.*]] = fir.convert %[[V4]] : (!fir.box<!fir.logical<4>>) -> !fir.box<none>
-// CHECK-NEXT:    %[[V14:.*]] = fir.call @_FortranAMaxlocInteger4(%[[V9]], %[[V10]], %[[C4]], {{.*}}, {{.*}}, %[[V13]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+// CHECK-NEXT:    fir.call @_FortranAMaxlocInteger4(%[[V9]], %[[V10]], %[[C4]], {{.*}}, {{.*}}, %[[V13]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
 // CHECK-NEXT:    %[[V15:.*]] = fir.load %[[V0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK-NEXT:    %[[V16:.*]]:3 = fir.box_dims %[[V15]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
 // CHECK-NEXT:    %[[V17:.*]] = fir.box_addr %[[V15]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
@@ -155,7 +155,7 @@ func.func @_QPmaxloc4(%arg0: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"},
 // CHECK:         %[[V8:.*]] = fir.convert %[[V0]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK-NEXT:    %[[V9:.*]] = fir.convert %[[V1]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
 // CHECK:         %[[V12:.*]] = fir.convert %[[V2]]#1 : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
-// CHECK-NEXT:    %[[V13:.*]] = fir.call @_FortranAMaxlocInteger4(%[[V8]], %[[V9]], %[[C4]], {{.*}}, {{.*}}, %[[V12]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+// CHECK-NEXT:    fir.call @_FortranAMaxlocInteger4(%[[V8]], %[[V9]], %[[C4]], {{.*}}, {{.*}}, %[[V12]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
 // CHECK-NEXT:    %[[V14:.*]] = fir.load %[[V0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK-NEXT:    %[[V15:.*]]:3 = fir.box_dims %[[V14]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
 // CHECK-NEXT:    %[[V16:.*]] = fir.box_addr %[[V14]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
@@ -226,7 +226,7 @@ func.func @_QPmaxloc5(%arg0: !fir.ref<!fir.array<2xi32>> {fir.bindc_name = "s"})
 // CHECK:         %[[V15:.*]] = fir.convert %[[V0]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK-NEXT:    %[[V16:.*]] = fir.convert %[[V8]] : (!fir.box<!fir.array<2x2xi32>>) -> !fir.box<none>
 // CHECK:         %[[V19:.*]] = fir.convert %[[V10]] : (!fir.box<!fir.logical<4>>) -> !fir.box<none>
-// CHECK-NEXT:    %[[V20:.*]] = fir.call @_FortranAMaxlocDim(%[[V15]], %[[V16]], %[[C4]], %[[C1]], {{.*}}, {{.*}}, %[[V19]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+// CHECK-NEXT:    fir.call @_FortranAMaxlocDim(%[[V15]], %[[V16]], %[[C4]], %[[C1]], {{.*}}, {{.*}}, %[[V19]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
 // CHECK-NEXT:    %[[V21:.*]] = fir.load %[[V0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK-NEXT:    %[[V22:.*]]:3 = fir.box_dims %[[V21]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
 // CHECK-NEXT:    %[[V23:.*]] = fir.box_addr %[[V21]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
@@ -265,7 +265,7 @@ func.func @_QPmaxloc6(%arg0: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_n
 // CHECK:         %[[V8:.*]] = fir.convert %[[V0]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK-NEXT:    %[[V9:.*]] = fir.convert %[[V1]]#1 : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:         %[[V12:.*]] = fir.convert %[[V3]] : (!fir.box<i1>) -> !fir.box<none>
-// CHECK-NEXT:    %[[V13:.*]] = fir.call @_FortranAMaxlocCharacter(%[[V8]], %[[V9]], %[[C4]], {{.*}}, {{.*}}, %[[V12]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+// CHECK-NEXT:    fir.call @_FortranAMaxlocCharacter(%[[V8]], %[[V9]], %[[C4]], {{.*}}, {{.*}}, %[[V12]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
 // CHECK-NEXT:    %[[V14:.*]] = fir.load %[[V0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK-NEXT:    %[[V15:.*]]:3 = fir.box_dims %[[V14]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
 // CHECK-NEXT:    %[[V16:.*]] = fir.box_addr %[[V14]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
@@ -310,7 +310,7 @@ func.func @_QPmaxloc7(%arg0: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"},
 // CHECK:         %[[V10:.*]] = fir.convert %[[V0]] : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.ref<!fir.box<none>>
 // CHECK-NEXT:    %[[V11:.*]] = fir.convert %[[V1]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
 // CHECK:         %[[V14:.*]] = fir.convert %[[V4]]#1 : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
-// CHECK-NEXT:    %[[V15:.*]] = fir.call @_FortranAMaxlocDim(%[[V10]], %[[V11]], %[[C4]], %[[V6]], {{.*}}, {{.*}}, %[[V14]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+// CHECK-NEXT:    fir.call @_FortranAMaxlocDim(%[[V10]], %[[V11]], %[[C4]], %[[V6]], {{.*}}, {{.*}}, %[[V14]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
 // CHECK-NEXT:    %[[V16:.*]] = fir.load %[[V0]] : !fir.ref<!fir.box<!fir.heap<i32>>>
 // CHECK-NEXT:    %[[V17:.*]] = fir.box_addr %[[V16]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
 // CHECK-NEXT:    %[[V18:.*]] = fir.load %[[V17]] : !fir.heap<i32>
diff --git a/flang/test/HLFIR/maxval-lowering.fir b/flang/test/HLFIR/maxval-lowering.fir
index 5a49ed5273ef8..7e025c41c6aeb 100644
--- a/flang/test/HLFIR/maxval-lowering.fir
+++ b/flang/test/HLFIR/maxval-lowering.fir
@@ -56,7 +56,7 @@ func.func @_QPmaxval2(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "a"
 // CHECK-DAG:     %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]]
 // CHECK-DAG:     %[[ARRAY_ARG:.*]] = fir.convert %[[ARRAY]]#1 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 // CHECK-DAG:     %[[MASK_ARG:.*]] = fir.convert %[[MASK]] : (!fir.box<i1>) -> !fir.box<none>
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranAMaxvalDim(%[[RET_ARG]], %[[ARRAY_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[MASK_ARG]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> none
+// CHECK:         fir.call @_FortranAMaxvalDim(%[[RET_ARG]], %[[ARRAY_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[MASK_ARG]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
 
 // CHECK:         %[[RET:.*]] = fir.load %[[RET_BOX]]
 // CHECK:         %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[RET]]
@@ -175,7 +175,7 @@ func.func @_QPmaxval5(%arg0: !fir.ref<!fir.array<2xi32>> {fir.bindc_name = "s"})
 // CHECK-DAG:     %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]]
 // CHECK-DAG:     %[[ARRAY_ARG:.*]] = fir.convert %[[ARRAY_BOX]] : (!fir.box<!fir.array<2x2xi32>>) -> !fir.box<none>
 // CHECK-DAG:     %[[MASK_ARG:.*]] = fir.convert %[[MASK_BOX]] : (!fir.box<!fir.logical<4>>) -> !fir.box<none>
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranAMaxvalDim(%[[RET_ARG]], %[[ARRAY_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[MASK_ARG]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> none
+// CHECK:         fir.call @_FortranAMaxvalDim(%[[RET_ARG]], %[[ARRAY_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[MASK_ARG]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
 
 // simple one argument maxval for character
 func.func @_QPmaxval6(%arg0: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_name = "a"}, %arg1: !fir.boxchar<1> {fir.bindc_name = "s"}) {
@@ -205,7 +205,7 @@ func.func @_QPmaxval6(%arg0: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_n
 // CHECK-DAG:     %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]]
 // CHECK-DAG:     %[[ARRAY_ARG:.*]] = fir.convert %[[ARRAY]]#1 : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK-DAG:     %[[MASK_ARG:.*]] = fir.convert %[[MASK]] : (!fir.box<i1>) -> !fir.box<none>
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranAMaxvalCharacter(%[[RET_ARG]], %[[ARRAY_ARG]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[MASK_ARG]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32, !fir.box<none>) -> none
+// CHECK:         fir.call @_FortranAMaxvalCharacter(%[[RET_ARG]], %[[ARRAY_ARG]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[MASK_ARG]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32, !fir.box<none>) -> ()
 
 // CHECK:         %[[RET:.*]] = fir.load %[[RET_BOX]]
 // CHECK:         %[[BOX_ELESIZE:.*]] = fir.box_elesize %[[RET]]
diff --git a/flang/test/HLFIR/minloc-lowering.fir b/flang/test/HLFIR/minloc-lowering.fir
index 6f3cbd171445c..76d788812e24c 100644
--- a/flang/test/HLFIR/minloc-lowering.fir
+++ b/flang/test/HLFIR/minloc-lowering.fir
@@ -28,7 +28,7 @@ func.func @_QPminloc1(%arg0: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"},
 // CHECK:         %[[V8:.*]] = fir.convert %[[V0]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK-NEXT:    %[[V9:.*]] = fir.convert %[[V1]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
 // CHECK:         %[[V12:.*]] = fir.convert %[[V3]] : (!fir.box<i1>) -> !fir.box<none>
-// CHECK-NEXT:    %[[V13:.*]] = fir.call @_FortranAMinlocInteger4(%[[V8]], %[[V9]], %[[C4]], {{.*}}, {{.*}}, %[[V12]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+// CHECK-NEXT:    fir.call @_FortranAMinlocInteger4(%[[V8]], %[[V9]], %[[C4]], {{.*}}, {{.*}}, %[[V12]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
 // CHECK-NEXT:    %[[V14:.*]] = fir.load %[[V0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK-NEXT:    %[[V15:.*]]:3 = fir.box_dims %[[V14]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
 // CHECK-NEXT:    %[[V16:.*]] = fir.box_addr %[[V14]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
@@ -72,7 +72,7 @@ func.func @_QPminloc2(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "a"
 // CHECK:         %[[V11:.*]] = fir.convert %[[V0]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK-NEXT:    %[[V12:.*]] = fir.convert %[[V1]]#1 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 // CHECK:         %[[V15:.*]] = fir.convert %[[V6]] : (!fir.box<i1>) -> !fir.box<none>
-// CHECK-NEXT:    %[[V16:.*]] = fir.call @_FortranAMinlocDim(%[[V11]], %[[V12]], %[[C4]], %[[V5]], {{.*}}, {{.*}}, %[[V15]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+// CHECK-NEXT:    fir.call @_FortranAMinlocDim(%[[V11]], %[[V12]], %[[C4]], %[[V5]], {{.*}}, {{.*}}, %[[V15]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
 // CHECK-NEXT:    %[[V17:.*]] = fir.load %[[V0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK-NEXT:    %[[V18:.*]]:3 = fir.box_dims %[[V17]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
 // CHECK-NEXT:    %[[V19:.*]] = fir.box_addr %[[V17]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
@@ -114,7 +114,7 @@ func.func @_QPminloc3(%arg0: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"},
 // CHECK:         %[[V9:.*]] = fir.convert %[[V0]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK-NEXT:    %[[V10:.*]] = fir.convert %[[V1]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
 // CHECK:         %[[V13:.*]] = fir.convert %[[V4]] : (!fir.box<!fir.logical<4>>) -> !fir.box<none>
-// CHECK-NEXT:    %[[V14:.*]] = fir.call @_FortranAMinlocInteger4(%[[V9]], %[[V10]], %[[C4]], {{.*}}, {{.*}}, %[[V13]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+// CHECK-NEXT:    fir.call @_FortranAMinlocInteger4(%[[V9]], %[[V10]], %[[C4]], {{.*}}, {{.*}}, %[[V13]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
 // CHECK-NEXT:    %[[V15:.*]] = fir.load %[[V0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK-NEXT:    %[[V16:.*]]:3 = fir.box_dims %[[V15]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
 // CHECK-NEXT:    %[[V17:.*]] = fir.box_addr %[[V15]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
@@ -155,7 +155,7 @@ func.func @_QPminloc4(%arg0: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"},
 // CHECK:         %[[V8:.*]] = fir.convert %[[V0]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK-NEXT:    %[[V9:.*]] = fir.convert %[[V1]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
 // CHECK:         %[[V12:.*]] = fir.convert %[[V2]]#1 : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
-// CHECK-NEXT:    %[[V13:.*]] = fir.call @_FortranAMinlocInteger4(%[[V8]], %[[V9]], %[[C4]], {{.*}}, {{.*}}, %[[V12]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+// CHECK-NEXT:    fir.call @_FortranAMinlocInteger4(%[[V8]], %[[V9]], %[[C4]], {{.*}}, {{.*}}, %[[V12]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
 // CHECK-NEXT:    %[[V14:.*]] = fir.load %[[V0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK-NEXT:    %[[V15:.*]]:3 = fir.box_dims %[[V14]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
 // CHECK-NEXT:    %[[V16:.*]] = fir.box_addr %[[V14]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
@@ -226,7 +226,7 @@ func.func @_QPminloc5(%arg0: !fir.ref<!fir.array<2xi32>> {fir.bindc_name = "s"})
 // CHECK:         %[[V15:.*]] = fir.convert %[[V0]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK-NEXT:    %[[V16:.*]] = fir.convert %[[V8]] : (!fir.box<!fir.array<2x2xi32>>) -> !fir.box<none>
 // CHECK:         %[[V19:.*]] = fir.convert %[[V10]] : (!fir.box<!fir.logical<4>>) -> !fir.box<none>
-// CHECK-NEXT:    %[[V20:.*]] = fir.call @_FortranAMinlocDim(%[[V15]], %[[V16]], %[[C4]], %[[C1]], {{.*}}, {{.*}}, %[[V19]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+// CHECK-NEXT:    fir.call @_FortranAMinlocDim(%[[V15]], %[[V16]], %[[C4]], %[[C1]], {{.*}}, {{.*}}, %[[V19]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
 // CHECK-NEXT:    %[[V21:.*]] = fir.load %[[V0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK-NEXT:    %[[V22:.*]]:3 = fir.box_dims %[[V21]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
 // CHECK-NEXT:    %[[V23:.*]] = fir.box_addr %[[V21]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
@@ -265,7 +265,7 @@ func.func @_QPminloc6(%arg0: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_n
 // CHECK:         %[[V8:.*]] = fir.convert %[[V0]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK-NEXT:    %[[V9:.*]] = fir.convert %[[V1]]#1 : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:         %[[V12:.*]] = fir.convert %[[V3]] : (!fir.box<i1>) -> !fir.box<none>
-// CHECK-NEXT:    %[[V13:.*]] = fir.call @_FortranAMinlocCharacter(%[[V8]], %[[V9]], %[[C4]], {{.*}}, {{.*}}, %[[V12]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+// CHECK-NEXT:    fir.call @_FortranAMinlocCharacter(%[[V8]], %[[V9]], %[[C4]], {{.*}}, {{.*}}, %[[V12]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
 // CHECK-NEXT:    %[[V14:.*]] = fir.load %[[V0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK-NEXT:    %[[V15:.*]]:3 = fir.box_dims %[[V14]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
 // CHECK-NEXT:    %[[V16:.*]] = fir.box_addr %[[V14]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
@@ -310,7 +310,7 @@ func.func @_QPminloc7(%arg0: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"},
 // CHECK:         %[[V10:.*]] = fir.convert %[[V0]] : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.ref<!fir.box<none>>
 // CHECK-NEXT:    %[[V11:.*]] = fir.convert %[[V1]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
 // CHECK:         %[[V14:.*]] = fir.convert %[[V4]]#1 : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
-// CHECK-NEXT:    %[[V15:.*]] = fir.call @_FortranAMinlocDim(%[[V10]], %[[V11]], %[[C4]], %[[V6]], {{.*}}, {{.*}}, %[[V14]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+// CHECK-NEXT:    fir.call @_FortranAMinlocDim(%[[V10]], %[[V11]], %[[C4]], %[[V6]], {{.*}}, {{.*}}, %[[V14]], %[[FALSE]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
 // CHECK-NEXT:    %[[V16:.*]] = fir.load %[[V0]] : !fir.ref<!fir.box<!fir.heap<i32>>>
 // CHECK-NEXT:    %[[V17:.*]] = fir.box_addr %[[V16]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
 // CHECK-NEXT:    %[[V18:.*]] = fir.load %[[V17]] : !fir.heap<i32>
diff --git a/flang/test/HLFIR/minval-lowering.fir b/flang/test/HLFIR/minval-lowering.fir
index d03dec1552309..c9c78e3b2e446 100644
--- a/flang/test/HLFIR/minval-lowering.fir
+++ b/flang/test/HLFIR/minval-lowering.fir
@@ -56,7 +56,7 @@ func.func @_QPminval2(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "a"
 // CHECK-DAG:     %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]]
 // CHECK-DAG:     %[[ARRAY_ARG:.*]] = fir.convert %[[ARRAY]]#1 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 // CHECK-DAG:     %[[MASK_ARG:.*]] = fir.convert %[[MASK]] : (!fir.box<i1>) -> !fir.box<none>
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranAMinvalDim(%[[RET_ARG]], %[[ARRAY_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[MASK_ARG]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> none
+// CHECK:         fir.call @_FortranAMinvalDim(%[[RET_ARG]], %[[ARRAY_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[MASK_ARG]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
 
 // CHECK:         %[[RET:.*]] = fir.load %[[RET_BOX]]
 // CHECK:         %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[RET]]
@@ -175,7 +175,7 @@ func.func @_QPminval5(%arg0: !fir.ref<!fir.array<2xi32>> {fir.bindc_name = "s"})
 // CHECK-DAG:     %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]]
 // CHECK-DAG:     %[[ARRAY_ARG:.*]] = fir.convert %[[ARRAY_BOX]] : (!fir.box<!fir.array<2x2xi32>>) -> !fir.box<none>
 // CHECK-DAG:     %[[MASK_ARG:.*]] = fir.convert %[[MASK_BOX]] : (!fir.box<!fir.logical<4>>) -> !fir.box<none>
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranAMinvalDim(%[[RET_ARG]], %[[ARRAY_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[MASK_ARG]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> none
+// CHECK:         fir.call @_FortranAMinvalDim(%[[RET_ARG]], %[[ARRAY_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[MASK_ARG]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
 
 // simple one argument minval for character
 func.func @_QPminval6(%arg0: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_name = "a"}, %arg1: !fir.boxchar<1> {fir.bindc_name = "s"}) {
@@ -205,7 +205,7 @@ func.func @_QPminval6(%arg0: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_n
 // CHECK-DAG:     %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]]
 // CHECK-DAG:     %[[ARRAY_ARG:.*]] = fir.convert %[[ARRAY]]#1 : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK-DAG:     %[[MASK_ARG:.*]] = fir.convert %[[MASK]] : (!fir.box<i1>) -> !fir.box<none>
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranAMinvalCharacter(%[[RET_ARG]], %[[ARRAY_ARG]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[MASK_ARG]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32, !fir.box<none>) -> none
+// CHECK:         fir.call @_FortranAMinvalCharacter(%[[RET_ARG]], %[[ARRAY_ARG]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[MASK_ARG]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32, !fir.box<none>) -> ()
 
 // CHECK:         %[[RET:.*]] = fir.load %[[RET_BOX]]
 // CHECK:         %[[BOX_ELESIZE:.*]] = fir.box_elesize %[[RET]]
diff --git a/flang/test/HLFIR/mul_transpose.f90 b/flang/test/HLFIR/mul_transpose.f90
index 7cfbfe39d0ea8..98a9c11e4b9c1 100644
--- a/flang/test/HLFIR/mul_transpose.f90
+++ b/flang/test/HLFIR/mul_transpose.f90
@@ -24,7 +24,7 @@ subroutine mul_transpose(a, b, res)
 ! CHECK-BASE-NEXT:      hlfir.destroy %[[MATMUL_RES]]
 ! CHECK-BASE-NEXT:      hlfir.destroy %[[TRANSPOSE_RES]]
 
-! CHECK-CANONICAL-NEXT: %[[CHAIN_RES:.*]] = hlfir.matmul_transpose %[[A_DECL]]#0 %[[B_DECL]]#0 : (!fir.ref<!fir.array<2x1xf32>>, !fir.ref<!fir.array<2x2xf32>>) -> !hlfir.expr<1x2xf32>
+! CHECK-CANONICAL-NEXT: %[[CHAIN_RES:.*]] = hlfir.matmul_transpose %[[A_DECL]]#0 %[[B_DECL]]#0 {fastmath = #arith.fastmath<contract>} : (!fir.ref<!fir.array<2x1xf32>>, !fir.ref<!fir.array<2x2xf32>>) -> !hlfir.expr<1x2xf32>
 ! CHECK-CANONICAL-NEXT: hlfir.assign %[[CHAIN_RES]] to %[[RES_DECL]]#0 : !hlfir.expr<1x2xf32>, !fir.ref<!fir.array<1x2xf32>>
 ! CHECK-CANONICAL-NEXT: hlfir.destroy %[[CHAIN_RES]]
 
diff --git a/flang/test/HLFIR/optional_dummy.f90 b/flang/test/HLFIR/optional_dummy.f90
index 8534a414eaaf5..ecb14f60fd7df 100644
--- a/flang/test/HLFIR/optional_dummy.f90
+++ b/flang/test/HLFIR/optional_dummy.f90
@@ -12,7 +12,7 @@
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i32
 ! CHECK:           %[[VAL_4:.*]] = arith.constant false
 ! CHECK:           %[[VAL_5:.*]] = arith.constant false
-! CHECK:           %[[VAL_6:.*]] = fir.call @_FortranAStopStatement(%[[VAL_3]], %[[VAL_4]], %[[VAL_5]]) fastmath<contract> : (i32, i1, i1) -> none
+! CHECK:           fir.call @_FortranAStopStatement(%[[VAL_3]], %[[VAL_4]], %[[VAL_5]]) fastmath<contract> : (i32, i1, i1) -> ()
 ! CHECK:           fir.unreachable
 ! CHECK:         ^bb2:
 ! CHECK:           cf.br ^bb3
diff --git a/flang/test/HLFIR/order_assignments/lhs-conflicts-codegen.fir b/flang/test/HLFIR/order_assignments/lhs-conflicts-codegen.fir
index 45ceb516a6863..ac6c0d89f73fb 100644
--- a/flang/test/HLFIR/order_assignments/lhs-conflicts-codegen.fir
+++ b/flang/test/HLFIR/order_assignments/lhs-conflicts-codegen.fir
@@ -96,7 +96,7 @@ func.func @save_box_in_stack(%arg0: !fir.box<!fir.array<?xi32>>) {
 // CHECK:           fir.do_loop {{.*}} {
 // CHECK:             %[[VAL_48:.*]] = hlfir.designate %[[VAL_9]]#0 {{.*}}  : (!fir.box<!fir.array<?xi32>>, i32, i32, index, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
 // CHECK:             %[[VAL_49:.*]] = fir.convert %[[VAL_48]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
-// CHECK:             %[[VAL_50:.*]] = fir.call @_FortranAPushDescriptor(%[[VAL_30]], %[[VAL_49]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> none
+// CHECK:             fir.call @_FortranAPushDescriptor(%[[VAL_30]], %[[VAL_49]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
 // CHECK:           }
 // CHECK:           fir.store %{{.*}} to %[[VAL_2]] : !fir.ref<i64>
 // CHECK:           fir.do_loop {{.*}} {
@@ -104,12 +104,12 @@ func.func @save_box_in_stack(%arg0: !fir.box<!fir.array<?xi32>>) {
 // CHECK:             %[[VAL_61:.*]] = arith.addi %[[VAL_60]], %{{.*}} : i64
 // CHECK:             fir.store %[[VAL_61]] to %[[VAL_2]] : !fir.ref<i64>
 // CHECK:             %[[VAL_62:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK:             %[[VAL_63:.*]] = fir.call @_FortranADescriptorAt(%[[VAL_30]], %[[VAL_60]], %[[VAL_62]]) : (!fir.llvm_ptr<i8>, i64, !fir.ref<!fir.box<none>>) -> none
+// CHECK:             fir.call @_FortranADescriptorAt(%[[VAL_30]], %[[VAL_60]], %[[VAL_62]]) : (!fir.llvm_ptr<i8>, i64, !fir.ref<!fir.box<none>>) -> ()
 // CHECK:             %[[VAL_64:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
 // CHECK:             %[[VAL_65:.*]] = fir.convert %[[VAL_64]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>) -> !fir.box<!fir.array<?xi32>>
 // CHECK:             hlfir.assign %{{.*}} to %[[VAL_65]] : i32, !fir.box<!fir.array<?xi32>>
 // CHECK:           }
-// CHECK:           fir.call @_FortranADestroyDescriptorStack(%[[VAL_30]]) : (!fir.llvm_ptr<i8>) -> none
+// CHECK:           fir.call @_FortranADestroyDescriptorStack(%[[VAL_30]]) : (!fir.llvm_ptr<i8>) -> ()
 
 // Test simplified IR for:
 //
@@ -171,13 +171,13 @@ func.func @test_vector_subscript_overlap(%arg0: !fir.ref<!fir.array<?xi64>>) {
 // CHECK:             %[[VAL_52:.*]] = fir.embox %[[VAL_51]](%[[VAL_48]]) : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
 // CHECK:             %[[VAL_55:.*]] = fir.convert %[[VAL_52]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
 // Save the vector subscripted designator shape.
-// CHECK:             %[[VAL_56:.*]] = fir.call @_FortranAPushDescriptor({{.*}}, {{.*}}) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> none
+// CHECK:             fir.call @_FortranAPushDescriptor({{.*}}, {{.*}}) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
 // CHECK:             fir.do_loop {{.*}} {
 // CHECK:               %[[VAL_60:.*]] = hlfir.designate %[[VAL_11]]#0 (%{{.*}})  : (!fir.box<!fir.array<?xi64>>, i64) -> !fir.ref<i64>
 // CHECK:               %[[VAL_61:.*]] = fir.embox %[[VAL_60]] : (!fir.ref<i64>) -> !fir.box<i64>
 // CHECK:               %[[VAL_62:.*]] = fir.convert %[[VAL_61]] : (!fir.box<i64>) -> !fir.box<none>
 // Save the vector subscripted designator element address.
-// CHECK:               %[[VAL_63:.*]] = fir.call @_FortranAPushDescriptor(%[[VAL_30]], %[[VAL_62]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> none
+// CHECK:               fir.call @_FortranAPushDescriptor(%[[VAL_30]], %[[VAL_62]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
 // CHECK:             }
 // CHECK:           }
 // CHECK:           fir.store %{{.*}} to %[[VAL_4]] : !fir.ref<i64>
@@ -189,7 +189,7 @@ func.func @test_vector_subscript_overlap(%arg0: !fir.ref<!fir.array<?xi64>>) {
 // CHECK:             fir.store %[[VAL_71]] to %[[VAL_2]] : !fir.ref<i64>
 // CHECK:             %[[VAL_72:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // Fetch the vector subscripted designator shape to create the elemental loop.
-// CHECK:             %[[VAL_73:.*]] = fir.call @_FortranADescriptorAt(%[[VAL_37]], %[[VAL_70]], %[[VAL_72]]) : (!fir.llvm_ptr<i8>, i64, !fir.ref<!fir.box<none>>) -> none
+// CHECK:             fir.call @_FortranADescriptorAt(%[[VAL_37]], %[[VAL_70]], %[[VAL_72]]) : (!fir.llvm_ptr<i8>, i64, !fir.ref<!fir.box<none>>) -> ()
 // CHECK:             %[[VAL_74:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
 // CHECK:             %[[VAL_75:.*]] = fir.convert %[[VAL_74]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>) -> !fir.box<!fir.array<?xi32>>
 // CHECK:             %[[VAL_76:.*]] = arith.constant 0 : index
@@ -201,15 +201,15 @@ func.func @test_vector_subscript_overlap(%arg0: !fir.ref<!fir.array<?xi64>>) {
 // CHECK:               fir.store %[[VAL_82]] to %[[VAL_4]] : !fir.ref<i64>
 // CHECK:               %[[VAL_83:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.ptr<i64>>>) -> !fir.ref<!fir.box<none>>
 // Fetch the vector subscripted designator element address.
-// CHECK:               %[[VAL_84:.*]] = fir.call @_FortranADescriptorAt(%[[VAL_30]], %[[VAL_81]], %[[VAL_83]]) : (!fir.llvm_ptr<i8>, i64, !fir.ref<!fir.box<none>>) -> none
+// CHECK:               fir.call @_FortranADescriptorAt(%[[VAL_30]], %[[VAL_81]], %[[VAL_83]]) : (!fir.llvm_ptr<i8>, i64, !fir.ref<!fir.box<none>>) -> ()
 // CHECK:               %[[VAL_85:.*]] = fir.load %[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<i64>>>
 // CHECK:               %[[VAL_86:.*]] = fir.box_addr %[[VAL_85]] : (!fir.box<!fir.ptr<i64>>) -> !fir.ptr<i64>
 // CHECK:               %[[VAL_87:.*]] = fir.convert %[[VAL_86]] : (!fir.ptr<i64>) -> !fir.ref<i64>
 // CHECK:               hlfir.assign %{{.*}} to %[[VAL_87]] : i64, !fir.ref<i64>
 // CHECK:             }
 // CHECK:           }
-// CHECK:           %[[VAL_88:.*]] = fir.call @_FortranADestroyDescriptorStack(%[[VAL_30]]) : (!fir.llvm_ptr<i8>) -> none
-// CHECK:           %[[VAL_89:.*]] = fir.call @_FortranADestroyDescriptorStack(%[[VAL_37]]) : (!fir.llvm_ptr<i8>) -> none
+// CHECK:           fir.call @_FortranADestroyDescriptorStack(%[[VAL_30]]) : (!fir.llvm_ptr<i8>) -> ()
+// CHECK:           fir.call @_FortranADestroyDescriptorStack(%[[VAL_37]]) : (!fir.llvm_ptr<i8>) -> ()
 
 func.func private @integer_to_real(!fir.ref<i64>, !fir.logical<4>)
 func.func private @foo(!fir.ref<!fir.array<?xi64>>, index) -> index
diff --git a/flang/test/HLFIR/order_assignments/runtime-stack-temp.fir b/flang/test/HLFIR/order_assignments/runtime-stack-temp.fir
index aa334c5ac56cf..4c2d416836671 100644
--- a/flang/test/HLFIR/order_assignments/runtime-stack-temp.fir
+++ b/flang/test/HLFIR/order_assignments/runtime-stack-temp.fir
@@ -89,7 +89,7 @@ func.func @test_runtime_stack(%arg0: !fir.box<!fir.array<?xi32>>, %n: !fir.ref<i
 // CHECK:             %[[VAL_36:.*]] = fir.shape %[[VAL_35]] : (index) -> !fir.shape<1>
 // CHECK:             %[[VAL_37:.*]] = hlfir.designate %[[VAL_11]]#0 (%[[VAL_29]]:%[[VAL_30]]:%[[VAL_6]])  shape %[[VAL_36]] : (!fir.box<!fir.array<?xi32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
 // CHECK:             %[[VAL_38:.*]] = fir.convert %[[VAL_37]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
-// CHECK:             %[[VAL_39:.*]] = fir.call @_FortranAPushValue(%[[VAL_22]], %[[VAL_38]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> none
+// CHECK:             fir.call @_FortranAPushValue(%[[VAL_22]], %[[VAL_38]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
 // CHECK:           }
 // CHECK:           %[[VAL_40:.*]] = fir.convert %[[VAL_9]] : (i32) -> index
 // CHECK:           %[[VAL_41:.*]] = fir.convert %[[VAL_8]] : (i32) -> index
@@ -101,7 +101,7 @@ func.func @test_runtime_stack(%arg0: !fir.box<!fir.array<?xi32>>, %n: !fir.ref<i
 // CHECK:             %[[VAL_46:.*]] = arith.addi %[[VAL_45]], %[[VAL_17]] : i64
 // CHECK:             fir.store %[[VAL_46]] to %[[VAL_3]] : !fir.ref<i64>
 // CHECK:             %[[VAL_47:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK:             %[[VAL_48:.*]] = fir.call @_FortranAValueAt(%[[VAL_22]], %[[VAL_45]], %[[VAL_47]]) : (!fir.llvm_ptr<i8>, i64, !fir.ref<!fir.box<none>>) -> none
+// CHECK:             fir.call @_FortranAValueAt(%[[VAL_22]], %[[VAL_45]], %[[VAL_47]]) : (!fir.llvm_ptr<i8>, i64, !fir.ref<!fir.box<none>>) -> ()
 // CHECK:             %[[VAL_49:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK:             %[[VAL_50:.*]] = fir.load %[[VAL_10]]#0 : !fir.ref<i32>
 // CHECK:             %[[VAL_51:.*]] = arith.addi %[[VAL_44]], %[[VAL_50]] : i32
@@ -116,7 +116,7 @@ func.func @test_runtime_stack(%arg0: !fir.box<!fir.array<?xi32>>, %n: !fir.ref<i
 // CHECK:             %[[VAL_60:.*]] = hlfir.designate %[[VAL_11]]#0 (%[[VAL_53]]:%[[VAL_54]]:%[[VAL_4]])  shape %[[VAL_59]] : (!fir.box<!fir.array<?xi32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
 // CHECK:             hlfir.assign %[[VAL_49]] to %[[VAL_60]] : !fir.box<!fir.heap<!fir.array<?xi32>>>, !fir.box<!fir.array<?xi32>>
 // CHECK:           }
-// CHECK:           %[[VAL_61:.*]] = fir.call @_FortranADestroyValueStack(%[[VAL_22]]) : (!fir.llvm_ptr<i8>) -> none
+// CHECK:           fir.call @_FortranADestroyValueStack(%[[VAL_22]]) : (!fir.llvm_ptr<i8>) -> ()
 // CHECK:           return
 // CHECK:         }
 
@@ -164,13 +164,13 @@ func.func @_QPdealing_with_i1(%x: !fir.ref<!fir.array<10x20xi32>>) {
 // CHECK:               fir.store %[[VAL_27]] to %[[VAL_1]] : !fir.ref<!fir.logical<1>>
 // CHECK:               %[[VAL_28:.*]] = fir.embox %[[VAL_1]] : (!fir.ref<!fir.logical<1>>) -> !fir.box<!fir.logical<1>>
 // CHECK:               %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (!fir.box<!fir.logical<1>>) -> !fir.box<none>
-// CHECK:               %[[VAL_30:.*]] = fir.call @_FortranAPushValue(%{{.*}}, %[[VAL_29]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> none
+// CHECK:               fir.call @_FortranAPushValue(%{{.*}}, %[[VAL_29]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
 // CHECK:             }
 // CHECK:           }
 // CHECK:           fir.do_loop
 // CHECK:             fir.do_loop
 // CHECK:               %[[VAL_43:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.box<!fir.heap<!fir.logical<1>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK:               %[[VAL_44:.*]] = fir.call @_FortranAValueAt(%{{.*}}, %{{.*}}, %[[VAL_43]])
+// CHECK:               fir.call @_FortranAValueAt(%{{.*}}, %{{.*}}, %[[VAL_43]])
 // CHECK:               %[[VAL_45:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.box<!fir.heap<!fir.logical<1>>>>
 // CHECK:               %[[VAL_46:.*]] = fir.box_addr %[[VAL_45]] : (!fir.box<!fir.heap<!fir.logical<1>>>) -> !fir.heap<!fir.logical<1>>
 // CHECK:               %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.heap<!fir.logical<1>>
diff --git a/flang/test/HLFIR/order_assignments/user-defined-assignment-finalization.fir b/flang/test/HLFIR/order_assignments/user-defined-assignment-finalization.fir
index ae5329a2d2433..fbbc47185757f 100644
--- a/flang/test/HLFIR/order_assignments/user-defined-assignment-finalization.fir
+++ b/flang/test/HLFIR/order_assignments/user-defined-assignment-finalization.fir
@@ -56,7 +56,7 @@ func.func @_QPtest1() {
     hlfir.yield %4#0 : !fir.ref<!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>> cleanup {
       %5 = fir.embox %0 : (!fir.ref<!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>) -> !fir.box<!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>
       %6 = fir.convert %5 : (!fir.box<!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>) -> !fir.box<none>
-      %7 = fir.call @_FortranADestroy(%6) fastmath<contract> : (!fir.box<none>) -> none
+      fir.call @_FortranADestroy(%6) fastmath<contract> : (!fir.box<none>) -> ()
     }
   } to {
     hlfir.yield %2#0 : !fir.ref<!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>
@@ -86,7 +86,7 @@ func.func @_QPtest1() {
 // CHECK:           hlfir.end_associate %[[VAL_6]]#1, %[[VAL_6]]#2 : !fir.ref<!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>, i1
 // CHECK:           %[[VAL_11:.*]] = fir.embox %[[VAL_0]] : (!fir.ref<!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>) -> !fir.box<!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>
 // CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (!fir.box<!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>) -> !fir.box<none>
-// CHECK:           %[[VAL_13:.*]] = fir.call @_FortranADestroy(%[[VAL_12]]) fastmath<contract> : (!fir.box<none>) -> none
+// CHECK:           fir.call @_FortranADestroy(%[[VAL_12]]) fastmath<contract> : (!fir.box<none>) -> ()
 // CHECK:           return
 // CHECK:         }
 
@@ -115,7 +115,7 @@ func.func @_QPtest2() {
     hlfir.yield %6#0 : !fir.ref<!fir.array<2x!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>> cleanup {
       %7 = fir.embox %0(%2) : (!fir.ref<!fir.array<2x!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>>
       %8 = fir.convert %7 : (!fir.box<!fir.array<2x!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>>) -> !fir.box<none>
-      %9 = fir.call @_FortranADestroy(%8) fastmath<contract> : (!fir.box<none>) -> none
+      fir.call @_FortranADestroy(%8) fastmath<contract> : (!fir.box<none>) -> ()
       fir.call @llvm.stackrestore.p0(%4) fastmath<contract> : (!fir.ref<i8>) -> ()
     }
   } to {
@@ -154,7 +154,7 @@ func.func @_QPtest2() {
 // CHECK:           hlfir.end_associate %[[VAL_9]]#1, %[[VAL_9]]#2 : !fir.ref<!fir.array<2x!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>>, i1
 // CHECK:           %[[VAL_18:.*]] = fir.embox %[[VAL_1]](%[[VAL_3]]) : (!fir.ref<!fir.array<2x!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>>
 // CHECK:           %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (!fir.box<!fir.array<2x!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>>) -> !fir.box<none>
-// CHECK:           %[[VAL_20:.*]] = fir.call @_FortranADestroy(%[[VAL_19]]) fastmath<contract> : (!fir.box<none>) -> none
+// CHECK:           fir.call @_FortranADestroy(%[[VAL_19]]) fastmath<contract> : (!fir.box<none>) -> ()
 // CHECK:           fir.call @llvm.stackrestore.p0(%[[VAL_5]]) fastmath<contract> : (!fir.ref<i8>) -> ()
 // CHECK:           return
 // CHECK:         }
@@ -201,7 +201,7 @@ func.func @_QPtest3(%arg0: !fir.ref<!fir.array<2xi32>> {fir.bindc_name = "y"}) {
         hlfir.yield %9#0 : !fir.ref<!fir.array<2x!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>> cleanup {
           %10 = fir.embox %0(%2) : (!fir.ref<!fir.array<2x!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>>
           %11 = fir.convert %10 : (!fir.box<!fir.array<2x!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>>) -> !fir.box<none>
-          %12 = fir.call @_FortranADestroy(%11) fastmath<contract> : (!fir.box<none>) -> none
+          fir.call @_FortranADestroy(%11) fastmath<contract> : (!fir.box<none>) -> ()
           fir.call @llvm.stackrestore.p0(%7) fastmath<contract> : (!fir.ref<i8>) -> ()
         }
       }
@@ -254,7 +254,7 @@ func.func @_QPtest3(%arg0: !fir.ref<!fir.array<2xi32>> {fir.bindc_name = "y"}) {
 // CHECK:               %[[VAL_32:.*]] = hlfir.designate %[[VAL_20B]]#0 (%[[VAL_28]])  : (!fir.ref<!fir.array<2x!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>>, index) -> !fir.ref<!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>
 // CHECK:               %[[VAL_33:.*]] = fir.embox %[[VAL_32]] : (!fir.ref<!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>) -> !fir.box<!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>
 // CHECK:               %[[VAL_34:.*]] = fir.convert %[[VAL_33]] : (!fir.box<!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>) -> !fir.box<none>
-// CHECK:               %[[VAL_35:.*]] = fir.call @_FortranAPushValue(%[[VAL_27]], %[[VAL_34]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> none
+// CHECK:               fir.call @_FortranAPushValue(%[[VAL_27]], %[[VAL_34]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
 // CHECK:             }
 // CHECK:           }
 // CHECK:           fir.do_loop %[[VAL_37:.*]] = %{{.*}} to %[[VAL_4]] step %{{.*}} {
@@ -266,7 +266,7 @@ func.func @_QPtest3(%arg0: !fir.ref<!fir.array<2xi32>> {fir.bindc_name = "y"}) {
 // CHECK:               %[[VAL_42:.*]] = arith.addi %[[VAL_41]], %{{.*}} : i64
 // CHECK:               fir.store %[[VAL_42]] to %[[VAL_2]] : !fir.ref<i64>
 // CHECK:               %[[VAL_43:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK:               %[[VAL_44:.*]] = fir.call @_FortranAValueAt(%[[VAL_27]], %[[VAL_41]], %[[VAL_43]]) : (!fir.llvm_ptr<i8>, i64, !fir.ref<!fir.box<none>>) -> none
+// CHECK:               fir.call @_FortranAValueAt(%[[VAL_27]], %[[VAL_41]], %[[VAL_43]]) : (!fir.llvm_ptr<i8>, i64, !fir.ref<!fir.box<none>>) -> ()
 // CHECK:               %[[VAL_45:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.heap<!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>>>
 // CHECK:               %[[VAL_46:.*]] = fir.box_addr %[[VAL_45]] : (!fir.box<!fir.heap<!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>>) -> !fir.heap<!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>
 // CHECK:               %[[VAL_47:.*]] = hlfir.designate %[[VAL_8]]#0 (%[[VAL_37]])  : (!fir.ref<!fir.array<2x!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>>, index) -> !fir.ref<!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>
@@ -279,10 +279,10 @@ func.func @_QPtest3(%arg0: !fir.ref<!fir.array<2xi32>> {fir.bindc_name = "y"}) {
 // CHECK:             }
 // CHECK:           }
 // CHECK:           hlfir.end_associate %[[VAL_16]]#1, %[[VAL_16]]#2 : !fir.ref<!fir.array<2x!fir.logical<4>>>, i1
-// CHECK:           %[[VAL_53:.*]] = fir.call @_FortranADestroyValueStack(%[[VAL_27]]) : (!fir.llvm_ptr<i8>) -> none
+// CHECK:           fir.call @_FortranADestroyValueStack(%[[VAL_27]]) : (!fir.llvm_ptr<i8>) -> ()
 // CHECK:           %[[VAL_54:.*]] = fir.embox %[[VAL_5]](%[[VAL_7]]) : (!fir.ref<!fir.array<2x!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>>
 // CHECK:           %[[VAL_55:.*]] = fir.convert %[[VAL_54]] : (!fir.box<!fir.array<2x!fir.type<_QMtypesTud_assign{x:!fir.box<!fir.ptr<i32>>}>>>) -> !fir.box<none>
-// CHECK:           %[[VAL_56:.*]] = fir.call @_FortranADestroy(%[[VAL_55]]) fastmath<contract> : (!fir.box<none>) -> none
+// CHECK:           fir.call @_FortranADestroy(%[[VAL_55]]) fastmath<contract> : (!fir.box<none>) -> ()
 // CHECK:           fir.call @llvm.stackrestore.p0(%[[VAL_18]]) fastmath<contract> : (!fir.ref<i8>) -> ()
 // CHECK:           return
 // CHECK:         }
diff --git a/flang/test/HLFIR/product-lowering.fir b/flang/test/HLFIR/product-lowering.fir
index dd3506937cacb..45ae1f7aeaf5a 100644
--- a/flang/test/HLFIR/product-lowering.fir
+++ b/flang/test/HLFIR/product-lowering.fir
@@ -59,7 +59,7 @@ func.func @_QPproduct2(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "a
 // CHECK-DAG:     %[[ARRAY_ARG:.*]] = fir.convert %[[ARRAY]]
 // CHECK-DAG:     %[[MASK_ARG:.*]] = fir.convert %[[MASK]]
 
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranAProductDim(%[[RET_ARG]], %[[ARRAY_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[MASK_ARG]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> none
+// CHECK:         fir.call @_FortranAProductDim(%[[RET_ARG]], %[[ARRAY_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[MASK_ARG]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
 // CHECK:         %[[RET:.*]] = fir.load %[[RET_BOX]]
 // CHECK:         %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[RET]]
 // CHECK-NEXT:    %[[ADDR:.*]] = fir.box_addr %[[RET]]
@@ -165,4 +165,4 @@ func.func @_QPproduct5(%arg0: !fir.ref<!fir.array<2xi32>> {fir.bindc_name = "s"}
 // CHECK-DAG:     %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]]
 // CHECK-DAG:     %[[ARRAY_ARG:.*]] = fir.convert %[[ARRAY_BOX]] : (!fir.box<!fir.array<2x2xi32>>) -> !fir.box<none>
 // CHECK-DAG:     %[[MASK_ARG:.*]] = fir.convert %[[MASK_BOX]] : (!fir.box<!fir.logical<4>>) -> !fir.box<none>
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranAProductDim(%[[RET_ARG]], %[[ARRAY_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[MASK_ARG]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> none
+// CHECK:         fir.call @_FortranAProductDim(%[[RET_ARG]], %[[ARRAY_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[MASK_ARG]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
diff --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-dotproduct.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-dotproduct.fir
new file mode 100644
index 0000000000000..f59b1422dbc84
--- /dev/null
+++ b/flang/test/HLFIR/simplify-hlfir-intrinsics-dotproduct.fir
@@ -0,0 +1,144 @@
+// Test hlfir.dot_product simplification to a reduction loop:
+// RUN: fir-opt --simplify-hlfir-intrinsics %s | FileCheck %s
+
+func.func @dot_product_integer(%arg0: !hlfir.expr<?xi16>, %arg1: !hlfir.expr<?xi32>) -> i32 {
+  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?xi16>, !hlfir.expr<?xi32>) -> i32
+  return %res : i32
+}
+// CHECK-LABEL:   func.func @dot_product_integer(
+// CHECK-SAME:                                   %[[VAL_0:.*]]: !hlfir.expr<?xi16>,
+// CHECK-SAME:                                   %[[VAL_1:.*]]: !hlfir.expr<?xi32>) -> i32 {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xi16>) -> !fir.shape<1>
+// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+// CHECK:           %[[VAL_6:.*]] = fir.do_loop %[[VAL_7:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] unordered iter_args(%[[VAL_8:.*]] = %[[VAL_3]]) -> (i32) {
+// CHECK:             %[[VAL_9:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_7]] : (!hlfir.expr<?xi16>, index) -> i16
+// CHECK:             %[[VAL_10:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_7]] : (!hlfir.expr<?xi32>, index) -> i32
+// CHECK:             %[[VAL_11:.*]] = fir.convert %[[VAL_9]] : (i16) -> i32
+// CHECK:             %[[VAL_12:.*]] = arith.muli %[[VAL_11]], %[[VAL_10]] : i32
+// CHECK:             %[[VAL_13:.*]] = arith.addi %[[VAL_8]], %[[VAL_12]] : i32
+// CHECK:             fir.result %[[VAL_13]] : i32
+// CHECK:           }
+// CHECK:           return %[[VAL_6]] : i32
+// CHECK:         }
+
+func.func @dot_product_real(%arg0: !hlfir.expr<?xf32>, %arg1: !hlfir.expr<?xf16>) -> f32 {
+  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?xf32>, !hlfir.expr<?xf16>) -> f32
+  return %res : f32
+}
+// CHECK-LABEL:   func.func @dot_product_real(
+// CHECK-SAME:                                %[[VAL_0:.*]]: !hlfir.expr<?xf32>,
+// CHECK-SAME:                                %[[VAL_1:.*]]: !hlfir.expr<?xf16>) -> f32 {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xf32>) -> !fir.shape<1>
+// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+// CHECK:           %[[VAL_6:.*]] = fir.do_loop %[[VAL_7:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_8:.*]] = %[[VAL_3]]) -> (f32) {
+// CHECK:             %[[VAL_9:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_7]] : (!hlfir.expr<?xf32>, index) -> f32
+// CHECK:             %[[VAL_10:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_7]] : (!hlfir.expr<?xf16>, index) -> f16
+// CHECK:             %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (f16) -> f32
+// CHECK:             %[[VAL_12:.*]] = arith.mulf %[[VAL_9]], %[[VAL_11]] : f32
+// CHECK:             %[[VAL_13:.*]] = arith.addf %[[VAL_8]], %[[VAL_12]] : f32
+// CHECK:             fir.result %[[VAL_13]] : f32
+// CHECK:           }
+// CHECK:           return %[[VAL_6]] : f32
+// CHECK:         }
+
+func.func @dot_product_complex(%arg0: !hlfir.expr<?xcomplex<f32>>, %arg1: !hlfir.expr<?xcomplex<f16>>) -> complex<f32> {
+  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?xcomplex<f32>>, !hlfir.expr<?xcomplex<f16>>) -> complex<f32>
+  return %res : complex<f32>
+}
+// CHECK-LABEL:   func.func @dot_product_complex(
+// CHECK-SAME:                                   %[[VAL_0:.*]]: !hlfir.expr<?xcomplex<f32>>,
+// CHECK-SAME:                                   %[[VAL_1:.*]]: !hlfir.expr<?xcomplex<f16>>) -> complex<f32> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xcomplex<f32>>) -> !fir.shape<1>
+// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+// CHECK:           %[[VAL_6:.*]] = fir.undefined complex<f32>
+// CHECK:           %[[VAL_7:.*]] = fir.insert_value %[[VAL_6]], %[[VAL_3]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+// CHECK:           %[[VAL_8:.*]] = fir.insert_value %[[VAL_7]], %[[VAL_3]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+// CHECK:           %[[VAL_9:.*]] = fir.do_loop %[[VAL_10:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_11:.*]] = %[[VAL_8]]) -> (complex<f32>) {
+// CHECK:             %[[VAL_12:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_10]] : (!hlfir.expr<?xcomplex<f32>>, index) -> complex<f32>
+// CHECK:             %[[VAL_13:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_10]] : (!hlfir.expr<?xcomplex<f16>>, index) -> complex<f16>
+// CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (complex<f16>) -> complex<f32>
+// CHECK:             %[[VAL_15:.*]] = fir.extract_value %[[VAL_12]], [1 : index] : (complex<f32>) -> f32
+// CHECK:             %[[VAL_16:.*]] = arith.negf %[[VAL_15]] : f32
+// CHECK:             %[[VAL_17:.*]] = fir.insert_value %[[VAL_12]], %[[VAL_16]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+// CHECK:             %[[VAL_18:.*]] = fir.mulc %[[VAL_17]], %[[VAL_14]] : complex<f32>
+// CHECK:             %[[VAL_19:.*]] = fir.addc %[[VAL_11]], %[[VAL_18]] : complex<f32>
+// CHECK:             fir.result %[[VAL_19]] : complex<f32>
+// CHECK:           }
+// CHECK:           return %[[VAL_9]] : complex<f32>
+// CHECK:         }
+
+func.func @dot_product_real_complex(%arg0: !hlfir.expr<?xf32>, %arg1: !hlfir.expr<?xcomplex<f16>>) -> complex<f32> {
+  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?xf32>, !hlfir.expr<?xcomplex<f16>>) -> complex<f32>
+  return %res : complex<f32>
+}
+// CHECK-LABEL:   func.func @dot_product_real_complex(
+// CHECK-SAME:                                        %[[VAL_0:.*]]: !hlfir.expr<?xf32>,
+// CHECK-SAME:                                        %[[VAL_1:.*]]: !hlfir.expr<?xcomplex<f16>>) -> complex<f32> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xf32>) -> !fir.shape<1>
+// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+// CHECK:           %[[VAL_6:.*]] = fir.undefined complex<f32>
+// CHECK:           %[[VAL_7:.*]] = fir.insert_value %[[VAL_6]], %[[VAL_3]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+// CHECK:           %[[VAL_8:.*]] = fir.insert_value %[[VAL_7]], %[[VAL_3]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+// CHECK:           %[[VAL_9:.*]] = fir.do_loop %[[VAL_10:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_11:.*]] = %[[VAL_8]]) -> (complex<f32>) {
+// CHECK:             %[[VAL_12:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_10]] : (!hlfir.expr<?xf32>, index) -> f32
+// CHECK:             %[[VAL_13:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_10]] : (!hlfir.expr<?xcomplex<f16>>, index) -> complex<f16>
+// CHECK:             %[[VAL_14:.*]] = fir.undefined complex<f32>
+// CHECK:             %[[VAL_15:.*]] = fir.insert_value %[[VAL_14]], %[[VAL_3]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+// CHECK:             %[[VAL_16:.*]] = fir.insert_value %[[VAL_15]], %[[VAL_3]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+// CHECK:             %[[VAL_17:.*]] = fir.insert_value %[[VAL_16]], %[[VAL_12]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+// CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_13]] : (complex<f16>) -> complex<f32>
+// CHECK:             %[[VAL_19:.*]] = fir.extract_value %[[VAL_17]], [1 : index] : (complex<f32>) -> f32
+// CHECK:             %[[VAL_20:.*]] = arith.negf %[[VAL_19]] : f32
+// CHECK:             %[[VAL_21:.*]] = fir.insert_value %[[VAL_17]], %[[VAL_20]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+// CHECK:             %[[VAL_22:.*]] = fir.mulc %[[VAL_21]], %[[VAL_18]] : complex<f32>
+// CHECK:             %[[VAL_23:.*]] = fir.addc %[[VAL_11]], %[[VAL_22]] : complex<f32>
+// CHECK:             fir.result %[[VAL_23]] : complex<f32>
+// CHECK:           }
+// CHECK:           return %[[VAL_9]] : complex<f32>
+// CHECK:         }
+
+func.func @dot_product_logical(%arg0: !hlfir.expr<?x!fir.logical<1>>, %arg1: !hlfir.expr<?x!fir.logical<4>>) -> !fir.logical<4> {
+  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?x!fir.logical<1>>, !hlfir.expr<?x!fir.logical<4>>) -> !fir.logical<4>
+  return %res : !fir.logical<4>
+}
+// CHECK-LABEL:   func.func @dot_product_logical(
+// CHECK-SAME:                                   %[[VAL_0:.*]]: !hlfir.expr<?x!fir.logical<1>>,
+// CHECK-SAME:                                   %[[VAL_1:.*]]: !hlfir.expr<?x!fir.logical<4>>) -> !fir.logical<4> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant false
+// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?x!fir.logical<1>>) -> !fir.shape<1>
+// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+// CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
+// CHECK:           %[[VAL_7:.*]] = fir.do_loop %[[VAL_8:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] unordered iter_args(%[[VAL_9:.*]] = %[[VAL_6]]) -> (!fir.logical<4>) {
+// CHECK:             %[[VAL_10:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_8]] : (!hlfir.expr<?x!fir.logical<1>>, index) -> !fir.logical<1>
+// CHECK:             %[[VAL_11:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_8]] : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
+// CHECK:             %[[VAL_12:.*]] = fir.convert %[[VAL_9]] : (!fir.logical<4>) -> i1
+// CHECK:             %[[VAL_13:.*]] = fir.convert %[[VAL_10]] : (!fir.logical<1>) -> i1
+// CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_11]] : (!fir.logical<4>) -> i1
+// CHECK:             %[[VAL_15:.*]] = arith.andi %[[VAL_13]], %[[VAL_14]] : i1
+// CHECK:             %[[VAL_16:.*]] = arith.ori %[[VAL_12]], %[[VAL_15]] : i1
+// CHECK:             %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
+// CHECK:             fir.result %[[VAL_17]] : !fir.logical<4>
+// CHECK:           }
+// CHECK:           return %[[VAL_7]] : !fir.logical<4>
+// CHECK:         }
+
+func.func @dot_product_known_dim(%arg0: !hlfir.expr<10xf32>, %arg1: !hlfir.expr<?xi16>) -> f32 {
+  %res1 = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<10xf32>, !hlfir.expr<?xi16>) -> f32
+  %res2 = hlfir.dot_product %arg1 %arg0 : (!hlfir.expr<?xi16>, !hlfir.expr<10xf32>) -> f32
+  %res = arith.addf %res1, %res2 : f32
+  return %res : f32
+}
+// CHECK-LABEL:   func.func @dot_product_known_dim(
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 10 : index
+// CHECK:           fir.do_loop %{{.*}} = %[[VAL_2]] to %[[VAL_4]] step %[[VAL_2]]
+// CHECK:           fir.do_loop %{{.*}} = %[[VAL_2]] to %[[VAL_4]] step %[[VAL_2]]
diff --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-matmul.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-matmul.fir
new file mode 100644
index 0000000000000..d29e9a26c20ba
--- /dev/null
+++ b/flang/test/HLFIR/simplify-hlfir-intrinsics-matmul.fir
@@ -0,0 +1,660 @@
+// Test hlfir.cshift simplification to hlfir.elemental:
+// RUN: fir-opt --simplify-hlfir-intrinsics=allow-new-side-effects=false %s | FileCheck %s --check-prefixes=ALL,NOANSE
+// RUN: fir-opt --simplify-hlfir-intrinsics=allow-new-side-effects=true %s | FileCheck %s --check-prefixes=ALL,ANSE
+// RUN: fir-opt --simplify-hlfir-intrinsics -flang-inline-matmul-as-elemental %s | FileCheck %s --check-prefixes=ALL,ELEMENTAL
+
+func.func @matmul_matrix_matrix_integer(%arg0: !hlfir.expr<?x?xi16>, %arg1: !hlfir.expr<?x?xi32>) -> !hlfir.expr<?x?xi32> {
+  %res = hlfir.matmul %arg0 %arg1 : (!hlfir.expr<?x?xi16>, !hlfir.expr<?x?xi32>) -> !hlfir.expr<?x?xi32>
+  return %res : !hlfir.expr<?x?xi32>
+}
+// ALL-LABEL:   func.func @matmul_matrix_matrix_integer(
+// ALL-SAME:                                            %[[VAL_0:.*]]: !hlfir.expr<?x?xi16>,
+// ALL-SAME:                                            %[[VAL_1:.*]]: !hlfir.expr<?x?xi32>) -> !hlfir.expr<?x?xi32> {
+
+// NOANSE: hlfir.matmul
+
+// ANSE:           %[[VAL_2:.*]] = arith.constant 0 : index
+// ANSE:           %[[VAL_3:.*]] = arith.constant 1 : index
+// ANSE:           %[[VAL_4:.*]] = arith.constant 0 : i32
+// ANSE:           %[[VAL_5:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?x?xi16>) -> !fir.shape<2>
+// ANSE:           %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 0 : index} : (!fir.shape<2>) -> index
+// ANSE:           %[[VAL_7:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ANSE:           %[[VAL_8:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr<?x?xi32>) -> !fir.shape<2>
+// ANSE:           %[[VAL_9:.*]] = hlfir.get_extent %[[VAL_8]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ANSE:           %[[VAL_10:.*]] = fir.shape %[[VAL_6]], %[[VAL_9]] : (index, index) -> !fir.shape<2>
+// ANSE:           %[[VAL_11:.*]] = hlfir.eval_in_mem shape %[[VAL_10]] : (!fir.shape<2>) -> !hlfir.expr<?x?xi32> {
+// ANSE:           ^bb0(%[[VAL_12:.*]]: !fir.ref<!fir.array<?x?xi32>>):
+// ANSE:             %[[VAL_13:.*]] = fir.embox %[[VAL_12]](%[[VAL_10]]) : (!fir.ref<!fir.array<?x?xi32>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?xi32>>
+// ANSE:             fir.do_loop %[[VAL_14:.*]] = %[[VAL_3]] to %[[VAL_9]] step %[[VAL_3]] unordered {
+// ANSE:               fir.do_loop %[[VAL_15:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] unordered {
+// ANSE:                 %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_2]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// ANSE:                 %[[VAL_17:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// ANSE:                 %[[VAL_18:.*]] = arith.subi %[[VAL_16]]#0, %[[VAL_3]] : index
+// ANSE:                 %[[VAL_19:.*]] = arith.addi %[[VAL_15]], %[[VAL_18]] : index
+// ANSE:                 %[[VAL_20:.*]] = arith.subi %[[VAL_17]]#0, %[[VAL_3]] : index
+// ANSE:                 %[[VAL_21:.*]] = arith.addi %[[VAL_14]], %[[VAL_20]] : index
+// ANSE:                 %[[VAL_22:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_19]], %[[VAL_21]])  : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
+// ANSE:                 hlfir.assign %[[VAL_4]] to %[[VAL_22]] : i32, !fir.ref<i32>
+// ANSE:               }
+// ANSE:             }
+// ANSE:             fir.do_loop %[[VAL_23:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_3]] unordered {
+// ANSE:               fir.do_loop %[[VAL_24:.*]] = %[[VAL_3]] to %[[VAL_9]] step %[[VAL_3]] unordered {
+// ANSE:                 fir.do_loop %[[VAL_25:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] unordered {
+// ANSE:                   %[[VAL_26:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_2]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// ANSE:                   %[[VAL_27:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// ANSE:                   %[[VAL_28:.*]] = arith.subi %[[VAL_26]]#0, %[[VAL_3]] : index
+// ANSE:                   %[[VAL_29:.*]] = arith.addi %[[VAL_25]], %[[VAL_28]] : index
+// ANSE:                   %[[VAL_30:.*]] = arith.subi %[[VAL_27]]#0, %[[VAL_3]] : index
+// ANSE:                   %[[VAL_31:.*]] = arith.addi %[[VAL_24]], %[[VAL_30]] : index
+// ANSE:                   %[[VAL_32:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_29]], %[[VAL_31]])  : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
+// ANSE:                   %[[VAL_33:.*]] = fir.load %[[VAL_32]] : !fir.ref<i32>
+// ANSE:                   %[[VAL_34:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_25]], %[[VAL_23]] : (!hlfir.expr<?x?xi16>, index, index) -> i16
+// ANSE:                   %[[VAL_35:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_23]], %[[VAL_24]] : (!hlfir.expr<?x?xi32>, index, index) -> i32
+// ANSE:                   %[[VAL_36:.*]] = fir.convert %[[VAL_34]] : (i16) -> i32
+// ANSE:                   %[[VAL_37:.*]] = arith.muli %[[VAL_36]], %[[VAL_35]] : i32
+// ANSE:                   %[[VAL_38:.*]] = arith.addi %[[VAL_33]], %[[VAL_37]] : i32
+// ANSE:                   hlfir.assign %[[VAL_38]] to %[[VAL_32]] : i32, !fir.ref<i32>
+// ANSE:                 }
+// ANSE:               }
+// ANSE:             }
+// ANSE:           }
+// ANSE:           return %[[VAL_11]] : !hlfir.expr<?x?xi32>
+// ANSE:         }
+
+// ELEMENTAL:           %[[VAL_2:.*]] = arith.constant 1 : index
+// ELEMENTAL:           %[[VAL_3:.*]] = arith.constant 0 : i32
+// ELEMENTAL:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?x?xi16>) -> !fir.shape<2>
+// ELEMENTAL:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<2>) -> index
+// ELEMENTAL:           %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ELEMENTAL:           %[[VAL_7:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr<?x?xi32>) -> !fir.shape<2>
+// ELEMENTAL:           %[[VAL_8:.*]] = hlfir.get_extent %[[VAL_7]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ELEMENTAL:           %[[VAL_9:.*]] = fir.shape %[[VAL_5]], %[[VAL_8]] : (index, index) -> !fir.shape<2>
+// ELEMENTAL:           %[[VAL_10:.*]] = hlfir.elemental %[[VAL_9]] unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xi32> {
+// ELEMENTAL:           ^bb0(%[[VAL_11:.*]]: index, %[[VAL_12:.*]]: index):
+// ELEMENTAL:             %[[VAL_13:.*]] = fir.do_loop %[[VAL_14:.*]] = %[[VAL_2]] to %[[VAL_6]] step %[[VAL_2]] unordered iter_args(%[[VAL_15:.*]] = %[[VAL_3]]) -> (i32) {
+// ELEMENTAL:               %[[VAL_16:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_11]], %[[VAL_14]] : (!hlfir.expr<?x?xi16>, index, index) -> i16
+// ELEMENTAL:               %[[VAL_17:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_14]], %[[VAL_12]] : (!hlfir.expr<?x?xi32>, index, index) -> i32
+// ELEMENTAL:               %[[VAL_18:.*]] = fir.convert %[[VAL_16]] : (i16) -> i32
+// ELEMENTAL:               %[[VAL_19:.*]] = arith.muli %[[VAL_18]], %[[VAL_17]] : i32
+// ELEMENTAL:               %[[VAL_20:.*]] = arith.addi %[[VAL_15]], %[[VAL_19]] : i32
+// ELEMENTAL:               fir.result %[[VAL_20]] : i32
+// ELEMENTAL:             }
+// ELEMENTAL:             hlfir.yield_element %[[VAL_13]] : i32
+// ELEMENTAL:           }
+// ELEMENTAL:           return %[[VAL_10]] : !hlfir.expr<?x?xi32>
+// ELEMENTAL:         }
+
+func.func @matmul_matrix_matrix_real(%arg0: !hlfir.expr<?x?xf32>, %arg1: !hlfir.expr<?x?xf16>) -> !hlfir.expr<?x?xf32> {
+  %res = hlfir.matmul %arg0 %arg1 : (!hlfir.expr<?x?xf32>, !hlfir.expr<?x?xf16>) -> !hlfir.expr<?x?xf32>
+  return %res : !hlfir.expr<?x?xf32>
+}
+// ALL-LABEL:   func.func @matmul_matrix_matrix_real(
+// ALL-SAME:                                         %[[VAL_0:.*]]: !hlfir.expr<?x?xf32>,
+// ALL-SAME:                                         %[[VAL_1:.*]]: !hlfir.expr<?x?xf16>) -> !hlfir.expr<?x?xf32> {
+
+// NOANSE: hlfir.matmul
+
+// ANSE:           %[[VAL_2:.*]] = arith.constant 0 : index
+// ANSE:           %[[VAL_3:.*]] = arith.constant 1 : index
+// ANSE:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
+// ANSE:           %[[VAL_5:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?x?xf32>) -> !fir.shape<2>
+// ANSE:           %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 0 : index} : (!fir.shape<2>) -> index
+// ANSE:           %[[VAL_7:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ANSE:           %[[VAL_8:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr<?x?xf16>) -> !fir.shape<2>
+// ANSE:           %[[VAL_9:.*]] = hlfir.get_extent %[[VAL_8]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ANSE:           %[[VAL_10:.*]] = fir.shape %[[VAL_6]], %[[VAL_9]] : (index, index) -> !fir.shape<2>
+// ANSE:           %[[VAL_11:.*]] = hlfir.eval_in_mem shape %[[VAL_10]] : (!fir.shape<2>) -> !hlfir.expr<?x?xf32> {
+// ANSE:           ^bb0(%[[VAL_12:.*]]: !fir.ref<!fir.array<?x?xf32>>):
+// ANSE:             %[[VAL_13:.*]] = fir.embox %[[VAL_12]](%[[VAL_10]]) : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?xf32>>
+// ANSE:             fir.do_loop %[[VAL_14:.*]] = %[[VAL_3]] to %[[VAL_9]] step %[[VAL_3]] unordered {
+// ANSE:               fir.do_loop %[[VAL_15:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] unordered {
+// ANSE:                 %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_2]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+// ANSE:                 %[[VAL_17:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+// ANSE:                 %[[VAL_18:.*]] = arith.subi %[[VAL_16]]#0, %[[VAL_3]] : index
+// ANSE:                 %[[VAL_19:.*]] = arith.addi %[[VAL_15]], %[[VAL_18]] : index
+// ANSE:                 %[[VAL_20:.*]] = arith.subi %[[VAL_17]]#0, %[[VAL_3]] : index
+// ANSE:                 %[[VAL_21:.*]] = arith.addi %[[VAL_14]], %[[VAL_20]] : index
+// ANSE:                 %[[VAL_22:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_19]], %[[VAL_21]])  : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
+// ANSE:                 hlfir.assign %[[VAL_4]] to %[[VAL_22]] : f32, !fir.ref<f32>
+// ANSE:               }
+// ANSE:             }
+// ANSE:             fir.do_loop %[[VAL_23:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_3]] {
+// ANSE:               fir.do_loop %[[VAL_24:.*]] = %[[VAL_3]] to %[[VAL_9]] step %[[VAL_3]] {
+// ANSE:                 fir.do_loop %[[VAL_25:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] {
+// ANSE:                   %[[VAL_26:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_2]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+// ANSE:                   %[[VAL_27:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+// ANSE:                   %[[VAL_28:.*]] = arith.subi %[[VAL_26]]#0, %[[VAL_3]] : index
+// ANSE:                   %[[VAL_29:.*]] = arith.addi %[[VAL_25]], %[[VAL_28]] : index
+// ANSE:                   %[[VAL_30:.*]] = arith.subi %[[VAL_27]]#0, %[[VAL_3]] : index
+// ANSE:                   %[[VAL_31:.*]] = arith.addi %[[VAL_24]], %[[VAL_30]] : index
+// ANSE:                   %[[VAL_32:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_29]], %[[VAL_31]])  : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
+// ANSE:                   %[[VAL_33:.*]] = fir.load %[[VAL_32]] : !fir.ref<f32>
+// ANSE:                   %[[VAL_34:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_25]], %[[VAL_23]] : (!hlfir.expr<?x?xf32>, index, index) -> f32
+// ANSE:                   %[[VAL_35:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_23]], %[[VAL_24]] : (!hlfir.expr<?x?xf16>, index, index) -> f16
+// ANSE:                   %[[VAL_36:.*]] = fir.convert %[[VAL_35]] : (f16) -> f32
+// ANSE:                   %[[VAL_37:.*]] = arith.mulf %[[VAL_34]], %[[VAL_36]] : f32
+// ANSE:                   %[[VAL_38:.*]] = arith.addf %[[VAL_33]], %[[VAL_37]] : f32
+// ANSE:                   hlfir.assign %[[VAL_38]] to %[[VAL_32]] : f32, !fir.ref<f32>
+// ANSE:                 }
+// ANSE:               }
+// ANSE:             }
+// ANSE:           }
+// ANSE:           return %[[VAL_11]] : !hlfir.expr<?x?xf32>
+// ANSE:         }
+
+// ELEMENTAL:           %[[VAL_2:.*]] = arith.constant 1 : index
+// ELEMENTAL:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+// ELEMENTAL:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?x?xf32>) -> !fir.shape<2>
+// ELEMENTAL:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<2>) -> index
+// ELEMENTAL:           %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ELEMENTAL:           %[[VAL_7:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr<?x?xf16>) -> !fir.shape<2>
+// ELEMENTAL:           %[[VAL_8:.*]] = hlfir.get_extent %[[VAL_7]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ELEMENTAL:           %[[VAL_9:.*]] = fir.shape %[[VAL_5]], %[[VAL_8]] : (index, index) -> !fir.shape<2>
+// ELEMENTAL:           %[[VAL_10:.*]] = hlfir.elemental %[[VAL_9]] unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xf32> {
+// ELEMENTAL:           ^bb0(%[[VAL_11:.*]]: index, %[[VAL_12:.*]]: index):
+// ELEMENTAL:             %[[VAL_13:.*]] = fir.do_loop %[[VAL_14:.*]] = %[[VAL_2]] to %[[VAL_6]] step %[[VAL_2]] iter_args(%[[VAL_15:.*]] = %[[VAL_3]]) -> (f32) {
+// ELEMENTAL:               %[[VAL_16:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_11]], %[[VAL_14]] : (!hlfir.expr<?x?xf32>, index, index) -> f32
+// ELEMENTAL:               %[[VAL_17:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_14]], %[[VAL_12]] : (!hlfir.expr<?x?xf16>, index, index) -> f16
+// ELEMENTAL:               %[[VAL_18:.*]] = fir.convert %[[VAL_17]] : (f16) -> f32
+// ELEMENTAL:               %[[VAL_19:.*]] = arith.mulf %[[VAL_16]], %[[VAL_18]] : f32
+// ELEMENTAL:               %[[VAL_20:.*]] = arith.addf %[[VAL_15]], %[[VAL_19]] : f32
+// ELEMENTAL:               fir.result %[[VAL_20]] : f32
+// ELEMENTAL:             }
+// ELEMENTAL:             hlfir.yield_element %[[VAL_13]] : f32
+// ELEMENTAL:           }
+// ELEMENTAL:           return %[[VAL_10]] : !hlfir.expr<?x?xf32>
+// ELEMENTAL:         }
+
+func.func @matmul_matrix_matrix_complex(%arg0: !hlfir.expr<?x?xcomplex<f32>>, %arg1: !hlfir.expr<?x?xcomplex<f16>>) -> !hlfir.expr<?x?xcomplex<f32>> {
+  %res = hlfir.matmul %arg0 %arg1 : (!hlfir.expr<?x?xcomplex<f32>>, !hlfir.expr<?x?xcomplex<f16>>) -> !hlfir.expr<?x?xcomplex<f32>>
+  return %res : !hlfir.expr<?x?xcomplex<f32>>
+}
+// ALL-LABEL:   func.func @matmul_matrix_matrix_complex(
+// ALL-SAME:                                            %[[VAL_0:.*]]: !hlfir.expr<?x?xcomplex<f32>>,
+// ALL-SAME:                                            %[[VAL_1:.*]]: !hlfir.expr<?x?xcomplex<f16>>) -> !hlfir.expr<?x?xcomplex<f32>> {
+
+// NOANSE: hlfir.matmul
+
+// ANSE:           %[[VAL_2:.*]] = arith.constant 0 : index
+// ANSE:           %[[VAL_3:.*]] = arith.constant 1 : index
+// ANSE:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
+// ANSE:           %[[VAL_5:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?x?xcomplex<f32>>) -> !fir.shape<2>
+// ANSE:           %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 0 : index} : (!fir.shape<2>) -> index
+// ANSE:           %[[VAL_7:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ANSE:           %[[VAL_8:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr<?x?xcomplex<f16>>) -> !fir.shape<2>
+// ANSE:           %[[VAL_9:.*]] = hlfir.get_extent %[[VAL_8]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ANSE:           %[[VAL_10:.*]] = fir.shape %[[VAL_6]], %[[VAL_9]] : (index, index) -> !fir.shape<2>
+// ANSE:           %[[VAL_11:.*]] = hlfir.eval_in_mem shape %[[VAL_10]] : (!fir.shape<2>) -> !hlfir.expr<?x?xcomplex<f32>> {
+// ANSE:           ^bb0(%[[VAL_12:.*]]: !fir.ref<!fir.array<?x?xcomplex<f32>>>):
+// ANSE:             %[[VAL_13:.*]] = fir.embox %[[VAL_12]](%[[VAL_10]]) : (!fir.ref<!fir.array<?x?xcomplex<f32>>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?xcomplex<f32>>>
+// ANSE:             %[[VAL_14:.*]] = fir.undefined complex<f32>
+// ANSE:             %[[VAL_15:.*]] = fir.insert_value %[[VAL_14]], %[[VAL_4]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+// ANSE:             %[[VAL_16:.*]] = fir.insert_value %[[VAL_15]], %[[VAL_4]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+// ANSE:             fir.do_loop %[[VAL_17:.*]] = %[[VAL_3]] to %[[VAL_9]] step %[[VAL_3]] unordered {
+// ANSE:               fir.do_loop %[[VAL_18:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] unordered {
+// ANSE:                 %[[VAL_19:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_2]] : (!fir.box<!fir.array<?x?xcomplex<f32>>>, index) -> (index, index, index)
+// ANSE:                 %[[VAL_20:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xcomplex<f32>>>, index) -> (index, index, index)
+// ANSE:                 %[[VAL_21:.*]] = arith.subi %[[VAL_19]]#0, %[[VAL_3]] : index
+// ANSE:                 %[[VAL_22:.*]] = arith.addi %[[VAL_18]], %[[VAL_21]] : index
+// ANSE:                 %[[VAL_23:.*]] = arith.subi %[[VAL_20]]#0, %[[VAL_3]] : index
+// ANSE:                 %[[VAL_24:.*]] = arith.addi %[[VAL_17]], %[[VAL_23]] : index
+// ANSE:                 %[[VAL_25:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_22]], %[[VAL_24]])  : (!fir.box<!fir.array<?x?xcomplex<f32>>>, index, index) -> !fir.ref<complex<f32>>
+// ANSE:                 hlfir.assign %[[VAL_16]] to %[[VAL_25]] : complex<f32>, !fir.ref<complex<f32>>
+// ANSE:               }
+// ANSE:             }
+// ANSE:             fir.do_loop %[[VAL_26:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_3]] {
+// ANSE:               fir.do_loop %[[VAL_27:.*]] = %[[VAL_3]] to %[[VAL_9]] step %[[VAL_3]] {
+// ANSE:                 fir.do_loop %[[VAL_28:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] {
+// ANSE:                   %[[VAL_29:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_2]] : (!fir.box<!fir.array<?x?xcomplex<f32>>>, index) -> (index, index, index)
+// ANSE:                   %[[VAL_30:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xcomplex<f32>>>, index) -> (index, index, index)
+// ANSE:                   %[[VAL_31:.*]] = arith.subi %[[VAL_29]]#0, %[[VAL_3]] : index
+// ANSE:                   %[[VAL_32:.*]] = arith.addi %[[VAL_28]], %[[VAL_31]] : index
+// ANSE:                   %[[VAL_33:.*]] = arith.subi %[[VAL_30]]#0, %[[VAL_3]] : index
+// ANSE:                   %[[VAL_34:.*]] = arith.addi %[[VAL_27]], %[[VAL_33]] : index
+// ANSE:                   %[[VAL_35:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_32]], %[[VAL_34]])  : (!fir.box<!fir.array<?x?xcomplex<f32>>>, index, index) -> !fir.ref<complex<f32>>
+// ANSE:                   %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref<complex<f32>>
+// ANSE:                   %[[VAL_37:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_28]], %[[VAL_26]] : (!hlfir.expr<?x?xcomplex<f32>>, index, index) -> complex<f32>
+// ANSE:                   %[[VAL_38:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_26]], %[[VAL_27]] : (!hlfir.expr<?x?xcomplex<f16>>, index, index) -> complex<f16>
+// ANSE:                   %[[VAL_39:.*]] = fir.convert %[[VAL_38]] : (complex<f16>) -> complex<f32>
+// ANSE:                   %[[VAL_40:.*]] = fir.mulc %[[VAL_37]], %[[VAL_39]] : complex<f32>
+// ANSE:                   %[[VAL_41:.*]] = fir.addc %[[VAL_36]], %[[VAL_40]] : complex<f32>
+// ANSE:                   hlfir.assign %[[VAL_41]] to %[[VAL_35]] : complex<f32>, !fir.ref<complex<f32>>
+// ANSE:                 }
+// ANSE:               }
+// ANSE:             }
+// ANSE:           }
+// ANSE:           return %[[VAL_11]] : !hlfir.expr<?x?xcomplex<f32>>
+// ANSE:         }
+
+// ELEMENTAL:           %[[VAL_2:.*]] = arith.constant 1 : index
+// ELEMENTAL:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+// ELEMENTAL:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?x?xcomplex<f32>>) -> !fir.shape<2>
+// ELEMENTAL:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<2>) -> index
+// ELEMENTAL:           %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ELEMENTAL:           %[[VAL_7:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr<?x?xcomplex<f16>>) -> !fir.shape<2>
+// ELEMENTAL:           %[[VAL_8:.*]] = hlfir.get_extent %[[VAL_7]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ELEMENTAL:           %[[VAL_9:.*]] = fir.shape %[[VAL_5]], %[[VAL_8]] : (index, index) -> !fir.shape<2>
+// ELEMENTAL:           %[[VAL_10:.*]] = hlfir.elemental %[[VAL_9]] unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xcomplex<f32>> {
+// ELEMENTAL:           ^bb0(%[[VAL_11:.*]]: index, %[[VAL_12:.*]]: index):
+// ELEMENTAL:             %[[VAL_13:.*]] = fir.undefined complex<f32>
+// ELEMENTAL:             %[[VAL_14:.*]] = fir.insert_value %[[VAL_13]], %[[VAL_3]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+// ELEMENTAL:             %[[VAL_15:.*]] = fir.insert_value %[[VAL_14]], %[[VAL_3]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+// ELEMENTAL:             %[[VAL_16:.*]] = fir.do_loop %[[VAL_17:.*]] = %[[VAL_2]] to %[[VAL_6]] step %[[VAL_2]] iter_args(%[[VAL_18:.*]] = %[[VAL_15]]) -> (complex<f32>) {
+// ELEMENTAL:               %[[VAL_19:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_11]], %[[VAL_17]] : (!hlfir.expr<?x?xcomplex<f32>>, index, index) -> complex<f32>
+// ELEMENTAL:               %[[VAL_20:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_17]], %[[VAL_12]] : (!hlfir.expr<?x?xcomplex<f16>>, index, index) -> complex<f16>
+// ELEMENTAL:               %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (complex<f16>) -> complex<f32>
+// ELEMENTAL:               %[[VAL_22:.*]] = fir.mulc %[[VAL_19]], %[[VAL_21]] : complex<f32>
+// ELEMENTAL:               %[[VAL_23:.*]] = fir.addc %[[VAL_18]], %[[VAL_22]] : complex<f32>
+// ELEMENTAL:               fir.result %[[VAL_23]] : complex<f32>
+// ELEMENTAL:             }
+// ELEMENTAL:             hlfir.yield_element %[[VAL_16]] : complex<f32>
+// ELEMENTAL:           }
+// ELEMENTAL:           return %[[VAL_10]] : !hlfir.expr<?x?xcomplex<f32>>
+// ELEMENTAL:         }
+
+func.func @matmul_matrix_matrix_complex_real(%arg0: !hlfir.expr<?x?xcomplex<f32>>, %arg1: !hlfir.expr<?x?xf16>) -> !hlfir.expr<?x?xcomplex<f32>> {
+  %res = hlfir.matmul %arg0 %arg1 : (!hlfir.expr<?x?xcomplex<f32>>, !hlfir.expr<?x?xf16>) -> !hlfir.expr<?x?xcomplex<f32>>
+  return %res : !hlfir.expr<?x?xcomplex<f32>>
+}
+// ALL-LABEL:   func.func @matmul_matrix_matrix_complex_real(
+// ALL-SAME:                                                 %[[VAL_0:.*]]: !hlfir.expr<?x?xcomplex<f32>>,
+// ALL-SAME:                                                 %[[VAL_1:.*]]: !hlfir.expr<?x?xf16>) -> !hlfir.expr<?x?xcomplex<f32>> {
+
+// NOANSE: hlfir.matmul
+
+// ANSE:           %[[VAL_2:.*]] = arith.constant 0 : index
+// ANSE:           %[[VAL_3:.*]] = arith.constant 1 : index
+// ANSE:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
+// ANSE:           %[[VAL_5:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?x?xcomplex<f32>>) -> !fir.shape<2>
+// ANSE:           %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 0 : index} : (!fir.shape<2>) -> index
+// ANSE:           %[[VAL_7:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ANSE:           %[[VAL_8:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr<?x?xf16>) -> !fir.shape<2>
+// ANSE:           %[[VAL_9:.*]] = hlfir.get_extent %[[VAL_8]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ANSE:           %[[VAL_10:.*]] = fir.shape %[[VAL_6]], %[[VAL_9]] : (index, index) -> !fir.shape<2>
+// ANSE:           %[[VAL_11:.*]] = hlfir.eval_in_mem shape %[[VAL_10]] : (!fir.shape<2>) -> !hlfir.expr<?x?xcomplex<f32>> {
+// ANSE:           ^bb0(%[[VAL_12:.*]]: !fir.ref<!fir.array<?x?xcomplex<f32>>>):
+// ANSE:             %[[VAL_13:.*]] = fir.embox %[[VAL_12]](%[[VAL_10]]) : (!fir.ref<!fir.array<?x?xcomplex<f32>>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?xcomplex<f32>>>
+// ANSE:             %[[VAL_14:.*]] = fir.undefined complex<f32>
+// ANSE:             %[[VAL_15:.*]] = fir.insert_value %[[VAL_14]], %[[VAL_4]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+// ANSE:             %[[VAL_16:.*]] = fir.insert_value %[[VAL_15]], %[[VAL_4]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+// ANSE:             fir.do_loop %[[VAL_17:.*]] = %[[VAL_3]] to %[[VAL_9]] step %[[VAL_3]] unordered {
+// ANSE:               fir.do_loop %[[VAL_18:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] unordered {
+// ANSE:                 %[[VAL_19:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_2]] : (!fir.box<!fir.array<?x?xcomplex<f32>>>, index) -> (index, index, index)
+// ANSE:                 %[[VAL_20:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xcomplex<f32>>>, index) -> (index, index, index)
+// ANSE:                 %[[VAL_21:.*]] = arith.subi %[[VAL_19]]#0, %[[VAL_3]] : index
+// ANSE:                 %[[VAL_22:.*]] = arith.addi %[[VAL_18]], %[[VAL_21]] : index
+// ANSE:                 %[[VAL_23:.*]] = arith.subi %[[VAL_20]]#0, %[[VAL_3]] : index
+// ANSE:                 %[[VAL_24:.*]] = arith.addi %[[VAL_17]], %[[VAL_23]] : index
+// ANSE:                 %[[VAL_25:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_22]], %[[VAL_24]])  : (!fir.box<!fir.array<?x?xcomplex<f32>>>, index, index) -> !fir.ref<complex<f32>>
+// ANSE:                 hlfir.assign %[[VAL_16]] to %[[VAL_25]] : complex<f32>, !fir.ref<complex<f32>>
+// ANSE:               }
+// ANSE:             }
+// ANSE:             fir.do_loop %[[VAL_26:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_3]] {
+// ANSE:               fir.do_loop %[[VAL_27:.*]] = %[[VAL_3]] to %[[VAL_9]] step %[[VAL_3]] {
+// ANSE:                 fir.do_loop %[[VAL_28:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] {
+// ANSE:                   %[[VAL_29:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_2]] : (!fir.box<!fir.array<?x?xcomplex<f32>>>, index) -> (index, index, index)
+// ANSE:                   %[[VAL_30:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xcomplex<f32>>>, index) -> (index, index, index)
+// ANSE:                   %[[VAL_31:.*]] = arith.subi %[[VAL_29]]#0, %[[VAL_3]] : index
+// ANSE:                   %[[VAL_32:.*]] = arith.addi %[[VAL_28]], %[[VAL_31]] : index
+// ANSE:                   %[[VAL_33:.*]] = arith.subi %[[VAL_30]]#0, %[[VAL_3]] : index
+// ANSE:                   %[[VAL_34:.*]] = arith.addi %[[VAL_27]], %[[VAL_33]] : index
+// ANSE:                   %[[VAL_35:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_32]], %[[VAL_34]])  : (!fir.box<!fir.array<?x?xcomplex<f32>>>, index, index) -> !fir.ref<complex<f32>>
+// ANSE:                   %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref<complex<f32>>
+// ANSE:                   %[[VAL_37:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_28]], %[[VAL_26]] : (!hlfir.expr<?x?xcomplex<f32>>, index, index) -> complex<f32>
+// ANSE:                   %[[VAL_38:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_26]], %[[VAL_27]] : (!hlfir.expr<?x?xf16>, index, index) -> f16
+// ANSE:                   %[[VAL_39:.*]] = fir.undefined complex<f32>
+// ANSE:                   %[[VAL_40:.*]] = fir.insert_value %[[VAL_39]], %[[VAL_4]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+// ANSE:                   %[[VAL_41:.*]] = fir.insert_value %[[VAL_40]], %[[VAL_4]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+// ANSE:                   %[[VAL_42:.*]] = fir.convert %[[VAL_38]] : (f16) -> f32
+// ANSE:                   %[[VAL_43:.*]] = fir.insert_value %[[VAL_41]], %[[VAL_42]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+// ANSE:                   %[[VAL_44:.*]] = fir.mulc %[[VAL_37]], %[[VAL_43]] : complex<f32>
+// ANSE:                   %[[VAL_45:.*]] = fir.addc %[[VAL_36]], %[[VAL_44]] : complex<f32>
+// ANSE:                   hlfir.assign %[[VAL_45]] to %[[VAL_35]] : complex<f32>, !fir.ref<complex<f32>>
+// ANSE:                 }
+// ANSE:               }
+// ANSE:             }
+// ANSE:           }
+// ANSE:           return %[[VAL_11]] : !hlfir.expr<?x?xcomplex<f32>>
+// ANSE:         }
+
+// ELEMENTAL:           %[[VAL_2:.*]] = arith.constant 1 : index
+// ELEMENTAL:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+// ELEMENTAL:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?x?xcomplex<f32>>) -> !fir.shape<2>
+// ELEMENTAL:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<2>) -> index
+// ELEMENTAL:           %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ELEMENTAL:           %[[VAL_7:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr<?x?xf16>) -> !fir.shape<2>
+// ELEMENTAL:           %[[VAL_8:.*]] = hlfir.get_extent %[[VAL_7]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ELEMENTAL:           %[[VAL_9:.*]] = fir.shape %[[VAL_5]], %[[VAL_8]] : (index, index) -> !fir.shape<2>
+// ELEMENTAL:           %[[VAL_10:.*]] = hlfir.elemental %[[VAL_9]] unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xcomplex<f32>> {
+// ELEMENTAL:           ^bb0(%[[VAL_11:.*]]: index, %[[VAL_12:.*]]: index):
+// ELEMENTAL:             %[[VAL_13:.*]] = fir.undefined complex<f32>
+// ELEMENTAL:             %[[VAL_14:.*]] = fir.insert_value %[[VAL_13]], %[[VAL_3]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+// ELEMENTAL:             %[[VAL_15:.*]] = fir.insert_value %[[VAL_14]], %[[VAL_3]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+// ELEMENTAL:             %[[VAL_16:.*]] = fir.do_loop %[[VAL_17:.*]] = %[[VAL_2]] to %[[VAL_6]] step %[[VAL_2]] iter_args(%[[VAL_18:.*]] = %[[VAL_15]]) -> (complex<f32>) {
+// ELEMENTAL:               %[[VAL_19:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_11]], %[[VAL_17]] : (!hlfir.expr<?x?xcomplex<f32>>, index, index) -> complex<f32>
+// ELEMENTAL:               %[[VAL_20:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_17]], %[[VAL_12]] : (!hlfir.expr<?x?xf16>, index, index) -> f16
+// ELEMENTAL:               %[[VAL_21:.*]] = fir.undefined complex<f32>
+// ELEMENTAL:               %[[VAL_22:.*]] = fir.insert_value %[[VAL_21]], %[[VAL_3]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+// ELEMENTAL:               %[[VAL_23:.*]] = fir.insert_value %[[VAL_22]], %[[VAL_3]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+// ELEMENTAL:               %[[VAL_24:.*]] = fir.convert %[[VAL_20]] : (f16) -> f32
+// ELEMENTAL:               %[[VAL_25:.*]] = fir.insert_value %[[VAL_23]], %[[VAL_24]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+// ELEMENTAL:               %[[VAL_26:.*]] = fir.mulc %[[VAL_19]], %[[VAL_25]] : complex<f32>
+// ELEMENTAL:               %[[VAL_27:.*]] = fir.addc %[[VAL_18]], %[[VAL_26]] : complex<f32>
+// ELEMENTAL:               fir.result %[[VAL_27]] : complex<f32>
+// ELEMENTAL:             }
+// ELEMENTAL:             hlfir.yield_element %[[VAL_16]] : complex<f32>
+// ELEMENTAL:           }
+// ELEMENTAL:           return %[[VAL_10]] : !hlfir.expr<?x?xcomplex<f32>>
+// ELEMENTAL:         }
+
+func.func @matmul_matrix_matrix_logical(%arg0: !hlfir.expr<?x?x!fir.logical<1>>, %arg1: !hlfir.expr<?x?x!fir.logical<4>>) -> !hlfir.expr<?x?x!fir.logical<4>> {
+  %res = hlfir.matmul %arg0 %arg1 : (!hlfir.expr<?x?x!fir.logical<1>>, !hlfir.expr<?x?x!fir.logical<4>>) -> !hlfir.expr<?x?x!fir.logical<4>>
+  return %res : !hlfir.expr<?x?x!fir.logical<4>>
+}
+// ALL-LABEL:   func.func @matmul_matrix_matrix_logical(
+// ALL-SAME:                                            %[[VAL_0:.*]]: !hlfir.expr<?x?x!fir.logical<1>>,
+// ALL-SAME:                                            %[[VAL_1:.*]]: !hlfir.expr<?x?x!fir.logical<4>>) -> !hlfir.expr<?x?x!fir.logical<4>> {
+
+// NOANSE: hlfir.matmul
+
+// ANSE:           %[[VAL_2:.*]] = arith.constant 0 : index
+// ANSE:           %[[VAL_3:.*]] = arith.constant 1 : index
+// ANSE:           %[[VAL_4:.*]] = arith.constant false
+// ANSE:           %[[VAL_5:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?x?x!fir.logical<1>>) -> !fir.shape<2>
+// ANSE:           %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 0 : index} : (!fir.shape<2>) -> index
+// ANSE:           %[[VAL_7:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ANSE:           %[[VAL_8:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr<?x?x!fir.logical<4>>) -> !fir.shape<2>
+// ANSE:           %[[VAL_9:.*]] = hlfir.get_extent %[[VAL_8]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ANSE:           %[[VAL_10:.*]] = fir.shape %[[VAL_6]], %[[VAL_9]] : (index, index) -> !fir.shape<2>
+// ANSE:           %[[VAL_11:.*]] = hlfir.eval_in_mem shape %[[VAL_10]] : (!fir.shape<2>) -> !hlfir.expr<?x?x!fir.logical<4>> {
+// ANSE:           ^bb0(%[[VAL_12:.*]]: !fir.ref<!fir.array<?x?x!fir.logical<4>>>):
+// ANSE:             %[[VAL_13:.*]] = fir.embox %[[VAL_12]](%[[VAL_10]]) : (!fir.ref<!fir.array<?x?x!fir.logical<4>>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?x!fir.logical<4>>>
+// ANSE:             %[[VAL_14:.*]] = fir.convert %[[VAL_4]] : (i1) -> !fir.logical<4>
+// ANSE:             fir.do_loop %[[VAL_15:.*]] = %[[VAL_3]] to %[[VAL_9]] step %[[VAL_3]] unordered {
+// ANSE:               fir.do_loop %[[VAL_16:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] unordered {
+// ANSE:                 %[[VAL_17:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_2]] : (!fir.box<!fir.array<?x?x!fir.logical<4>>>, index) -> (index, index, index)
+// ANSE:                 %[[VAL_18:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_3]] : (!fir.box<!fir.array<?x?x!fir.logical<4>>>, index) -> (index, index, index)
+// ANSE:                 %[[VAL_19:.*]] = arith.subi %[[VAL_17]]#0, %[[VAL_3]] : index
+// ANSE:                 %[[VAL_20:.*]] = arith.addi %[[VAL_16]], %[[VAL_19]] : index
+// ANSE:                 %[[VAL_21:.*]] = arith.subi %[[VAL_18]]#0, %[[VAL_3]] : index
+// ANSE:                 %[[VAL_22:.*]] = arith.addi %[[VAL_15]], %[[VAL_21]] : index
+// ANSE:                 %[[VAL_23:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_20]], %[[VAL_22]])  : (!fir.box<!fir.array<?x?x!fir.logical<4>>>, index, index) -> !fir.ref<!fir.logical<4>>
+// ANSE:                 hlfir.assign %[[VAL_14]] to %[[VAL_23]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+// ANSE:               }
+// ANSE:             }
+// ANSE:             fir.do_loop %[[VAL_24:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_3]] unordered {
+// ANSE:               fir.do_loop %[[VAL_25:.*]] = %[[VAL_3]] to %[[VAL_9]] step %[[VAL_3]] unordered {
+// ANSE:                 fir.do_loop %[[VAL_26:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] unordered {
+// ANSE:                   %[[VAL_27:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_2]] : (!fir.box<!fir.array<?x?x!fir.logical<4>>>, index) -> (index, index, index)
+// ANSE:                   %[[VAL_28:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_3]] : (!fir.box<!fir.array<?x?x!fir.logical<4>>>, index) -> (index, index, index)
+// ANSE:                   %[[VAL_29:.*]] = arith.subi %[[VAL_27]]#0, %[[VAL_3]] : index
+// ANSE:                   %[[VAL_30:.*]] = arith.addi %[[VAL_26]], %[[VAL_29]] : index
+// ANSE:                   %[[VAL_31:.*]] = arith.subi %[[VAL_28]]#0, %[[VAL_3]] : index
+// ANSE:                   %[[VAL_32:.*]] = arith.addi %[[VAL_25]], %[[VAL_31]] : index
+// ANSE:                   %[[VAL_33:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_30]], %[[VAL_32]])  : (!fir.box<!fir.array<?x?x!fir.logical<4>>>, index, index) -> !fir.ref<!fir.logical<4>>
+// ANSE:                   %[[VAL_34:.*]] = fir.load %[[VAL_33]] : !fir.ref<!fir.logical<4>>
+// ANSE:                   %[[VAL_35:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_26]], %[[VAL_24]] : (!hlfir.expr<?x?x!fir.logical<1>>, index, index) -> !fir.logical<1>
+// ANSE:                   %[[VAL_36:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_24]], %[[VAL_25]] : (!hlfir.expr<?x?x!fir.logical<4>>, index, index) -> !fir.logical<4>
+// ANSE:                   %[[VAL_37:.*]] = fir.convert %[[VAL_34]] : (!fir.logical<4>) -> i1
+// ANSE:                   %[[VAL_38:.*]] = fir.convert %[[VAL_35]] : (!fir.logical<1>) -> i1
+// ANSE:                   %[[VAL_39:.*]] = fir.convert %[[VAL_36]] : (!fir.logical<4>) -> i1
+// ANSE:                   %[[VAL_40:.*]] = arith.andi %[[VAL_38]], %[[VAL_39]] : i1
+// ANSE:                   %[[VAL_41:.*]] = arith.ori %[[VAL_37]], %[[VAL_40]] : i1
+// ANSE:                   %[[VAL_42:.*]] = fir.convert %[[VAL_41]] : (i1) -> !fir.logical<4>
+// ANSE:                   hlfir.assign %[[VAL_42]] to %[[VAL_33]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+// ANSE:                 }
+// ANSE:               }
+// ANSE:             }
+// ANSE:           }
+// ANSE:           return %[[VAL_11]] : !hlfir.expr<?x?x!fir.logical<4>>
+// ANSE:         }
+
+// ELEMENTAL:           %[[VAL_2:.*]] = arith.constant 1 : index
+// ELEMENTAL:           %[[VAL_3:.*]] = arith.constant false
+// ELEMENTAL:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?x?x!fir.logical<1>>) -> !fir.shape<2>
+// ELEMENTAL:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<2>) -> index
+// ELEMENTAL:           %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ELEMENTAL:           %[[VAL_7:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr<?x?x!fir.logical<4>>) -> !fir.shape<2>
+// ELEMENTAL:           %[[VAL_8:.*]] = hlfir.get_extent %[[VAL_7]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ELEMENTAL:           %[[VAL_9:.*]] = fir.shape %[[VAL_5]], %[[VAL_8]] : (index, index) -> !fir.shape<2>
+// ELEMENTAL:           %[[VAL_10:.*]] = hlfir.elemental %[[VAL_9]] unordered : (!fir.shape<2>) -> !hlfir.expr<?x?x!fir.logical<4>> {
+// ELEMENTAL:           ^bb0(%[[VAL_11:.*]]: index, %[[VAL_12:.*]]: index):
+// ELEMENTAL:             %[[VAL_13:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
+// ELEMENTAL:             %[[VAL_14:.*]] = fir.do_loop %[[VAL_15:.*]] = %[[VAL_2]] to %[[VAL_6]] step %[[VAL_2]] unordered iter_args(%[[VAL_16:.*]] = %[[VAL_13]]) -> (!fir.logical<4>) {
+// ELEMENTAL:               %[[VAL_17:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_11]], %[[VAL_15]] : (!hlfir.expr<?x?x!fir.logical<1>>, index, index) -> !fir.logical<1>
+// ELEMENTAL:               %[[VAL_18:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_15]], %[[VAL_12]] : (!hlfir.expr<?x?x!fir.logical<4>>, index, index) -> !fir.logical<4>
+// ELEMENTAL:               %[[VAL_19:.*]] = fir.convert %[[VAL_16]] : (!fir.logical<4>) -> i1
+// ELEMENTAL:               %[[VAL_20:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<1>) -> i1
+// ELEMENTAL:               %[[VAL_21:.*]] = fir.convert %[[VAL_18]] : (!fir.logical<4>) -> i1
+// ELEMENTAL:               %[[VAL_22:.*]] = arith.andi %[[VAL_20]], %[[VAL_21]] : i1
+// ELEMENTAL:               %[[VAL_23:.*]] = arith.ori %[[VAL_19]], %[[VAL_22]] : i1
+// ELEMENTAL:               %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (i1) -> !fir.logical<4>
+// ELEMENTAL:               fir.result %[[VAL_24]] : !fir.logical<4>
+// ELEMENTAL:             }
+// ELEMENTAL:             hlfir.yield_element %[[VAL_14]] : !fir.logical<4>
+// ELEMENTAL:           }
+// ELEMENTAL:           return %[[VAL_10]] : !hlfir.expr<?x?x!fir.logical<4>>
+// ELEMENTAL:         }
+
+func.func @matmul_matrix_vector_real(%arg0: !hlfir.expr<?x?xf32>, %arg1: !hlfir.expr<?xf16>) -> !hlfir.expr<?xf32> {
+  %res = hlfir.matmul %arg0 %arg1 : (!hlfir.expr<?x?xf32>, !hlfir.expr<?xf16>) -> !hlfir.expr<?xf32>
+  return %res : !hlfir.expr<?xf32>
+}
+// ALL-LABEL:   func.func @matmul_matrix_vector_real(
+// ALL-SAME:                                         %[[VAL_0:.*]]: !hlfir.expr<?x?xf32>,
+// ALL-SAME:                                         %[[VAL_1:.*]]: !hlfir.expr<?xf16>) -> !hlfir.expr<?xf32> {
+
+// NOANSE: hlfir.matmul
+
+// ANSE:           %[[VAL_2:.*]] = arith.constant 0 : index
+// ANSE:           %[[VAL_3:.*]] = arith.constant 1 : index
+// ANSE:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
+// ANSE:           %[[VAL_5:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?x?xf32>) -> !fir.shape<2>
+// ANSE:           %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 0 : index} : (!fir.shape<2>) -> index
+// ANSE:           %[[VAL_7:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ANSE:           %[[VAL_8:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
+// ANSE:           %[[VAL_9:.*]] = hlfir.eval_in_mem shape %[[VAL_8]] : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+// ANSE:           ^bb0(%[[VAL_10:.*]]: !fir.ref<!fir.array<?xf32>>):
+// ANSE:             %[[VAL_11:.*]] = fir.embox %[[VAL_10]](%[[VAL_8]]) : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+// ANSE:             fir.do_loop %[[VAL_12:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] unordered {
+// ANSE:               %[[VAL_13:.*]]:3 = fir.box_dims %[[VAL_11]], %[[VAL_2]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+// ANSE:               %[[VAL_14:.*]] = arith.subi %[[VAL_13]]#0, %[[VAL_3]] : index
+// ANSE:               %[[VAL_15:.*]] = arith.addi %[[VAL_12]], %[[VAL_14]] : index
+// ANSE:               %[[VAL_16:.*]] = hlfir.designate %[[VAL_11]] (%[[VAL_15]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+// ANSE:               hlfir.assign %[[VAL_4]] to %[[VAL_16]] : f32, !fir.ref<f32>
+// ANSE:             }
+// ANSE:             fir.do_loop %[[VAL_17:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_3]] {
+// ANSE:               fir.do_loop %[[VAL_18:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] {
+// ANSE:                 %[[VAL_19:.*]]:3 = fir.box_dims %[[VAL_11]], %[[VAL_2]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+// ANSE:                 %[[VAL_20:.*]] = arith.subi %[[VAL_19]]#0, %[[VAL_3]] : index
+// ANSE:                 %[[VAL_21:.*]] = arith.addi %[[VAL_18]], %[[VAL_20]] : index
+// ANSE:                 %[[VAL_22:.*]] = hlfir.designate %[[VAL_11]] (%[[VAL_21]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+// ANSE:                 %[[VAL_23:.*]] = fir.load %[[VAL_22]] : !fir.ref<f32>
+// ANSE:                 %[[VAL_24:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_18]], %[[VAL_17]] : (!hlfir.expr<?x?xf32>, index, index) -> f32
+// ANSE:                 %[[VAL_25:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_17]] : (!hlfir.expr<?xf16>, index) -> f16
+// ANSE:                 %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (f16) -> f32
+// ANSE:                 %[[VAL_27:.*]] = arith.mulf %[[VAL_24]], %[[VAL_26]] : f32
+// ANSE:                 %[[VAL_28:.*]] = arith.addf %[[VAL_23]], %[[VAL_27]] : f32
+// ANSE:                 hlfir.assign %[[VAL_28]] to %[[VAL_22]] : f32, !fir.ref<f32>
+// ANSE:               }
+// ANSE:             }
+// ANSE:           }
+// ANSE:           return %[[VAL_9]] : !hlfir.expr<?xf32>
+// ANSE:         }
+
+// ELEMENTAL:           %[[VAL_2:.*]] = arith.constant 1 : index
+// ELEMENTAL:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+// ELEMENTAL:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?x?xf32>) -> !fir.shape<2>
+// ELEMENTAL:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<2>) -> index
+// ELEMENTAL:           %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ELEMENTAL:           %[[VAL_7:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+// ELEMENTAL:           %[[VAL_8:.*]] = hlfir.elemental %[[VAL_7]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+// ELEMENTAL:           ^bb0(%[[VAL_9:.*]]: index):
+// ELEMENTAL:             %[[VAL_10:.*]] = fir.do_loop %[[VAL_11:.*]] = %[[VAL_2]] to %[[VAL_6]] step %[[VAL_2]] iter_args(%[[VAL_12:.*]] = %[[VAL_3]]) -> (f32) {
+// ELEMENTAL:               %[[VAL_13:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_9]], %[[VAL_11]] : (!hlfir.expr<?x?xf32>, index, index) -> f32
+// ELEMENTAL:               %[[VAL_14:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_11]] : (!hlfir.expr<?xf16>, index) -> f16
+// ELEMENTAL:               %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (f16) -> f32
+// ELEMENTAL:               %[[VAL_16:.*]] = arith.mulf %[[VAL_13]], %[[VAL_15]] : f32
+// ELEMENTAL:               %[[VAL_17:.*]] = arith.addf %[[VAL_12]], %[[VAL_16]] : f32
+// ELEMENTAL:               fir.result %[[VAL_17]] : f32
+// ELEMENTAL:             }
+// ELEMENTAL:             hlfir.yield_element %[[VAL_10]] : f32
+// ELEMENTAL:           }
+// ELEMENTAL:           return %[[VAL_8]] : !hlfir.expr<?xf32>
+// ELEMENTAL:         }
+
+func.func @matmul_vector_matrix_real(%arg0: !hlfir.expr<?xf32>, %arg1: !hlfir.expr<?x?xf16>) -> !hlfir.expr<?xf32> {
+  %res = hlfir.matmul %arg0 %arg1 : (!hlfir.expr<?xf32>, !hlfir.expr<?x?xf16>) -> !hlfir.expr<?xf32>
+  return %res : !hlfir.expr<?xf32>
+}
+// ALL-LABEL:   func.func @matmul_vector_matrix_real(
+// ALL-SAME:                                         %[[VAL_0:.*]]: !hlfir.expr<?xf32>,
+// ALL-SAME:                                         %[[VAL_1:.*]]: !hlfir.expr<?x?xf16>) -> !hlfir.expr<?xf32> {
+
+// NOANSE: hlfir.matmul
+
+// ANSE:           %[[VAL_2:.*]] = arith.constant 0 : index
+// ANSE:           %[[VAL_3:.*]] = arith.constant 1 : index
+// ANSE:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
+// ANSE:           %[[VAL_5:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xf32>) -> !fir.shape<1>
+// ANSE:           %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 0 : index} : (!fir.shape<1>) -> index
+// ANSE:           %[[VAL_7:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr<?x?xf16>) -> !fir.shape<2>
+// ANSE:           %[[VAL_8:.*]] = hlfir.get_extent %[[VAL_7]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ANSE:           %[[VAL_9:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1>
+// ANSE:           %[[VAL_10:.*]] = hlfir.eval_in_mem shape %[[VAL_9]] : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+// ANSE:           ^bb0(%[[VAL_11:.*]]: !fir.ref<!fir.array<?xf32>>):
+// ANSE:             %[[VAL_12:.*]] = fir.embox %[[VAL_11]](%[[VAL_9]]) : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+// ANSE:             fir.do_loop %[[VAL_13:.*]] = %[[VAL_3]] to %[[VAL_8]] step %[[VAL_3]] unordered {
+// ANSE:               %[[VAL_14:.*]]:3 = fir.box_dims %[[VAL_12]], %[[VAL_2]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+// ANSE:               %[[VAL_15:.*]] = arith.subi %[[VAL_14]]#0, %[[VAL_3]] : index
+// ANSE:               %[[VAL_16:.*]] = arith.addi %[[VAL_13]], %[[VAL_15]] : index
+// ANSE:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_12]] (%[[VAL_16]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+// ANSE:               hlfir.assign %[[VAL_4]] to %[[VAL_17]] : f32, !fir.ref<f32>
+// ANSE:             }
+// ANSE:             fir.do_loop %[[VAL_18:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] {
+// ANSE:               fir.do_loop %[[VAL_19:.*]] = %[[VAL_3]] to %[[VAL_8]] step %[[VAL_3]] {
+// ANSE:                 %[[VAL_20:.*]]:3 = fir.box_dims %[[VAL_12]], %[[VAL_2]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+// ANSE:                 %[[VAL_21:.*]] = arith.subi %[[VAL_20]]#0, %[[VAL_3]] : index
+// ANSE:                 %[[VAL_22:.*]] = arith.addi %[[VAL_19]], %[[VAL_21]] : index
+// ANSE:                 %[[VAL_23:.*]] = hlfir.designate %[[VAL_12]] (%[[VAL_22]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+// ANSE:                 %[[VAL_24:.*]] = fir.load %[[VAL_23]] : !fir.ref<f32>
+// ANSE:                 %[[VAL_25:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_18]] : (!hlfir.expr<?xf32>, index) -> f32
+// ANSE:                 %[[VAL_26:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_18]], %[[VAL_19]] : (!hlfir.expr<?x?xf16>, index, index) -> f16
+// ANSE:                 %[[VAL_27:.*]] = fir.convert %[[VAL_26]] : (f16) -> f32
+// ANSE:                 %[[VAL_28:.*]] = arith.mulf %[[VAL_25]], %[[VAL_27]] : f32
+// ANSE:                 %[[VAL_29:.*]] = arith.addf %[[VAL_24]], %[[VAL_28]] : f32
+// ANSE:                 hlfir.assign %[[VAL_29]] to %[[VAL_23]] : f32, !fir.ref<f32>
+// ANSE:               }
+// ANSE:             }
+// ANSE:           }
+// ANSE:           return %[[VAL_10]] : !hlfir.expr<?xf32>
+// ANSE:         }
+
+// ELEMENTAL:           %[[VAL_2:.*]] = arith.constant 1 : index
+// ELEMENTAL:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+// ELEMENTAL:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xf32>) -> !fir.shape<1>
+// ELEMENTAL:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+// ELEMENTAL:           %[[VAL_6:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr<?x?xf16>) -> !fir.shape<2>
+// ELEMENTAL:           %[[VAL_7:.*]] = hlfir.get_extent %[[VAL_6]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ELEMENTAL:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
+// ELEMENTAL:           %[[VAL_9:.*]] = hlfir.elemental %[[VAL_8]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+// ELEMENTAL:           ^bb0(%[[VAL_10:.*]]: index):
+// ELEMENTAL:             %[[VAL_11:.*]] = fir.do_loop %[[VAL_12:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_13:.*]] = %[[VAL_3]]) -> (f32) {
+// ELEMENTAL:               %[[VAL_14:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_12]] : (!hlfir.expr<?xf32>, index) -> f32
+// ELEMENTAL:               %[[VAL_15:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_12]], %[[VAL_10]] : (!hlfir.expr<?x?xf16>, index, index) -> f16
+// ELEMENTAL:               %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (f16) -> f32
+// ELEMENTAL:               %[[VAL_17:.*]] = arith.mulf %[[VAL_14]], %[[VAL_16]] : f32
+// ELEMENTAL:               %[[VAL_18:.*]] = arith.addf %[[VAL_13]], %[[VAL_17]] : f32
+// ELEMENTAL:               fir.result %[[VAL_18]] : f32
+// ELEMENTAL:             }
+// ELEMENTAL:             hlfir.yield_element %[[VAL_11]] : f32
+// ELEMENTAL:           }
+// ELEMENTAL:           return %[[VAL_9]] : !hlfir.expr<?xf32>
+// ELEMENTAL:         }
+
+func.func @matmul_transpose_matrix_matrix_integer(%arg0: !hlfir.expr<?x?xi16>, %arg1: !hlfir.expr<?x?xi32>) -> !hlfir.expr<?x?xi32> {
+  %res = hlfir.matmul_transpose %arg0 %arg1 : (!hlfir.expr<?x?xi16>, !hlfir.expr<?x?xi32>) -> !hlfir.expr<?x?xi32>
+  return %res : !hlfir.expr<?x?xi32>
+}
+// ALL-LABEL:   func.func @matmul_transpose_matrix_matrix_integer(
+// ALL-SAME:                                                      %[[VAL_0:.*]]: !hlfir.expr<?x?xi16>,
+// ALL-SAME:                                                      %[[VAL_1:.*]]: !hlfir.expr<?x?xi32>) -> !hlfir.expr<?x?xi32> {
+// ALL:           %[[VAL_2:.*]] = arith.constant 1 : index
+// ALL:           %[[VAL_3:.*]] = arith.constant 0 : i32
+// ALL:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?x?xi16>) -> !fir.shape<2>
+// ALL:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<2>) -> index
+// ALL:           %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ALL:           %[[VAL_7:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr<?x?xi32>) -> !fir.shape<2>
+// ALL:           %[[VAL_8:.*]] = hlfir.get_extent %[[VAL_7]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ALL:           %[[VAL_9:.*]] = fir.shape %[[VAL_6]], %[[VAL_8]] : (index, index) -> !fir.shape<2>
+// ALL:           %[[VAL_10:.*]] = hlfir.elemental %[[VAL_9]] unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xi32> {
+// ALL:           ^bb0(%[[VAL_11:.*]]: index, %[[VAL_12:.*]]: index):
+// ALL:             %[[VAL_13:.*]] = fir.do_loop %[[VAL_14:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] unordered iter_args(%[[VAL_15:.*]] = %[[VAL_3]]) -> (i32) {
+// ALL:               %[[VAL_16:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_14]], %[[VAL_11]] : (!hlfir.expr<?x?xi16>, index, index) -> i16
+// ALL:               %[[VAL_17:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_14]], %[[VAL_12]] : (!hlfir.expr<?x?xi32>, index, index) -> i32
+// ALL:               %[[VAL_18:.*]] = fir.convert %[[VAL_16]] : (i16) -> i32
+// ALL:               %[[VAL_19:.*]] = arith.muli %[[VAL_18]], %[[VAL_17]] : i32
+// ALL:               %[[VAL_20:.*]] = arith.addi %[[VAL_15]], %[[VAL_19]] : i32
+// ALL:               fir.result %[[VAL_20]] : i32
+// ALL:             }
+// ALL:             hlfir.yield_element %[[VAL_13]] : i32
+// ALL:           }
+// ALL:           return %[[VAL_10]] : !hlfir.expr<?x?xi32>
+// ALL:         }
+
+func.func @matmul_transpose_matrix_vector_real(%arg0: !hlfir.expr<?x?xf32>, %arg1: !hlfir.expr<?xf16>) -> !hlfir.expr<?xf32> {
+  %res = hlfir.matmul_transpose %arg0 %arg1 : (!hlfir.expr<?x?xf32>, !hlfir.expr<?xf16>) -> !hlfir.expr<?xf32>
+  return %res : !hlfir.expr<?xf32>
+}
+// ALL-LABEL:   func.func @matmul_transpose_matrix_vector_real(
+// ALL-SAME:                                                   %[[VAL_0:.*]]: !hlfir.expr<?x?xf32>,
+// ALL-SAME:                                                   %[[VAL_1:.*]]: !hlfir.expr<?xf16>) -> !hlfir.expr<?xf32> {
+// ALL:           %[[VAL_2:.*]] = arith.constant 1 : index
+// ALL:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+// ALL:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?x?xf32>) -> !fir.shape<2>
+// ALL:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<2>) -> index
+// ALL:           %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 1 : index} : (!fir.shape<2>) -> index
+// ALL:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
+// ALL:           %[[VAL_8:.*]] = hlfir.elemental %[[VAL_7]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+// ALL:           ^bb0(%[[VAL_9:.*]]: index):
+// ALL:             %[[VAL_10:.*]] = fir.do_loop %[[VAL_11:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_12:.*]] = %[[VAL_3]]) -> (f32) {
+// ALL:               %[[VAL_13:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_11]], %[[VAL_9]] : (!hlfir.expr<?x?xf32>, index, index) -> f32
+// ALL:               %[[VAL_14:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_11]] : (!hlfir.expr<?xf16>, index) -> f16
+// ALL:               %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (f16) -> f32
+// ALL:               %[[VAL_16:.*]] = arith.mulf %[[VAL_13]], %[[VAL_15]] : f32
+// ALL:               %[[VAL_17:.*]] = arith.addf %[[VAL_12]], %[[VAL_16]] : f32
+// ALL:               fir.result %[[VAL_17]] : f32
+// ALL:             }
+// ALL:             hlfir.yield_element %[[VAL_10]] : f32
+// ALL:           }
+// ALL:           return %[[VAL_8]] : !hlfir.expr<?xf32>
+// ALL:         }
+
+// Check that the inner-product loop uses the best known extent
+// of the input matrices:
+func.func @matmul_matrix_matrix_deduce_bounds(%arg0: !hlfir.expr<?x10xi16>, %arg1: !hlfir.expr<?x?xi32>, %arg2: !hlfir.expr<10x?xi16>) -> (!hlfir.expr<?x?xi32>, !hlfir.expr<?x?xi32>) {
+  %res1 = hlfir.matmul %arg0 %arg1 : (!hlfir.expr<?x10xi16>, !hlfir.expr<?x?xi32>) -> !hlfir.expr<?x?xi32>
+  %res2 = hlfir.matmul %arg1 %arg2 : (!hlfir.expr<?x?xi32>, !hlfir.expr<10x?xi16>) -> !hlfir.expr<?x?xi32>
+  return %res1, %res2 : !hlfir.expr<?x?xi32>, !hlfir.expr<?x?xi32>
+}
+// ALL-LABEL:   func.func @matmul_matrix_matrix_deduce_bounds(
+
+// ANSE:           %[[VAL_6:.*]] = arith.constant 10 : index
+// ANSE:           hlfir.eval_in_mem shape {{.*}}
+// ANSE:             fir.do_loop
+// ANSE:               fir.do_loop
+// ANSE:             fir.do_loop %{{.*}} = %{{.*}} to %[[VAL_6]]
+// ANSE:               fir.do_loop
+// ANSE:                 fir.do_loop
+// ANSE:           hlfir.eval_in_mem shape {{.*}}
+// ANSE:             fir.do_loop
+// ANSE:               fir.do_loop
+// ANSE:             fir.do_loop %{{.*}} = %{{.*}} to %[[VAL_6]]
+// ANSE:               fir.do_loop
+// ANSE:                 fir.do_loop
+
+// ELEMENTAL:           %[[VAL_5:.*]] = arith.constant 10 : index
+// ELEMENTAL:           hlfir.elemental %{{.*}}
+// ELEMENTAL:             fir.do_loop %{{.*}} = %{{.*}} to %[[VAL_5]]
+// ELEMENTAL:           hlfir.elemental %{{.*}}
+// ELEMENTAL:             fir.do_loop %{{.*}} = %{{.*}} to %[[VAL_5]]
diff --git a/flang/test/HLFIR/sum-lowering.fir b/flang/test/HLFIR/sum-lowering.fir
index d4a79d278acc4..e34ac487e8f9b 100644
--- a/flang/test/HLFIR/sum-lowering.fir
+++ b/flang/test/HLFIR/sum-lowering.fir
@@ -56,7 +56,7 @@ func.func @_QPsum2(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "a"},
 // CHECK-DAG:     %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]]
 // CHECK-DAG:     %[[ARRAY_ARG:.*]] = fir.convert %[[ARRAY]]#1 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 // CHECK-DAG:     %[[MASK_ARG:.*]] = fir.convert %[[MASK]] : (!fir.box<i1>) -> !fir.box<none>
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranASumDim(%[[RET_ARG]], %[[ARRAY_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[MASK_ARG]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> none
+// CHECK:         fir.call @_FortranASumDim(%[[RET_ARG]], %[[ARRAY_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[MASK_ARG]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
 
 // CHECK:         %[[RET:.*]] = fir.load %[[RET_BOX]]
 // CHECK:         %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[RET]]
@@ -175,4 +175,4 @@ func.func @_QPsum5(%arg0: !fir.ref<!fir.array<2xi32>> {fir.bindc_name = "s"}) {
 // CHECK-DAG:     %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]]
 // CHECK-DAG:     %[[ARRAY_ARG:.*]] = fir.convert %[[ARRAY_BOX]] : (!fir.box<!fir.array<2x2xi32>>) -> !fir.box<none>
 // CHECK-DAG:     %[[MASK_ARG:.*]] = fir.convert %[[MASK_BOX]] : (!fir.box<!fir.logical<4>>) -> !fir.box<none>
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranASumDim(%[[RET_ARG]], %[[ARRAY_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[MASK_ARG]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> none
+// CHECK:         fir.call @_FortranASumDim(%[[RET_ARG]], %[[ARRAY_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[MASK_ARG]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
diff --git a/flang/test/HLFIR/transpose-lowering.fir b/flang/test/HLFIR/transpose-lowering.fir
index 9afe8a058b8b8..76d51c3438282 100644
--- a/flang/test/HLFIR/transpose-lowering.fir
+++ b/flang/test/HLFIR/transpose-lowering.fir
@@ -33,7 +33,7 @@ func.func @_QPtranspose1(%arg0: !fir.ref<!fir.array<1x2xi32>> {fir.bindc_name =
 
 // CHECK:         %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
 // CHECK:         %[[M_ARG:.*]] = fir.convert %[[M_BOX]] : (!fir.box<!fir.array<1x2xi32>>) -> !fir.box<none>
-// CHECK:         %[[NONE:.*]] = fir.call @_FortranATranspose(%[[RET_ARG]], %[[M_ARG]], %[[LOC_STR:.*]], %[[LOC_N:.*]])
+// CHECK:         fir.call @_FortranATranspose(%[[RET_ARG]], %[[M_ARG]], %[[LOC_STR:.*]], %[[LOC_N:.*]])
 
 // CHECK:         %[[RET:.*]] = fir.load %[[RET_BOX]]
 // CHECK-DAG:     %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[RET]]
diff --git a/flang/test/HLFIR/unroll-loops.fir b/flang/test/HLFIR/unroll-loops.fir
new file mode 100644
index 0000000000000..83b30d4d72693
--- /dev/null
+++ b/flang/test/HLFIR/unroll-loops.fir
@@ -0,0 +1,39 @@
+// RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
+// RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
+// RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
+// RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
+
+// CHECK-LABEL: @unroll
+// CHECK-SAME: (ptr nocapture writeonly %[[ARG0:.*]])
+func.func @unroll(%arg0: !fir.ref<!fir.array<1000 x index>> {fir.bindc_name = "a"}) {
+  %scope = fir.dummy_scope : !fir.dscope
+  %c1000 = arith.constant 1000 : index
+  %shape = fir.shape %c1000 : (index) -> !fir.shape<1>
+  %a:2 = hlfir.declare %arg0(%shape) dummy_scope %scope {uniq_name = "unrollEa"} : (!fir.ref<!fir.array<1000xindex>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<1000 x index>>, !fir.ref<!fir.array<1000 x index>>)
+  %c1 = arith.constant 1 : index
+  fir.do_loop %arg1 = %c1 to %c1000 step %c1 {
+    // CHECK: br label %[[BLK:.*]]
+    // CHECK: [[BLK]]:
+    // CHECK-NEXT: %[[IND:.*]] = phi i64 [ 0, %{{.*}} ], [ %[[NIV:.*]], %[[BLK]] ]
+    // CHECK-NEXT: %[[VIND:.*]] = phi <2 x i64> [ <i64 1, i64 2>, %{{.*}} ], [ %[[NVIND:.*]], %[[BLK]] ]
+
+    // NO-UNROLL-NEXT: %[[GEP:.*]] = getelementptr i64, ptr %[[ARG0]], i64 %[[IND]]
+    // NO-UNROLL-NEXT: store <2 x i64> %[[VIND]], ptr %[[GEP]]
+    // NO-UNROLL-NEXT: %[[NIV:.*]] = add nuw i64 %{{.*}}, 2
+    // NO-UNROLL-NEXT: %[[NVIND]] = add <2 x i64> %[[VIND]], splat (i64 2)
+
+    // UNROLL-NEXT: %[[VIND1:.*]] = add <2 x i64> %[[VIND]], splat (i64 2)
+    // UNROLL-NEXT: %[[GEP0:.*]] = getelementptr i64, ptr %[[ARG0]], i64 %[[IND]]
+    // UNROLL-NEXT: %[[GEP1:.*]] = getelementptr i8, ptr %[[GEP0]], i64 16
+    // UNROLL-NEXT: store <2 x i64> %[[VIND]], ptr %[[GEP0]]
+    // UNROLL-NEXT: store <2 x i64> %[[VIND1]], ptr %[[GEP1]]
+    // UNROLL-NEXT: %[[NIV:.*]] = add nuw i64 %[[IND]], 4
+    // UNROLL-NEXT: %[[NVIND:.*]] = add <2 x i64> %[[VIND]], splat (i64 4)
+
+    // CHECK-NEXT: %[[EXIT:.*]] = icmp eq i64 %[[NIV]], 1000
+    // CHECK-NEXT: br i1 %[[EXIT]], label %{{.*}}, label %[[BLK]]
+    %ai = hlfir.designate %a#0 (%arg1)  : (!fir.ref<!fir.array<1000 x index>>, index) -> !fir.ref<index>
+    hlfir.assign %arg1 to %ai : index, !fir.ref<index>
+  }
+  return
+}
diff --git a/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90 b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
index fe3a326702e52..0173847b73235 100644
--- a/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
+++ b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
@@ -96,9 +96,12 @@ subroutine worst_case(a, b, c, d)
 
 ! CHECK:       omp.region.cont13:                                ; preds = %omp.private.copy16
 ! CHECK-NEXT:    %{{.*}} = phi ptr
+! CHECK-NEXT:    br label %omp.region.after_alloca
+
+! CHECK:       omp.region.after_alloca:
 ! CHECK-NEXT:    br label %omp.par.region
 
-! CHECK:       omp.par.region:                                   ; preds = %omp.region.cont13
+! CHECK:       omp.par.region:                                   ; preds = %omp.region.after_alloca
 ! CHECK-NEXT:    br label %omp.reduction.init
 
 ! CHECK:       omp.reduction.init:                               ; preds = %omp.par.region
@@ -232,7 +235,7 @@ subroutine worst_case(a, b, c, d)
 ! CHECK-NEXT:    br label %omp.reduction.cleanup42
 
 ! CHECK:       omp.par.region28:                                 ; preds = %omp.par.region27
-! CHECK-NEXT:    call {} @_FortranAStopStatement
+! CHECK-NEXT:    call void @_FortranAStopStatement
 
 ! CHECK:       omp.reduction.neutral23:                          ; preds = %omp.reduction.neutral22
 !                [source length was zero: finish initializing array]
diff --git a/flang/test/Integration/OpenMP/private-global.f90 b/flang/test/Integration/OpenMP/private-global.f90
index 63ac6fbe05ee0..07dbe86e5ec93 100644
--- a/flang/test/Integration/OpenMP/private-global.f90
+++ b/flang/test/Integration/OpenMP/private-global.f90
@@ -34,7 +34,7 @@ program bug
 ! CHECK :         %[[TABLE_BOX_VAL2:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[TABLE_BOX_ADDR]], align 8
 ! CHECK :         store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL2]], ptr %[[TABLE_BOX_ADDR2]], align 8
 ! CHECK:         call void @llvm.memcpy.p0.p0.i32(ptr %[[TABLE_BOX_ADDR2]], ptr %[[TABLE_BOX_ADDR]], i32 48, i1 false)
-! CHECK:         %[[VAL_26:.*]] = call {} @_FortranAAssign(ptr %[[TABLE_BOX_ADDR2]], ptr %[[BOXED_FIFTY]], ptr @{{.*}}, i32 9)
+! CHECK:         call void @_FortranAAssign(ptr %[[TABLE_BOX_ADDR2]], ptr %[[BOXED_FIFTY]], ptr @{{.*}}, i32 9)
 ! ...
 ! check that we use the private copy of table for table/=50
 ! CHECK:       omp.par.region3:
diff --git a/flang/test/Integration/unroll-loops.f90 b/flang/test/Integration/unroll-loops.f90
new file mode 100644
index 0000000000000..939c96e150690
--- /dev/null
+++ b/flang/test/Integration/unroll-loops.f90
@@ -0,0 +1,34 @@
+! RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
+! RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
+! RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
+! RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
+
+! CHECK-LABEL: @unroll
+! CHECK-SAME: (ptr nocapture writeonly %[[ARG0:.*]])
+subroutine unroll(a)
+  integer(kind=8), intent(out) :: a(1000)
+  integer(kind=8) :: i
+    ! CHECK: br label %[[BLK:.*]]
+    ! CHECK: [[BLK]]:
+    ! CHECK-NEXT: %[[IND:.*]] = phi i64 [ 0, %{{.*}} ], [ %[[NIV:.*]], %[[BLK]] ]
+    ! CHECK-NEXT: %[[VIND:.*]] = phi <2 x i64> [ <i64 1, i64 2>, %{{.*}} ], [ %[[NVIND:.*]], %[[BLK]] ]
+    !
+    ! NO-UNROLL-NEXT: %[[GEP:.*]] = getelementptr i64, ptr %[[ARG0]], i64 %[[IND]]
+    ! NO-UNROLL-NEXT: store <2 x i64> %[[VIND]], ptr %[[GEP]]
+    ! NO-UNROLL-NEXT: %[[NIV:.*]] = add nuw i64 %{{.*}}, 2
+    ! NO-UNROLL-NEXT: %[[NVIND]] = add <2 x i64> %[[VIND]], splat (i64 2)
+    !
+    ! UNROLL-NEXT: %[[VIND1:.*]] = add <2 x i64> %[[VIND]], splat (i64 2)
+    ! UNROLL-NEXT: %[[GEP0:.*]] = getelementptr i64, ptr %[[ARG0]], i64 %[[IND]]
+    ! UNROLL-NEXT: %[[GEP1:.*]] = getelementptr i8, ptr %[[GEP0]], i64 16
+    ! UNROLL-NEXT: store <2 x i64> %[[VIND]], ptr %[[GEP0]]
+    ! UNROLL-NEXT: store <2 x i64> %[[VIND1]], ptr %[[GEP1]]
+    ! UNROLL-NEXT: %[[NIV:.*]] = add nuw i64 %[[IND]], 4
+    ! UNROLL-NEXT: %[[NVIND:.*]] = add <2 x i64> %[[VIND]], splat (i64 4)
+    !
+    ! CHECK-NEXT: %[[EXIT:.*]] = icmp eq i64 %[[NIV]], 1000
+    ! CHECK-NEXT: br i1 %[[EXIT]], label %{{.*}}, label %[[BLK]]
+  do i=1,1000
+    a(i) = i
+  end do
+end subroutine
diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf
index 8b287f859aa76..ed78bec1b8f08 100644
--- a/flang/test/Lower/CUDA/cuda-allocatable.cuf
+++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf
@@ -80,7 +80,7 @@ end subroutine
 ! CHECK: fir.embox {{.*}} {allocator_idx = 1 : i32}
 ! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub3Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
 ! CHECK: %[[PLOG:.*]] = fir.alloca !fir.logical<4> {bindc_name = "plog", uniq_name = "_QFsub3Eplog"}
-! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %5 {uniq_name = "_QFsub3Eplog"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub3Eplog"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK-2: fir.call @_FortranAAllocatableSetBounds
 ! CHECK: %{{.*}} = cuf.allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> pinned(%[[PLOG_DECL]]#1 : !fir.ref<!fir.logical<4>>) {data_attr = #cuf.cuda<pinned>} -> i32
 ! CHECK: fir.if %{{.*}} {
@@ -97,7 +97,7 @@ end subroutine
 ! CHECK-LABEL: func.func @_QPsub4()
 ! CHECK: %[[BOX:.*]] = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub4Ea"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK: fir.embox {{.*}} {allocator_idx = 2 : i32}
-! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub4Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub4Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 ! CHECK: %[[ISTREAM:.*]] = fir.alloca i32 {bindc_name = "istream", uniq_name = "_QFsub4Eistream"}
 ! CHECK: %[[ISTREAM_DECL:.*]]:2 = hlfir.declare %[[ISTREAM]] {uniq_name = "_QFsub4Eistream"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: fir.call @_FortranAAllocatableSetBounds
@@ -173,7 +173,7 @@ end subroutine
 ! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>
 
 ! CHECK: %[[ERR_BOX:.*]] = fir.embox %[[ERR_DECL]]#1 : (!fir.ref<!fir.char<1,50>>) -> !fir.box<!fir.char<1,50>>
-! CHECK: %[[STAT:.*]] = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> errmsg(%15 : !fir.box<!fir.char<1,50>>) {data_attr = #cuf.cuda<device>, hasStat} -> i32
+! CHECK: %[[STAT:.*]] = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> errmsg(%{{.*}} : !fir.box<!fir.char<1,50>>) {data_attr = #cuf.cuda<device>, hasStat} -> i32
 ! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>
 ! CHECK: fir.if %{{.*}} {
 ! CHECK:   %{{.*}} = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
diff --git a/flang/test/Lower/CUDA/cuda-devptr.cuf b/flang/test/Lower/CUDA/cuda-devptr.cuf
index 561d92ecd3e2e..d61d84d9bc750 100644
--- a/flang/test/Lower/CUDA/cuda-devptr.cuf
+++ b/flang/test/Lower/CUDA/cuda-devptr.cuf
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir -fcuda %s -o - | FileCheck %s
 
 ! Test CUDA Fortran specific type
 
@@ -37,12 +37,34 @@ subroutine sub2()
 end
 
 ! CHECK-LABEL: func.func @_QPsub2()
-! CHECK: %[[X:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsub2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK: %[[X:.*]] = fir.declare %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsub2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK: %[[CPTR:.*]] = fir.field_index cptr, !fir.type<_QM__fortran_builtinsT__builtin_c_devptr{{[<]?}}{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}{{[>]?}}>
-! CHECK: %[[CPTR_COORD:.*]] = fir.coordinate_of %{{.*}}#1, %[[CPTR]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{{[<]?}}{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}{{[>]?}}>>, !fir.field) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>
+! CHECK: %[[CPTR_COORD:.*]] = fir.coordinate_of %{{.*}}, %[[CPTR]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{{[<]?}}{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}{{[>]?}}>>, !fir.field) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>
 ! CHECK: %[[ADDRESS:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
 ! CHECK: %[[ADDRESS_COORD:.*]] = fir.coordinate_of %[[CPTR_COORD]], %[[ADDRESS]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
 ! CHECK: %[[ADDRESS_LOADED:.*]] = fir.load %[[ADDRESS_COORD]] : !fir.ref<i64>
 ! CHECK: %[[ADDRESS_IDX:.*]] = fir.convert %[[ADDRESS_LOADED]] : (i64) -> !fir.ptr<!fir.array<?xf32>>
 ! CHECK: %[[EMBOX:.*]] = fir.embox %[[ADDRESS_IDX]](%{{.*}}) : (!fir.ptr<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
-! CHECK: fir.store %[[EMBOX]] to %[[X]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK: fir.store %[[EMBOX]] to %[[X]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+
+attributes(global) subroutine assign_c_devptr(p, a)
+  use __fortran_builtins, only: c_devloc => __builtin_c_devloc
+  use __fortran_builtins, only: c_devptr => __builtin_c_devptr
+  type (c_devptr), device :: p
+  complex :: a(10)
+  p = c_devloc(a(1))
+end subroutine
+
+! CHECK-LABEL: func.func @_QPassign_c_devptr
+! CHECK: %[[P:.*]] = fir.declare %arg0 dummy_scope %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFassign_c_devptrEp"}
+! CHECK: %[[C_DEVLOC_RES:.*]] = fir.declare %15 {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>
+! CHECK: %[[CPTR_FIELD:.*]] = fir.field_index cptr, !fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>
+! CHECK: %[[RES_CPTR_COORD:.*]] = fir.coordinate_of %[[C_DEVLOC_RES]], %[[CPTR_FIELD]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>, !fir.field) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>
+! CHECK: %[[CPTR_FIELD:.*]] = fir.field_index cptr, !fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>
+! CHECK: %[[P_CPTR_COORD:.*]] = fir.coordinate_of %[[P]], %[[CPTR_FIELD]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>, !fir.field) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>
+! CHECK: %[[ADDRESS_FIELD:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
+! CHECK: %[[RES_ADDR_COORD:.*]] = fir.coordinate_of %[[RES_CPTR_COORD]], %[[ADDRESS_FIELD]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK: %[[ADDRESS_FIELD:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
+! CHECK: %[[P_ADDR_COORD:.*]] = fir.coordinate_of %[[P_CPTR_COORD]], %[[ADDRESS_FIELD]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK: %[[ADDR:.*]] = fir.load %[[RES_ADDR_COORD]] : !fir.ref<i64>
+! CHECK: fir.store %[[ADDR]] to %[[P_ADDR_COORD]] : !fir.ref<i64>
diff --git a/flang/test/Lower/HLFIR/array-ctor-as-runtime-temp.f90 b/flang/test/Lower/HLFIR/array-ctor-as-runtime-temp.f90
index 727eff7613e48..704245caf3d6d 100644
--- a/flang/test/Lower/HLFIR/array-ctor-as-runtime-temp.f90
+++ b/flang/test/Lower/HLFIR/array-ctor-as-runtime-temp.f90
@@ -21,7 +21,7 @@ subroutine test_loops()
 ! CHECK:           %[[VAL_11:.*]] = arith.constant 7 : i32
 ! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_10]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_14:.*]] = fir.call @_FortranAInitArrayConstructorVector(%[[VAL_8]], %[[VAL_12]], %[[VAL_7]], %[[VAL_13]], %[[VAL_11]]) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.ref<!fir.box<none>>, i1, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitArrayConstructorVector(%[[VAL_8]], %[[VAL_12]], %[[VAL_7]], %[[VAL_13]], %[[VAL_11]]) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.ref<!fir.box<none>>, i1, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_15:.*]] = arith.constant 1 : i64
 ! CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i64) -> index
 ! CHECK:           %[[VAL_17:.*]] = fir.call @_QMarrayctorPibar() fastmath<contract> : () -> i32
@@ -42,7 +42,7 @@ subroutine test_loops()
 ! CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i64) -> i32
 ! CHECK:               fir.store %[[VAL_32]] to %[[VAL_0]] : !fir.ref<i32>
 ! CHECK:               %[[VAL_33:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<i32>) -> !fir.llvm_ptr<i8>
-! CHECK:               %[[VAL_34:.*]] = fir.call @_FortranAPushArrayConstructorSimpleScalar(%[[VAL_8]], %[[VAL_33]]) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>) -> none
+! CHECK:               fir.call @_FortranAPushArrayConstructorSimpleScalar(%[[VAL_8]], %[[VAL_33]]) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>) -> ()
 ! CHECK:             }
 ! CHECK:           }
 ! CHECK:           %[[VAL_35:.*]] = arith.constant true
@@ -85,11 +85,11 @@ subroutine test_arrays(a)
 ! CHECK:  %[[VAL_26:.*]] = arith.constant false
 ! CHECK:  %[[VAL_27:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<10xi64>>) -> !fir.llvm_ptr<i8>
 ! CHECK:  %[[VAL_31:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK:  %[[VAL_33:.*]] = fir.call @_FortranAInitArrayConstructorVector(%[[VAL_27]], %[[VAL_31]], %[[VAL_26]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.ref<!fir.box<none>>, i1, !fir.ref<i8>, i32) -> none
+! CHECK:  fir.call @_FortranAInitArrayConstructorVector(%[[VAL_27]], %[[VAL_31]], %[[VAL_26]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.ref<!fir.box<none>>, i1, !fir.ref<i8>, i32) -> ()
 ! CHECK:  %[[VAL_34:.*]] = fir.convert %[[VAL_3]]#1 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
-! CHECK:  %[[VAL_35:.*]] = fir.call @_FortranAPushArrayConstructorValue(%[[VAL_27]], %[[VAL_34]]) {{.*}}: (!fir.llvm_ptr<i8>, !fir.box<none>) -> none
+! CHECK:  fir.call @_FortranAPushArrayConstructorValue(%[[VAL_27]], %[[VAL_34]]) {{.*}}: (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
 ! CHECK:  %[[VAL_36:.*]] = fir.convert %[[VAL_3]]#1 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
-! CHECK:  %[[VAL_37:.*]] = fir.call @_FortranAPushArrayConstructorValue(%[[VAL_27]], %[[VAL_36]]) {{.*}}: (!fir.llvm_ptr<i8>, !fir.box<none>) -> none
+! CHECK:  fir.call @_FortranAPushArrayConstructorValue(%[[VAL_27]], %[[VAL_36]]) {{.*}}: (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
 ! CHECK:  %[[VAL_38:.*]] = arith.constant true
 ! CHECK:  hlfir.as_expr %[[VAL_24]]#0 move %[[VAL_38]] : (!fir.box<!fir.array<?xi32>>, i1) -> !hlfir.expr<?xi32>
 
@@ -106,13 +106,13 @@ subroutine test_arrays_unpredictable_size()
 ! CHECK:  %[[VAL_9:.*]] = arith.constant false
 ! CHECK:  %[[VAL_10:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.array<10xi64>>) -> !fir.llvm_ptr<i8>
 ! CHECK:  %[[VAL_14:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK:  %[[VAL_16:.*]] = fir.call @_FortranAInitArrayConstructorVector(%[[VAL_10]], %[[VAL_14]], %[[VAL_9]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.ref<!fir.box<none>>, i1, !fir.ref<i8>, i32) -> none
+! CHECK:  fir.call @_FortranAInitArrayConstructorVector(%[[VAL_10]], %[[VAL_14]], %[[VAL_9]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.ref<!fir.box<none>>, i1, !fir.ref<i8>, i32) -> ()
 ! CHECK:  fir.call @_QMarrayctorPrank1() {{.*}}: () -> !fir.box<!fir.heap<!fir.array<?xi32>>>
-! CHECK:  %[[VAL_21:.*]] = fir.call @_FortranAPushArrayConstructorValue(%[[VAL_10]], %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.box<none>) -> none
+! CHECK:  fir.call @_FortranAPushArrayConstructorValue(%[[VAL_10]], %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
 ! CHECK:  fir.call @_QMarrayctorPrank3() {{.*}}: () -> !fir.box<!fir.heap<!fir.array<?x?x?xi32>>>
-! CHECK:  %[[VAL_26:.*]] = fir.call @_FortranAPushArrayConstructorValue(%[[VAL_10]], %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.box<none>) -> none
+! CHECK:  fir.call @_FortranAPushArrayConstructorValue(%[[VAL_10]], %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
 ! CHECK:  fir.call @_QMarrayctorPrank1() {{.*}}: () -> !fir.box<!fir.heap<!fir.array<?xi32>>>
-! CHECK:  %[[VAL_31:.*]] = fir.call @_FortranAPushArrayConstructorValue(%[[VAL_10]], %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.box<none>) -> none
+! CHECK:  fir.call @_FortranAPushArrayConstructorValue(%[[VAL_10]], %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
 ! CHECK:  %[[VAL_32:.*]] = arith.constant true
 ! CHECK:  %[[VAL_33:.*]] = fir.load %[[VAL_4]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK:  hlfir.as_expr %[[VAL_33]] move %[[VAL_32]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>, i1) -> !hlfir.expr<?xi32>
diff --git a/flang/test/Lower/HLFIR/array-ctor-character.f90 b/flang/test/Lower/HLFIR/array-ctor-character.f90
index 7cbad5218f588..5538c6763c310 100644
--- a/flang/test/Lower/HLFIR/array-ctor-character.f90
+++ b/flang/test/Lower/HLFIR/array-ctor-character.f90
@@ -52,11 +52,11 @@ subroutine test_dynamic_length()
 ! CHECK:  %[[VAL_15:.*]] = arith.constant true
 ! CHECK:  %[[VAL_16:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<10xi64>>) -> !fir.llvm_ptr<i8>
 ! CHECK:  %[[VAL_20:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<2x!fir.char<1,?>>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK:  %[[VAL_22:.*]] = fir.call @_FortranAInitArrayConstructorVector(%[[VAL_16]], %[[VAL_20]], %[[VAL_15]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.ref<!fir.box<none>>, i1, !fir.ref<i8>, i32) -> none
+! CHECK:  fir.call @_FortranAInitArrayConstructorVector(%[[VAL_16]], %[[VAL_20]], %[[VAL_15]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.ref<!fir.box<none>>, i1, !fir.ref<i8>, i32) -> ()
 ! CHECK:  fir.call @_QMchararrayctorPchar_pointer(
-! CHECK:  fir.call @_FortranAPushArrayConstructorValue(%[[VAL_16]], %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.box<none>) -> none
+! CHECK:  fir.call @_FortranAPushArrayConstructorValue(%[[VAL_16]], %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
 ! CHECK:  fir.call @_QMchararrayctorPchar_pointer(
-! CHECK:  fir.call @_FortranAPushArrayConstructorValue(%[[VAL_16]], %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.box<none>) -> none
+! CHECK:  fir.call @_FortranAPushArrayConstructorValue(%[[VAL_16]], %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
 ! CHECK:  %[[VAL_45:.*]] = arith.constant true
 ! CHECK:  %[[VAL_46:.*]] = fir.load %[[VAL_3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<2x!fir.char<1,?>>>>>
 ! CHECK:  %[[VAL_47:.*]] = hlfir.as_expr %[[VAL_46]] move %[[VAL_45]] : (!fir.box<!fir.heap<!fir.array<2x!fir.char<1,?>>>>, i1) -> !hlfir.expr<2x!fir.char<1,?>>
diff --git a/flang/test/Lower/HLFIR/array-ctor-derived.f90 b/flang/test/Lower/HLFIR/array-ctor-derived.f90
index 22f7fbd72cb59..08e9abd1ec243 100644
--- a/flang/test/Lower/HLFIR/array-ctor-derived.f90
+++ b/flang/test/Lower/HLFIR/array-ctor-derived.f90
@@ -28,11 +28,11 @@ subroutine test_simple(s1, s2)
 ! CHECK:  %[[VAL_11:.*]] = arith.constant false
 ! CHECK:  %[[VAL_12:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<10xi64>>) -> !fir.llvm_ptr<i8>
 ! CHECK:  %[[VAL_16:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<2x!fir.type<_QMtypesTsimple{i:i32,j:i32}>>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK:  %[[VAL_18:.*]] = fir.call @_FortranAInitArrayConstructorVector(%[[VAL_12]], %[[VAL_16]], %[[VAL_11]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.ref<!fir.box<none>>, i1, !fir.ref<i8>, i32) -> none
+! CHECK:  fir.call @_FortranAInitArrayConstructorVector(%[[VAL_12]], %[[VAL_16]], %[[VAL_11]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.ref<!fir.box<none>>, i1, !fir.ref<i8>, i32) -> ()
 ! CHECK:  %[[VAL_19:.*]] = fir.convert %[[VAL_4]]#1 : (!fir.ref<!fir.type<_QMtypesTsimple{i:i32,j:i32}>>) -> !fir.llvm_ptr<i8>
-! CHECK:  %[[VAL_20:.*]] = fir.call @_FortranAPushArrayConstructorSimpleScalar(%[[VAL_12]], %[[VAL_19]]) {{.*}}: (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>) -> none
+! CHECK:  fir.call @_FortranAPushArrayConstructorSimpleScalar(%[[VAL_12]], %[[VAL_19]]) {{.*}}: (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>) -> ()
 ! CHECK:  %[[VAL_21:.*]] = fir.convert %[[VAL_5]]#1 : (!fir.ref<!fir.type<_QMtypesTsimple{i:i32,j:i32}>>) -> !fir.llvm_ptr<i8>
-! CHECK:  %[[VAL_22:.*]] = fir.call @_FortranAPushArrayConstructorSimpleScalar(%[[VAL_12]], %[[VAL_21]]) {{.*}}: (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>) -> none
+! CHECK:  fir.call @_FortranAPushArrayConstructorSimpleScalar(%[[VAL_12]], %[[VAL_21]]) {{.*}}: (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>) -> ()
 ! CHECK:  %[[VAL_23:.*]] = arith.constant true
 ! CHECK:  %[[VAL_24:.*]] = hlfir.as_expr %[[VAL_9]]#0 move %[[VAL_23]] : (!fir.heap<!fir.array<2x!fir.type<_QMtypesTsimple{i:i32,j:i32}>>>, i1) -> !hlfir.expr<2x!fir.type<_QMtypesTsimple{i:i32,j:i32}>>
 ! CHECK:  fir.call @_QMderivedarrayctorPtakes_simple
@@ -56,13 +56,13 @@ subroutine test_with_polymorphic(s1, s2)
 ! CHECK:  %[[VAL_11:.*]] = arith.constant false
 ! CHECK:  %[[VAL_12:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<10xi64>>) -> !fir.llvm_ptr<i8>
 ! CHECK:  %[[VAL_16:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<2x!fir.type<_QMtypesTsimple{i:i32,j:i32}>>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK:  %[[VAL_18:.*]] = fir.call @_FortranAInitArrayConstructorVector(%[[VAL_12]], %[[VAL_16]], %[[VAL_11]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.ref<!fir.box<none>>, i1, !fir.ref<i8>, i32) -> none
+! CHECK:  fir.call @_FortranAInitArrayConstructorVector(%[[VAL_12]], %[[VAL_16]], %[[VAL_11]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.ref<!fir.box<none>>, i1, !fir.ref<i8>, i32) -> ()
 ! CHECK:  %[[VAL_19A:.*]] = fir.box_addr %[[VAL_4]]#1 : (!fir.class<!fir.type<_QMtypesTsimple{i:i32,j:i32}>>) -> !fir.ref<!fir.type<_QMtypesTsimple{i:i32,j:i32}>>
 ! CHECK:  %[[VAL_19:.*]] = fir.convert %[[VAL_19A]] : (!fir.ref<!fir.type<_QMtypesTsimple{i:i32,j:i32}>>) -> !fir.llvm_ptr<i8>
-! CHECK:  %[[VAL_20:.*]] = fir.call @_FortranAPushArrayConstructorSimpleScalar(%[[VAL_12]], %[[VAL_19]]) {{.*}}: (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>) -> none
+! CHECK:  fir.call @_FortranAPushArrayConstructorSimpleScalar(%[[VAL_12]], %[[VAL_19]]) {{.*}}: (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>) -> ()
 ! CHECK:  %[[VAL_21A:.*]] = fir.box_addr %[[VAL_5]]#1 : (!fir.class<!fir.type<_QMtypesTsimple{i:i32,j:i32}>>) -> !fir.ref<!fir.type<_QMtypesTsimple{i:i32,j:i32}>>
 ! CHECK:  %[[VAL_21:.*]] = fir.convert %[[VAL_21A]] : (!fir.ref<!fir.type<_QMtypesTsimple{i:i32,j:i32}>>) -> !fir.llvm_ptr<i8>
-! CHECK:  %[[VAL_22:.*]] = fir.call @_FortranAPushArrayConstructorSimpleScalar(%[[VAL_12]], %[[VAL_21]]) {{.*}}: (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>) -> none
+! CHECK:  fir.call @_FortranAPushArrayConstructorSimpleScalar(%[[VAL_12]], %[[VAL_21]]) {{.*}}: (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>) -> ()
 ! CHECK:  %[[VAL_23:.*]] = arith.constant true
 ! CHECK:  %[[VAL_24:.*]] = hlfir.as_expr %[[VAL_9]]#0 move %[[VAL_23]] : (!fir.heap<!fir.array<2x!fir.type<_QMtypesTsimple{i:i32,j:i32}>>>, i1) -> !hlfir.expr<2x!fir.type<_QMtypesTsimple{i:i32,j:i32}>>
 ! CHECK:  fir.call @_QMderivedarrayctorPtakes_simple
diff --git a/flang/test/Lower/HLFIR/assumed-rank-inquiries-3.f90 b/flang/test/Lower/HLFIR/assumed-rank-inquiries-3.f90
index af89cb833b337..aeb2c124d2628 100644
--- a/flang/test/Lower/HLFIR/assumed-rank-inquiries-3.f90
+++ b/flang/test/Lower/HLFIR/assumed-rank-inquiries-3.f90
@@ -10,7 +10,7 @@ subroutine test_shape(x)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 4 : i32
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi32>>) -> !fir.llvm_ptr<i8>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_3:.*]] : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
-! CHECK:           %[[VAL_10:.*]] = fir.call @_FortranAShape(%[[VAL_7]], %[[VAL_8]], %[[VAL_4]], %{{.*}}, %{{.*}})
+! CHECK:           fir.call @_FortranAShape(%[[VAL_7]], %[[VAL_8]], %[[VAL_4]], %{{.*}}, %{{.*}})
 ! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi32>>) -> !fir.ref<!fir.array<?xi32>>
 ! CHECK:           %[[VAL_12:.*]] = fir.box_rank %[[VAL_3]] : (!fir.box<!fir.array<*:f32>>) -> index
 ! CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
@@ -33,7 +33,7 @@ subroutine test_shape_kind(x)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 8 : i32
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi64>>) -> !fir.llvm_ptr<i8>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_3:.*]] : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
-! CHECK:           %[[VAL_10:.*]] = fir.call @_FortranAShape(%[[VAL_7]], %[[VAL_8]], %[[VAL_4]], %{{.*}}, %{{.*}})
+! CHECK:           fir.call @_FortranAShape(%[[VAL_7]], %[[VAL_8]], %[[VAL_4]], %{{.*}}, %{{.*}})
 ! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi64>>) -> !fir.ref<!fir.array<?xi64>>
 ! CHECK:           %[[VAL_12:.*]] = fir.box_rank %[[VAL_3]] : (!fir.box<!fir.array<*:f32>>) -> index
 ! CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
@@ -49,7 +49,7 @@ subroutine test_shape_2(x)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 4 : i32
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi32>>) -> !fir.llvm_ptr<i8>
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> !fir.box<none>
-! CHECK:           %[[VAL_11:.*]] = fir.call @_FortranAShape(%[[VAL_8]], %[[VAL_9]], %[[VAL_5]], %{{.*}}, %{{.*}})
+! CHECK:           fir.call @_FortranAShape(%[[VAL_8]], %[[VAL_9]], %[[VAL_5]], %{{.*}}, %{{.*}})
 ! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi32>>) -> !fir.ref<!fir.array<?xi32>>
 ! CHECK:           %[[VAL_13:.*]] = fir.box_rank %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> index
 ! CHECK:           %[[VAL_14:.*]] = fir.shape %[[VAL_13]] : (index) -> !fir.shape<1>
@@ -65,7 +65,7 @@ subroutine test_lbound(x)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 4 : i32
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi32>>) -> !fir.llvm_ptr<i8>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_3:.*]] : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
-! CHECK:           %[[VAL_10:.*]] = fir.call @_FortranALbound(%[[VAL_7]], %[[VAL_8]], %[[VAL_4]], %{{.*}}, %{{.*}})
+! CHECK:           fir.call @_FortranALbound(%[[VAL_7]], %[[VAL_8]], %[[VAL_4]], %{{.*}}, %{{.*}})
 ! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi32>>) -> !fir.ref<!fir.array<?xi32>>
 ! CHECK:           %[[VAL_12:.*]] = fir.box_rank %[[VAL_3]] : (!fir.box<!fir.array<*:f32>>) -> index
 ! CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
@@ -88,7 +88,7 @@ subroutine test_lbound_kind(x)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 8 : i32
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi64>>) -> !fir.llvm_ptr<i8>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_3:.*]] : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
-! CHECK:           %[[VAL_10:.*]] = fir.call @_FortranALbound(%[[VAL_7]], %[[VAL_8]], %[[VAL_4]], %{{.*}}, %{{.*}})
+! CHECK:           fir.call @_FortranALbound(%[[VAL_7]], %[[VAL_8]], %[[VAL_4]], %{{.*}}, %{{.*}})
 ! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi64>>) -> !fir.ref<!fir.array<?xi64>>
 ! CHECK:           %[[VAL_12:.*]] = fir.box_rank %[[VAL_3]] : (!fir.box<!fir.array<*:f32>>) -> index
 ! CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
@@ -104,7 +104,7 @@ subroutine test_lbound_2(x)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 4 : i32
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi32>>) -> !fir.llvm_ptr<i8>
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> !fir.box<none>
-! CHECK:           %[[VAL_11:.*]] = fir.call @_FortranALbound(%[[VAL_8]], %[[VAL_9]], %[[VAL_5]], %{{.*}}, %{{.*}})
+! CHECK:           fir.call @_FortranALbound(%[[VAL_8]], %[[VAL_9]], %[[VAL_5]], %{{.*}}, %{{.*}})
 ! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi32>>) -> !fir.ref<!fir.array<?xi32>>
 ! CHECK:           %[[VAL_13:.*]] = fir.box_rank %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> index
 ! CHECK:           %[[VAL_14:.*]] = fir.shape %[[VAL_13]] : (index) -> !fir.shape<1>
@@ -119,7 +119,7 @@ subroutine test_ubound(x)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 4 : i32
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi32>>) -> !fir.llvm_ptr<i8>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_3:.*]] : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
-! CHECK:           %[[VAL_10:.*]] = fir.call @_FortranAUbound(%[[VAL_7]], %[[VAL_8]], %[[VAL_4]], %{{.*}}, %{{.*}})
+! CHECK:           fir.call @_FortranAUbound(%[[VAL_7]], %[[VAL_8]], %[[VAL_4]], %{{.*}}, %{{.*}})
 ! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi32>>) -> !fir.ref<!fir.array<?xi32>>
 ! CHECK:           %[[VAL_12:.*]] = fir.box_rank %[[VAL_3]] : (!fir.box<!fir.array<*:f32>>) -> index
 ! CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
@@ -142,7 +142,7 @@ subroutine test_ubound_kind(x)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 8 : i32
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi64>>) -> !fir.llvm_ptr<i8>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_3:.*]] : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
-! CHECK:           %[[VAL_10:.*]] = fir.call @_FortranAUbound(%[[VAL_7]], %[[VAL_8]], %[[VAL_4]], %{{.*}}, %{{.*}})
+! CHECK:           fir.call @_FortranAUbound(%[[VAL_7]], %[[VAL_8]], %[[VAL_4]], %{{.*}}, %{{.*}})
 ! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi64>>) -> !fir.ref<!fir.array<?xi64>>
 ! CHECK:           %[[VAL_12:.*]] = fir.box_rank %[[VAL_3]] : (!fir.box<!fir.array<*:f32>>) -> index
 ! CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
@@ -158,7 +158,7 @@ subroutine test_ubound_2(x)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 4 : i32
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi32>>) -> !fir.llvm_ptr<i8>
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> !fir.box<none>
-! CHECK:           %[[VAL_11:.*]] = fir.call @_FortranAUbound(%[[VAL_8]], %[[VAL_9]], %[[VAL_5]], %{{.*}}, %{{.*}})
+! CHECK:           fir.call @_FortranAUbound(%[[VAL_8]], %[[VAL_9]], %[[VAL_5]], %{{.*}}, %{{.*}})
 ! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi32>>) -> !fir.ref<!fir.array<?xi32>>
 ! CHECK:           %[[VAL_13:.*]] = fir.box_rank %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> index
 ! CHECK:           %[[VAL_14:.*]] = fir.shape %[[VAL_13]] : (index) -> !fir.shape<1>
@@ -171,7 +171,7 @@ subroutine test_lbound_dim(x)
 ! CHECK-LABEL:   func.func @_QPtest_lbound_dim(
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 2 : i32
 ! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_2:.*]]#0 : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
-! CHECK:           %[[VAL_8:.*]] = fir.call @_FortranALboundDim(%[[VAL_6]], %[[VAL_3]],
+! CHECK:           fir.call @_FortranALboundDim(%[[VAL_6]], %[[VAL_3]],
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i64) -> i32
 ! CHECK:           %[[VAL_10:.*]]:3 = hlfir.associate %[[VAL_9]]
 
@@ -186,7 +186,7 @@ subroutine test_ubound_dim(x)
 ! CHECK:           %[[VAL_8:.*]] = fir.call @_FortranASizeDim(%[[VAL_6]], %[[VAL_3]],
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i64) -> i32
 ! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
-! CHECK:           %[[VAL_14:.*]] = fir.call @_FortranALboundDim(%[[VAL_12]], %[[VAL_3]],
+! CHECK:           fir.call @_FortranALboundDim(%[[VAL_12]], %[[VAL_3]],
 ! CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i64) -> i32
 ! CHECK:           %[[VAL_16:.*]] = arith.constant 1 : i32
 ! CHECK:           %[[VAL_17:.*]] = arith.subi %[[VAL_15]], %[[VAL_16]] : i32
diff --git a/flang/test/Lower/HLFIR/assumed-rank-inquiries.f90 b/flang/test/Lower/HLFIR/assumed-rank-inquiries.f90
index a94ae7da36593..6a44cbd86e80d 100644
--- a/flang/test/Lower/HLFIR/assumed-rank-inquiries.f90
+++ b/flang/test/Lower/HLFIR/assumed-rank-inquiries.f90
@@ -210,7 +210,7 @@ subroutine c_loc_2(x)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[VAL_7:.*]] = arith.cmpi eq, %[[VAL_5]], %[[VAL_6]] : i64
 ! CHECK:           fir.if %[[VAL_7]] {
-! CHECK:             %[[VAL_13:.*]] = fir.call @_FortranAReportFatalUserError
+! CHECK:             fir.call @_FortranAReportFatalUserError
 ! CHECK:           }
 ! CHECK:           %[[VAL_14:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
 ! CHECK:           %[[VAL_15:.*]] = fir.box_elesize %[[VAL_14]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> i32
diff --git a/flang/test/Lower/HLFIR/cray-pointers.f90 b/flang/test/Lower/HLFIR/cray-pointers.f90
index bb49977dd2227..29b4f7b52ac09 100644
--- a/flang/test/Lower/HLFIR/cray-pointers.f90
+++ b/flang/test/Lower/HLFIR/cray-pointers.f90
@@ -21,7 +21,7 @@ end subroutine test1
 ! CHECK:           %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.ptr<i64>>
 ! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_12]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:           %[[VAL_18:.*]] = fir.convert %[[VAL_16]] : (!fir.ptr<i64>) -> !fir.llvm_ptr<i8>
-! CHECK:           %[[VAL_19:.*]] = fir.call @_FortranAPointerAssociateScalar(%[[VAL_17]], %[[VAL_18]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> none
+! CHECK:           fir.call @_FortranAPointerAssociateScalar(%[[VAL_17]], %[[VAL_18]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> ()
 ! CHECK:           %[[VAL_20:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:           %[[VAL_21:.*]] = arith.constant 7 : index
 ! CHECK:           %[[VAL_22:.*]] = hlfir.designate %[[VAL_20]] (%[[VAL_21]])  : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
@@ -47,7 +47,7 @@ end subroutine test2
 ! CHECK:           %[[VAL_28:.*]] = fir.load %[[VAL_27]] : !fir.ref<!fir.ptr<i64>>
 ! CHECK:           %[[VAL_29:.*]] = fir.convert %[[VAL_24]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:           %[[VAL_30:.*]] = fir.convert %[[VAL_28]] : (!fir.ptr<i64>) -> !fir.llvm_ptr<i8>
-! CHECK:           %[[VAL_31:.*]] = fir.call @_FortranAPointerAssociateScalar(%[[VAL_29]], %[[VAL_30]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> none
+! CHECK:           fir.call @_FortranAPointerAssociateScalar(%[[VAL_29]], %[[VAL_30]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> ()
 ! CHECK:           %[[VAL_32:.*]] = fir.load %[[VAL_24]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:           %[[VAL_33:.*]] = arith.constant 7 : index
 ! CHECK:           %[[VAL_34:.*]] = hlfir.designate %[[VAL_32]] (%[[VAL_33]])  : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
@@ -75,7 +75,7 @@ end subroutine test3
 ! CHECK:           %[[VAL_33:.*]] = fir.load %[[VAL_32]] : !fir.ref<!fir.ptr<i64>>
 ! CHECK:           %[[VAL_34:.*]] = fir.convert %[[VAL_29]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,11>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:           %[[VAL_35:.*]] = fir.convert %[[VAL_33]] : (!fir.ptr<i64>) -> !fir.llvm_ptr<i8>
-! CHECK:           %[[VAL_36:.*]] = fir.call @_FortranAPointerAssociateScalar(%[[VAL_34]], %[[VAL_35]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> none
+! CHECK:           fir.call @_FortranAPointerAssociateScalar(%[[VAL_34]], %[[VAL_35]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> ()
 ! CHECK:           %[[VAL_37:.*]] = fir.load %[[VAL_29]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,11>>>>>
 ! CHECK:           %[[VAL_38:.*]] = arith.constant 7 : index
 ! CHECK:           %[[VAL_39:.*]] = hlfir.designate %[[VAL_37]] (%[[VAL_38]])  typeparams %[[VAL_8]] : (!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,11>>>>, index, index) -> !fir.ref<!fir.char<1,11>>
@@ -103,7 +103,7 @@ end subroutine test4
 ! CHECK:           %[[VAL_23:.*]] = fir.load %[[VAL_22]] : !fir.ref<!fir.ptr<i64>>
 ! CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_13]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:           %[[VAL_25:.*]] = fir.convert %[[VAL_23]] : (!fir.ptr<i64>) -> !fir.llvm_ptr<i8>
-! CHECK:           %[[VAL_26:.*]] = fir.call @_FortranAPointerAssociateScalar(%[[VAL_24]], %[[VAL_25]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> none
+! CHECK:           fir.call @_FortranAPointerAssociateScalar(%[[VAL_24]], %[[VAL_25]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> ()
 ! CHECK:           %[[VAL_27:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
 ! CHECK:           %[[VAL_28:.*]] = fir.box_addr %[[VAL_27]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>>
 ! CHECK:           %[[VAL_29:.*]] = fir.emboxchar %[[VAL_28]], %[[VAL_8]] : (!fir.ptr<!fir.char<1,?>>, i32) -> !fir.boxchar<1>
@@ -134,7 +134,7 @@ end subroutine test5
 ! CHECK:           %[[VAL_17:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.ptr<i64>>
 ! CHECK:           %[[VAL_18:.*]] = fir.convert %[[VAL_13]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFtest5Tt{r:f32,i:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:           %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.ptr<i64>) -> !fir.llvm_ptr<i8>
-! CHECK:           %[[VAL_20:.*]] = fir.call @_FortranAPointerAssociateScalar(%[[VAL_18]], %[[VAL_19]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> none
+! CHECK:           fir.call @_FortranAPointerAssociateScalar(%[[VAL_18]], %[[VAL_19]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> ()
 ! CHECK:           %[[VAL_21:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFtest5Tt{r:f32,i:i32}>>>>>
 ! CHECK:           %[[VAL_22:.*]] = arith.constant 7 : index
 ! CHECK:           %[[VAL_23:.*]] = hlfir.designate %[[VAL_21]] (%[[VAL_22]])  : (!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFtest5Tt{r:f32,i:i32}>>>>, index) -> !fir.ref<!fir.type<_QFtest5Tt{r:f32,i:i32}>>
@@ -178,7 +178,7 @@ end subroutine test6
 ! CHECK:           %[[VAL_50:.*]] = fir.load %[[VAL_49]] : !fir.ref<!fir.ptr<i64>>
 ! CHECK:           %[[VAL_51:.*]] = fir.convert %[[VAL_20]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,?>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:           %[[VAL_52:.*]] = fir.convert %[[VAL_50]] : (!fir.ptr<i64>) -> !fir.llvm_ptr<i8>
-! CHECK:           %[[VAL_53:.*]] = fir.call @_FortranAPointerAssociateScalar(%[[VAL_51]], %[[VAL_52]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> none
+! CHECK:           fir.call @_FortranAPointerAssociateScalar(%[[VAL_51]], %[[VAL_52]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> ()
 ! CHECK:           %[[VAL_54:.*]] = fir.load %[[VAL_20]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,?>>>>>
 ! CHECK:           %[[VAL_55:.*]] = arith.constant 9 : index
 ! CHECK:           %[[VAL_56:.*]] = hlfir.designate %[[VAL_54]] (%[[VAL_55]])  typeparams %[[VAL_11]] : (!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,?>>>>, index, i32) -> !fir.boxchar<1>
@@ -187,7 +187,7 @@ end subroutine test6
 ! CHECK:           %[[VAL_58:.*]] = fir.load %[[VAL_57]] : !fir.ref<!fir.ptr<i64>>
 ! CHECK:           %[[VAL_59:.*]] = fir.convert %[[VAL_46]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:           %[[VAL_60:.*]] = fir.convert %[[VAL_58]] : (!fir.ptr<i64>) -> !fir.llvm_ptr<i8>
-! CHECK:           %[[VAL_61:.*]] = fir.call @_FortranAPointerAssociateScalar(%[[VAL_59]], %[[VAL_60]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> none
+! CHECK:           fir.call @_FortranAPointerAssociateScalar(%[[VAL_59]], %[[VAL_60]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> ()
 ! CHECK:           %[[VAL_62:.*]] = fir.load %[[VAL_46]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:           %[[VAL_63:.*]] = arith.constant 5 : index
 ! CHECK:           %[[VAL_64:.*]] = hlfir.designate %[[VAL_62]] (%[[VAL_63]])  : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
@@ -216,7 +216,7 @@ end subroutine test7
 ! CHECK:    %[[VAL_15:.*]] = fir.load %[[VAL_14]] : !fir.ref<!fir.ptr<i64>>
 ! CHECK:    %[[VAL_16:.*]] = fir.convert %[[VAL_9]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:    %[[VAL_17:.*]] = fir.convert %[[VAL_15]] : (!fir.ptr<i64>) -> !fir.llvm_ptr<i8>
-! CHECK:    %[[VAL_18:.*]] = fir.call @_FortranAPointerAssociateScalar(%[[VAL_16]], %[[VAL_17]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> none
+! CHECK:    fir.call @_FortranAPointerAssociateScalar(%[[VAL_16]], %[[VAL_17]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> ()
 ! CHECK:    %[[VAL_19:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
 
 subroutine test8()
@@ -238,7 +238,7 @@ end subroutine test8
 ! CHECK:    %[[VAL_11:.*]] = fir.load %[[VAL_10]] : !fir.ref<!fir.ptr<i64>>
 ! CHECK:    %[[VAL_12:.*]] = fir.convert %[[VAL_5]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:    %[[VAL_13:.*]] = fir.convert %[[VAL_11]] : (!fir.ptr<i64>) -> !fir.llvm_ptr<i8>
-! CHECK:    %[[VAL_14:.*]] = fir.call @_FortranAPointerAssociateScalar(%[[VAL_12]], %[[VAL_13]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> none
+! CHECK:    fir.call @_FortranAPointerAssociateScalar(%[[VAL_12]], %[[VAL_13]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> ()
 ! CHECK:    %[[VAL_15:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
 ! CHECK:    %[[VAL_16:.*]] = fir.box_addr %[[VAL_15]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>) -> !fir.ptr<!fir.array<?xi32>>
 ! CHECK:    %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (!fir.ptr<!fir.array<?xi32>>) -> !fir.ref<!fir.array<5xi32>>
@@ -268,7 +268,7 @@ end subroutine test9
 ! CHECK:    %[[VAL_11:.*]] = fir.load %[[VAL_10]] : !fir.ref<!fir.ptr<i64>>
 ! CHECK:    %[[VAL_12:.*]] = fir.convert %[[VAL_5]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:    %[[VAL_13:.*]] = fir.convert %[[VAL_11]] : (!fir.ptr<i64>) -> !fir.llvm_ptr<i8>
-! CHECK:    %[[VAL_14:.*]] = fir.call @_FortranAPointerAssociateScalar(%[[VAL_12]], %[[VAL_13]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> none
+! CHECK:    fir.call @_FortranAPointerAssociateScalar(%[[VAL_12]], %[[VAL_13]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> ()
 ! CHECK:    %[[VAL_15:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
 ! CHECK:    %[[VAL_16:.*]] = hlfir.as_expr %[[VAL_15]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>) -> !hlfir.expr<?xi32>
 ! CHECK:    %[[VAL_17:.*]] = arith.constant 0 : index
@@ -297,7 +297,7 @@ end subroutine test10
 ! CHECK:    %[[VAL_9:.*]] = fir.load %[[VAL_8]] : !fir.ref<!fir.ptr<i64>>
 ! CHECK:    %[[VAL_10:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:    %[[VAL_11:.*]] = fir.convert %[[VAL_9]] : (!fir.ptr<i64>) -> !fir.llvm_ptr<i8>
-! CHECK:    %[[VAL_12:.*]] = fir.call @_FortranAPointerAssociateScalar(%[[VAL_10]], %[[VAL_11]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> none
+! CHECK:    fir.call @_FortranAPointerAssociateScalar(%[[VAL_10]], %[[VAL_11]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> ()
 ! CHECK:    %[[VAL_13:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
 ! CHECK:    %[[VAL_14:.*]] = fir.box_addr %[[VAL_13]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
 ! CHECK:    %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (!fir.ptr<i32>) -> !fir.ref<i32>
@@ -325,7 +325,7 @@ end subroutine test11
 ! CHECK:    %[[VAL_9:.*]] = fir.load %[[VAL_8]] : !fir.ref<!fir.ptr<i64>>
 ! CHECK:    %[[VAL_10:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:    %[[VAL_11:.*]] = fir.convert %[[VAL_9]] : (!fir.ptr<i64>) -> !fir.llvm_ptr<i8>
-! CHECK:    %[[VAL_12:.*]] = fir.call @_FortranAPointerAssociateScalar(%[[VAL_10]], %[[VAL_11]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> none
+! CHECK:    fir.call @_FortranAPointerAssociateScalar(%[[VAL_10]], %[[VAL_11]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> ()
 ! CHECK:    %[[VAL_13:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
 ! CHECK:    %[[VAL_14:.*]] = fir.box_addr %[[VAL_13]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
 ! CHECK:    %[[VAL_15:.*]] = fir.load %[[VAL_14]] : !fir.ptr<i32>
@@ -355,7 +355,7 @@ subroutine test_hidden_pointer
 ! CHECK:           %[[VAL_7:.*]] = fir.load %[[VAL_6]] : !fir.ref<!fir.ptr<i64>>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.ref<!fir.box<!fir.ptr<f32>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_7]] : (!fir.ptr<i64>) -> !fir.llvm_ptr<i8>
-! CHECK:           %[[VAL_10:.*]] = fir.call @_FortranAPointerAssociateScalar(%[[VAL_8]], %[[VAL_9]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> none
+! CHECK:           fir.call @_FortranAPointerAssociateScalar(%[[VAL_8]], %[[VAL_9]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> ()
 ! CHECK:           %[[VAL_11:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.ptr<f32>>>
 ! CHECK:           %[[VAL_12:.*]] = fir.box_addr %[[VAL_11]] : (!fir.box<!fir.ptr<f32>>) -> !fir.ptr<f32>
 ! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (!fir.ptr<f32>) -> !fir.ref<f32>
@@ -417,7 +417,7 @@ subroutine internal()
 ! CHECK:           %[[VAL_12:.*]] = fir.load %[[VAL_11]] : !fir.ref<!fir.ptr<i64>>
 ! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_6]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_12]] : (!fir.ptr<i64>) -> !fir.llvm_ptr<i8>
-! CHECK:           %[[VAL_15:.*]] = fir.call @_FortranAPointerAssociateScalar(%[[VAL_13]], %[[VAL_14]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> none
+! CHECK:           fir.call @_FortranAPointerAssociateScalar(%[[VAL_13]], %[[VAL_14]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>) -> ()
 ! CHECK:           %[[VAL_16:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
 ! CHECK:           %[[VAL_17:.*]] = fir.box_addr %[[VAL_16]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>>
 ! CHECK:           %[[VAL_18:.*]] = fir.emboxchar %[[VAL_17]], %[[VAL_5]] : (!fir.ptr<!fir.char<1,?>>, index) -> !fir.boxchar<1>
diff --git a/flang/test/Lower/HLFIR/function-return-as-expr.f90 b/flang/test/Lower/HLFIR/function-return-as-expr.f90
index 95a0c090ef043..41c489decf158 100644
--- a/flang/test/Lower/HLFIR/function-return-as-expr.f90
+++ b/flang/test/Lower/HLFIR/function-return-as-expr.f90
@@ -70,7 +70,7 @@ end subroutine test4
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = ".tmp.func_result"} : (!fir.class<!fir.heap<none>>) -> (!fir.class<!fir.heap<none>>, !fir.class<!fir.heap<none>>)
 ! CHECK:           hlfir.assign %[[VAL_7]]#0 to %{{.*}}#0 realloc : !fir.class<!fir.heap<none>>, !fir.ref<!fir.class<!fir.heap<none>>>
 ! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.class<!fir.heap<none>>>) -> !fir.box<none>
-! CHECK:           fir.call @_FortranADestroy(%[[VAL_10]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK:           fir.call @_FortranADestroy(%[[VAL_10]]) fastmath<contract> : (!fir.box<none>) -> ()
 
 subroutine test4b
   class(*), allocatable :: p(:, :)
@@ -85,7 +85,7 @@ end subroutine test4b
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = ".tmp.func_result"} : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> (!fir.class<!fir.heap<!fir.array<?x?xnone>>>, !fir.class<!fir.heap<!fir.array<?x?xnone>>>)
 ! CHECK:           hlfir.assign %[[VAL_7]]#0 to %{{.*}}#0 realloc : !fir.class<!fir.heap<!fir.array<?x?xnone>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>
 ! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) -> !fir.box<none>
-! CHECK:           fir.call @_FortranADestroy(%[[VAL_10]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK:           fir.call @_FortranADestroy(%[[VAL_10]]) fastmath<contract> : (!fir.box<none>) -> ()
 
 subroutine test5
   use types
diff --git a/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90 b/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90
index c2118432a9813..bb3ce49059027 100644
--- a/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90
+++ b/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90
@@ -100,7 +100,7 @@ end subroutine test_derived_explicit_shape_array
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_0]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_5:.*]] = fir.embox %[[VAL_3]]#1(%[[VAL_4]]) : (!fir.ref<!fir.array<10x!fir.type<_QFtest_derived_explicit_shape_arrayTt1{a:!fir.box<!fir.heap<f32>>}>>>, !fir.shape<1>) -> !fir.box<!fir.array<10x!fir.type<_QFtest_derived_explicit_shape_arrayTt1{a:!fir.box<!fir.heap<f32>>}>>>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_5]] : (!fir.box<!fir.array<10x!fir.type<_QFtest_derived_explicit_shape_arrayTt1{a:!fir.box<!fir.heap<f32>>}>>>) -> !fir.box<none>
-! CHECK:           %[[VAL_10:.*]] = fir.call @_FortranAInitialize(%[[VAL_8]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_8]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_11:.*]] = fir.embox %[[VAL_3]]#0(%[[VAL_2]]) : (!fir.ref<!fir.array<10x!fir.type<_QFtest_derived_explicit_shape_arrayTt1{a:!fir.box<!fir.heap<f32>>}>>>, !fir.shape<1>) -> !fir.box<!fir.array<10x!fir.type<_QFtest_derived_explicit_shape_arrayTt1{a:!fir.box<!fir.heap<f32>>}>>>
 ! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (!fir.box<!fir.array<10x!fir.type<_QFtest_derived_explicit_shape_arrayTt1{a:!fir.box<!fir.heap<f32>>}>>>) -> !fir.class<!fir.array<10xnone>>
 ! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (!fir.class<!fir.array<10xnone>>) -> !fir.class<none>
diff --git a/flang/test/Lower/HLFIR/intentout-allocatable-components.f90 b/flang/test/Lower/HLFIR/intentout-allocatable-components.f90
index 9d4bedbd9be60..647f1c26c8dad 100644
--- a/flang/test/Lower/HLFIR/intentout-allocatable-components.f90
+++ b/flang/test/Lower/HLFIR/intentout-allocatable-components.f90
@@ -13,7 +13,7 @@ subroutine test_intentout_component_deallocate(a)
 ! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QFtest_intentout_component_deallocateEa"}
 ! CHECK:  %[[VAL_2:.*]] = fir.embox %[[VAL_1]]#1 : (!fir.ref<!fir.type<_QFtest_intentout_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<!fir.type<_QFtest_intentout_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>
 ! CHECK:  %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.box<!fir.type<_QFtest_intentout_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<none>
-! CHECK:  %[[VAL_4:.*]] = fir.call @_FortranADestroy(%[[VAL_3]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK:  fir.call @_FortranADestroy(%[[VAL_3]]) fastmath<contract> : (!fir.box<none>) -> ()
 
 subroutine test_intentout_optional_component_deallocate(a)
   type :: t
@@ -28,5 +28,5 @@ subroutine test_intentout_optional_component_deallocate(a)
 ! CHECK:  fir.if %[[VAL_2]] {
 ! CHECK:    %[[VAL_3:.*]] = fir.embox %[[VAL_1]]#1 : (!fir.ref<!fir.type<_QFtest_intentout_optional_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<!fir.type<_QFtest_intentout_optional_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>
 ! CHECK:    %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.box<!fir.type<_QFtest_intentout_optional_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<none>
-! CHECK:    %[[VAL_5:.*]] = fir.call @_FortranADestroy(%[[VAL_4]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK:    fir.call @_FortranADestroy(%[[VAL_4]]) fastmath<contract> : (!fir.box<none>) -> ()
 ! CHECK:  }
diff --git a/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90 b/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90
index 5763d84cfd605..ed1d0a954d82f 100644
--- a/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90
+++ b/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90
@@ -50,7 +50,7 @@ subroutine internal()
 ! CHECK:             %[[VAL_20:.*]] = fir.convert %[[VAL_18]] : (!fir.tdesc<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> !fir.ref<none>
 ! CHECK:             %[[VAL_21:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_22:.*]] = arith.constant 0 : i32
-! CHECK:             %[[VAL_23:.*]] = fir.call @_FortranAPointerNullifyDerived(%[[VAL_19]], %[[VAL_20]], %[[VAL_21]], %[[VAL_22]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
+! CHECK:             fir.call @_FortranAPointerNullifyDerived(%[[VAL_19]], %[[VAL_20]], %[[VAL_21]], %[[VAL_22]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK:           }
 ! CHECK:           fir.call @_QMcaptured_optional_polymorphicFtestPinternal(%[[VAL_7]])
 
diff --git a/flang/test/Lower/HLFIR/intrinsic-subroutines.f90 b/flang/test/Lower/HLFIR/intrinsic-subroutines.f90
index 216e044ec9cab..2812d124c4b7a 100644
--- a/flang/test/Lower/HLFIR/intrinsic-subroutines.f90
+++ b/flang/test/Lower/HLFIR/intrinsic-subroutines.f90
@@ -81,7 +81,7 @@ program main
 ! CHECK:           %[[VAL_45:.*]] = arith.constant 1 : i32
 ! CHECK:           %[[VAL_46:.*]] = arith.constant false
 ! CHECK:           %[[VAL_47:.*]] = arith.constant false
-! CHECK:           %[[VAL_48:.*]] = fir.call @_FortranAStopStatement(%[[VAL_45]], %[[VAL_46]], %[[VAL_47]]) fastmath<contract> : (i32, i1, i1) -> none
+! CHECK:           fir.call @_FortranAStopStatement(%[[VAL_45]], %[[VAL_46]], %[[VAL_47]]) fastmath<contract> : (i32, i1, i1) -> ()
 ! CHECK:           fir.unreachable
 ! CHECK:         ^bb2:
 ! CHECK:           return
diff --git a/flang/test/Lower/HLFIR/local-end-of-scope-component-dealloc.f90 b/flang/test/Lower/HLFIR/local-end-of-scope-component-dealloc.f90
index 06884138d28c3..246d18b243974 100644
--- a/flang/test/Lower/HLFIR/local-end-of-scope-component-dealloc.f90
+++ b/flang/test/Lower/HLFIR/local-end-of-scope-component-dealloc.f90
@@ -25,7 +25,7 @@ subroutine test1()
   type(t1) :: x1
 end subroutine test1
 ! CHECK-LABEL:   func.func @_QPtest1() {
-! CHECK-DAG:       %[[VAL_10:.*]] = fir.call @_FortranADestroy(%[[VAL_9:.*]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK-DAG:       fir.call @_FortranADestroy(%[[VAL_9:.*]]) fastmath<contract> : (!fir.box<none>) -> ()
 ! CHECK-DAG:       %[[VAL_9]] = fir.convert %{{.*}} : (!fir.box<!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>>) -> !fir.box<none>
 
 subroutine test1b()
@@ -35,7 +35,7 @@ subroutine test1b()
   end block
 end subroutine test1b
 ! CHECK-LABEL:   func.func @_QPtest1b() {
-! CHECK-DAG:       %[[VAL_11:.*]] = fir.call @_FortranADestroy(%[[VAL_10:.*]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK-DAG:       fir.call @_FortranADestroy(%[[VAL_10:.*]]) fastmath<contract> : (!fir.box<none>) -> ()
 ! CHECK-DAG:       %[[VAL_10]] = fir.convert %{{.*}} : (!fir.box<!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>>) -> !fir.box<none>
 
 subroutine test2()
@@ -43,7 +43,7 @@ subroutine test2()
   type(t2) :: x2
 end subroutine test2
 ! CHECK-LABEL:   func.func @_QPtest2() {
-! CHECK-DAG:       %[[VAL_10:.*]] = fir.call @_FortranADestroy(%[[VAL_9:.*]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK-DAG:       fir.call @_FortranADestroy(%[[VAL_9:.*]]) fastmath<contract> : (!fir.box<none>) -> ()
 ! CHECK-DAG:       %[[VAL_9]] = fir.convert %{{.*}} : (!fir.box<!fir.type<_QMtypesTt2{x:!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>}>>) -> !fir.box<none>
 
 subroutine test2b()
@@ -53,7 +53,7 @@ subroutine test2b()
   end block
 end subroutine test2b
 ! CHECK-LABEL:   func.func @_QPtest2b() {
-! CHECK-DAG:       %[[VAL_11:.*]] = fir.call @_FortranADestroy(%[[VAL_10:.*]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK-DAG:       fir.call @_FortranADestroy(%[[VAL_10:.*]]) fastmath<contract> : (!fir.box<none>) -> ()
 ! CHECK-DAG:       %[[VAL_10]] = fir.convert %{{.*}} : (!fir.box<!fir.type<_QMtypesTt2{x:!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>}>>) -> !fir.box<none>
 
 subroutine test3()
@@ -61,7 +61,7 @@ subroutine test3()
   type(t3) :: x3
 end subroutine test3
 ! CHECK-LABEL:   func.func @_QPtest3() {
-! CHECK-DAG:       %[[VAL_10:.*]] = fir.call @_FortranADestroy(%[[VAL_9:.*]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK-DAG:       fir.call @_FortranADestroy(%[[VAL_9:.*]]) fastmath<contract> : (!fir.box<none>) -> ()
 ! CHECK-DAG:       %[[VAL_9]] = fir.convert %{{.*}} : (!fir.box<!fir.type<_QMtypesTt3{t1:!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>}>>) -> !fir.box<none>
 
 subroutine test3b()
@@ -71,7 +71,7 @@ subroutine test3b()
   end block
 end subroutine test3b
 ! CHECK-LABEL:   func.func @_QPtest3b() {
-! CHECK-DAG:       %[[VAL_11:.*]] = fir.call @_FortranADestroy(%[[VAL_10:.*]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK-DAG:       fir.call @_FortranADestroy(%[[VAL_10:.*]]) fastmath<contract> : (!fir.box<none>) -> ()
 ! CHECK-DAG:       %[[VAL_10]] = fir.convert %{{.*}} : (!fir.box<!fir.type<_QMtypesTt3{t1:!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>}>>) -> !fir.box<none>
 
 subroutine test4()
@@ -79,7 +79,7 @@ subroutine test4()
   type(t4) :: x4
 end subroutine test4
 ! CHECK-LABEL:   func.func @_QPtest4() {
-! CHECK-DAG:       %[[VAL_10:.*]] = fir.call @_FortranADestroy(%[[VAL_9:.*]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK-DAG:       fir.call @_FortranADestroy(%[[VAL_9:.*]]) fastmath<contract> : (!fir.box<none>) -> ()
 ! CHECK-DAG:       %[[VAL_9]] = fir.convert %{{.*}} : (!fir.box<!fir.type<_QMtypesTt4{t3:!fir.type<_QMtypesTt3{t1:!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>}>}>>) -> !fir.box<none>
 
 subroutine test4b()
@@ -89,7 +89,7 @@ subroutine test4b()
   end block
 end subroutine test4b
 ! CHECK-LABEL:   func.func @_QPtest4b() {
-! CHECK-DAG:       %[[VAL_11:.*]] = fir.call @_FortranADestroy(%[[VAL_10:.*]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK-DAG:       fir.call @_FortranADestroy(%[[VAL_10:.*]]) fastmath<contract> : (!fir.box<none>) -> ()
 ! CHECK-DAG:       %[[VAL_10]] = fir.convert %{{.*}} : (!fir.box<!fir.type<_QMtypesTt4{t3:!fir.type<_QMtypesTt3{t1:!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>}>}>>) -> !fir.box<none>
 
 subroutine test5()
@@ -97,7 +97,7 @@ subroutine test5()
   type(t5) :: x5
 end subroutine test5
 ! CHECK-LABEL:   func.func @_QPtest5() {
-! CHECK-DAG:       %[[VAL_10:.*]] = fir.call @_FortranADestroy(%[[VAL_9:.*]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK-DAG:       fir.call @_FortranADestroy(%[[VAL_9:.*]]) fastmath<contract> : (!fir.box<none>) -> ()
 ! CHECK-DAG:       %[[VAL_9]] = fir.convert %{{.*}} : (!fir.box<!fir.type<_QMtypesTt5{t2:!fir.type<_QMtypesTt2{x:!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>}>}>>) -> !fir.box<none>
 
 subroutine test5b()
@@ -107,5 +107,5 @@ subroutine test5b()
   end block
 end subroutine test5b
 ! CHECK-LABEL:   func.func @_QPtest5b() {
-! CHECK-DAG:       %[[VAL_11:.*]] = fir.call @_FortranADestroy(%[[VAL_10:.*]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK-DAG:       fir.call @_FortranADestroy(%[[VAL_10:.*]]) fastmath<contract> : (!fir.box<none>) -> ()
 ! CHECK-DAG:       %[[VAL_10]] = fir.convert %{{.*}} : (!fir.box<!fir.type<_QMtypesTt5{t2:!fir.type<_QMtypesTt2{x:!fir.type<_QMtypesTt1{x:!fir.box<!fir.heap<f32>>}>}>}>>) -> !fir.box<none>
diff --git a/flang/test/Lower/HLFIR/structure-constructor.f90 b/flang/test/Lower/HLFIR/structure-constructor.f90
index ed9ee5d0ac363..3a82145ddf4f6 100644
--- a/flang/test/Lower/HLFIR/structure-constructor.f90
+++ b/flang/test/Lower/HLFIR/structure-constructor.f90
@@ -50,7 +50,7 @@ end subroutine test1
 ! CHECK:           %[[VAL_11:.*]] = arith.constant {{[0-9]*}} : i32
 ! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_9]] : (!fir.box<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_10]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_14:.*]] = fir.call @_FortranAInitialize(%[[VAL_12]], %[[VAL_13]], %[[VAL_11]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_12]], %[[VAL_13]], %[[VAL_11]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_15:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_16:.*]] = hlfir.designate %[[VAL_8]]#0{"c"}   typeparams %[[VAL_15]] : (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>, index) -> !fir.ref<!fir.char<1,4>>
 ! CHECK:           hlfir.assign %[[VAL_7]]#0 to %[[VAL_16]] temporary_lhs : !fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>
@@ -78,7 +78,7 @@ end subroutine test2
 ! CHECK:           %[[VAL_10:.*]] = arith.constant {{[0-9]*}} : i32
 ! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_9]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_13:.*]] = fir.call @_FortranAInitialize(%[[VAL_11]], %[[VAL_12]], %[[VAL_10]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_11]], %[[VAL_12]], %[[VAL_10]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_14:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_15:.*]] = fir.shape %[[VAL_14]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_16:.*]] = hlfir.designate %[[VAL_7]]#0{"i"} <%[[VAL_15]]>   shape %[[VAL_15]] : (!fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>, !fir.shape<1>, !fir.shape<1>) -> !fir.ref<!fir.array<10xi32>>
@@ -103,7 +103,7 @@ end subroutine test3
 ! CHECK:           %[[VAL_6:.*]] = arith.constant {{[0-9]*}} : i32
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_4]] : (!fir.box<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_9:.*]] = fir.call @_FortranAInitialize(%[[VAL_7]], %[[VAL_8]], %[[VAL_6]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_7]], %[[VAL_8]], %[[VAL_6]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest3Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>)
 ! CHECK:           %[[VAL_12:.*]] = fir.embox %[[VAL_11]]#0 : (!fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> !fir.box<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>
@@ -111,7 +111,7 @@ end subroutine test3
 ! CHECK:           %[[VAL_14:.*]] = arith.constant {{[0-9]*}} : i32
 ! CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_12]] : (!fir.box<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_13]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_17:.*]] = fir.call @_FortranAInitialize(%[[VAL_15]], %[[VAL_16]], %[[VAL_14]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_15]], %[[VAL_16]], %[[VAL_14]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_18:.*]] = hlfir.designate %[[VAL_11]]#0{"r"}   {fortran_attrs = #fir.var_attrs<pointer>} : (!fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:           %[[VAL_19:.*]] = fir.load %[[VAL_10]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:           %[[VAL_20:.*]] = arith.constant 0 : index
@@ -139,7 +139,7 @@ end subroutine test4
 ! CHECK:           %[[VAL_6:.*]] = arith.constant {{[0-9]*}} : i32
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_4]] : (!fir.box<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_9:.*]] = fir.call @_FortranAInitialize(%[[VAL_7]], %[[VAL_8]], %[[VAL_6]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_7]], %[[VAL_8]], %[[VAL_6]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 2 : index
 ! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_10]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest4Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>)
 ! CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>, !fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>)
@@ -148,7 +148,7 @@ end subroutine test4
 ! CHECK:           %[[VAL_15:.*]] = arith.constant {{[0-9]*}} : i32
 ! CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_13]] : (!fir.box<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_14]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_18:.*]] = fir.call @_FortranAInitialize(%[[VAL_16]], %[[VAL_17]], %[[VAL_15]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_16]], %[[VAL_17]], %[[VAL_15]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_19:.*]] = arith.constant 2 : index
 ! CHECK:           %[[VAL_20:.*]] = hlfir.designate %[[VAL_12]]#0{"c"}   typeparams %[[VAL_19]] {fortran_attrs = #fir.var_attrs<allocatable>} : (!fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>, index) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>
 ! CHECK:           %[[VAL_21:.*]] = fir.load %[[VAL_11]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>
@@ -183,7 +183,7 @@ end subroutine test5
 ! CHECK:           %[[VAL_6:.*]] = arith.constant {{[0-9]*}} : i32
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_4]] : (!fir.box<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_9:.*]] = fir.call @_FortranAInitialize(%[[VAL_7]], %[[VAL_8]], %[[VAL_6]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_7]], %[[VAL_8]], %[[VAL_6]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest5Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>)
 ! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>, !fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>)
 ! CHECK:           %[[VAL_12:.*]] = fir.embox %[[VAL_11]]#0 : (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>) -> !fir.box<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>
@@ -191,7 +191,7 @@ end subroutine test5
 ! CHECK:           %[[VAL_14:.*]] = arith.constant {{[0-9]*}} : i32
 ! CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_12]] : (!fir.box<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_13]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_17:.*]] = fir.call @_FortranAInitialize(%[[VAL_15]], %[[VAL_16]], %[[VAL_14]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_15]], %[[VAL_16]], %[[VAL_14]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_18:.*]] = hlfir.designate %[[VAL_11]]#0{"t5m"}   {fortran_attrs = #fir.var_attrs<allocatable>} : (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>
 ! CHECK:           %[[VAL_19:.*]] = fir.load %[[VAL_10]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>
 ! CHECK:           %[[VAL_20:.*]] = fir.box_addr %[[VAL_19]] : (!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>) -> !fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>
@@ -234,7 +234,7 @@ end subroutine test6
 ! CHECK:           %[[VAL_15:.*]] = arith.constant {{[0-9]*}} : i32
 ! CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_13]] : (!fir.box<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_14]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_18:.*]] = fir.call @_FortranAInitialize(%[[VAL_16]], %[[VAL_17]], %[[VAL_15]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_16]], %[[VAL_17]], %[[VAL_15]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest6Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>)
 ! CHECK:           %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>, !fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>)
 ! CHECK:           %[[VAL_21:.*]] = fir.embox %[[VAL_20]]#0 : (!fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>) -> !fir.box<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>
@@ -242,7 +242,7 @@ end subroutine test6
 ! CHECK:           %[[VAL_23:.*]] = arith.constant {{[0-9]*}} : i32
 ! CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_21]] : (!fir.box<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_25:.*]] = fir.convert %[[VAL_22]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_26:.*]] = fir.call @_FortranAInitialize(%[[VAL_24]], %[[VAL_25]], %[[VAL_23]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_24]], %[[VAL_25]], %[[VAL_23]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_27:.*]] = hlfir.designate %[[VAL_20]]#0{"t5"}   : (!fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>) -> !fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>
 ! CHECK:           %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>, !fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>)
 ! CHECK:           %[[VAL_29:.*]] = fir.embox %[[VAL_28]]#0 : (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>) -> !fir.box<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>
@@ -250,7 +250,7 @@ end subroutine test6
 ! CHECK:           %[[VAL_31:.*]] = arith.constant {{[0-9]*}} : i32
 ! CHECK:           %[[VAL_32:.*]] = fir.convert %[[VAL_29]] : (!fir.box<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_33:.*]] = fir.convert %[[VAL_30]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_34:.*]] = fir.call @_FortranAInitialize(%[[VAL_32]], %[[VAL_33]], %[[VAL_31]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_32]], %[[VAL_33]], %[[VAL_31]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_35:.*]] = hlfir.designate %[[VAL_28]]#0{"t5m"}   {fortran_attrs = #fir.var_attrs<allocatable>} : (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>
 ! CHECK:           %[[VAL_36:.*]] = fir.load %[[VAL_19]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>
 ! CHECK:           %[[VAL_37:.*]] = fir.box_addr %[[VAL_36]] : (!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>) -> !fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>
@@ -277,19 +277,19 @@ end subroutine test6
 ! CHECK:           %[[VAL_54:.*]] = arith.constant {{[0-9]*}} : i32
 ! CHECK:           %[[VAL_55:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:           %[[VAL_56:.*]] = fir.convert %[[VAL_53]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_57:.*]] = fir.call @_FortranAInitArrayConstructorVector(%[[VAL_51]], %[[VAL_55]], %[[VAL_50]], %[[VAL_56]], %[[VAL_54]]) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.ref<!fir.box<none>>, i1, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitArrayConstructorVector(%[[VAL_51]], %[[VAL_55]], %[[VAL_50]], %[[VAL_56]], %[[VAL_54]]) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.ref<!fir.box<none>>, i1, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_58:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>) -> (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>, !fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>)
 ! CHECK:           %[[VAL_59:.*]] = fir.embox %[[VAL_58]]#0 : (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>) -> !fir.box<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>
 ! CHECK:           %[[VAL_60:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
 ! CHECK:           %[[VAL_61:.*]] = arith.constant {{[0-9]*}} : i32
 ! CHECK:           %[[VAL_62:.*]] = fir.convert %[[VAL_59]] : (!fir.box<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_63:.*]] = fir.convert %[[VAL_60]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_64:.*]] = fir.call @_FortranAInitialize(%[[VAL_62]], %[[VAL_63]], %[[VAL_61]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_62]], %[[VAL_63]], %[[VAL_61]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_65:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_66:.*]] = hlfir.designate %[[VAL_58]]#0{"c"}   typeparams %[[VAL_65]] : (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>, index) -> !fir.ref<!fir.char<1,4>>
 ! CHECK:           hlfir.assign %[[VAL_10]]#0 to %[[VAL_66]] temporary_lhs : !fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>
 ! CHECK:           %[[VAL_67:.*]] = fir.convert %[[VAL_58]]#1 : (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>) -> !fir.llvm_ptr<i8>
-! CHECK:           %[[VAL_68:.*]] = fir.call @_FortranAPushArrayConstructorSimpleScalar(%[[VAL_51]], %[[VAL_67]]) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>) -> none
+! CHECK:           fir.call @_FortranAPushArrayConstructorSimpleScalar(%[[VAL_51]], %[[VAL_67]]) fastmath<contract> : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>) -> ()
 ! CHECK:           %[[VAL_69:.*]] = arith.constant true
 ! CHECK:           %[[VAL_70:.*]] = hlfir.as_expr %[[VAL_48]]#0 move %[[VAL_69]] : (!fir.heap<!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>>, i1) -> !hlfir.expr<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>
 ! CHECK:           hlfir.assign %[[VAL_70]] to %[[VAL_44]] temporary_lhs : !hlfir.expr<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>, !fir.ref<!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>>
@@ -323,14 +323,14 @@ end subroutine test7
 ! CHECK:           %[[VAL_7:.*]] = arith.constant {{[0-9]*}} : i32
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_5]] : (!fir.box<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_6]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_10:.*]] = fir.call @_FortranAInitialize(%[[VAL_8]], %[[VAL_9]], %[[VAL_7]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_8]], %[[VAL_9]], %[[VAL_7]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>)
 ! CHECK:           %[[VAL_12:.*]] = fir.embox %[[VAL_11]]#0 : (!fir.ref<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> !fir.box<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>
 ! CHECK:           %[[VAL_13:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
 ! CHECK:           %[[VAL_14:.*]] = arith.constant {{[0-9]*}} : i32
 ! CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_12]] : (!fir.box<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_13]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_17:.*]] = fir.call @_FortranAInitialize(%[[VAL_15]], %[[VAL_16]], %[[VAL_14]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_15]], %[[VAL_16]], %[[VAL_14]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_18:.*]] = hlfir.designate %[[VAL_11]]#0{"c1"}   : (!fir.ref<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> !fir.ref<i32>
 ! CHECK:           %[[VAL_19:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
 ! CHECK:           hlfir.assign %[[VAL_19]] to %[[VAL_18]] temporary_lhs : i32, !fir.ref<i32>
@@ -355,7 +355,7 @@ end subroutine test8
 ! CHECK:           %[[VAL_5:.*]] = arith.constant {{[0-9]*}} : i32
 ! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_3]] : (!fir.box<!fir.type<_QMtypesTt8{c:!fir.box<!fir.heap<!fir.char<1,11>>>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_8:.*]] = fir.call @_FortranAInitialize(%[[VAL_6]], %[[VAL_7]], %[[VAL_5]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_6]], %[[VAL_7]], %[[VAL_5]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_9:.*]] = fir.alloca !fir.box<!fir.heap<!fir.char<1,12>>> {bindc_name = "x", uniq_name = "_QFtest8Ex"}
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 12 : index
 ! CHECK:           %[[VAL_11:.*]] = fir.zero_bits !fir.heap<!fir.char<1,12>>
@@ -368,7 +368,7 @@ end subroutine test8
 ! CHECK:           %[[VAL_17:.*]] = arith.constant {{[0-9]*}} : i32
 ! CHECK:           %[[VAL_18:.*]] = fir.convert %[[VAL_15]] : (!fir.box<!fir.type<_QMtypesTt8{c:!fir.box<!fir.heap<!fir.char<1,11>>>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_19:.*]] = fir.convert %[[VAL_16]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_20:.*]] = fir.call @_FortranAInitialize(%[[VAL_18]], %[[VAL_19]], %[[VAL_17]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_18]], %[[VAL_19]], %[[VAL_17]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_21:.*]] = arith.constant 11 : index
 ! CHECK:           %[[VAL_22:.*]] = hlfir.designate %[[VAL_14]]#0{"c"}   typeparams %[[VAL_21]] {fortran_attrs = #fir.var_attrs<allocatable>} : (!fir.ref<!fir.type<_QMtypesTt8{c:!fir.box<!fir.heap<!fir.char<1,11>>>}>>, index) -> !fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>
 ! CHECK:           %[[VAL_23:.*]] = fir.load %[[VAL_13]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,12>>>>
@@ -404,7 +404,7 @@ end subroutine test9
 ! CHECK:           %[[VAL_5:.*]] = arith.constant {{[0-9]*}} : i32
 ! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_3]] : (!fir.box<!fir.type<_QMtypesTt8{c:!fir.box<!fir.heap<!fir.char<1,11>>>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_8:.*]] = fir.call @_FortranAInitialize(%[[VAL_6]], %[[VAL_7]], %[[VAL_5]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_6]], %[[VAL_7]], %[[VAL_5]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 12 : index
 ! CHECK:           %[[VAL_10:.*]] = fir.alloca !fir.char<1,12> {bindc_name = "x", uniq_name = "_QFtest9Ex"}
 ! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] typeparams %[[VAL_9]] {uniq_name = "_QFtest9Ex"} : (!fir.ref<!fir.char<1,12>>, index) -> (!fir.ref<!fir.char<1,12>>, !fir.ref<!fir.char<1,12>>)
@@ -414,7 +414,7 @@ end subroutine test9
 ! CHECK:           %[[VAL_15:.*]] = arith.constant {{[0-9]*}} : i32
 ! CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_13]] : (!fir.box<!fir.type<_QMtypesTt8{c:!fir.box<!fir.heap<!fir.char<1,11>>>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_14]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_18:.*]] = fir.call @_FortranAInitialize(%[[VAL_16]], %[[VAL_17]], %[[VAL_15]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_16]], %[[VAL_17]], %[[VAL_15]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_19:.*]] = arith.constant 11 : index
 ! CHECK:           %[[VAL_20:.*]] = hlfir.designate %[[VAL_12]]#0{"c"}   typeparams %[[VAL_19]] {fortran_attrs = #fir.var_attrs<allocatable>} : (!fir.ref<!fir.type<_QMtypesTt8{c:!fir.box<!fir.heap<!fir.char<1,11>>>}>>, index) -> !fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>
 ! CHECK:           hlfir.assign %[[VAL_11]]#0 to %[[VAL_20]] realloc keep_lhs_len temporary_lhs : !fir.ref<!fir.char<1,12>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>
@@ -449,7 +449,7 @@ end subroutine test10
 ! CHECK:           %[[VAL_11:.*]] = arith.constant {{[0-9]*}} : i32
 ! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_9]] : (!fir.box<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_10]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_14:.*]] = fir.call @_FortranAInitialize(%[[VAL_12]], %[[VAL_13]], %[[VAL_11]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_12]], %[[VAL_13]], %[[VAL_11]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           %[[VAL_15:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_16:.*]] = hlfir.designate %[[VAL_8]]#0{"c"}   typeparams %[[VAL_15]] : (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>, index) -> !fir.ref<!fir.char<1,4>>
 ! CHECK:           %[[VAL_17:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,12>>>>
diff --git a/flang/test/Lower/Intrinsics/abort.f90 b/flang/test/Lower/Intrinsics/abort.f90
index 942d3c8cd9af6..1b51708cbf7e7 100644
--- a/flang/test/Lower/Intrinsics/abort.f90
+++ b/flang/test/Lower/Intrinsics/abort.f90
@@ -1,7 +1,7 @@
 ! RUN: bbc -emit-fir %s -o - | FileCheck %s
 
 ! CHECK-LABEL: func.func @_QPabort_test() {
-! CHECK:         %[[VAL_0:.*]] = fir.call @_FortranAAbort() {{.*}}: () -> none
+! CHECK:         fir.call @_FortranAAbort() {{.*}}: () -> ()
 ! CHECK:         return
 ! CHECK:       }
 
diff --git a/flang/test/Lower/Intrinsics/adjustl.f90 b/flang/test/Lower/Intrinsics/adjustl.f90
index a8d004cd52665..56c93996015f4 100644
--- a/flang/test/Lower/Intrinsics/adjustl.f90
+++ b/flang/test/Lower/Intrinsics/adjustl.f90
@@ -13,7 +13,7 @@ subroutine adjustl_test
   ! CHECK: %[[r3:.*]] = fir.convert %[[strBox]] : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) -> !fir.ref<!fir.box<none>>
   ! CHECK: %[[r4:.*]] = fir.convert %[[eBox]] : (!fir.box<!fir.char<1,12>>) -> !fir.box<none>
   ! CHECK: %[[r5:.*]] = fir.convert %[[r2]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-  ! CHECK: %[[r6:.*]] = fir.call @_FortranAAdjustl(%[[r3]], %[[r4]], %[[r5]], %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  ! CHECK: fir.call @_FortranAAdjustl(%[[r3]], %[[r4]], %[[r5]], %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
     adjust_str = adjustl(adjust_str)
   end subroutine
   
diff --git a/flang/test/Lower/Intrinsics/adjustr.f90 b/flang/test/Lower/Intrinsics/adjustr.f90
index 07aa08c994586..17c2a1647bb8d 100644
--- a/flang/test/Lower/Intrinsics/adjustr.f90
+++ b/flang/test/Lower/Intrinsics/adjustr.f90
@@ -13,7 +13,7 @@ subroutine adjustr_test
   ! CHECK: %[[r3:.*]] = fir.convert %[[strBox]] : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) -> !fir.ref<!fir.box<none>>
   ! CHECK: %[[r4:.*]] = fir.convert %[[eBox]] : (!fir.box<!fir.char<1,12>>) -> !fir.box<none>
   ! CHECK: %[[r5:.*]] = fir.convert %[[r2]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-  ! CHECK: %[[r6:.*]] = fir.call @_FortranAAdjustr(%[[r3]], %[[r4]], %[[r5]], %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  ! CHECK: fir.call @_FortranAAdjustr(%[[r3]], %[[r4]], %[[r5]], %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
     adjust_str = adjustr(adjust_str)
   end subroutine
   
diff --git a/flang/test/Lower/Intrinsics/all.f90 b/flang/test/Lower/Intrinsics/all.f90
index 3eb7ea70dfb16..343169f0b10ea 100644
--- a/flang/test/Lower/Intrinsics/all.f90
+++ b/flang/test/Lower/Intrinsics/all.f90
@@ -24,7 +24,7 @@ subroutine all_test2(mask, d, rslt)
 ! CHECK:  %[[a6:.*]] = fir.convert %[[a0:.*]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:  %[[a7:.*]] = fir.convert %[[arg0:.*]]: (!fir.box<!fir.array<?x?x!fir.logical<4>>>) -> !fir.box<none>
 rslt = all(mask, d)
-! CHECK:  %[[r1:.*]] = fir.call @_FortranAAllDim(%[[a6:.*]], %[[a7:.*]], %[[a1:.*]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+! CHECK:  fir.call @_FortranAAllDim(%[[a6:.*]], %[[a7:.*]], %[[a1:.*]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
 ! CHECK:  %[[a10:.*]] = fir.load %[[a0:.*]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>
 ! CHECK:  %[[a12:.*]] = fir.box_addr %[[a10:.*]] : (!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>) -> !fir.heap<!fir.array<?x!fir.logical<4>>>
 ! CHECK:  fir.freemem %[[a12:.*]]
diff --git a/flang/test/Lower/Intrinsics/any.f90 b/flang/test/Lower/Intrinsics/any.f90
index 1ea22d9706744..e4dc20e3de2f3 100644
--- a/flang/test/Lower/Intrinsics/any.f90
+++ b/flang/test/Lower/Intrinsics/any.f90
@@ -24,7 +24,7 @@ subroutine any_test2(mask, d, rslt)
 ! CHECK-DAG:  %[[a6:.*]] = fir.convert %[[a0:.*]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK-DAG:  %[[a7:.*]] = fir.convert %[[arg0:.*]]: (!fir.box<!fir.array<?x?x!fir.logical<4>>>) -> !fir.box<none>
 rslt = any(mask, d)
-! CHECK:  %[[r1:.*]] = fir.call @_FortranAAnyDim(%[[a6:.*]], %[[a7:.*]], %[[a1:.*]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+! CHECK:  fir.call @_FortranAAnyDim(%[[a6:.*]], %[[a7:.*]], %[[a1:.*]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
 ! CHECK-DAG:  %[[a10:.*]] = fir.load %[[a0:.*]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>
 ! CHECK-DAG:  %[[a12:.*]] = fir.box_addr %[[a10:.*]] : (!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>) -> !fir.heap<!fir.array<?x!fir.logical<4>>>
 ! CHECK-DAG  fir.freemem %[[a12:.*]]
diff --git a/flang/test/Lower/Intrinsics/bessel_jn.f90 b/flang/test/Lower/Intrinsics/bessel_jn.f90
index 428733d547d7b..f6ea8d296d576 100644
--- a/flang/test/Lower/Intrinsics/bessel_jn.f90
+++ b/flang/test/Lower/Intrinsics/bessel_jn.f90
@@ -47,22 +47,22 @@ subroutine test_transformational_real4(x, n1, n2, r)
   ! ALL-DAG: %[[n1eqn2:.*]] = arith.cmpi eq, %[[n1]], %[[n2]] : i32
   ! ALL: fir.if %[[xeq0]] {
   ! ALL: %[[resxeq0:.*]] = fir.convert %[[r]] {{.*}}
-  ! ALL: fir.call @_FortranABesselJnX0_4(%[[resxeq0]], %[[n1]], %[[n2]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, !fir.ref<i8>, i32) -> none
+  ! ALL: fir.call @_FortranABesselJnX0_4(%[[resxeq0]], %[[n1]], %[[n2]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, !fir.ref<i8>, i32) -> ()
   ! ALL-NEXT: } else {
   ! ALL-NEXT: fir.if %[[n1ltn2]] {
   ! ALL-DAG: %[[n2_1:.*]] = arith.subi %[[n2]], %[[one]] : i32
   ! ALL-DAG: %[[bn2:.*]] = fir.call @jnf(%[[n2]], %[[x]]) {{.*}} : (i32, f32) -> f32
   ! ALL-DAG: %[[bn2_1:.*]] = fir.call @jnf(%[[n2_1]], %[[x]]) {{.*}} : (i32, f32) -> f32
   ! ALL-DAG: %[[resn1ltn2:.*]] = fir.convert %[[r]] {{.*}}
-  ! ALL: fir.call @_FortranABesselJn_4(%[[resn1ltn2]], %[[n1]], %[[n2]], %[[x]], %[[bn2]], %[[bn2_1]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f32, f32, f32, !fir.ref<i8>, i32) -> none
+  ! ALL: fir.call @_FortranABesselJn_4(%[[resn1ltn2]], %[[n1]], %[[n2]], %[[x]], %[[bn2]], %[[bn2_1]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f32, f32, f32, !fir.ref<i8>, i32) -> ()
   ! ALL-NEXT: } else {
   ! ALL-NEXT: fir.if %[[n1eqn2]] {
   ! ALL-DAG: %[[bn2:.*]] = fir.call @jnf(%[[n2]], %[[x]]) {{.*}} : (i32, f32) -> f32
   ! ALL-DAG: %[[resn1eqn2:.*]] = fir.convert %[[r]] {{.*}}
-  ! ALL: fir.call @_FortranABesselJn_4(%[[resn1eqn2]], %[[n1]], %[[n2]], %[[x]], %[[bn2]], %[[zero]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f32, f32, f32, !fir.ref<i8>, i32) -> none
+  ! ALL: fir.call @_FortranABesselJn_4(%[[resn1eqn2]], %[[n1]], %[[n2]], %[[x]], %[[bn2]], %[[zero]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f32, f32, f32, !fir.ref<i8>, i32) -> ()
   ! ALL-NEXT: } else {
   ! ALL-DAG: %[[resn1gtn2:.*]] = fir.convert %[[r]] {{.*}}
-  ! ALL: fir.call @_FortranABesselJn_4(%[[resn1gtn2]], %[[n1]], %[[n2]], %[[x]], %[[zero]], %[[zero]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f32, f32, f32, !fir.ref<i8>, i32) -> none
+  ! ALL: fir.call @_FortranABesselJn_4(%[[resn1gtn2]], %[[n1]], %[[n2]], %[[x]], %[[zero]], %[[zero]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f32, f32, f32, !fir.ref<i8>, i32) -> ()
   ! ALL-NEXT: }
   ! ALL-NEXT: }
   ! ALL-NEXT: }
@@ -90,22 +90,22 @@ subroutine test_transformational_real8(x, n1, n2, r)
   ! ALL-DAG: %[[n1eqn2:.*]] = arith.cmpi eq, %[[n1]], %[[n2]] : i32
   ! ALL: fir.if %[[xeq0]] {
   ! ALL: %[[resxeq0:.*]] = fir.convert %[[r]] {{.*}}
-  ! ALL: fir.call @_FortranABesselJnX0_8(%[[resxeq0]], %[[n1]], %[[n2]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, !fir.ref<i8>, i32) -> none
+  ! ALL: fir.call @_FortranABesselJnX0_8(%[[resxeq0]], %[[n1]], %[[n2]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, !fir.ref<i8>, i32) -> ()
   ! ALL-NEXT: } else {
   ! ALL-NEXT: fir.if %[[n1ltn2]] {
   ! ALL-DAG: %[[n2_1:.*]] = arith.subi %[[n2]], %[[one]] : i32
   ! ALL-DAG: %[[bn2:.*]] = fir.call @jn(%[[n2]], %[[x]]) {{.*}} : (i32, f64) -> f64
   ! ALL-DAG: %[[bn2_1:.*]] = fir.call @jn(%[[n2_1]], %[[x]]) {{.*}} : (i32, f64) -> f64
   ! ALL-DAG: %[[resn1ltn2:.*]] = fir.convert %[[r]] {{.*}}
-  ! ALL: fir.call @_FortranABesselJn_8(%[[resn1ltn2]], %[[n1]], %[[n2]], %[[x]], %[[bn2]], %[[bn2_1]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f64, f64, f64, !fir.ref<i8>, i32) -> none
+  ! ALL: fir.call @_FortranABesselJn_8(%[[resn1ltn2]], %[[n1]], %[[n2]], %[[x]], %[[bn2]], %[[bn2_1]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f64, f64, f64, !fir.ref<i8>, i32) -> ()
   ! ALL-NEXT: } else {
   ! ALL-NEXT: fir.if %[[n1eqn2]] {
   ! ALL-DAG: %[[bn2:.*]] = fir.call @jn(%[[n2]], %[[x]]) {{.*}} : (i32, f64) -> f64
   ! ALL-DAG: %[[resn1eqn2:.*]] = fir.convert %[[r]] {{.*}}
-  ! ALL: fir.call @_FortranABesselJn_8(%[[resn1eqn2]], %[[n1]], %[[n2]], %[[x]], %[[bn2]], %[[zero]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f64, f64, f64, !fir.ref<i8>, i32) -> none
+  ! ALL: fir.call @_FortranABesselJn_8(%[[resn1eqn2]], %[[n1]], %[[n2]], %[[x]], %[[bn2]], %[[zero]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f64, f64, f64, !fir.ref<i8>, i32) -> ()
   ! ALL-NEXT: } else {
   ! ALL-DAG: %[[resn1gtn2:.*]] = fir.convert %[[r]] {{.*}}
-  ! ALL: fir.call @_FortranABesselJn_8(%[[resn1gtn2]], %[[n1]], %[[n2]], %[[x]], %[[zero]], %[[zero]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f64, f64, f64, !fir.ref<i8>, i32) -> none
+  ! ALL: fir.call @_FortranABesselJn_8(%[[resn1gtn2]], %[[n1]], %[[n2]], %[[x]], %[[zero]], %[[zero]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f64, f64, f64, !fir.ref<i8>, i32) -> ()
   ! ALL-NEXT: }
   ! ALL-NEXT: }
   ! ALL-NEXT: }
diff --git a/flang/test/Lower/Intrinsics/bessel_yn.f90 b/flang/test/Lower/Intrinsics/bessel_yn.f90
index ac77e4db5614d..2097fb9006ff5 100644
--- a/flang/test/Lower/Intrinsics/bessel_yn.f90
+++ b/flang/test/Lower/Intrinsics/bessel_yn.f90
@@ -47,22 +47,22 @@ subroutine test_transformational_real4(x, n1, n2, r)
   ! ALL-DAG: %[[n1eqn2:.*]] = arith.cmpi eq, %[[n1]], %[[n2]] : i32
   ! ALL: fir.if %[[xeq0]] {
   ! ALL: %[[resxeq0:.*]] = fir.convert %[[r]] {{.*}}
-  ! ALL: fir.call @_FortranABesselYnX0_4(%[[resxeq0]], %[[n1]], %[[n2]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, !fir.ref<i8>, i32) -> none
+  ! ALL: fir.call @_FortranABesselYnX0_4(%[[resxeq0]], %[[n1]], %[[n2]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, !fir.ref<i8>, i32) -> ()
   ! ALL-NEXT: } else {
   ! ALL-NEXT: fir.if %[[n1ltn2]] {
   ! ALL-DAG: %[[n1_1:.*]] = arith.addi %[[n1]], %[[one]] : i32
   ! ALL-DAG: %[[bn1:.*]] = fir.call @ynf(%[[n1]], %[[x]]) {{.*}} : (i32, f32) -> f32
   ! ALL-DAG: %[[bn1_1:.*]] = fir.call @ynf(%[[n1_1]], %[[x]]) {{.*}} : (i32, f32) -> f32
   ! ALL-DAG: %[[resn1ltn2:.*]] = fir.convert %[[r]] {{.*}}
-  ! ALL: fir.call @_FortranABesselYn_4(%[[resn1ltn2]], %[[n1]], %[[n2]], %[[x]], %[[bn1]], %[[bn1_1]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f32, f32, f32, !fir.ref<i8>, i32) -> none
+  ! ALL: fir.call @_FortranABesselYn_4(%[[resn1ltn2]], %[[n1]], %[[n2]], %[[x]], %[[bn1]], %[[bn1_1]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f32, f32, f32, !fir.ref<i8>, i32) -> ()
   ! ALL-NEXT: } else {
   ! ALL-NEXT: fir.if %[[n1eqn2]] {
   ! ALL-DAG: %[[bn1:.*]] = fir.call @ynf(%[[n1]], %[[x]]) {{.*}} : (i32, f32) -> f32
   ! ALL-DAG: %[[resn1eqn2:.*]] = fir.convert %[[r]] {{.*}}
-  ! ALL: fir.call @_FortranABesselYn_4(%[[resn1eqn2]], %[[n1]], %[[n2]], %[[x]], %[[bn1]], %[[zero]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f32, f32, f32, !fir.ref<i8>, i32) -> none
+  ! ALL: fir.call @_FortranABesselYn_4(%[[resn1eqn2]], %[[n1]], %[[n2]], %[[x]], %[[bn1]], %[[zero]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f32, f32, f32, !fir.ref<i8>, i32) -> ()
   ! ALL-NEXT: } else {
   ! ALL-DAG: %[[resn1gtn2:.*]] = fir.convert %[[r]] {{.*}}
-  ! ALL: fir.call @_FortranABesselYn_4(%[[resn1gtn2]], %[[n1]], %[[n2]], %[[x]], %[[zero]], %[[zero]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f32, f32, f32, !fir.ref<i8>, i32) -> none
+  ! ALL: fir.call @_FortranABesselYn_4(%[[resn1gtn2]], %[[n1]], %[[n2]], %[[x]], %[[zero]], %[[zero]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f32, f32, f32, !fir.ref<i8>, i32) -> ()
   ! ALL-NEXT: }
   ! ALL-NEXT: }
   ! ALL-NEXT: }
@@ -90,22 +90,22 @@ subroutine test_transformational_real8(x, n1, n2, r)
   ! ALL-DAG: %[[n1eqn2:.*]] = arith.cmpi eq, %[[n1]], %[[n2]] : i32
   ! ALL: fir.if %[[xeq0]] {
   ! ALL: %[[resxeq0:.*]] = fir.convert %[[r]] {{.*}}
-  ! ALL: fir.call @_FortranABesselYnX0_8(%[[resxeq0]], %[[n1]], %[[n2]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, !fir.ref<i8>, i32) -> none
+  ! ALL: fir.call @_FortranABesselYnX0_8(%[[resxeq0]], %[[n1]], %[[n2]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, !fir.ref<i8>, i32) -> ()
   ! ALL-NEXT: } else {
   ! ALL-NEXT: fir.if %[[n1ltn2]] {
   ! ALL-DAG: %[[n1_1:.*]] = arith.addi %[[n1]], %[[one]] : i32
   ! ALL-DAG: %[[bn1:.*]] = fir.call @yn(%[[n1]], %[[x]]) {{.*}} : (i32, f64) -> f64
   ! ALL-DAG: %[[bn1_1:.*]] = fir.call @yn(%[[n1_1]], %[[x]]) {{.*}} : (i32, f64) -> f64
   ! ALL-DAG: %[[resn1ltn2:.*]] = fir.convert %[[r]] {{.*}}
-  ! ALL: fir.call @_FortranABesselYn_8(%[[resn1ltn2]], %[[n1]], %[[n2]], %[[x]], %[[bn1]], %[[bn1_1]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f64, f64, f64, !fir.ref<i8>, i32) -> none
+  ! ALL: fir.call @_FortranABesselYn_8(%[[resn1ltn2]], %[[n1]], %[[n2]], %[[x]], %[[bn1]], %[[bn1_1]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f64, f64, f64, !fir.ref<i8>, i32) -> ()
   ! ALL-NEXT: } else {
   ! ALL-NEXT: fir.if %[[n1eqn2]] {
   ! ALL-DAG: %[[bn1:.*]] = fir.call @yn(%[[n1]], %[[x]]) {{.*}} : (i32, f64) -> f64
   ! ALL-DAG: %[[resn1eqn2:.*]] = fir.convert %[[r]] {{.*}}
-  ! ALL: fir.call @_FortranABesselYn_8(%[[resn1eqn2]], %[[n1]], %[[n2]], %[[x]], %[[bn1]], %[[zero]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f64, f64, f64, !fir.ref<i8>, i32) -> none
+  ! ALL: fir.call @_FortranABesselYn_8(%[[resn1eqn2]], %[[n1]], %[[n2]], %[[x]], %[[bn1]], %[[zero]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f64, f64, f64, !fir.ref<i8>, i32) -> ()
   ! ALL-NEXT: } else {
   ! ALL-DAG: %[[resn1gtn2:.*]] = fir.convert %[[r]] {{.*}}
-  ! ALL: fir.call @_FortranABesselYn_8(%[[resn1gtn2]], %[[n1]], %[[n2]], %[[x]], %[[zero]], %[[zero]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f64, f64, f64, !fir.ref<i8>, i32) -> none
+  ! ALL: fir.call @_FortranABesselYn_8(%[[resn1gtn2]], %[[n1]], %[[n2]], %[[x]], %[[zero]], %[[zero]], {{.*}}, {{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, f64, f64, f64, !fir.ref<i8>, i32) -> ()
   ! ALL-NEXT: }
   ! ALL-NEXT: }
   ! ALL-NEXT: }
diff --git a/flang/test/Lower/Intrinsics/count.f90 b/flang/test/Lower/Intrinsics/count.f90
index 1eef676e79244..c3efe6b4bf077 100644
--- a/flang/test/Lower/Intrinsics/count.f90
+++ b/flang/test/Lower/Intrinsics/count.f90
@@ -24,7 +24,7 @@ subroutine test_count2(rslt, mask)
   ! CHECK:  %[[a6:.*]] = fir.convert %[[arg1]] : (!fir.box<!fir.array<?x?x!fir.logical<4>>>) -> !fir.box<none>
   ! CHECK:  %[[a7:.*]] = fir.convert %[[c4]] : (index) -> i32
     rslt = count(mask, dim=1)
-  ! CHECK:  %{{.*}} = fir.call @_FortranACountDim(%[[a5]], %[[a6]], %[[c1_i32]], %[[a7]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32) -> none
+  ! CHECK:  fir.call @_FortranACountDim(%[[a5]], %[[a6]], %[[c1_i32]], %[[a7]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32) -> ()
   ! CHECK:  %[[a10:.*]] = fir.load %[[a0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   ! CHECK:  %[[a12:.*]] = fir.box_addr %[[a10]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
   ! CHECK:  fir.freemem %[[a12]]
diff --git a/flang/test/Lower/Intrinsics/date_and_time.f90 b/flang/test/Lower/Intrinsics/date_and_time.f90
index 900880f778447..55b1383766cb8 100644
--- a/flang/test/Lower/Intrinsics/date_and_time.f90
+++ b/flang/test/Lower/Intrinsics/date_and_time.f90
@@ -15,7 +15,7 @@ subroutine date_and_time_test(date, time, zone, values)
     ! CHECK: %[[zoneBuffer:.*]] = fir.convert %[[zoneUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
     ! CHECK: %[[zoneLen:.*]] = fir.convert %[[zoneUnbox]]#1 : (index) -> i64
     ! CHECK: %[[valuesCast:.*]] = fir.convert %[[values]] : (!fir.box<!fir.array<?xi64>>) -> !fir.box<none>
-    ! CHECK: fir.call @_FortranADateAndTime(%[[dateBuffer]], %[[dateLen]], %[[timeBuffer]], %[[timeLen]], %[[zoneBuffer]], %[[zoneLen]], %{{.*}}, %{{.*}}, %[[valuesCast]]) {{.*}}: (!fir.ref<i8>, i64, !fir.ref<i8>, i64, !fir.ref<i8>, i64, !fir.ref<i8>, i32, !fir.box<none>) -> none
+    ! CHECK: fir.call @_FortranADateAndTime(%[[dateBuffer]], %[[dateLen]], %[[timeBuffer]], %[[timeLen]], %[[zoneBuffer]], %[[zoneLen]], %{{.*}}, %{{.*}}, %[[valuesCast]]) {{.*}}: (!fir.ref<i8>, i64, !fir.ref<i8>, i64, !fir.ref<i8>, i64, !fir.ref<i8>, i32, !fir.box<none>) -> ()
     call date_and_time(date, time, zone, values)
   end subroutine
   
@@ -31,7 +31,7 @@ subroutine date_and_time_test2(date)
     ! CHECK: %[[timeLen:.*]] = fir.convert %c0{{.*}} : (index) -> i64
     ! CHECK: %[[zoneBuffer:.*]] = fir.convert %c0{{.*}} : (index) -> !fir.ref<i8>
     ! CHECK: %[[zoneLen:.*]] = fir.convert %c0{{.*}} : (index) -> i64
-    ! CHECK: fir.call @_FortranADateAndTime(%[[dateBuffer]], %[[dateLen]], %[[timeBuffer]], %[[timeLen]], %[[zoneBuffer]], %[[zoneLen]], %{{.*}}, %{{.*}}, %[[values]]) {{.*}}: (!fir.ref<i8>, i64, !fir.ref<i8>, i64, !fir.ref<i8>, i64, !fir.ref<i8>, i32, !fir.box<none>) -> none
+    ! CHECK: fir.call @_FortranADateAndTime(%[[dateBuffer]], %[[dateLen]], %[[timeBuffer]], %[[timeLen]], %[[zoneBuffer]], %[[zoneLen]], %{{.*}}, %{{.*}}, %[[values]]) {{.*}}: (!fir.ref<i8>, i64, !fir.ref<i8>, i64, !fir.ref<i8>, i64, !fir.ref<i8>, i32, !fir.box<none>) -> ()
     call date_and_time(date)
   end subroutine
   
@@ -69,5 +69,5 @@ subroutine date_and_time_dynamic_optional(date, time, zone, values)
   ! CHECK:  %[[VAL_23:.*]] = fir.convert %[[VAL_5]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
   ! CHECK:  %[[VAL_24:.*]] = fir.convert %[[VAL_5]]#1 : (index) -> i64
   ! CHECK:  %[[VAL_26:.*]] = fir.convert %[[VAL_16]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>) -> !fir.box<none>
-  ! CHECK:  %[[VAL_28:.*]] = fir.call @_FortranADateAndTime(%[[VAL_19]], %[[VAL_20]], %[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %{{.*}}, %{{.*}}, %[[VAL_26]]) {{.*}}: (!fir.ref<i8>, i64, !fir.ref<i8>, i64, !fir.ref<i8>, i64, !fir.ref<i8>, i32, !fir.box<none>) -> none
+  ! CHECK:  fir.call @_FortranADateAndTime(%[[VAL_19]], %[[VAL_20]], %[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %{{.*}}, %{{.*}}, %[[VAL_26]]) {{.*}}: (!fir.ref<i8>, i64, !fir.ref<i8>, i64, !fir.ref<i8>, i64, !fir.ref<i8>, i32, !fir.box<none>) -> ()
   end subroutine
diff --git a/flang/test/Lower/Intrinsics/dot_product.f90 b/flang/test/Lower/Intrinsics/dot_product.f90
index e67e9d598cd84..9a825c4b9acf1 100644
--- a/flang/test/Lower/Intrinsics/dot_product.f90
+++ b/flang/test/Lower/Intrinsics/dot_product.f90
@@ -168,7 +168,7 @@ subroutine dot_prod_complex_default (x, y, z)
   ! CHECK-DAG: %[[res:.*]] = fir.alloca complex<f32>
   ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xcomplex<f32>>>) -> !fir.box<none>
   ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xcomplex<f32>>>) -> !fir.box<none>
-  ! CHECK-DAG: fir.call @_FortranACppDotProductComplex4(%[[res]], %[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f32>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  ! CHECK-DAG: fir.call @_FortranACppDotProductComplex4(%[[res]], %[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f32>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   z = dot_product(x,y)
 end subroutine
 
@@ -182,7 +182,7 @@ subroutine dot_prod_complex_kind_4 (x, y, z)
   ! CHECK-DAG: %[[res:.*]] = fir.alloca complex<f32>
   ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xcomplex<f32>>>) -> !fir.box<none>
   ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xcomplex<f32>>>) -> !fir.box<none>
-  ! CHECK-DAG: fir.call @_FortranACppDotProductComplex4(%[[res]], %[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f32>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  ! CHECK-DAG: fir.call @_FortranACppDotProductComplex4(%[[res]], %[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f32>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   z = dot_product(x,y)
 end subroutine
 
@@ -196,7 +196,7 @@ subroutine dot_prod_complex_kind_8 (x, y, z)
   ! CHECK-DAG: %[[res:.*]] = fir.alloca complex<f64>
   ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xcomplex<f64>>>) -> !fir.box<none>
   ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xcomplex<f64>>>) -> !fir.box<none>
-  ! CHECK-DAG: fir.call @_FortranACppDotProductComplex8(%[[res]], %[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f64>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  ! CHECK-DAG: fir.call @_FortranACppDotProductComplex8(%[[res]], %[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f64>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   z = dot_product(x,y)
 end subroutine
 
@@ -264,7 +264,7 @@ subroutine dot_product_mixed_int_complex(x, y, z)
   ! CHECK-DAG: %[[res:.*]] = fir.alloca complex<f32>
   ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
   ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xcomplex<f32>>>) -> !fir.box<none>
-  ! CHECK-DAG: fir.call @_FortranACppDotProductComplex4(%[[res]], %[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f32>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  ! CHECK-DAG: fir.call @_FortranACppDotProductComplex4(%[[res]], %[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f32>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   z = dot_product(x,y)
 end subroutine
 
@@ -278,6 +278,6 @@ subroutine dot_product_mixed_real_complex(x, y, z)
   ! CHECK-DAG: %[[res:.*]] = fir.alloca complex<f32>
   ! CHECK-DAG: %[[x_conv:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
   ! CHECK-DAG: %[[y_conv:.*]] = fir.convert %[[y]] : (!fir.box<!fir.array<?xcomplex<f32>>>) -> !fir.box<none>
-  ! CHECK-DAG: fir.call @_FortranACppDotProductComplex4(%[[res]], %[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f32>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  ! CHECK-DAG: fir.call @_FortranACppDotProductComplex4(%[[res]], %[[x_conv]], %[[y_conv]], %{{[0-9]+}}, %{{.*}}) {{.*}}: (!fir.ref<complex<f32>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   z = dot_product(x,y)
 end subroutine
diff --git a/flang/test/Lower/Intrinsics/eoshift.f90 b/flang/test/Lower/Intrinsics/eoshift.f90
index 5d916dcdb56c2..9cd0b86fadc52 100644
--- a/flang/test/Lower/Intrinsics/eoshift.f90
+++ b/flang/test/Lower/Intrinsics/eoshift.f90
@@ -19,7 +19,7 @@ subroutine eoshift_test1(arr, shift)
   ! CHECK: %[[resIRBox:.*]] = fir.convert %[[resBox]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>) -> !fir.ref<!fir.box<none>>
   ! CHECK: %[[arrBox:.*]] = fir.convert %[[arr]] : (!fir.box<!fir.array<3x!fir.logical<4>>>) -> !fir.box<none>
   ! CHECK: %[[shiftBox:.*]] = fir.convert %[[shift]] : (i32) -> i64
-  ! CHECK: %[[tmp:.*]] = fir.call @_FortranAEoshiftVector(%[[resIRBox]], %[[arrBox]], %[[shiftBox]], %[[boundBox]], {{.*}}, {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  ! CHECK: fir.call @_FortranAEoshiftVector(%[[resIRBox]], %[[arrBox]], %[[shiftBox]], %[[boundBox]], {{.*}}, {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   ! CHECK: fir.array_merge_store %[[resLoad]], {{.*}} to %[[res]] : !fir.array<3x!fir.logical<4>>, !fir.array<3x!fir.logical<4>>, !fir.ref<!fir.array<3x!fir.logical<4>>>
   end subroutine eoshift_test1
   
@@ -43,7 +43,7 @@ subroutine eoshift_test2(arr, shift, bound, dim)
   ! CHECK: %[[shiftBoxNone:.*]] = fir.convert %[[shiftBox]] : (!fir.box<!fir.array<3xi32>>) -> !fir.box<none>
   ! CHECK: %[[boundBoxNone:.*]] = fir.convert %[[boundBox]] : (!fir.box<i32>) -> !fir.box<none>
   
-  ! CHECK: %[[tmp:.*]] = fir.call @_FortranAEoshift(%[[resIRBox]], %[[arrBox]], %[[shiftBoxNone]], %[[boundBoxNone]], %[[dim]], {{.*}}, {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+  ! CHECK: fir.call @_FortranAEoshift(%[[resIRBox]], %[[arrBox]], %[[shiftBoxNone]], %[[boundBoxNone]], %[[dim]], {{.*}}, {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
   ! CHECK: fir.array_merge_store %[[resLoad]], {{.*}} to %[[res]] : !fir.array<3x3xi32>, !fir.array<3x3xi32>, !fir.ref<!fir.array<3x3xi32>>
   end subroutine eoshift_test2
   
@@ -67,7 +67,7 @@ subroutine eoshift_test3(arr, shift, dim)
   ! CHECK: %[[resIRBox:.*]] = fir.convert %[[resBox]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,4>>>>>) -> !fir.ref<!fir.box<none>>
   ! CHECK: %[[arrayBoxNone:.*]] = fir.convert %[[arrayBox]] : (!fir.box<!fir.array<3x3x!fir.char<1,4>>>) -> !fir.box<none>
   ! CHECK: %[[shiftBoxNone:.*]] = fir.convert %[[shiftBox]] : (!fir.box<i32>) -> !fir.box<none>
-  ! CHECK: %[[tmp:.*]] = fir.call @_FortranAEoshift(%[[resIRBox]], %[[arrayBoxNone]], %[[shiftBoxNone]], %[[boundBox]], %[[dim]], {{.*}}, {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+  ! CHECK: fir.call @_FortranAEoshift(%[[resIRBox]], %[[arrayBoxNone]], %[[shiftBoxNone]], %[[boundBox]], %[[dim]], {{.*}}, {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
   ! CHECK: fir.array_merge_store %[[resLoad]], {{.*}} to %[[res]] : !fir.array<3x3x!fir.char<1,4>>, !fir.array<3x3x!fir.char<1,4>>, !fir.ref<!fir.array<3x3x!fir.char<1,4>>>
   end subroutine eoshift_test3
   
@@ -90,5 +90,5 @@ subroutine eoshift_test_dynamic_optional(array, shift, boundary)
   ! CHECK:  %[[VAL_8:.*]] = fir.absent !fir.box<!fir.array<10xi32>>
   ! CHECK:  %[[VAL_9:.*]] = arith.select %[[VAL_5]], %[[VAL_7]], %[[VAL_8]] : !fir.box<!fir.array<10xi32>>
   ! CHECK:  %[[VAL_21:.*]] = fir.convert %[[VAL_9]] : (!fir.box<!fir.array<10xi32>>) -> !fir.box<none>
-  ! CHECK:  fir.call @_FortranAEoshift(%{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_21]], %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+  ! CHECK:  fir.call @_FortranAEoshift(%{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_21]], %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
   end subroutine
diff --git a/flang/test/Lower/Intrinsics/etime-function.f90 b/flang/test/Lower/Intrinsics/etime-function.f90
index e3279189c7523..f4594cee7525d 100644
--- a/flang/test/Lower/Intrinsics/etime-function.f90
+++ b/flang/test/Lower/Intrinsics/etime-function.f90
@@ -18,8 +18,8 @@ subroutine etime_test(values, time)
   ! CHECK-NEXT:        %[[timeTmpBox:.*]] = fir.embox %[[timeTmpAddr]] : (!fir.ref<f32>) -> !fir.box<f32>
   ! CHECK:             %[[values:.*]] = fir.convert %[[valuesBox]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<none>
   ! CHECK:             %[[timeTmp:.*]] = fir.convert %[[timeTmpBox]] : (!fir.box<f32>) -> !fir.box<none>
-  ! CHECK:             %[[VAL_9:.*]] = fir.call @_FortranAEtime(%[[values]], %[[timeTmp]], %[[VAL_7:.*]], %[[c9]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  ! CHECK:             fir.call @_FortranAEtime(%[[values]], %[[timeTmp]], %[[VAL_7:.*]], %[[c9]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   ! CHECK-NEXT:        %[[timeValue:.*]] = fir.load %[[timeTmpAddr]] : !fir.ref<f32>
   ! CHECK-NEXT:        fir.store %[[timeValue]] to %[[timeDeclare]] : !fir.ref<f32>
   ! CHECK-NEXT:        return
-end subroutine etime_test
\ No newline at end of file
+end subroutine etime_test
diff --git a/flang/test/Lower/Intrinsics/etime.f90 b/flang/test/Lower/Intrinsics/etime.f90
index 3e7ae0e9a406d..fe5d16b64cd0c 100644
--- a/flang/test/Lower/Intrinsics/etime.f90
+++ b/flang/test/Lower/Intrinsics/etime.f90
@@ -17,6 +17,6 @@ subroutine etime_test(values, time)
   ! CHECK-NEXT:        %[[timeBox:.*]] = fir.embox %[[timeDeclare]] : (!fir.ref<f32>) -> !fir.box<f32>
   ! CHECK:             %[[values:.*]] = fir.convert %[[valuesBox]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<none>
   ! CHECK:             %[[time:.*]] = fir.convert %[[timeBox]] : (!fir.box<f32>) -> !fir.box<none>
-  ! CHECK:             %[[VAL_9:.*]] = fir.call @_FortranAEtime(%[[values]], %[[time]], %[[VAL_7:.*]], %[[c9]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  ! CHECK:             fir.call @_FortranAEtime(%[[values]], %[[time]], %[[VAL_7:.*]], %[[c9]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   ! CHECK-NEXT:        return
-end subroutine etime_test
\ No newline at end of file
+end subroutine etime_test
diff --git a/flang/test/Lower/Intrinsics/execute_command_line-optional.f90 b/flang/test/Lower/Intrinsics/execute_command_line-optional.f90
index e4f9a241197c8..00a3258c9a647 100644
--- a/flang/test/Lower/Intrinsics/execute_command_line-optional.f90
+++ b/flang/test/Lower/Intrinsics/execute_command_line-optional.f90
@@ -50,6 +50,6 @@ subroutine all_args_optional(command, isWait, exitVal, cmdVal, msg)
 ! CHECK-NEXT:    %[[exitstat:.*]] = fir.convert %[[exitstatBox]] : (!fir.box<i32>) -> !fir.box<none>
 ! CHECK-NEXT:    %[[cmdstat:.*]] = fir.convert %[[cmdstatBox]] : (!fir.box<i32>) -> !fir.box<none>
 ! CHECK-NEXT:    %[[cmdmsg:.*]] = fir.convert %[[cmdmsgBox]] : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
-! CHECK:         %[[VAL_30:.*]] = fir.call @_FortranAExecuteCommandLine(%[[command]], %[[wait]], %[[exitstat]], %[[cmdstat]], %[[cmdmsg]], %[[VAL_29:.*]], %[[c14]]) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:         fir.call @_FortranAExecuteCommandLine(%[[command]], %[[wait]], %[[exitstat]], %[[cmdstat]], %[[cmdmsg]], %[[VAL_29:.*]], %[[c14]]) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK-NEXT:    return
 end subroutine all_args_optional
diff --git a/flang/test/Lower/Intrinsics/execute_command_line.f90 b/flang/test/Lower/Intrinsics/execute_command_line.f90
index 6bde50e807b28..77f1750c504bd 100644
--- a/flang/test/Lower/Intrinsics/execute_command_line.f90
+++ b/flang/test/Lower/Intrinsics/execute_command_line.f90
@@ -42,7 +42,7 @@ subroutine all_args(command, isWait, exitVal, cmdVal, msg)
 ! CHECK-NEXT:        %[[exitstat:.*]] = fir.convert %[[exitstatBox]] : (!fir.box<i32>) -> !fir.box<none>
 ! CHECK-NEXT:        %[[cmdstat:.*]] = fir.convert %[[cmdstatBox]] : (!fir.box<i32>) -> !fir.box<none>
 ! CHECK-NEXT:        %[[cmdmsg:.*]] = fir.convert %[[cmdmsgBox]] : (!fir.box<!fir.char<1,30>>) -> !fir.box<none>
-! CHECK:             %[[VAL_22:.*]] = fir.call @_FortranAExecuteCommandLine(%[[command]], %[[wait]], %[[exitstat]], %[[cmdstat]], %[[cmdmsg]], %[[VAL_20:.*]], %[[c13]]) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:             fir.call @_FortranAExecuteCommandLine(%[[command]], %[[wait]], %[[exitstat]], %[[cmdstat]], %[[cmdmsg]], %[[VAL_20:.*]], %[[c13]]) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK-NEXT:        return
 end subroutine all_args
 
@@ -61,6 +61,6 @@ subroutine only_command_default_wait_true(command)
 ! CHECK-NEXT:     %[[commandBox:.*]] = fir.embox %[[commandDeclare]] : (!fir.ref<!fir.char<1,30>>) -> !fir.box<!fir.char<1,30>>
 ! CHECK-NEXT:     %[[absent:.*]] = fir.absent !fir.box<none>
 ! CHECK:          %[[command:.*]] = fir.convert %[[commandBox]] : (!fir.box<!fir.char<1,30>>) -> !fir.box<none> 
-! CHECK:          %[[VAL_8:.*]] = fir.call @_FortranAExecuteCommandLine(%[[command]], %true, %[[absent]], %[[absent]], %[[absent]], %[[VAL_7:.*]], %[[c52]]) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:          fir.call @_FortranAExecuteCommandLine(%[[command]], %true, %[[absent]], %[[absent]], %[[absent]], %[[VAL_7:.*]], %[[c52]]) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK-NEXT:     return
 end subroutine only_command_default_wait_true
diff --git a/flang/test/Lower/Intrinsics/exit-2.f90 b/flang/test/Lower/Intrinsics/exit-2.f90
index 7158eeb70db81..458d2a83cf76b 100644
--- a/flang/test/Lower/Intrinsics/exit-2.f90
+++ b/flang/test/Lower/Intrinsics/exit-2.f90
@@ -14,7 +14,7 @@ subroutine exit_opt_dummy(status)
 ! CHECK:    %[[VAL_4:.*]] = arith.constant 0 : i32
 ! CHECK:    fir.result %[[VAL_4]] : i32
 ! CHECK:  }
-! CHECK:  %[[VAL_5:.*]] = fir.call @_FortranAExit(%[[VAL_6:.*]]) {{.*}}: (i32) -> none
+! CHECK:  fir.call @_FortranAExit(%[[VAL_6:.*]]) {{.*}}: (i32) -> ()
 end subroutine
 
 ! CHECK-LABEL: func @_QPexit_pointer(
@@ -36,5 +36,5 @@ subroutine exit_pointer(status)
 ! CHECK:    %[[VAL_10:.*]] = arith.constant 0 : i32
 ! CHECK:    fir.result %[[VAL_10]] : i32
 ! CHECK:  }
-! CHECK:  %[[VAL_11:.*]] = fir.call @_FortranAExit(%[[VAL_12:.*]]) {{.*}}: (i32) -> none
+! CHECK:  fir.call @_FortranAExit(%[[VAL_12:.*]]) {{.*}}: (i32) -> ()
 end subroutine
diff --git a/flang/test/Lower/Intrinsics/exit.f90 b/flang/test/Lower/Intrinsics/exit.f90
index bd551f7318a84..d80efc556f95e 100644
--- a/flang/test/Lower/Intrinsics/exit.f90
+++ b/flang/test/Lower/Intrinsics/exit.f90
@@ -7,8 +7,8 @@ subroutine exit_test1
     call exit()
   ! CHECK: %[[status:.*]] = arith.constant 0 : i[[DEFAULT_INTEGER_SIZE]]
   ! CHECK-64: %[[statusConvert:.*]] = fir.convert %[[status]] : (i64) -> i32
-  ! CHECK-32: %{{[0-9]+}} = fir.call @_FortranAExit(%[[status]]) {{.*}}: (i32) -> none
-  ! CHECK-64: %{{[0-9]+}} = fir.call @_FortranAExit(%[[statusConvert]]) {{.*}}: (i32) -> none
+  ! CHECK-32: fir.call @_FortranAExit(%[[status]]) {{.*}}: (i32) -> ()
+  ! CHECK-64: fir.call @_FortranAExit(%[[statusConvert]]) {{.*}}: (i32) -> ()
   end subroutine exit_test1
   
   ! CHECK-LABEL: func @_QPexit_test2(
@@ -18,6 +18,6 @@ subroutine exit_test2(status)
     call exit(status)
   ! CHECK: %[[status:.*]] = fir.load %[[statusArg]] : !fir.ref<i[[DEFAULT_INTEGER_SIZE]]>
   ! CHECK-64: %[[statusConv:.*]] = fir.convert %[[status]] : (i64) -> i32
-  ! CHECK-32: %{{[0-9]+}} = fir.call @_FortranAExit(%[[status]]) {{.*}}: (i32) -> none
-  ! CHECK-64: %{{[0-9]+}} = fir.call @_FortranAExit(%[[statusConv]]) {{.*}}: (i32) -> none
+  ! CHECK-32: fir.call @_FortranAExit(%[[status]]) {{.*}}: (i32) -> ()
+  ! CHECK-64: fir.call @_FortranAExit(%[[statusConv]]) {{.*}}: (i32) -> ()
   end subroutine exit_test2
diff --git a/flang/test/Lower/Intrinsics/findloc.f90 b/flang/test/Lower/Intrinsics/findloc.f90
index b8b337e8f085f..a82a5277753ac 100644
--- a/flang/test/Lower/Intrinsics/findloc.f90
+++ b/flang/test/Lower/Intrinsics/findloc.f90
@@ -18,7 +18,7 @@ function findloc_test_1d(a, v)
   ! CHECK-DAG: %[[mask:.*]] = fir.convert %[[m]] : (!fir.box<i1>) -> !fir.box<none>
   ! CHECK-DAG: %[[kind:.*]] = fir.convert %[[c4]] : (index) -> i32
   findloc_test_1d = findloc(a, v)
-  ! CHECK:  %{{.*}} = fir.call @_FortranAFindloc(%[[res]], %[[arr]], %[[val]], %[[kind]], %{{.*}}, %{{.*}}, %[[mask]], %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  ! CHECK:  fir.call @_FortranAFindloc(%[[res]], %[[arr]], %[[val]], %[[kind]], %{{.*}}, %{{.*}}, %[[mask]], %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   ! CHECK: %[[box:.*]] = fir.load %[[r]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   ! CHECK: %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
   ! CHECK: fir.freemem %[[addr]] : !fir.heap<!fir.array<?xi32>>
@@ -41,7 +41,7 @@ function findloc_test_2d(a, v)
   ! CHECK-DAG: %[[mask:.*]] = fir.convert %[[m]] : (!fir.box<i1>) -> !fir.box<none>
   ! CHECK-DAG: %[[kind:.*]] = fir.convert %[[c4]] : (index) -> i32
   findloc_test_2d = findloc(a, v)
-  ! CHECK:  %{{.*}} = fir.call @_FortranAFindloc(%[[res]], %[[arr]], %[[val]], %[[kind]], %{{.*}}, %{{.*}}, %[[mask]], %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  ! CHECK:  fir.call @_FortranAFindloc(%[[res]], %[[arr]], %[[val]], %[[kind]], %{{.*}}, %{{.*}}, %[[mask]], %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   ! CHECK: %[[box:.*]] = fir.load %[[r]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   ! CHECK: %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
   ! CHECK: fir.freemem %[[addr]] : !fir.heap<!fir.array<?xi32>>
@@ -66,7 +66,7 @@ function findloc_test_byval(a, v)
   ! CHECK-DAG: %[[mask:.*]] = fir.convert %[[m]] : (!fir.box<i1>) -> !fir.box<none>
   ! CHECK-DAG: %[[kind:.*]] = fir.convert %[[c4]] : (index) -> i32
   findloc_test_byval = findloc(a, v)
-  ! CHECK:  %{{.*}} = fir.call @_FortranAFindloc(%[[res]], %[[arr]], %[[val]], %[[kind]], %{{.*}}, %{{.*}}, %[[mask]], %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  ! CHECK:  fir.call @_FortranAFindloc(%[[res]], %[[arr]], %[[val]], %[[kind]], %{{.*}}, %{{.*}}, %[[mask]], %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   ! CHECK: %[[box:.*]] = fir.load %[[r]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   ! CHECK: %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
   ! CHECK: fir.freemem %[[addr]] : !fir.heap<!fir.array<?xi32>>
@@ -89,7 +89,7 @@ function findloc_test_back_true(a, v)
   ! CHECK-DAG: %[[mask:.*]] = fir.convert %[[m]] : (!fir.box<i1>) -> !fir.box<none>
   ! CHECK-DAG: %[[kind:.*]] = fir.convert %[[c4]] : (index) -> i32
   findloc_test_back_true = findloc(a, v, back=.true.)
-  ! CHECK:  %{{.*}} = fir.call @_FortranAFindloc(%[[res]], %[[arr]], %[[val]], %[[kind]], %{{.*}}, %{{.*}}, %[[mask]], %true) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  ! CHECK:  fir.call @_FortranAFindloc(%[[res]], %[[arr]], %[[val]], %[[kind]], %{{.*}}, %{{.*}}, %[[mask]], %true) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   ! CHECK: %[[box:.*]] = fir.load %[[r]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   ! CHECK: %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
   ! CHECK: fir.freemem %[[addr]] : !fir.heap<!fir.array<?xi32>>
@@ -114,7 +114,7 @@ function findloc_test_back(a, v, back)
   ! CHECK-DAG: %[[kind:.*]] = fir.convert %[[c4]] : (index) -> i32
   ! CHECK-DAG: %[[back:.*]] = fir.convert %[[b]] : (!fir.logical<4>) -> i1
   findloc_test_back = findloc(a, v, back=back)
-  ! CHECK:  %{{.*}} = fir.call @_FortranAFindloc(%[[res]], %[[arr]], %[[val]], %[[kind]], %{{.*}}, %{{.*}}, %[[mask]], %[[back]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  ! CHECK:  fir.call @_FortranAFindloc(%[[res]], %[[arr]], %[[val]], %[[kind]], %{{.*}}, %{{.*}}, %[[mask]], %[[back]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   ! CHECK: %[[box:.*]] = fir.load %[[r]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   ! CHECK: %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
   ! CHECK: fir.freemem %[[addr]] : !fir.heap<!fir.array<?xi32>>
@@ -138,7 +138,7 @@ subroutine findloc_test_dim(a, v, res)
   ! CHECK-DAG: %[[mask:.*]] = fir.convert %[[m]] : (!fir.box<i1>) -> !fir.box<none>
   ! CHECK-DAG: %[[kind:.*]] = fir.convert %[[c4]] : (index) -> i32
   res = findloc(a, v, dim=1)
-  ! CHECK:  %{{.*}} = fir.call @_FortranAFindlocDim(%[[res]], %[[arr]], %[[val]], %[[kind]], %[[c1]], %{{.*}}, %{{.*}}, %[[mask]], %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  ! CHECK:  fir.call @_FortranAFindlocDim(%[[res]], %[[arr]], %[[val]], %[[kind]], %[[c1]], %{{.*}}, %{{.*}}, %[[mask]], %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   ! CHECK: %[[box:.*]] = fir.load %[[r]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   ! CHECK: %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
   ! CHECK: fir.freemem %[[addr]] : !fir.heap<!fir.array<?xi32>>
@@ -163,7 +163,7 @@ subroutine findloc_test_dim_unknown(a, v, dim, res)
   ! CHECK-DAG: %[[mask:.*]] = fir.convert %[[m]] : (!fir.box<i1>) -> !fir.box<none>
   ! CHECK-DAG: %[[kind:.*]] = fir.convert %[[c4]] : (index) -> i32
   res = findloc(a, v, dim=dim)
-  ! CHECK:  %{{.*}} = fir.call @_FortranAFindlocDim(%[[res]], %[[arr]], %[[val]], %[[kind]], %[[dim]], %{{.*}}, %{{.*}}, %[[mask]], %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  ! CHECK:  fir.call @_FortranAFindlocDim(%[[res]], %[[arr]], %[[val]], %[[kind]], %[[dim]], %{{.*}}, %{{.*}}, %[[mask]], %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   ! CHECK: %[[box:.*]] = fir.load %[[r]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   ! CHECK: %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
   ! CHECK: fir.freemem %[[addr]] : !fir.heap<!fir.array<?xi32>>
@@ -185,7 +185,7 @@ subroutine findloc_test_kind(a, v, res)
   ! CHECK-DAG: %[[val:.*]] = fir.convert %[[v]] : (!fir.box<i32>) -> !fir.box<none>
   ! CHECK-DAG: %[[mask:.*]] = fir.convert %[[m]] : (!fir.box<i1>) -> !fir.box<none>
   res = findloc(a, v, kind=8)
-  ! CHECK:  %{{.*}} = fir.call @_FortranAFindloc(%[[res]], %[[arr]], %[[val]], %[[kind]], %{{.*}}, %{{.*}}, %[[mask]], %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  ! CHECK:  fir.call @_FortranAFindloc(%[[res]], %[[arr]], %[[val]], %[[kind]], %{{.*}}, %{{.*}}, %[[mask]], %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   ! CHECK: %[[box:.*]] = fir.load %[[r]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi64>>>>
   ! CHECK: %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?xi64>>>) -> !fir.heap<!fir.array<?xi64>>
   ! CHECK: fir.freemem %[[addr]] : !fir.heap<!fir.array<?xi64>>
@@ -208,7 +208,7 @@ subroutine findloc_test_non_scalar_mask(a, v, mask, res)
   ! CHECK-DAG: %[[mask:.*]] = fir.convert %[[arg2]] : (!fir.box<!fir.array<?x?x!fir.logical<4>>>) -> !fir.box<none>
   ! CHECK-DAG: %[[kind:.*]] = fir.convert %[[c4]] : (index) -> i32
   res = findloc(a, v, mask=mask)
-  ! CHECK:  %{{.*}} = fir.call @_FortranAFindloc(%[[res]], %[[arr]], %[[val]], %[[kind]], %{{.*}}, %{{.*}}, %[[mask]], %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  ! CHECK:  fir.call @_FortranAFindloc(%[[res]], %[[arr]], %[[val]], %[[kind]], %{{.*}}, %{{.*}}, %[[mask]], %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   ! CHECK: %[[box:.*]] = fir.load %[[r]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   ! CHECK: %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
   ! CHECK: fir.freemem %[[addr]] : !fir.heap<!fir.array<?xi32>>
@@ -232,7 +232,7 @@ subroutine findloc_test_scalar_mask(a, v, mask, res)
   ! CHECK-DAG: %[[mask:.*]] = fir.convert %[[m]] : (!fir.box<!fir.logical<4>>) -> !fir.box<none>
   ! CHECK-DAG: %[[kind:.*]] = fir.convert %[[c4]] : (index) -> i32
   res = findloc(a, v, mask=mask)
-  ! CHECK:  %{{.*}} = fir.call @_FortranAFindloc(%[[res]], %[[arr]], %[[val]], %[[kind]], %{{.*}}, %{{.*}}, %[[mask]], %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  ! CHECK:  fir.call @_FortranAFindloc(%[[res]], %[[arr]], %[[val]], %[[kind]], %{{.*}}, %{{.*}}, %[[mask]], %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   ! CHECK: %[[box:.*]] = fir.load %[[r]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   ! CHECK: %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
   ! CHECK: fir.freemem %[[addr]] : !fir.heap<!fir.array<?xi32>>
@@ -258,7 +258,7 @@ subroutine findloc_test_all(a, v, dim, mask, back, res)
   ! CHECK-DAG: %[[mask:.*]] = fir.convert %[[arg3]] : (!fir.box<!fir.array<?x?x!fir.logical<4>>>) -> !fir.box<none>
   ! CHECK-DAG: %[[back:.*]] = fir.convert %[[b]] : (!fir.logical<4>) -> i1
   res = findloc(a, v, dim=dim, mask=mask, kind=8, back=back)
-  ! CHECK:  %{{.*}} = fir.call @_FortranAFindlocDim(%[[res]], %[[arr]], %[[val]], %[[kind]], %[[dim]], %{{.*}}, %{{.*}}, %[[mask]], %[[back]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  ! CHECK:  fir.call @_FortranAFindlocDim(%[[res]], %[[arr]], %[[val]], %[[kind]], %[[dim]], %{{.*}}, %{{.*}}, %[[mask]], %[[back]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   ! CHECK: %[[box:.*]] = fir.load %[[r]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi64>>>>
   ! CHECK: %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?xi64>>>) -> !fir.heap<!fir.array<?xi64>>
   ! CHECK: fir.freemem %[[addr]] : !fir.heap<!fir.array<?xi64>>
diff --git a/flang/test/Lower/Intrinsics/free.f90 b/flang/test/Lower/Intrinsics/free.f90
index bb8d38e737aa7..1bfe48f550754 100644
--- a/flang/test/Lower/Intrinsics/free.f90
+++ b/flang/test/Lower/Intrinsics/free.f90
@@ -10,7 +10,7 @@ subroutine free_ptr()
   ! CHECK:           %[[X_PTR_DECL:.*]]:2 = hlfir.declare %[[X_PTR]] {uniq_name = "_QFfree_ptrEptr_x"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
   ! CHECK:           %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFfree_ptrEx"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
   ! CHECK:           %[[X_LD:.*]] = fir.load %[[X_PTR_DECL]]#0 : !fir.ref<i64>
-  ! CHECK:           %[[VOID:.*]] = fir.call @_FortranAFree(%[[X_LD]]) fastmath<contract> : (i64) -> none
+  ! CHECK:           fir.call @_FortranAFree(%[[X_LD]]) fastmath<contract> : (i64) -> ()
   ! CHECK:           return
   call free(ptr_x)
 end subroutine
@@ -24,7 +24,7 @@ subroutine free_i8
   ! CHECK:           %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFfree_i8Ex"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
   ! CHECK:           %[[X_LD:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i8>
   ! CHECK:           %[[X_I64:.*]] = fir.convert %[[X_LD]] : (i8) -> i64
-  ! CHECK:           %[[VOID:.*]] = fir.call @_FortranAFree(%[[X_I64]]) fastmath<contract> : (i64) -> none
+  ! CHECK:           fir.call @_FortranAFree(%[[X_I64]]) fastmath<contract> : (i64) -> ()
   ! CHECK:           return
   call free(x)
 end subroutine
@@ -37,7 +37,7 @@ subroutine free_i16
   ! CHECK:           %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFfree_i16Ex"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
   ! CHECK:           %[[X_LD:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i16>
   ! CHECK:           %[[X_I64:.*]] = fir.convert %[[X_LD]] : (i16) -> i64
-  ! CHECK:           %[[VOID:.*]] = fir.call @_FortranAFree(%[[X_I64]]) fastmath<contract> : (i64) -> none
+  ! CHECK:           fir.call @_FortranAFree(%[[X_I64]]) fastmath<contract> : (i64) -> ()
   ! CHECK:           return
   call free(x)
 end subroutine
@@ -49,7 +49,7 @@ subroutine free_i32
   ! CHECK:           %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFfree_i32Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! CHECK:           %[[X_LD:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
   ! CHECK:           %[[X_I64:.*]] = fir.convert %[[X_LD]] : (i32) -> i64
-  ! CHECK:           %[[VOID:.*]] = fir.call @_FortranAFree(%[[X_I64]]) fastmath<contract> : (i64) -> none
+  ! CHECK:           fir.call @_FortranAFree(%[[X_I64]]) fastmath<contract> : (i64) -> ()
   ! CHECK:           return
   call free(x)
 end subroutine
@@ -60,7 +60,7 @@ subroutine free_i64
   ! CHECK:           %[[X:.*]] = fir.alloca i64 {bindc_name = "x", uniq_name = "_QFfree_i64Ex"}
   ! CHECK:           %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFfree_i64Ex"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
   ! CHECK:           %[[X_LD:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i64>
-  ! CHECK:           %[[VOID:.*]] = fir.call @_FortranAFree(%[[X_LD]]) fastmath<contract> : (i64) -> none
+  ! CHECK:           fir.call @_FortranAFree(%[[X_LD]]) fastmath<contract> : (i64) -> ()
   ! CHECK:           return
   call free(x)
 end subroutine
diff --git a/flang/test/Lower/Intrinsics/iall.f90 b/flang/test/Lower/Intrinsics/iall.f90
index 119cb90a52da2..a1d320ea8b1a2 100644
--- a/flang/test/Lower/Intrinsics/iall.f90
+++ b/flang/test/Lower/Intrinsics/iall.f90
@@ -77,7 +77,7 @@ subroutine iall_test2(a,r)
 ! CHECK-DAG:  %[[a7:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 ! CHECK-DAG:  %[[a9:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
 r = iall(a,dim=2)
-! CHECK:  %{{.*}} = fir.call @_FortranAIAllDim(%[[a6]], %[[a7]], %[[c2_i32]], %{{.*}}, %{{.*}}, %[[a9]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> none
+! CHECK:  fir.call @_FortranAIAllDim(%[[a6]], %[[a7]], %[[c2_i32]], %{{.*}}, %{{.*}}, %[[a9]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
 ! CHECK-DAG: %[[a11:.*]] = fir.load %[[a0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK-DAG:  %[[a13:.*]] = fir.box_addr %[[a11]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
 ! CHECK-DAG:  fir.freemem %[[a13]]
diff --git a/flang/test/Lower/Intrinsics/iand.f90 b/flang/test/Lower/Intrinsics/iand.f90
index a6e4fbcdfe9f4..0954948a62af4 100644
--- a/flang/test/Lower/Intrinsics/iand.f90
+++ b/flang/test/Lower/Intrinsics/iand.f90
@@ -74,6 +74,6 @@ subroutine iand_test6(s1, s2)
 ! CHECK-DAG: %[[S2_VAL:.*]] = fir.load %[[S2]] : !fir.ref<i32>
   stop iand(s1,s2)
 ! CHECK-DAG: %[[ANDI:.*]] = arith.andi %[[S1_VAL]], %[[S2_VAL]] : i32
-! CHECK: fir.call @_FortranAStopStatement(%[[ANDI]], {{.*}}, {{.*}}) {{.*}}: (i32, i1, i1) -> none
+! CHECK: fir.call @_FortranAStopStatement(%[[ANDI]], {{.*}}, {{.*}}) {{.*}}: (i32, i1, i1) -> ()
 ! CHECK-NEXT: fir.unreachable
 end subroutine iand_test6
diff --git a/flang/test/Lower/Intrinsics/iany.f90 b/flang/test/Lower/Intrinsics/iany.f90
index 1f33a7a5c5cfb..3b9036bb670fe 100644
--- a/flang/test/Lower/Intrinsics/iany.f90
+++ b/flang/test/Lower/Intrinsics/iany.f90
@@ -77,7 +77,7 @@ subroutine iany_test2(a,r)
 ! CHECK-DAG:  %[[a7:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 ! CHECK-DAG:  %[[a9:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
 r = iany(a,dim=2)
-! CHECK:  %{{.*}} = fir.call @_FortranAIAnyDim(%[[a6]], %[[a7]], %[[c2_i32]], %{{.*}}, %{{.*}}, %[[a9]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> none
+! CHECK: fir.call @_FortranAIAnyDim(%[[a6]], %[[a7]], %[[c2_i32]], %{{.*}}, %{{.*}}, %[[a9]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
 ! CHECK-DAG: %[[a11:.*]] = fir.load %[[a0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK-DAG:  %[[a13:.*]] = fir.box_addr %[[a11]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
 ! CHECK-DAG:  fir.freemem %[[a13]]
diff --git a/flang/test/Lower/Intrinsics/index.f90 b/flang/test/Lower/Intrinsics/index.f90
index f1204458f7a40..0ec8cfad83adf 100644
--- a/flang/test/Lower/Intrinsics/index.f90
+++ b/flang/test/Lower/Intrinsics/index.f90
@@ -31,7 +31,7 @@ integer function index_test2(s1, s2)
   ! CHECK: %[[a2:.*]] = fir.convert %[[ssb]] : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
   ! CHECK: %[[a3:.*]] = fir.convert %[[back]] : (!fir.box<!fir.logical<4>>) -> !fir.box<none>
   ! CHECK: %[[a5:.*]] = fir.convert %{{.*}} : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-  ! CHECK:  fir.call @_FortranAIndex(%[[a0]], %[[a1]], %[[a2]], %[[a3]], %{{.*}}, %[[a5]], %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+  ! CHECK:  fir.call @_FortranAIndex(%[[a0]], %[[a1]], %[[a2]], %[[a3]], %{{.*}}, %[[a5]], %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
   index_test2 = index(s1, s2, .true., 4)
   ! CHECK: %[[ld1:.*]] = fir.load %[[mut]] : !fir.ref<!fir.box<!fir.heap<i32>>>
   ! CHECK: %[[ad1:.*]] = fir.box_addr %[[ld1]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
diff --git a/flang/test/Lower/Intrinsics/iparity.f90 b/flang/test/Lower/Intrinsics/iparity.f90
index 46b9ca5fc86fa..fab2b07e859c0 100644
--- a/flang/test/Lower/Intrinsics/iparity.f90
+++ b/flang/test/Lower/Intrinsics/iparity.f90
@@ -77,7 +77,7 @@ subroutine iparity_test2(a,r)
 ! CHECK-DAG:  %[[a7:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 ! CHECK-DAG:  %[[a9:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
 r = iparity(a,dim=2)
-! CHECK:  %{{.*}} = fir.call @_FortranAIParityDim(%[[a6]], %[[a7]], %[[c2_i32]], %{{.*}}, %{{.*}}, %[[a9]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> none
+! CHECK:  fir.call @_FortranAIParityDim(%[[a6]], %[[a7]], %[[c2_i32]], %{{.*}}, %{{.*}}, %[[a9]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
 ! CHECK-DAG: %[[a11:.*]] = fir.load %[[a0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK-DAG:  %[[a13:.*]] = fir.box_addr %[[a11]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
 ! CHECK-DAG:  fir.freemem %[[a13]]
diff --git a/flang/test/Lower/Intrinsics/matmul.f90 b/flang/test/Lower/Intrinsics/matmul.f90
index db60963320144..77e7fa213717e 100644
--- a/flang/test/Lower/Intrinsics/matmul.f90
+++ b/flang/test/Lower/Intrinsics/matmul.f90
@@ -23,7 +23,7 @@
 ! CHECK:  %[[RESULT_BOX_ADDR_RUNTIME:.*]] = fir.convert %[[RESULT_BOX_ADDR]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:  %[[X_BOX_RUNTIME:.*]] = fir.convert %[[X_BOX]] : (!fir.box<!fir.array<3x1xf32>>) -> !fir.box<none>
 ! CHECK:  %[[Y_BOX_RUNTIME:.*]] = fir.convert %[[Y_BOX]] : (!fir.box<!fir.array<1x3xf32>>) -> !fir.box<none>
-! CHECK:  {{.*}}fir.call @_FortranAMatmulReal4Real4(%[[RESULT_BOX_ADDR_RUNTIME]], %[[X_BOX_RUNTIME]], %[[Y_BOX_RUNTIME]], {{.*}}, {{.*}} {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:  {{.*}}fir.call @_FortranAMatmulReal4Real4(%[[RESULT_BOX_ADDR_RUNTIME]], %[[X_BOX_RUNTIME]], %[[Y_BOX_RUNTIME]], {{.*}}, {{.*}} {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:  %[[RESULT_BOX:.*]] = fir.load %[[RESULT_BOX_ADDR]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
 ! CHECK:  %[[RESULT_TMP:.*]] = fir.box_addr %[[RESULT_BOX]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.heap<!fir.array<?x?xf32>>
 ! CHECK:  %[[Z_COPY_FROM_RESULT:.*]] = fir.do_loop
@@ -50,7 +50,7 @@ subroutine matmul_test(x,y,z)
 !CHECK:  %[[RESULT_BOX_RUNTIME:.*]] = fir.convert %[[RESULT_BOX_ADDR]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>) -> !fir.ref<!fir.box<none>>
 !CHECK:  %[[X_BOX_RUNTIME:.*]] = fir.convert %[[X_BOX]] : (!fir.box<!fir.array<?x?x!fir.logical<4>>>) -> !fir.box<none>
 !CHECK:  %[[Y_BOX_RUNTIME:.*]] = fir.convert %[[Y_BOX]] : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
-!CHECK:  {{.*}}fir.call @_FortranAMatmulLogical4Logical4(%[[RESULT_BOX_RUNTIME]], %[[X_BOX_RUNTIME]], %[[Y_BOX_RUNTIME]], {{.*}}, {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+!CHECK:  {{.*}}fir.call @_FortranAMatmulLogical4Logical4(%[[RESULT_BOX_RUNTIME]], %[[X_BOX_RUNTIME]], %[[Y_BOX_RUNTIME]], {{.*}}, {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 !CHECK:  %[[RESULT_BOX:.*]] = fir.load %[[RESULT_BOX_ADDR]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>
 !CHECK:  %[[RESULT_TMP:.*]] = fir.box_addr %[[RESULT_BOX]] : (!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>) -> !fir.heap<!fir.array<?x!fir.logical<4>>>
 !CHECK:  %[[Z_COPY_FROM_RESULT:.*]] = fir.do_loop
diff --git a/flang/test/Lower/Intrinsics/maxloc.f90 b/flang/test/Lower/Intrinsics/maxloc.f90
index e299e5ab63e64..87f17881e0476 100644
--- a/flang/test/Lower/Intrinsics/maxloc.f90
+++ b/flang/test/Lower/Intrinsics/maxloc.f90
@@ -13,7 +13,7 @@ subroutine maxloc_test(arr,res)
   ! CHECK-DAG: %[[a8:.*]] = fir.convert %[[c4]] : (index) -> i32
   ! CHECK-DAG: %[[a10:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
     res = maxloc(arr)
-  ! CHECK: %{{.*}} = fir.call @_FortranAMaxlocInteger4(%[[a6]], %[[a7]], %[[a8]], %{{.*}}, %{{.*}}, %[[a10]], %false) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  ! CHECK: fir.call @_FortranAMaxlocInteger4(%[[a6]], %[[a7]], %[[a8]], %{{.*}}, %{{.*}}, %[[a10]], %false) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   ! CHECK-DAG: %[[a12:.*]] = fir.load %[[a0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   ! CHECK-DAG: %[[a14:.*]] = fir.box_addr %[[a12]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
   ! CHECK-DAG: fir.freemem %[[a14]]
@@ -34,7 +34,7 @@ subroutine maxloc_test2(arr,res,d)
   ! CHECK-DAG:  %[[a8:.*]] = fir.convert %[[c4]] : (index) -> i32
   ! CHECK-DAG:  %[[a10:.*]] = fir.convert %[[a2]] : (!fir.box<i1>) -> !fir.box<none>
     res = maxloc(arr, dim=d)
-  ! CHECK:  %{{.*}} = fir.call @_FortranAMaxlocDim(%[[a6]], %[[a7]], %[[a8]], %[[a1]], %{{.*}}, %{{.*}}, %[[a10]], %false) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  ! CHECK:  fir.call @_FortranAMaxlocDim(%[[a6]], %[[a7]], %[[a8]], %[[a1]], %{{.*}}, %{{.*}}, %[[a10]], %false) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   ! CHECK:  %[[a12:.*]] = fir.load %0 : !fir.ref<!fir.box<!fir.heap<i32>>>
   ! CHECK:  %[[a13:.*]] = fir.box_addr %[[a12]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
   ! CHECK:  fir.freemem %[[a13]]
@@ -63,7 +63,7 @@ subroutine test_maxloc_optional_scalar_mask(mask, back, array)
   ! CHECK:  }
   ! CHECK:  %[[VAL_29:.*]] = fir.convert %[[VAL_12]] : (!fir.box<!fir.logical<4>>) -> !fir.box<none>
   ! CHECK:  %[[VAL_30:.*]] = fir.convert %[[VAL_14]] : (!fir.logical<4>) -> i1
-  ! CHECK:  fir.call @_FortranAMaxlocInteger4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_29]], %[[VAL_30]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  ! CHECK:  fir.call @_FortranAMaxlocInteger4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_29]], %[[VAL_30]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   end subroutine
   
   ! CHECK-LABEL: func @_QPtest_maxloc_optional_array_mask(
@@ -85,5 +85,5 @@ subroutine test_maxloc_optional_array_mask(mask, back, array)
   ! CHECK:  }
   ! CHECK:  %[[VAL_25:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
   ! CHECK:  %[[VAL_26:.*]] = fir.convert %[[VAL_10]] : (!fir.logical<4>) -> i1
-  ! CHECK:  fir.call @_FortranAMaxlocInteger4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_25]], %[[VAL_26]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  ! CHECK:  fir.call @_FortranAMaxlocInteger4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_25]], %[[VAL_26]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   end subroutine
diff --git a/flang/test/Lower/Intrinsics/maxval.f90 b/flang/test/Lower/Intrinsics/maxval.f90
index 9e8b6e04cd684..92c868bd1fd01 100644
--- a/flang/test/Lower/Intrinsics/maxval.f90
+++ b/flang/test/Lower/Intrinsics/maxval.f90
@@ -23,7 +23,7 @@ integer function maxval_test(a)
 ! CHECK:  %[[a6:.*]] = fir.convert %[[arg2]] : (!fir.box<!fir.array<?x!fir.char<1>>>) -> !fir.box<none>
 ! CHECK-DAG:  %[[a8:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
 maxval_test2 = maxval(a)
-! CHECK:  %{{.*}} = fir.call @_FortranAMaxvalCharacter(%[[a5]], %[[a6]], %{{.*}}, %{{.*}}, %[[a8]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32, !fir.box<none>) -> none
+! CHECK:  fir.call @_FortranAMaxvalCharacter(%[[a5]], %[[a6]], %{{.*}}, %{{.*}}, %[[a8]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32, !fir.box<none>) -> ()
 end function
 
 ! CHECK-LABEL: func @_QPmaxval_test3(
@@ -38,7 +38,7 @@ subroutine maxval_test3(a,r)
 ! CHECK:  %[[a7:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 ! CHECK-DAG:  %[[a9:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
 r = maxval(a,dim=2)
-! CHECK:  %{{.*}} = fir.call @_FortranAMaxvalDim(%[[a6]], %[[a7]], %[[c2_i32]], %{{.*}}, %{{.*}}, %[[a9]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> none
+! CHECK:  fir.call @_FortranAMaxvalDim(%[[a6]], %[[a7]], %[[c2_i32]], %{{.*}}, %{{.*}}, %[[a9]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
 ! CHECK:  %[[a11:.*]] = fir.load %[[a0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK-DAG:  %[[a13:.*]] = fir.box_addr %[[a11]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
 ! CHECK-DAG:  fir.freemem %[[a13]]
diff --git a/flang/test/Lower/Intrinsics/minloc.f90 b/flang/test/Lower/Intrinsics/minloc.f90
index 2a361cc94639f..caab36d0f8138 100644
--- a/flang/test/Lower/Intrinsics/minloc.f90
+++ b/flang/test/Lower/Intrinsics/minloc.f90
@@ -13,7 +13,7 @@ subroutine minloc_test(arr,res)
   ! CHECK-DAG: %[[a8:.*]] = fir.convert %[[c4]] : (index) -> i32
   ! CHECK-DAG: %[[a10:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
     res = minloc(arr)
-  ! CHECK: %{{.*}} = fir.call @_FortranAMinlocInteger4(%[[a6]], %[[a7]], %[[a8]], %{{.*}}, %{{.*}}, %[[a10]], %false) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  ! CHECK: fir.call @_FortranAMinlocInteger4(%[[a6]], %[[a7]], %[[a8]], %{{.*}}, %{{.*}}, %[[a10]], %false) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   ! CHECK-DAG: %[[a12:.*]] = fir.load %[[a0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   ! CHECK-DAG: %[[a14:.*]] = fir.box_addr %[[a12]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
   ! CHECK-DAG: fir.freemem %[[a14]]
@@ -34,7 +34,7 @@ subroutine minloc_test2(arr,res,d)
   ! CHECK-DAG:  %[[a8:.*]] = fir.convert %[[c4]] : (index) -> i32
   ! CHECK-DAG:  %[[a10:.*]] = fir.convert %[[a2]] : (!fir.box<i1>) -> !fir.box<none>
     res = minloc(arr, dim=d)
-  ! CHECK:  %{{.*}} = fir.call @_FortranAMinlocDim(%[[a6]], %[[a7]], %[[a8]], %[[a1]], %{{.*}}, %{{.*}}, %[[a10]], %false) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  ! CHECK:  fir.call @_FortranAMinlocDim(%[[a6]], %[[a7]], %[[a8]], %[[a1]], %{{.*}}, %{{.*}}, %[[a10]], %false) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   ! CHECK:  %[[a12:.*]] = fir.load %0 : !fir.ref<!fir.box<!fir.heap<i32>>>
   ! CHECK:  %[[a13:.*]] = fir.box_addr %[[a12]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
   ! CHECK:  fir.freemem %[[a13]]
@@ -63,7 +63,7 @@ subroutine test_minloc_optional_scalar_mask(mask, back, array)
   ! CHECK:  }
   ! CHECK:  %[[VAL_29:.*]] = fir.convert %[[VAL_12]] : (!fir.box<!fir.logical<4>>) -> !fir.box<none>
   ! CHECK:  %[[VAL_30:.*]] = fir.convert %[[VAL_14]] : (!fir.logical<4>) -> i1
-  ! CHECK:  fir.call @_FortranAMinlocInteger4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_29]], %[[VAL_30]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  ! CHECK:  fir.call @_FortranAMinlocInteger4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_29]], %[[VAL_30]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   end subroutine
   
   ! CHECK-LABEL: func @_QPtest_minloc_optional_array_mask(
@@ -85,5 +85,5 @@ subroutine test_minloc_optional_array_mask(mask, back, array)
   ! CHECK:  }
   ! CHECK:  %[[VAL_25:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
   ! CHECK:  %[[VAL_26:.*]] = fir.convert %[[VAL_10]] : (!fir.logical<4>) -> i1
-  ! CHECK:  fir.call @_FortranAMinlocInteger4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_25]], %[[VAL_26]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  ! CHECK:  fir.call @_FortranAMinlocInteger4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_25]], %[[VAL_26]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   end subroutine
diff --git a/flang/test/Lower/Intrinsics/minval.f90 b/flang/test/Lower/Intrinsics/minval.f90
index cff34a4f1e7e6..59132f1813673 100644
--- a/flang/test/Lower/Intrinsics/minval.f90
+++ b/flang/test/Lower/Intrinsics/minval.f90
@@ -23,7 +23,7 @@ integer function minval_test(a)
 ! CHECK:  %[[a6:.*]] = fir.convert %[[arg2]] : (!fir.box<!fir.array<?x!fir.char<1>>>) -> !fir.box<none>
 ! CHECK-DAG:  %[[a8:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
 minval_test2 = minval(a)
-! CHECK:  %{{.*}} = fir.call @_FortranAMinvalCharacter(%[[a5]], %[[a6]], %{{.*}}, %{{.*}}, %[[a8]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32, !fir.box<none>) -> none
+! CHECK:  fir.call @_FortranAMinvalCharacter(%[[a5]], %[[a6]], %{{.*}}, %{{.*}}, %[[a8]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32, !fir.box<none>) -> ()
 end function
 
 ! CHECK-LABEL: func @_QPminval_test3(
@@ -38,7 +38,7 @@ subroutine minval_test3(a,r)
 ! CHECK:  %[[a7:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 ! CHECK-DAG:  %[[a9:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
 r = minval(a,dim=2)
-! CHECK:  %{{.*}} = fir.call @_FortranAMinvalDim(%[[a6]], %[[a7]], %[[c2_i32]], %{{.*}}, %{{.*}}, %[[a9]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> none
+! CHECK:  fir.call @_FortranAMinvalDim(%[[a6]], %[[a7]], %[[c2_i32]], %{{.*}}, %{{.*}}, %[[a9]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
 ! CHECK:  %[[a11:.*]] = fir.load %[[a0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK-DAG:  %[[a13:.*]] = fir.box_addr %[[a11]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
 ! CHECK-DAG:  fir.freemem %[[a13]]
diff --git a/flang/test/Lower/Intrinsics/norm2.f90 b/flang/test/Lower/Intrinsics/norm2.f90
index 0d125e36f6650..ac761ae3f5381 100644
--- a/flang/test/Lower/Intrinsics/norm2.f90
+++ b/flang/test/Lower/Intrinsics/norm2.f90
@@ -55,7 +55,7 @@ subroutine norm2_test_dim_2(a,r)
   ! CHECK-DAG:  %[[res:.*]] = fir.convert %[[r]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
   ! CHECK:  %[[arr:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
   r = norm2(a,dim=1)
-  ! CHECK:  %{{.*}} = fir.call @_FortranANorm2Dim(%[[res]], %[[arr]], %[[dim]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+  ! CHECK:  fir.call @_FortranANorm2Dim(%[[res]], %[[arr]], %[[dim]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
   ! CHECK:  %[[box:.*]] = fir.load %[[r]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
   ! CHECK-DAG:  %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
   ! CHECK-DAG:  fir.freemem %[[addr]]
@@ -71,7 +71,7 @@ subroutine norm2_test_dim_3(a,r)
   ! CHECK-DAG:  %[[res:.*]] = fir.convert %[[r]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
   ! CHECK:  %[[arr:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?x?xf32>>) -> !fir.box<none>
   r = norm2(a,dim=3)
-  ! CHECK:  %{{.*}} = fir.call @_FortranANorm2Dim(%[[res]], %[[arr]], %[[dim]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+  ! CHECK:  fir.call @_FortranANorm2Dim(%[[res]], %[[arr]], %[[dim]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
   ! CHECK:  %[[box:.*]] = fir.load %[[r]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
   ! CHECK-DAG:  %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.heap<!fir.array<?x?xf32>>
   ! CHECK-DAG:  fir.freemem %[[addr]]
@@ -87,7 +87,7 @@ subroutine norm2_test_real16(a,r)
   ! CHECK-DAG:  %[[res:.*]] = fir.convert %[[r]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf128>>>>) -> !fir.ref<!fir.box<none>>
   ! CHECK:  %[[arr:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?x?xf128>>) -> !fir.box<none>
   r = norm2(a,dim=3)
-  ! CHECK:  %{{.*}} = fir.call @_FortranANorm2DimReal16(%[[res]], %[[arr]], %[[dim]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+  ! CHECK:  fir.call @_FortranANorm2DimReal16(%[[res]], %[[arr]], %[[dim]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
   ! CHECK:  %[[box:.*]] = fir.load %[[r]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf128>>>>
   ! CHECK-DAG:  %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?x?xf128>>>) -> !fir.heap<!fir.array<?x?xf128>>
   ! CHECK-DAG:  fir.freemem %[[addr]]
diff --git a/flang/test/Lower/Intrinsics/pack.f90 b/flang/test/Lower/Intrinsics/pack.f90
index 37e3170316b4c..a00c10dc2e959 100644
--- a/flang/test/Lower/Intrinsics/pack.f90
+++ b/flang/test/Lower/Intrinsics/pack.f90
@@ -16,7 +16,7 @@ subroutine pack_test(a,m,v,r)
   ! CHECK:  %[[a7:.*]] = fir.convert %[[arg1]] : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
   ! CHECK:  %[[a8:.*]] = fir.convert %[[arg2]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
     r = pack(a,m,v)
-  ! CHECK: %{{.*}} = fir.call @_FortranAPack(%[[a5]], %[[a6]], %[[a7]], %[[a8]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  ! CHECK: fir.call @_FortranAPack(%[[a5]], %[[a6]], %[[a7]], %[[a8]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   ! CHECK:  %[[a11:.*]] = fir.load %[[a0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   ! CHECK:  %[[a13:.*]] = fir.box_addr %[[a11]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
   ! CHECK:  fir.freemem %[[a13]]
@@ -38,5 +38,5 @@ subroutine test_pack_optional(vector, array, mask)
   ! CHECK:  %[[VAL_15:.*]] = fir.absent !fir.box<!fir.ptr<!fir.array<?xi32>>>
   ! CHECK:  %[[VAL_16:.*]] = arith.select %[[VAL_13]], %[[VAL_14]], %[[VAL_15]] : !fir.box<!fir.ptr<!fir.array<?xi32>>>
   ! CHECK:  %[[VAL_26:.*]] = fir.convert %[[VAL_16]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>) -> !fir.box<none>
-  ! CHECK:  fir.call @_FortranAPack(%{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_26]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  ! CHECK:  fir.call @_FortranAPack(%{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_26]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   end subroutine
diff --git a/flang/test/Lower/Intrinsics/parity.f90 b/flang/test/Lower/Intrinsics/parity.f90
index 6771b7d703275..91b168ee5662d 100644
--- a/flang/test/Lower/Intrinsics/parity.f90
+++ b/flang/test/Lower/Intrinsics/parity.f90
@@ -25,7 +25,7 @@ subroutine parity_test2(mask, d, rslt)
   ! CHECK-DAG:  %[[a6:.*]] = fir.convert %[[a0:.*]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>) -> !fir.ref<!fir.box<none>>
   ! CHECK-DAG:  %[[a7:.*]] = fir.convert %[[arg0:.*]]: (!fir.box<!fir.array<?x?x!fir.logical<4>>>) -> !fir.box<none>
   rslt = parity(mask, d)
-  ! CHECK:  %[[r1:.*]] = fir.call @_FortranAParityDim(%[[a6:.*]], %[[a7:.*]], %[[a1:.*]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+  ! CHECK:  fir.call @_FortranAParityDim(%[[a6:.*]], %[[a7:.*]], %[[a1:.*]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
   ! CHECK-DAG:  %[[a10:.*]] = fir.load %[[a0:.*]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>
   ! CHECK-DAG:  %[[a12:.*]] = fir.box_addr %[[a10:.*]] : (!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>) -> !fir.heap<!fir.array<?x!fir.logical<4>>>
   ! CHECK-DAG  fir.freemem %[[a12:.*]]
diff --git a/flang/test/Lower/Intrinsics/product.f90 b/flang/test/Lower/Intrinsics/product.f90
index 77b8ab8e7f5a9..b2fc809b15ef3 100644
--- a/flang/test/Lower/Intrinsics/product.f90
+++ b/flang/test/Lower/Intrinsics/product.f90
@@ -25,7 +25,7 @@ subroutine product_test2(a,r)
 ! CHECK-DAG:  %[[a7:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 ! CHECK-DAG:  %[[a9:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
 r = product(a,dim=2)
-! CHECK:  %{{.*}} = fir.call @_FortranAProductDim(%[[a6]], %[[a7]], %[[c2_i32]], %{{.*}}, %{{.*}}, %[[a9]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> none
+! CHECK:  fir.call @_FortranAProductDim(%[[a6]], %[[a7]], %[[c2_i32]], %{{.*}}, %{{.*}}, %[[a9]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
 ! CHECK-DAG: %[[a11:.*]] = fir.load %[[a0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK-DAG:  %[[a13:.*]] = fir.box_addr %[[a11]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
 ! CHECK-DAG:  fir.freemem %[[a13]]
@@ -42,7 +42,7 @@ subroutine product_test2(a,r)
 ! CHECK-DAG:  %[[a8:.*]] = fir.convert %[[c0]] : (index) -> i32
 ! CHECK-DAG:  %[[a9:.*]] = fir.convert %[[a3]] : (!fir.box<i1>) -> !fir.box<none>
 product_test3 = product(a)
-! CHECK:  %{{.*}} = fir.call @_FortranACppProductComplex4(%[[a0]], %[[a6]], %{{.*}}, %{{.*}}, %[[a8]], %[[a9]]) {{.*}}: (!fir.ref<complex<f32>>, !fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> none
+! CHECK:  fir.call @_FortranACppProductComplex4(%[[a0]], %[[a6]], %{{.*}}, %{{.*}}, %[[a8]], %[[a9]]) {{.*}}: (!fir.ref<complex<f32>>, !fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> ()
 end function
 
 ! CHECK-LABEL: func @_QPproduct_test4(
diff --git a/flang/test/Lower/Intrinsics/random.f90 b/flang/test/Lower/Intrinsics/random.f90
index 4fb1a9a5da27a..5f7d6414e606d 100644
--- a/flang/test/Lower/Intrinsics/random.f90
+++ b/flang/test/Lower/Intrinsics/random.f90
@@ -8,14 +8,14 @@ subroutine random_test_1
   ! CHECK-DAG: [[rr:%[0-9]+]] = fir.alloca {{.*}}random_test_1Err
   ! CHECK-DAG: [[aa:%[0-9]+]] = fir.alloca {{.*}}random_test_1Eaa
   real rr, aa(5)
-  ! CHECK: fir.call @_FortranARandomInit(%true{{.*}}, %false{{.*}}) {{.*}}: (i1, i1) -> none
+  ! CHECK: fir.call @_FortranARandomInit(%true{{.*}}, %false{{.*}}) {{.*}}: (i1, i1) -> ()
   call random_init(.true., .false.)
   ! CHECK: [[box:%[0-9]+]] = fir.embox [[ss]]
   ! CHECK: [[argbox:%[0-9]+]] = fir.convert [[box]]
   ! CHECK: fir.call @_FortranARandomSeedSize([[argbox]]
   call random_seed(size=ss)
   print*, 'size: ', ss
-  ! CHECK: fir.call @_FortranARandomSeedDefaultPut() {{.*}}: () -> none
+  ! CHECK: fir.call @_FortranARandomSeedDefaultPut() {{.*}}: () -> ()
   call random_seed()
   ! CHECK: [[box:%[0-9]+]] = fir.embox [[rr]]
   ! CHECK: [[argbox:%[0-9]+]] = fir.convert [[box]]
diff --git a/flang/test/Lower/Intrinsics/random_number_real16.f90 b/flang/test/Lower/Intrinsics/random_number_real16.f90
index 76fed258d8afc..060574d5b3b3f 100644
--- a/flang/test/Lower/Intrinsics/random_number_real16.f90
+++ b/flang/test/Lower/Intrinsics/random_number_real16.f90
@@ -2,14 +2,14 @@
 ! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
 
 ! CHECK-LABEL: func @_QPtest_scalar
-! CHECK: fir.call @_FortranARandomNumber16({{.*}}){{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranARandomNumber16({{.*}}){{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 subroutine test_scalar
   real(16) :: r
   call random_number(r)
 end
 
 ! CHECK-LABEL: func @_QPtest_array
-! CHECK: fir.call @_FortranARandomNumber16({{.*}}){{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranARandomNumber16({{.*}}){{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 subroutine test_array(r)
   real(16) :: r(:)
   call random_number(r)
diff --git a/flang/test/Lower/Intrinsics/rename.f90 b/flang/test/Lower/Intrinsics/rename.f90
index 75042217c6202..66fab9efae9f6 100644
--- a/flang/test/Lower/Intrinsics/rename.f90
+++ b/flang/test/Lower/Intrinsics/rename.f90
@@ -20,7 +20,7 @@ subroutine test_rename(src, dst)
     !CHECK-NEXT: %[[src:.*]] = fir.convert %[[srcBox]] : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
     !CHECK-NEXT: %[[dst:.*]] = fir.convert %[[dstBox]] : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
     !CHECK-NEXT: %[[loc:.*]] = fir.convert %[[sourceFileConv:.*]]: (!fir.ref<!fir.char<1,[[len:.*]]>>) -> !fir.ref<i8>
-    !CHECK-NEXT: %[[result:.*]] = fir.call @_FortranARename(%[[src]], %[[dst]], %[[statusBox]], %[[loc]], %[[c10_i32]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+    !CHECK-NEXT: fir.call @_FortranARename(%[[src]], %[[dst]], %[[statusBox]], %[[loc]], %[[c10_i32]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 end subroutine test_rename
 
 !CHECK-LABEL: func.func @_QPtest_rename_status
@@ -47,5 +47,5 @@ subroutine test_rename_status(src, dst)
     !CHECK-NEXT: %[[dst:.*]] = fir.convert %[[dstBox]] : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
     !CHECK-NEXT: %[[status:.*]] = fir.convert %[[statusBox]] : (!fir.box<i32>) -> !fir.box<none>
     !CHECK-NEXT: %[[loc:.*]] = fir.convert %[[sourceFileConv:.*]]: (!fir.ref<!fir.char<1,[[len:.*]]>>) -> !fir.ref<i8>
-    !CHECK-NEXT: %[[result:.*]] = fir.call @_FortranARename(%[[src]], %[[dst]], %[[status]], %[[loc]], %[[c10_i32]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+    !CHECK-NEXT: fir.call @_FortranARename(%[[src]], %[[dst]], %[[status]], %[[loc]], %[[c10_i32]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 end subroutine test_rename_status
diff --git a/flang/test/Lower/Intrinsics/repeat.f90 b/flang/test/Lower/Intrinsics/repeat.f90
index dd37fbbc8c54c..e95221405b664 100644
--- a/flang/test/Lower/Intrinsics/repeat.f90
+++ b/flang/test/Lower/Intrinsics/repeat.f90
@@ -13,7 +13,7 @@ subroutine repeat_test(c, n)
   ! CHECK-DAG: %[[cBox:.*]] = fir.embox %[[c]]#0 typeparams %[[c]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
   ! CHECK-DAG: %[[cBoxNone:.*]] = fir.convert %[[cBox]] : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
   ! CHECK-DAG: %[[resBox:.*]] = fir.convert %[[tmpBox]] : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) -> !fir.ref<!fir.box<none>>
-  ! CHECK: fir.call @{{.*}}Repeat(%[[resBox]], %[[cBoxNone]], %[[n]], {{.*}}, {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.ref<i8>, i32) -> none
+  ! CHECK: fir.call @{{.*}}Repeat(%[[resBox]], %[[cBoxNone]], %[[n]], {{.*}}, {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.ref<i8>, i32) -> ()
   ! CHECK-DAG: %[[tmpAddr:.*]] = fir.box_addr
   ! CHECK-DAG: fir.box_elesize
   ! CHECK: fir.call @{{.*}}bar_repeat_test
diff --git a/flang/test/Lower/Intrinsics/reshape.f90 b/flang/test/Lower/Intrinsics/reshape.f90
index 6fe95963b7acc..4f4f50965dd1b 100644
--- a/flang/test/Lower/Intrinsics/reshape.f90
+++ b/flang/test/Lower/Intrinsics/reshape.f90
@@ -19,7 +19,7 @@ subroutine reshape_test(x, source, pd, sh, ord)
   ! CHECK-DAG:  %[[a11:.*]] = fir.convert %[[arg2]] : (!fir.box<!fir.array<?x?x?xi32>>) -> !fir.box<none>
   ! CHECK-DAG:  %[[a12:.*]] = fir.convert %[[a3]] : (!fir.box<!fir.array<2xi32>>) -> !fir.box<none>
     x = reshape(source, sh, pd, ord)
-  ! CHECK:  %{{.*}} = fir.call @_FortranAReshape(%[[a8]], %[[a9]], %[[a10]], %[[a11]], %[[a12]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  ! CHECK:  fir.call @_FortranAReshape(%[[a8]], %[[a9]], %[[a10]], %[[a11]], %[[a12]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   ! CHECK-DAG:  %[[a15:.*]] = fir.load %[[a0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>
   ! CHECK-DAG:  %[[a18:.*]] = fir.box_addr %[[a15]] : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>) -> !fir.heap<!fir.array<?x?xi32>>
   ! CHECK-DAG:  fir.freemem %[[a18]]
@@ -52,7 +52,7 @@ subroutine test_reshape_optional(pad, order, source, shape)
   ! CHECK:  %[[VAL_28:.*]] = arith.select %[[VAL_25]], %[[VAL_26]], %[[VAL_27]] : !fir.box<!fir.ptr<!fir.array<?xi32>>>
   ! CHECK:  %[[VAL_38:.*]] = fir.convert %[[VAL_20]] : (!fir.box<!fir.ptr<!fir.array<?x?xf32>>>) -> !fir.box<none>
   ! CHECK:  %[[VAL_39:.*]] = fir.convert %[[VAL_28]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>) -> !fir.box<none>
-  ! CHECK:  %[[VAL_41:.*]] = fir.call @_FortranAReshape({{.*}}, {{.*}}, %{{.*}}, %[[VAL_38]], %[[VAL_39]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  ! CHECK:  fir.call @_FortranAReshape({{.*}}, {{.*}}, %{{.*}}, %[[VAL_38]], %[[VAL_39]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_reshape_shape_slice() {
@@ -73,7 +73,7 @@ subroutine test_reshape_optional(pad, order, source, shape)
 ! CHECK:         %[[VAL_15:.*]] = fir.embox %[[VAL_1]](%[[VAL_13]]) [%[[VAL_14]]] : (!fir.ref<!fir.array<4xi32>>, !fir.shape<1>, !fir.slice<1>) -> !fir.box<!fir.array<2xi32>>
 ! CHECK:         %[[VAL_25:.*]] = fir.convert %[[VAL_6]] : (!fir.box<!fir.array<4xf32>>) -> !fir.box<none>
 ! CHECK:         %[[VAL_26:.*]] = fir.convert %[[VAL_15]] : (!fir.box<!fir.array<2xi32>>) -> !fir.box<none>
-! CHECK:         %[[VAL_30:.*]] = fir.call @_FortranAReshape(%{{.*}}, %[[VAL_25]], %[[VAL_26]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:         fir.call @_FortranAReshape(%{{.*}}, %[[VAL_25]], %[[VAL_26]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 subroutine test_reshape_shape_slice()
   integer, parameter :: i = 1
   real :: tmp(4) = [1,2,3,4]
diff --git a/flang/test/Lower/Intrinsics/scan.f90 b/flang/test/Lower/Intrinsics/scan.f90
index 2dd6933bc46fa..1773a395f1b24 100644
--- a/flang/test/Lower/Intrinsics/scan.f90
+++ b/flang/test/Lower/Intrinsics/scan.f90
@@ -15,7 +15,7 @@ integer function scan_test(s1, s2)
 ! CHECK-DAG: %[[backBox:.*]] = fir.convert %[[backOptBox]] : (!fir.box<i1>) -> !fir.box<none>
 ! CHECK-DAG: %[[kindConstant:.*]] = arith.constant 4 : i32
 ! CHECK-DAG: %[[resBox:.*]] = fir.convert %[[tmpBox:.*]] : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: fir.call @{{.*}}Scan(%[[resBox]], %[[cBoxNone]], %[[cBoxNone2]], %[[backBox]], %[[kindConstant]], {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @{{.*}}Scan(%[[resBox]], %[[cBoxNone]], %[[cBoxNone2]], %[[backBox]], %[[kindConstant]], {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
 scan_test = scan(s1, s2, kind=4)
 ! CHECK-DAG: %[[tmpAddr:.*]] = fir.box_addr
 ! CHECK: fir.freemem %[[tmpAddr]]
diff --git a/flang/test/Lower/Intrinsics/sleep.f90 b/flang/test/Lower/Intrinsics/sleep.f90
index c4a7b381602ca..0b7d11a803650 100644
--- a/flang/test/Lower/Intrinsics/sleep.f90
+++ b/flang/test/Lower/Intrinsics/sleep.f90
@@ -6,22 +6,22 @@ subroutine test_sleep()
   call sleep(1_2)
 ! CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i16
 ! CHECK:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (i16) -> i64
-! CHECK:           %[[VAL_2:.*]] = fir.call @_FortranASleep(%[[VAL_1]]) fastmath<contract> : (i64) -> none
+! CHECK:           fir.call @_FortranASleep(%[[VAL_1]]) fastmath<contract> : (i64) -> ()
 
   call sleep(1_4)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i32
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i32) -> i64
-! CHECK:           %[[VAL_5:.*]] = fir.call @_FortranASleep(%[[VAL_4]]) fastmath<contract> : (i64) -> none
+! CHECK:           fir.call @_FortranASleep(%[[VAL_4]]) fastmath<contract> : (i64) -> ()
 
   call sleep(1_8)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 1 : i64
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> i64
-! CHECK:           %[[VAL_8:.*]] = fir.call @_FortranASleep(%[[VAL_7]]) fastmath<contract> : (i64) -> none
+! CHECK:           fir.call @_FortranASleep(%[[VAL_7]]) fastmath<contract> : (i64) -> ()
 
   call sleep(1_16)
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 1 : i128
 ! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i128) -> i64
-! CHECK:           %[[VAL_11:.*]] = fir.call @_FortranASleep(%[[VAL_10]]) fastmath<contract> : (i64) -> none
+! CHECK:           fir.call @_FortranASleep(%[[VAL_10]]) fastmath<contract> : (i64) -> ()
 end
 ! CHECK:           return
 ! CHECK:         }
diff --git a/flang/test/Lower/Intrinsics/spread.f90 b/flang/test/Lower/Intrinsics/spread.f90
index d58725aba6987..3c20ec29ebc11 100644
--- a/flang/test/Lower/Intrinsics/spread.f90
+++ b/flang/test/Lower/Intrinsics/spread.f90
@@ -25,7 +25,7 @@ subroutine spread_test(s,d,n,r)
   ! CHECK-DAG:  %[[a9:.*]] = fir.convert %[[a3]] : (!fir.box<i32>) -> !fir.box<none>
   ! CHECK-DAG:  %[[a10:.*]] = fir.convert %[[a2]] : (i32) -> i64
     r = spread(s,d,n)
-  ! CHECK:  %{{.*}} = fir.call @_FortranASpread(%[[a8]], %[[a9]], %[[a1]], %[[a10]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i64, !fir.ref<i8>, i32) -> none
+  ! CHECK:  fir.call @_FortranASpread(%[[a8]], %[[a9]], %[[a1]], %[[a10]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i64, !fir.ref<i8>, i32) -> ()
   ! CHECK-DAG:  %[[a13:.*]] = fir.load %[[a0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   ! CHECK-DAG:  %[[a15:.*]] = fir.box_addr %[[a13]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
   ! CHECK:  fir.freemem %[[a15]]
@@ -43,7 +43,7 @@ subroutine spread_test2(s,d,n,r)
   ! CHECK-DAG:  %[[a8:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
   ! CHECK-DAG:  %[[a9:.*]] = fir.convert %[[a2]] : (i32) -> i64
     r = spread(s,d,n)
-  ! CHECK:  %{{.*}} = fir.call @_FortranASpread(%[[a7]], %[[a8]], %[[a1]], %[[a9]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i64, !fir.ref<i8>, i32) -> none
+  ! CHECK:  fir.call @_FortranASpread(%[[a7]], %[[a8]], %[[a1]], %[[a9]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i64, !fir.ref<i8>, i32) -> ()
   ! CHECK-DAG:  %[[a12:.*]] = fir.load %[[a0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>
   ! CHECK-DAG:  %[[a15:.*]] = fir.box_addr %[[a12]] : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>) -> !fir.heap<!fir.array<?x?xi32>>
   ! CHECK:  fir.freemem %[[a15:.*]]
@@ -62,7 +62,7 @@ subroutine spread_test_polymorphic_source(p)
 ! CHECK: fir.store %[[embox]] to %[[res]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x?xnone>>>>
 ! CHECK: %[[res_box_none:.*]] = fir.convert %[[res]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x?xnone>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[source_box_none:.*]] = fir.convert %[[source_box]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranASpread(%[[res_box_none]], %[[source_box_none]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i64, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranASpread(%[[res_box_none]], %[[source_box_none]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i64, !fir.ref<i8>, i32) -> ()
 
 end subroutine
 
diff --git a/flang/test/Lower/Intrinsics/storage_size.f90 b/flang/test/Lower/Intrinsics/storage_size.f90
index b0c9d51f95328..3dc135bbf6fbc 100644
--- a/flang/test/Lower/Intrinsics/storage_size.f90
+++ b/flang/test/Lower/Intrinsics/storage_size.f90
@@ -29,7 +29,7 @@ integer function unlimited_polymorphic_pointer(p) result(size)
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i64
 ! CHECK: %[[IS_NULL_ADDR:.*]] = arith.cmpi eq, %[[P_ADDR_I64]], %[[C0]] : i64
 ! CHECK: fir.if %[[IS_NULL_ADDR]] {
-! CHECK:   %{{.*}} = fir.call @_FortranAReportFatalUserError(%{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<i8>, !fir.ref<i8>, i32) -> none
+! CHECK:   fir.call @_FortranAReportFatalUserError(%{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<i8>, !fir.ref<i8>, i32) -> ()
 ! CHECK: }
 ! CHECK: %[[LOAD_P:.*]] = fir.load %[[P]] : !fir.ref<!fir.class<!fir.ptr<none>>>
 ! CHECK: %[[ELE_SIZE:.*]] = fir.box_elesize %[[LOAD_P]] : (!fir.class<!fir.ptr<none>>) -> i32
@@ -53,7 +53,7 @@ integer function unlimited_polymorphic_allocatable(p) result(size)
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i64
 ! CHECK: %[[IS_NULL_ADDR:.*]] = arith.cmpi eq, %[[P_ADDR_I64]], %[[C0]] : i64
 ! CHECK: fir.if %[[IS_NULL_ADDR]] {
-! CHECK:   %{{.*}} = fir.call @_FortranAReportFatalUserError(%{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<i8>, !fir.ref<i8>, i32) -> none
+! CHECK:   fir.call @_FortranAReportFatalUserError(%{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<i8>, !fir.ref<i8>, i32) -> ()
 ! CHECK: }
 ! CHECK: %[[LOAD_P:.*]] = fir.load %[[P]] : !fir.ref<!fir.class<!fir.heap<none>>>
 ! CHECK: %[[ELE_SIZE:.*]] = fir.box_elesize %[[LOAD_P]] : (!fir.class<!fir.heap<none>>) -> i32
diff --git a/flang/test/Lower/Intrinsics/sum.f90 b/flang/test/Lower/Intrinsics/sum.f90
index ab5da34b3c7bb..785f20b861f13 100644
--- a/flang/test/Lower/Intrinsics/sum.f90
+++ b/flang/test/Lower/Intrinsics/sum.f90
@@ -25,7 +25,7 @@ subroutine sum_test2(a,r)
 ! CHECK-DAG:  %[[a7:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 ! CHECK-DAG:  %[[a9:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
 r = sum(a,dim=2)
-! CHECK:  %{{.*}} = fir.call @_FortranASumDim(%[[a6]], %[[a7]], %[[c2_i32]], %{{.*}}, %{{.*}}, %[[a9]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> none
+! CHECK:  fir.call @_FortranASumDim(%[[a6]], %[[a7]], %[[c2_i32]], %{{.*}}, %{{.*}}, %[[a9]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
 ! CHECK-DAG: %[[a11:.*]] = fir.load %[[a0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK-DAG:  %[[a13:.*]] = fir.box_addr %[[a11]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
 ! CHECK-DAG:  fir.freemem %[[a13]]
@@ -42,7 +42,7 @@ subroutine sum_test2(a,r)
 ! CHECK-DAG:  %[[a8:.*]] = fir.convert %[[c0]] : (index) -> i32
 ! CHECK-DAG:  %[[a9:.*]] = fir.convert %[[a3]] : (!fir.box<i1>) -> !fir.box<none>
 sum_test3 = sum(a)
-! CHECK:  %{{.*}} = fir.call @_FortranACppSumComplex4(%[[a0]], %[[a6]], %{{.*}}, %{{.*}}, %[[a8]], %[[a9]]) {{.*}}: (!fir.ref<complex<f32>>, !fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> none
+! CHECK:  fir.call @_FortranACppSumComplex4(%[[a0]], %[[a6]], %{{.*}}, %{{.*}}, %[[a8]], %[[a9]]) {{.*}}: (!fir.ref<complex<f32>>, !fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> ()
 end function
 
 ! CHECK-LABEL: func @_QPsum_test4(
diff --git a/flang/test/Lower/Intrinsics/system-optional.f90 b/flang/test/Lower/Intrinsics/system-optional.f90
index 8001e76fb93bd..55f63a913a532 100644
--- a/flang/test/Lower/Intrinsics/system-optional.f90
+++ b/flang/test/Lower/Intrinsics/system-optional.f90
@@ -27,7 +27,7 @@ subroutine all_args(command, exitstat)
 ! CHECK-NEXT:    %[[command:.*]] = fir.convert %[[commandBox]] : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
 ! CHECK-NEXT:    %[[exitstat:.*]] = fir.convert %[[exitstatRealBox]] : (!fir.box<i32>) -> !fir.box<none>
 ! CHECK-NEXT:    %[[cmdstat:.*]] = fir.convert %[[cmdstatBox]] : (!fir.box<i16>) -> !fir.box<none>
-! CHECK:         %[[VAL_16:.*]] = fir.call @_FortranAExecuteCommandLine(%[[command]], %[[true]], %[[exitstat]], %[[cmdstat]], %[[absentBox]], %[[VAL_15:.*]], %[[c9_i32]]) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:         fir.call @_FortranAExecuteCommandLine(%[[command]], %[[true]], %[[exitstat]], %[[cmdstat]], %[[absentBox]], %[[VAL_15:.*]], %[[c9_i32]]) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK-NEXT:    return
 ! CHECK-NEXT:      }
 
diff --git a/flang/test/Lower/Intrinsics/system.f90 b/flang/test/Lower/Intrinsics/system.f90
index 87ac8d9c7e6f9..d2a95acf2b120 100644
--- a/flang/test/Lower/Intrinsics/system.f90
+++ b/flang/test/Lower/Intrinsics/system.f90
@@ -23,7 +23,7 @@ subroutine all_args(command, exitstat)
 ! CHECK-NEXT:   %[[command:.*]] = fir.convert %[[commandBox]] : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
 ! CHECK-NEXT:   %[[exitstat:.*]] = fir.convert %[[exitstatBox]] : (!fir.box<i32>) -> !fir.box<none>
 ! CHECK-NEXT:   %[[cmdstat:.*]] = fir.convert %[[cmdstatBox]] : (!fir.box<i16>) -> !fir.box<none>
-! CHECK:        %[[VAL_13:.*]] = fir.call @_FortranAExecuteCommandLine(%[[command]], %[[true]], %[[exitstat]], %[[cmdstat]], %[[absentBox]], %[[VAL_12:.*]], %[[c9_i32]]) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:        fir.call @_FortranAExecuteCommandLine(%[[command]], %[[true]], %[[exitstat]], %[[cmdstat]], %[[absentBox]], %[[VAL_12:.*]], %[[c9_i32]]) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK-NEXT:   return
 ! CHECK-NEXT:  }
 end subroutine all_args
@@ -47,7 +47,7 @@ subroutine only_command(command)
 ! CHECK:        %[[c35_i32:.*]] = arith.constant {{[0-9]+}} : i32
 ! CHECK-NEXT:   %[[command:.*]] = fir.convert %[[commandBox]] : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
 ! CHECK-NEXT:   %[[cmdstat:.*]] = fir.convert %[[cmdstatBox]] : (!fir.box<i16>) -> !fir.box<none>
-! CHECK:        %[[VAL_12:.*]] = fir.call @_FortranAExecuteCommandLine(%[[command]], %[[true]], %[[absentBox]], %[[cmdstat]], %[[absentBox2]], %[[VAL_11:.*]], %[[c35_i32]]) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:        fir.call @_FortranAExecuteCommandLine(%[[command]], %[[true]], %[[absentBox]], %[[cmdstat]], %[[absentBox2]], %[[VAL_11:.*]], %[[c35_i32]]) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK-NEXT:   return
 ! CHECK-NEXT:    }
 end subroutine only_command
@@ -77,7 +77,7 @@ subroutine as_function(command)
 ! CHECK-NEXT:   %[[command:.*]] = fir.convert %[[commandBox]] : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
 ! CHECK-NEXT:   %[[exitstat:.*]] = fir.convert %[[exitstatBox]] : (!fir.box<i32>) -> !fir.box<none>
 ! CHECK-NEXT:   %[[cmdstat:.*]] = fir.convert %[[cmdstatBox]] : (!fir.box<i16>) -> !fir.box<none>
-! CHECK:        %[[VAL_13:.*]] = fir.call @_FortranAExecuteCommandLine(%[[command]], %[[true]], %[[exitstat]], %[[cmdstat]], %[[absentBox]], %[[VAL_12:.*]], %[[LINE_NO]]) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:        fir.call @_FortranAExecuteCommandLine(%[[command]], %[[true]], %[[exitstat]], %[[cmdstat]], %[[absentBox]], %[[VAL_12:.*]], %[[LINE_NO]]) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK-NEXT:   %[[RET_ADDR:.*]] = fir.box_addr %[[exitstatBox]] : (!fir.box<i32>) -> !fir.ref<i32>
 ! CHECK-NEXT:   %[[RET:.*]] = fir.load %[[RET_ADDR]] : !fir.ref<i32>
 ! CHECK-NEXT:   hlfir.assign %[[RET]] to %[[exitstatDeclare]]#0 : i32, !fir.ref<i32>
diff --git a/flang/test/Lower/Intrinsics/transfer.f90 b/flang/test/Lower/Intrinsics/transfer.f90
index 812946f106476..b75fe2e826561 100644
--- a/flang/test/Lower/Intrinsics/transfer.f90
+++ b/flang/test/Lower/Intrinsics/transfer.f90
@@ -15,7 +15,7 @@ subroutine trans_test(store, word)
     ! CHECK:         %[[VAL_10:.*]] = fir.convert %[[VAL_3]] : (!fir.box<f32>) -> !fir.box<none>
     ! CHECK:         %[[VAL_11:.*]] = fir.convert %[[VAL_4]] : (!fir.box<i32>) -> !fir.box<none>
     ! CHECK:         %[[VAL_12:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-    ! CHECK:         %[[VAL_13:.*]] = fir.call @_FortranATransfer(%[[VAL_9]], %[[VAL_10]], %[[VAL_11]], %[[VAL_12]], %[[VAL_8]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+    ! CHECK:         fir.call @_FortranATransfer(%[[VAL_9]], %[[VAL_10]], %[[VAL_11]], %[[VAL_12]], %[[VAL_8]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
     ! CHECK:         %[[VAL_14:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.box<!fir.heap<i32>>>
     ! CHECK:         %[[VAL_15:.*]] = fir.box_addr %[[VAL_14]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
     ! CHECK:         %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.heap<i32>
@@ -50,7 +50,7 @@ subroutine trans_test(store, word)
   ! CHECK:         %[[VAL_18:.*]] = fir.convert %[[VAL_9]] : (!fir.box<!fir.array<3xi32>>) -> !fir.box<none>
   ! CHECK:         %[[VAL_19:.*]] = fir.convert %[[VAL_14]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
   ! CHECK:         %[[VAL_20:.*]] = fir.convert %[[VAL_6]] : (i32) -> i64
-  ! CHECK:         %[[VAL_21:.*]] = fir.call @_FortranATransferSize(%[[VAL_16]], %[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %[[VAL_15]], %[[VAL_20]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32, i64) -> none
+  ! CHECK:         fir.call @_FortranATransferSize(%[[VAL_16]], %[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %[[VAL_15]], %[[VAL_20]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32, i64) -> ()
   ! CHECK:         %[[VAL_22:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   ! CHECK:         %[[VAL_23:.*]] = arith.constant 0 : index
   ! CHECK:         %[[VAL_24:.*]]:3 = fir.box_dims %[[VAL_22]], %[[VAL_23]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
@@ -94,7 +94,7 @@ integer function trans_test3(p)
     ! CHECK:         %[[VAL_12:.*]] = fir.convert %[[VAL_5]] : (!fir.box<i32>) -> !fir.box<none>
     ! CHECK:         %[[VAL_13:.*]] = fir.convert %[[VAL_6]] : (!fir.box<!fir.type<_QFtrans_test3Tobj{x:i32}>>) -> !fir.box<none>
     ! CHECK:         %[[VAL_14:.*]] = fir.convert %[[VAL_9]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-    ! CHECK:         %[[VAL_15:.*]] = fir.call @_FortranATransfer(%[[VAL_11]], %[[VAL_12]], %[[VAL_13]], %[[VAL_14]], %[[VAL_10]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+    ! CHECK:         fir.call @_FortranATransfer(%[[VAL_11]], %[[VAL_12]], %[[VAL_13]], %[[VAL_14]], %[[VAL_10]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
     ! CHECK:         %[[VAL_16:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.box<!fir.heap<!fir.type<_QFtrans_test3Tobj{x:i32}>>>>
     ! CHECK:         %[[VAL_17:.*]] = fir.box_addr %[[VAL_16]] : (!fir.box<!fir.heap<!fir.type<_QFtrans_test3Tobj{x:i32}>>>) -> !fir.heap<!fir.type<_QFtrans_test3Tobj{x:i32}>>
     ! CHECK:         %[[VAL_18:.*]] = fir.embox %[[VAL_3]] : (!fir.ref<!fir.type<_QFtrans_test3Tobj{x:i32}>>) -> !fir.box<!fir.type<_QFtrans_test3Tobj{x:i32}>>
@@ -104,7 +104,7 @@ integer function trans_test3(p)
     ! CHECK:         %[[VAL_21:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.box<!fir.type<_QFtrans_test3Tobj{x:i32}>>>) -> !fir.ref<!fir.box<none>>
     ! CHECK:         %[[VAL_22:.*]] = fir.convert %[[VAL_16]] : (!fir.box<!fir.heap<!fir.type<_QFtrans_test3Tobj{x:i32}>>>) -> !fir.box<none>
     ! CHECK:         %[[VAL_23:.*]] = fir.convert %[[VAL_19]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-    ! CHECK:         %[[VAL_24:.*]] = fir.call @_FortranAAssign(%[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_20]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+    ! CHECK:         fir.call @_FortranAAssign(%[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_20]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
     ! CHECK:         fir.freemem %[[VAL_17]]
     ! CHECK:         %[[VAL_25:.*]] = fir.field_index x, !fir.type<_QFtrans_test3Tobj{x:i32}>
     ! CHECK:         %[[VAL_26:.*]] = fir.coordinate_of %[[VAL_3]], %[[VAL_25]] : (!fir.ref<!fir.type<_QFtrans_test3Tobj{x:i32}>>, !fir.field) -> !fir.ref<i32>
diff --git a/flang/test/Lower/Intrinsics/transpose.f90 b/flang/test/Lower/Intrinsics/transpose.f90
index 41c94edb77e7b..cf2c2ba5bde83 100644
--- a/flang/test/Lower/Intrinsics/transpose.f90
+++ b/flang/test/Lower/Intrinsics/transpose.f90
@@ -15,7 +15,7 @@ subroutine transpose_test(mat)
 ! CHECK:  fir.store %[[resultBox]] to %[[resultDescr]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
 ! CHECK:  %[[resultOpaque:.*]] = fir.convert %[[resultDescr]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:  %[[sourceOpaque:.*]] = fir.convert %[[sourceBox]] : (!fir.box<!fir.array<2x3xf32>>) -> !fir.box<none>
-! CHECK:  %{{.*}} = fir.call @_FortranATranspose(%[[resultOpaque]], %[[sourceOpaque]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:  fir.call @_FortranATranspose(%[[resultOpaque]], %[[sourceOpaque]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:  %[[tmp1:.*]] = fir.load %[[resultDescr]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
 ! CHECK:  %[[tmp2:.*]] = fir.box_addr %[[tmp1]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.heap<!fir.array<?x?xf32>>
 ! CHECK:  %[[tmp3:.*]] = fir.convert %[[tmp2]] : (!fir.heap<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<3x2xf32>>
diff --git a/flang/test/Lower/Intrinsics/trim.f90 b/flang/test/Lower/Intrinsics/trim.f90
index 9ecb7945097be..c88e07f2a4e25 100644
--- a/flang/test/Lower/Intrinsics/trim.f90
+++ b/flang/test/Lower/Intrinsics/trim.f90
@@ -10,7 +10,7 @@ subroutine trim_test(c)
   ! CHECK-DAG: %[[cBox:.*]] = fir.embox %[[c]]#0 typeparams %[[c]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
   ! CHECK-DAG: %[[cBoxNone:.*]] = fir.convert %[[cBox]] : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
   ! CHECK-DAG: %[[resBox:.*]] = fir.convert %[[tmpBox]] : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) -> !fir.ref<!fir.box<none>>
-  ! CHECK: fir.call @{{.*}}Trim(%[[resBox]], %[[cBoxNone]], {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  ! CHECK: fir.call @{{.*}}Trim(%[[resBox]], %[[cBoxNone]], {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   ! CHECK-DAG: %[[tmpAddr:.*]] = fir.box_addr
   ! CHECK-DAG: fir.box_elesize
   ! CHECK: fir.call @{{.*}}bar_trim_test
diff --git a/flang/test/Lower/Intrinsics/ubound01.f90 b/flang/test/Lower/Intrinsics/ubound01.f90
index e933075cc0bf2..bd7961553da83 100644
--- a/flang/test/Lower/Intrinsics/ubound01.f90
+++ b/flang/test/Lower/Intrinsics/ubound01.f90
@@ -20,4 +20,4 @@ subroutine s2(a,n,n2)
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x?xf32>>
 ! CHECK: %[[BOX:.*]] = fir.rebox %[[ARG0]](%{{.*}}) : (!fir.box<!fir.array<?x?xf32>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?xf32>>
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[BOX]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAUbound(%{{.*}}, %[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranAUbound(%{{.*}}, %[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.llvm_ptr<i8>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
diff --git a/flang/test/Lower/Intrinsics/verify.f90 b/flang/test/Lower/Intrinsics/verify.f90
index eb1454c001f70..7d0f97023e63f 100644
--- a/flang/test/Lower/Intrinsics/verify.f90
+++ b/flang/test/Lower/Intrinsics/verify.f90
@@ -21,7 +21,7 @@ integer function verify_test(s1, s2)
 ! CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_9]] : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
 ! CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_7]] : (!fir.box<i1>) -> !fir.box<none>
 ! CHECK: %[[VAL_18:.*]] = fir.convert %[[VAL_12]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
-! CHECK: %[[VAL_19:.*]] = fir.call @_FortranAVerify(%[[VAL_14]], %[[VAL_15]], %[[VAL_16]], %[[VAL_17]], %[[VAL_6]], %[[VAL_18]], %[[VAL_13]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranAVerify(%[[VAL_14]], %[[VAL_15]], %[[VAL_16]], %[[VAL_17]], %[[VAL_6]], %[[VAL_18]], %[[VAL_13]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
 ! CHECK: %[[VAL_20:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.box<!fir.heap<i32>>>
 ! CHECK: %[[VAL_21:.*]] = fir.box_addr %[[VAL_20]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
 ! CHECK: %[[VAL_22:.*]] = fir.load %[[VAL_21]] : !fir.heap<i32>
diff --git a/flang/test/Lower/OpenACC/stop-stmt-in-region.f90 b/flang/test/Lower/OpenACC/stop-stmt-in-region.f90
index 2694a1531d169..89d0d4a484cc1 100644
--- a/flang/test/Lower/OpenACC/stop-stmt-in-region.f90
+++ b/flang/test/Lower/OpenACC/stop-stmt-in-region.f90
@@ -8,7 +8,7 @@
 ! CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i32
 ! CHECK:           %[[VAL_1:.*]] = arith.constant false
 ! CHECK:           %[[VAL_2:.*]] = arith.constant false
-! CHECK:           %[[VAL_3:.*]] = fir.call @_FortranAStopStatement(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) {{.*}} : (i32, i1, i1) -> none
+! CHECK:           fir.call @_FortranAStopStatement(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) {{.*}} : (i32, i1, i1) -> ()
 ! CHECK:           acc.yield
 ! CHECK:         }
 ! CHECK:         return
@@ -26,7 +26,7 @@ subroutine test_stop_in_region1()
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
 ! CHECK:           %[[VAL_2:.*]] = arith.constant false
 ! CHECK:           %[[VAL_3:.*]] = arith.constant false
-! CHECK:           %[[VAL_4:.*]] = fir.call @_FortranAStopStatement(%[[VAL_1]], %[[VAL_2]], %[[VAL_3]]) {{.*}} : (i32, i1, i1) -> none
+! CHECK:           fir.call @_FortranAStopStatement(%[[VAL_1]], %[[VAL_2]], %[[VAL_3]]) {{.*}} : (i32, i1, i1) -> ()
 ! CHECK:           acc.yield
 ! CHECK:         }
 ! CHECK:         return
diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/wsloop.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/wsloop.f90
index 66fd120085c78..c98850b8000d3 100644
--- a/flang/test/Lower/OpenMP/DelayedPrivatization/wsloop.f90
+++ b/flang/test/Lower/OpenMP/DelayedPrivatization/wsloop.f90
@@ -1,6 +1,6 @@
-! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization-staging \
+! RUN: %flang_fc1 -emit-hlfir -fopenmp \
 ! RUN:   -o - %s 2>&1 | FileCheck %s
-! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization-staging -o - %s 2>&1 \
+! RUN: bbc -emit-hlfir -fopenmp  -o - %s 2>&1 \
 ! RUN:   | FileCheck %s
 
 subroutine wsloop_private
diff --git a/flang/test/Lower/OpenMP/Todo/omp-default-clause-inner-loop.f90 b/flang/test/Lower/OpenMP/Todo/omp-default-clause-inner-loop.f90
index 77a1304f39a48..10879c53dc0c5 100644
--- a/flang/test/Lower/OpenMP/Todo/omp-default-clause-inner-loop.f90
+++ b/flang/test/Lower/OpenMP/Todo/omp-default-clause-inner-loop.f90
@@ -9,11 +9,10 @@
 ! The string "EXPECTED" denotes the expected FIR
 
 ! CHECK: omp.parallel  private(@{{.*}} %{{.*}} -> %[[PRIVATE_Y:.*]], @{{.*}} %{{.*}} -> %[[PRIVATE_Y:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
-! CHECK: %[[TEMP:.*]] = fir.alloca i32 {bindc_name = "x", pinned, {{.*}}}
 ! CHECK: %[[const_1:.*]] = arith.constant 1 : i32
 ! CHECK: %[[const_2:.*]] = arith.constant 10 : i32
 ! CHECK: %[[const_3:.*]] = arith.constant 1 : i32
-! CHECK: omp.wsloop {
+! CHECK: omp.wsloop private(@{{.*}} %{{.*}} -> %[[TEMP:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT: omp.loop_nest (%[[ARG:.*]]) : i32 = (%[[const_1]]) to (%[[const_2]]) inclusive step (%[[const_3]]) {
 ! CHECK: fir.store %[[ARG]] to %[[TEMP]] : !fir.ref<i32>
 ! EXPECTED: %[[temp_1:.*]] = fir.load %[[PRIVATE_Z]] : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/associate.f90 b/flang/test/Lower/OpenMP/associate.f90
index 4964890a6842c..d497b4ade782e 100644
--- a/flang/test/Lower/OpenMP/associate.f90
+++ b/flang/test/Lower/OpenMP/associate.f90
@@ -6,12 +6,12 @@
 !CHECK:         omp.parallel {
 !CHECK-NOT:       hlfir.declare {{.*}} {uniq_name = "_QFtest_parallel_assocEa"}
 !CHECK-NOT:       hlfir.declare {{.*}} {uniq_name = "_QFtest_parallel_assocEb"}
-!CHECK:           omp.wsloop {
+!CHECK:           omp.wsloop private({{.*}}) {
 !CHECK:           }
 !CHECK:         }
 !CHECK:         omp.parallel {{.*}} {
 !CHECK-NOT:       hlfir.declare {{.*}} {uniq_name = "_QFtest_parallel_assocEb"}
-!CHECK:           omp.wsloop {
+!CHECK:           omp.wsloop private({{.*}}) {
 !CHECK:           }
 !CHECK:         }
 subroutine test_parallel_assoc()
diff --git a/flang/test/Lower/OpenMP/copyin.f90 b/flang/test/Lower/OpenMP/copyin.f90
index 9e9ccf8e3d914..5ad45f1f5ba6f 100644
--- a/flang/test/Lower/OpenMP/copyin.f90
+++ b/flang/test/Lower/OpenMP/copyin.f90
@@ -154,14 +154,13 @@ subroutine copyin_derived_type()
 
 ! CHECK:             omp.barrier
 
-! CHECK:             %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFcombined_parallel_worksharing_loopEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
 ! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_12:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32>
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}} -> %[[VAL_6:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_14:.*]]) : i32 = (%[[VAL_11]]) to (%[[VAL_12]]) inclusive step (%[[VAL_13]]) {
+! CHECK:                 %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFcombined_parallel_worksharing_loopEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[VAL_14]] to %[[VAL_7]]#1 : !fir.ref<i32>
 ! CHECK:                 fir.call @_QPsub4(%[[VAL_9]]#1) fastmath<contract> : (!fir.ref<i32>) -> ()
 ! CHECK:                 omp.yield
@@ -321,15 +320,12 @@ subroutine common_1()
 ! CHECK:             %[[VAL_33:.*]] = fir.load %[[VAL_18]]#0 : !fir.ref<i32>
 ! CHECK:             hlfir.assign %[[VAL_33]] to %[[VAL_31]]#0 : i32, !fir.ref<i32>
 ! CHECK:             omp.barrier
-
-! CHECK:             %[[VAL_19:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_19]] {uniq_name = "_QFcommon_2Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
 ! CHECK:             %[[VAL_34:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_35:.*]] = fir.load %[[VAL_26]]#0 : !fir.ref<i32>
 ! CHECK:             %[[VAL_36:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}} -> %[[VAL_19:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_37:.*]]) : i32 = (%[[VAL_34]]) to (%[[VAL_35]]) inclusive step (%[[VAL_36]]) {
+! CHECK:             %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_19]] {uniq_name = "_QFcommon_2Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[VAL_37]] to %[[VAL_20]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_38:.*]] = fir.load %[[VAL_31]]#0 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_39:.*]] = fir.load %[[VAL_20]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/critical.f90 b/flang/test/Lower/OpenMP/critical.f90
index 051d378210646..99a4426ab0453 100644
--- a/flang/test/Lower/OpenMP/critical.f90
+++ b/flang/test/Lower/OpenMP/critical.f90
@@ -38,11 +38,10 @@ subroutine predetermined_privatization()
   !CHECK: omp.parallel
   !$omp parallel do
 
-  !CHECK: %[[PRIV_I_ALLOC:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-  !CHECK: %[[PRIV_I_DECL:.*]]:2 = hlfir.declare %[[PRIV_I_ALLOC]]
   do i = 2, 10
-    !CHECK: omp.wsloop
+    !CHECK: omp.wsloop private(@{{.*}} %{{.*}} -> %[[PRIV_I_ALLOC:.*]] : !fir.ref<i32>)
     !CHECK: omp.loop_nest (%[[IV:[^[:space:]]+]])
+    !CHECK: %[[PRIV_I_DECL:.*]]:2 = hlfir.declare %[[PRIV_I_ALLOC]]
     !CHECK: fir.store %[[IV]] to %[[PRIV_I_DECL]]#1
     !CHECK: omp.critical
     !$omp critical
diff --git a/flang/test/Lower/OpenMP/default-clause-byref.f90 b/flang/test/Lower/OpenMP/default-clause-byref.f90
index 654c13ada9e39..10e62005f42ba 100644
--- a/flang/test/Lower/OpenMP/default-clause-byref.f90
+++ b/flang/test/Lower/OpenMP/default-clause-byref.f90
@@ -346,7 +346,7 @@ subroutine skipped_default_clause_checks()
        type(it)::iii
 
 !CHECK: omp.parallel {{.*}} {
-!CHECK: omp.wsloop reduction(byref @min_byref_i32 %[[VAL_Z_DECLARE]]#0 -> %[[PRV:.+]] : !fir.ref<i32>) {
+!CHECK: omp.wsloop private({{.*}}) reduction(byref @min_byref_i32 %[[VAL_Z_DECLARE]]#0 -> %[[PRV:.+]] : !fir.ref<i32>) {
 !CHECK-NEXT: omp.loop_nest (%[[ARG:.*]]) {{.*}} {
 !CHECK: omp.yield
 !CHECK: }
diff --git a/flang/test/Lower/OpenMP/default-clause.f90 b/flang/test/Lower/OpenMP/default-clause.f90
index c004813a911f7..fcc8d033eea0f 100644
--- a/flang/test/Lower/OpenMP/default-clause.f90
+++ b/flang/test/Lower/OpenMP/default-clause.f90
@@ -284,16 +284,13 @@ subroutine nested_default_clause_test4
 !CHECK-LABEL: func @_QPnested_default_clause_test5
 !CHECK: omp.parallel {
 
-!CHECK: %[[X_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_test5Ex"}
-!CHECK: %[[X_DECLARE:.*]]:2 = hlfir.declare %[[X_ALLOCA]] {{.*}}
-
-!CHECK: %[[LOOP_VAR_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-!CHECK: %[[LOOP_VAR_DECLARE:.*]]:2 = hlfir.declare %[[LOOP_VAR_ALLOCA]] {{.*}}
-
 !CHECK: %[[CONST_LB:.*]] = arith.constant 1 : i32
 !CHECK: %[[CONST_UB:.*]] = arith.constant 50 : i32
 !CHECK: %[[CONST_STEP:.*]] = arith.constant 1 : i32
+! CHECK: omp.wsloop private(@{{.*}} %{{.*}} -> %[[X_ALLOCA:.*]], @{{.*}} %{{.*}} -> %[[LOOP_VAR_ALLOCA:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
 !CHECK: omp.loop_nest (%[[ARG:.*]]) : i32 = (%[[CONST_LB]]) to (%[[CONST_UB]]) inclusive step (%[[CONST_STEP]]) {
+!CHECK: %[[X_DECLARE:.*]]:2 = hlfir.declare %[[X_ALLOCA]] {{.*}}
+!CHECK: %[[LOOP_VAR_DECLARE:.*]]:2 = hlfir.declare %[[LOOP_VAR_ALLOCA]] {{.*}}
 !CHECK: fir.store %[[ARG]] to %[[LOOP_VAR_DECLARE]]#1 : !fir.ref<i32>
 !CHECK: %[[LOADED_X:.*]] = fir.load %[[X_DECLARE]]#0 : !fir.ref<i32>
 !CHECK: %[[CONST:.*]] = arith.constant 1 : i32
@@ -321,13 +318,12 @@ subroutine nested_default_clause_test5
 
 !CHECK: %[[Z_VAR_DECLARE:.*]]:2 = hlfir.declare %[[Z_VAR]] {{.*}}
 
-!CHECK: %[[LOOP_VAR:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-!CHECK: %[[LOOP_VAR_DECLARE:.*]]:2 = hlfir.declare %[[LOOP_VAR]] {{.*}}
-
 !CHECK: %[[CONST_LB:.*]] = arith.constant 1 : i32
 !CHECK: %[[CONST_UB:.*]] = arith.constant 10 : i32
 !CHECK: %[[CONST_STEP:.*]] = arith.constant 1 : i32
+! CHECK: omp.wsloop private(@{{.*}} %{{.*}} -> %[[LOOP_VAR:.*]] : !fir.ref<i32>) {
 !CHECK: omp.loop_nest (%[[ARG:.*]]) : i32 = (%[[CONST_LB]]) to (%[[CONST_UB]]) inclusive step (%[[CONST_STEP]]) {
+!CHECK: %[[LOOP_VAR_DECLARE:.*]]:2 = hlfir.declare %[[LOOP_VAR]] {{.*}}
 !CHECK: fir.store %[[ARG]] to %[[LOOP_VAR_DECLARE]]#1 : !fir.ref<i32>
 !CHECK: %[[LOADED_X:.*]] = fir.load %[[X_VAR_DECLARE]]#0 : !fir.ref<i32>
 !CHECK: %[[CONST:.*]] = arith.constant 1 : i32
@@ -386,7 +382,7 @@ subroutine skipped_default_clause_checks()
        type(it)::iii
 
 !CHECK: omp.parallel {{.*}} {
-!CHECK: omp.wsloop reduction(@min_i32 %[[VAL_Z_DECLARE]]#0 -> %[[PRV:.+]] : !fir.ref<i32>) {
+!CHECK: omp.wsloop private({{.*}}) reduction(@min_i32 %[[VAL_Z_DECLARE]]#0 -> %[[PRV:.+]] : !fir.ref<i32>) {
 !CHECK-NEXT: omp.loop_nest (%[[ARG:.*]]) {{.*}} {
 !CHECK: omp.yield
 !CHECK: }
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-default-init.f90 b/flang/test/Lower/OpenMP/delayed-privatization-default-init.f90
index 0eeebe0afea54..022b592db74b8 100644
--- a/flang/test/Lower/OpenMP/delayed-privatization-default-init.f90
+++ b/flang/test/Lower/OpenMP/delayed-privatization-default-init.f90
@@ -41,7 +41,7 @@ subroutine delayed_privatization_default_init_firstprivate
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.type<_QFdelayed_privatization_default_initTt{i:i32}> {bindc_name = "a", pinned, uniq_name = "_QFdelayed_privatization_default_initEa"}
 ! CHECK:           %[[VAL_2:.*]] = fir.embox %[[VAL_1]] : (!fir.ref<!fir.type<_QFdelayed_privatization_default_initTt{i:i32}>>) -> !fir.box<!fir.type<_QFdelayed_privatization_default_initTt{i:i32}>>
 ! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_2]] : (!fir.box<!fir.type<_QFdelayed_privatization_default_initTt{i:i32}>>) -> !fir.box<none>
-! CHECK:           %[[VAL_8:.*]] = fir.call @_FortranAInitialize(%[[VAL_6]],{{.*}}
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_6]],{{.*}}
 ! CHECK-NEXT:      %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFdelayed_privatization_default_initEa"} : (!fir.ref<!fir.type<_QFdelayed_privatization_default_initTt{i:i32}>>) -> (!fir.ref<!fir.type<_QFdelayed_privatization_default_initTt{i:i32}>>, !fir.ref<!fir.type<_QFdelayed_privatization_default_initTt{i:i32}>>)
 ! CHECK:           omp.yield(%[[VAL_9]]#0 : !fir.ref<!fir.type<_QFdelayed_privatization_default_initTt{i:i32}>>)
 ! CHECK:   }
diff --git a/flang/test/Lower/OpenMP/hlfir-wsloop.f90 b/flang/test/Lower/OpenMP/hlfir-wsloop.f90
index f7b0ba681efeb..786ab916d000c 100644
--- a/flang/test/Lower/OpenMP/hlfir-wsloop.f90
+++ b/flang/test/Lower/OpenMP/hlfir-wsloop.f90
@@ -10,12 +10,11 @@ subroutine simple_loop
   ! CHECK-DAG:     %[[WS_END:.*]] = arith.constant 9 : i32
   ! CHECK:  omp.parallel
   !$OMP PARALLEL
-  ! CHECK-DAG:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned, {{.*}}}
-  ! CHECK:         %[[IV:.*]]    = fir.declare %[[ALLOCA_IV]] {uniq_name = "_QFsimple_loopEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
-  ! CHECK:         omp.wsloop {
+  ! CHECK:         omp.wsloop private(@{{.*}} %{{.*}} -> %[[ALLOCA_IV:.*]] : !fir.ref<i32>) {
   ! CHECK-NEXT:      omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_ST]]) to (%[[WS_END]]) inclusive step (%[[WS_ST]]) {
   !$OMP DO
   do i=1, 9
+  ! CHECK:         %[[IV:.*]]    = fir.declare %[[ALLOCA_IV]] {uniq_name = "_QFsimple_loopEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
   ! CHECK:             fir.store %[[I]] to %[[IV:.*]] : !fir.ref<i32>
   ! CHECK:             %[[LOAD_IV:.*]] = fir.load %[[IV]] : !fir.ref<i32>
   ! CHECK:             fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
diff --git a/flang/test/Lower/OpenMP/lastprivate-allocatable.f90 b/flang/test/Lower/OpenMP/lastprivate-allocatable.f90
index 6b7d849fde93c..fd8338393dd88 100644
--- a/flang/test/Lower/OpenMP/lastprivate-allocatable.f90
+++ b/flang/test/Lower/OpenMP/lastprivate-allocatable.f90
@@ -8,12 +8,11 @@
 ! CHECK:           fir.store %[[VAL_2]] to %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<i32>>>
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = {{.*}}<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
 ! CHECK:           omp.parallel {
-!                    create original copy of private variable
-! CHECK:             %[[VAL_16:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = {{.*}}<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
-! CHECK:             %[[VAL_17:.*]] = fir.alloca i32 {bindc_name = "i", pinned, uniq_name = "_QFEi"}
-! CHECK:             %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:             omp.wsloop {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}} -> %{{.*}}, @{{.*}} %{{.*}} -> %[[VAL_17:.*]] : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<i32>) {
 ! CHECK:               omp.loop_nest
+! CHECK:                   %[[VAL_16:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = {{.*}}<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+! CHECK:                   %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
 !                        [...]
 !                        if this is the last iteration
 ! CHECK:                 fir.if %{{.*}} {
diff --git a/flang/test/Lower/OpenMP/lastprivate-commonblock.f90 b/flang/test/Lower/OpenMP/lastprivate-commonblock.f90
index faa3d3e053f34..c059382bf634c 100644
--- a/flang/test/Lower/OpenMP/lastprivate-commonblock.f90
+++ b/flang/test/Lower/OpenMP/lastprivate-commonblock.f90
@@ -11,12 +11,10 @@
 !CHECK:      %[[CB_C_Y_COOR:.*]] = fir.coordinate_of %[[CB_C_REF_CVT]], %{{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
 !CHECK:      %[[CB_C_Y_ADDR:.*]] = fir.convert %[[CB_C_Y_COOR]] : (!fir.ref<i8>) -> !fir.ref<f32>
 !CHECK:      %[[Y_DECL:.*]]:2 = hlfir.declare %[[CB_C_Y_ADDR]] {uniq_name = "_QFlastprivate_commonEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-!CHECK:      %[[PRIVATE_X_REF:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivate_commonEx"}
+!CHECK:      omp.wsloop private(@{{.*}} %{{.*}} -> %[[PRIVATE_X_REF:.*]], @{{.*}} %{{.*}} -> %[[PRIVATE_Y_REF:.*]], @{{.*}} %{{.*}} -> %{{.*}} : !{{.*}}, !{{.*}}, !{{.*}}) {
+!CHECK-NEXT:   omp.loop_nest (%[[I:.*]]) : i32 = (%{{.*}}) to (%{{.*}}) inclusive step (%{{.*}}) {
 !CHECK:      %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X_REF]] {uniq_name = "_QFlastprivate_commonEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-!CHECK:      %[[PRIVATE_Y_REF:.*]] = fir.alloca f32 {bindc_name = "y", pinned, uniq_name = "_QFlastprivate_commonEy"}
 !CHECK:      %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y_REF]] {uniq_name = "_QFlastprivate_commonEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-!CHECK:      omp.wsloop {
-!CHECK-NEXT:   omp.loop_nest (%[[I:.*]]) : i32 = (%{{.*}}) to (%{{.*}}) inclusive step (%{{.*}}) {
 !CHECK:          %[[V:.*]] = arith.addi %[[I]], %{{.*}} : i32
 !CHECK:          %[[C0:.*]] = arith.constant 0 : i32
 !CHECK:          %[[NEG_STEP:.*]] = arith.cmpi slt, %{{.*}}, %[[C0]] : i32
diff --git a/flang/test/Lower/OpenMP/lastprivate-iv.f90 b/flang/test/Lower/OpenMP/lastprivate-iv.f90
index 63a81e818bc8b..aacefd8b59c0f 100644
--- a/flang/test/Lower/OpenMP/lastprivate-iv.f90
+++ b/flang/test/Lower/OpenMP/lastprivate-iv.f90
@@ -6,14 +6,12 @@
 !CHECK:      %[[I2_MEM:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFlastprivate_iv_incEi"}
 !CHECK:      %[[I2:.*]]:2 = hlfir.declare %[[I2_MEM]] {uniq_name = "_QFlastprivate_iv_incEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
-!CHECK:      %[[I_MEM:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-!CHECK:      %[[I:.*]]:2 = hlfir.declare %[[I_MEM]] {uniq_name = "_QFlastprivate_iv_incEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
 !CHECK:      %[[LB:.*]] = arith.constant 4 : i32
 !CHECK:      %[[UB:.*]] = arith.constant 10 : i32
 !CHECK:      %[[STEP:.*]]  = arith.constant 3 : i32
-!CHECK:      omp.wsloop {
+!CHECK:      omp.wsloop private(@{{.*}} %{{.*}} -> %[[I_MEM:.*]] : !fir.ref<i32>) {
 !CHECK-NEXT:   omp.loop_nest (%[[IV:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+!CHECK:          %[[I:.*]]:2 = hlfir.declare %[[I_MEM]] {uniq_name = "_QFlastprivate_iv_incEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:          fir.store %[[IV]] to %[[I]]#1 : !fir.ref<i32>
 !CHECK:          %[[V:.*]] = arith.addi %[[IV]], %[[STEP]] : i32
 !CHECK:          %[[C0:.*]] = arith.constant 0 : i32
@@ -42,15 +40,12 @@ subroutine lastprivate_iv_inc()
 
 !CHECK:      %[[I2_MEM:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFlastprivate_iv_decEi"}
 !CHECK:      %[[I2:.*]]:2 = hlfir.declare %[[I2_MEM]] {uniq_name = "_QFlastprivate_iv_decEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
-!CHECK:      %[[I_MEM:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-!CHECK:      %[[I:.*]]:2 = hlfir.declare %[[I_MEM]] {uniq_name = "_QFlastprivate_iv_decEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
 !CHECK:      %[[LB:.*]] = arith.constant 10 : i32
 !CHECK:      %[[UB:.*]] = arith.constant 1 : i32
 !CHECK:      %[[STEP:.*]]  = arith.constant -3 : i32
-!CHECK:      omp.wsloop {
+!CHECK:      omp.wsloop private(@{{.*}} %{{.*}} -> %[[I_MEM:.*]] : !fir.ref<i32>) {
 !CHECK-NEXT:   omp.loop_nest (%[[IV:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+!CHECK:          %[[I:.*]]:2 = hlfir.declare %[[I_MEM]] {uniq_name = "_QFlastprivate_iv_decEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:          fir.store %[[IV]] to %[[I]]#1 : !fir.ref<i32>
 !CHECK:          %[[V:.*]] = arith.addi %[[IV]], %[[STEP]] : i32
 !CHECK:          %[[C0:.*]] = arith.constant 0 : i32
@@ -80,7 +75,7 @@ subroutine lastprivate_iv_dec()
 subroutine lastprivate_iv_i1
   integer*1 :: i1
   i1=0
-!CHECK:    omp.wsloop
+!CHECK:    omp.wsloop private({{.*}})
 !CHECK:      omp.loop_nest
 !CHECK:        fir.if %{{.*}} {
 !CHECK:          %[[I8_VAL:.*]] = fir.convert %{{.*}} : (i32) -> i8
diff --git a/flang/test/Lower/OpenMP/location.f90 b/flang/test/Lower/OpenMP/location.f90
index 2dab22a1c1f90..fc7dd43499863 100644
--- a/flang/test/Lower/OpenMP/location.f90
+++ b/flang/test/Lower/OpenMP/location.f90
@@ -28,7 +28,7 @@ subroutine sub_target()
 
 !CHECK-LABEL: sub_loop
 subroutine sub_loop()
-!CHECK: omp.wsloop {
+!CHECK: omp.wsloop private({{.*}}) {
 !CHECK-NEXT: omp.loop_nest {{.*}} {
   !$omp do
   do i=1,10
diff --git a/flang/test/Lower/OpenMP/order-clause.f90 b/flang/test/Lower/OpenMP/order-clause.f90
index a30d82979021d..1f678e02708da 100644
--- a/flang/test/Lower/OpenMP/order-clause.f90
+++ b/flang/test/Lower/OpenMP/order-clause.f90
@@ -20,15 +20,15 @@ end subroutine simd_order
 
 !CHECK-LABEL:   func.func @_QPdo_order() {
 subroutine do_order
-   !CHECK: omp.wsloop order(reproducible:concurrent) {
+   !CHECK: omp.wsloop order(reproducible:concurrent) private({{.*}}) {
    !$omp do order(concurrent)
    do i = 1, 10
    end do
-   !CHECK: omp.wsloop order(reproducible:concurrent) {
+   !CHECK: omp.wsloop order(reproducible:concurrent) private({{.*}}) {
    !$omp do order(reproducible:concurrent)
    do i = 1, 10
    end do
-   !CHECK: omp.wsloop order(unconstrained:concurrent) {
+   !CHECK: omp.wsloop order(unconstrained:concurrent) private({{.*}}) {
    !$omp do order(unconstrained:concurrent)
    do i = 1, 10
    end do
diff --git a/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90 b/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
index 86309a24f91a0..531413c124f81 100644
--- a/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
+++ b/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
@@ -10,12 +10,12 @@
 !CHECK-DAG: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1_REF]] typeparams %[[FIVE]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFlastprivate_characterEarg1"} : (!fir.ref<!fir.char<1,5>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
 
 !CHECK: omp.parallel {
-!CHECK-DAG: %[[ARG1_PVT:.*]] = fir.alloca !fir.char<1,5> {bindc_name = "arg1", pinned, {{.*}}}
-!CHECK-DAG: %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] typeparams %[[FIVE]] {uniq_name = "_QFlastprivate_characterEarg1"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
 
 ! Check that we are accessing the clone inside the loop
-!CHECK: omp.wsloop {
+!CHECK: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[ARG1_PVT:.*]], @{{.*}} %{{.*}}#0 -> %{{.*}} : !fir.ref<!fir.char<1,5>>, !{{.*}}) {
 !CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
+!CHECK: %[[FIVE:.*]] = arith.constant 5 : index
+!CHECK: %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] typeparams %[[FIVE]] {uniq_name = "_QFlastprivate_characterEarg1"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
 !CHECK: %[[UNIT:.*]] = arith.constant 6 : i32
 !CHECK-NEXT: %[[ADDR:.*]] = fir.address_of(@_QQclX
 !CHECK-NEXT: %[[CVT0:.*]] = fir.convert %[[ADDR]] 
@@ -58,10 +58,9 @@ subroutine lastprivate_character(arg1)
 !CHECK: func @_QPlastprivate_int(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}) {
 !CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFlastprivate_intEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-DAG: omp.parallel  {
-!CHECK-DAG: %[[CLONE:.*]] = fir.alloca i32 {bindc_name = "arg1", pinned, {{.*}}}
-!CHECK-DAG: %[[CLONE_DECL:.*]]:2 = hlfir.declare %[[CLONE]] {uniq_name = "_QFlastprivate_intEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: omp.wsloop {
+!CHECK: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[CLONE:.*]], @{{.*}} %{{.*}}#0 -> %{{.*}} : !fir.ref<i32>, !{{.*}}) {
 !CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
+!CHECK:      %[[CLONE_DECL:.*]]:2 = hlfir.declare %[[CLONE]] {uniq_name = "_QFlastprivate_intEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
 ! Testing last iteration check
 !CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
@@ -98,12 +97,10 @@ subroutine lastprivate_int(arg1)
 !CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmult_lastprivate_intEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmult_lastprivate_intEarg2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.parallel  {
-!CHECK-DAG: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1", pinned, {{.*}}}
+!CHECK: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[CLONE1:.*]], @{{.*}} %{{.*}}#0 -> %[[CLONE2:.*]], @{{.*}} %{{.*}}#0 -> %{{.*}} : !fir.ref<i32>, !fir.ref<i32>, !{{.*}}) {
+!CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
 !CHECK-DAG: %[[CLONE1_DECL:.*]]:2 = hlfir.declare %[[CLONE1]] {uniq_name = "_QFmult_lastprivate_intEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK-DAG: %[[CLONE2:.*]] = fir.alloca i32 {bindc_name = "arg2", pinned, {{.*}}}
 !CHECK-DAG: %[[CLONE2_DECL:.*]]:2 = hlfir.declare %[[CLONE2]] {uniq_name = "_QFmult_lastprivate_intEarg2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: omp.wsloop {
-!CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
 
 ! Testing last iteration check
 !CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
@@ -142,12 +139,10 @@ subroutine mult_lastprivate_int(arg1, arg2)
 !CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmult_lastprivate_int2Earg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmult_lastprivate_int2Earg2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.parallel  {
-!CHECK-DAG: %[[CLONE2:.*]] = fir.alloca i32 {bindc_name = "arg2", pinned, {{.*}}}
-!CHECK-DAG: %[[CLONE2_DECL:.*]]:2 = hlfir.declare %[[CLONE2]] {uniq_name = "_QFmult_lastprivate_int2Earg2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK-DAG: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1", pinned, {{.*}}}
-!CHECK-DAG: %[[CLONE1_DECL:.*]]:2 = hlfir.declare %[[CLONE1]] {uniq_name = "_QFmult_lastprivate_int2Earg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: omp.wsloop {
+!CHECK: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[CLONE1:.*]], @{{.*}} %{{.*}}#0 -> %[[CLONE2:.*]], @{{.*}} %{{.*}}#0 -> %{{.*}} : !fir.ref<i32>, !fir.ref<i32>, !{{.*}}) {
 !CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
+!CHECK-DAG: %[[CLONE1_DECL:.*]]:2 = hlfir.declare %[[CLONE1]] {uniq_name = "_QFmult_lastprivate_int2Earg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-DAG: %[[CLONE2_DECL:.*]]:2 = hlfir.declare %[[CLONE2]] {uniq_name = "_QFmult_lastprivate_int2Earg2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
 !Testing last iteration check
 !CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
@@ -187,16 +182,11 @@ subroutine mult_lastprivate_int2(arg1, arg2)
 !CHECK:    %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstpriv_lastpriv_intEarg2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.parallel  {
 ! Firstprivate update
-!CHECK: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1", pinned, {{.*}}}
-!CHECK: %[[CLONE1_DECL:.*]]:2 = hlfir.declare %[[CLONE1]] {uniq_name = "_QFfirstpriv_lastpriv_intEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[FPV_LD:.*]] = fir.load %[[ARG1_DECL]]#0 : !fir.ref<i32>
-!CHECK: hlfir.assign %[[FPV_LD]] to %[[CLONE1_DECL]]#0 : i32, !fir.ref<i32>
-! Lastprivate Allocation
-!CHECK: %[[CLONE2:.*]] = fir.alloca i32 {bindc_name = "arg2", pinned, {{.*}}}
-!CHECK: %[[CLONE2_DECL:.*]]:2 = hlfir.declare %[[CLONE2]] {uniq_name = "_QFfirstpriv_lastpriv_intEarg2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-NOT: omp.barrier
-!CHECK: omp.wsloop {
+!CHECK: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[CLONE1:.*]], @{{.*}} %{{.*}}#0 -> %[[CLONE2:.*]], @{{.*}} %{{.*}}#0 -> %{{.*}} : !fir.ref<i32>, !fir.ref<i32>, !{{.*}}) {
 !CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
+!CHECK: %[[CLONE1_DECL:.*]]:2 = hlfir.declare %[[CLONE1]] {uniq_name = "_QFfirstpriv_lastpriv_intEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[CLONE2_DECL:.*]]:2 = hlfir.declare %[[CLONE2]] {uniq_name = "_QFfirstpriv_lastpriv_intEarg2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
 ! Testing last iteration check
 !CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
@@ -234,17 +224,14 @@ subroutine firstpriv_lastpriv_int(arg1, arg2)
 !CHECK: omp.parallel  {
 
 ! Firstprivate update
-!CHECK: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1", pinned, {{.*}}}
-!CHECK: %[[CLONE1_DECL:.*]]:2 = hlfir.declare %[[CLONE1]] {uniq_name = "_QFfirstpriv_lastpriv_int2Earg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK-NEXT: %[[FPV_LD:.*]] = fir.load %[[ARG1_DECL]]#0 : !fir.ref<i32>
-!CHECK-NEXT: hlfir.assign %[[FPV_LD]] to %[[CLONE1_DECL]]#0 : i32, !fir.ref<i32>
 
-!CHECK-NEXT: %[[IV:.*]] = fir.alloca i32 {bindc_name = "n", pinned, {{.*}}}
-!CHECK-NEXT: hlfir.declare %[[IV]]
 
 !CHECK-NEXT: omp.barrier
-!CHECK: omp.wsloop {
+!CHECK: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[CLONE1:.*]], @{{.*}} %{{.*}}#0 -> %[[IV:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
 !CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
+!CHECK: %[[CLONE1_DECL:.*]]:2 = hlfir.declare %[[CLONE1]] {uniq_name = "_QFfirstpriv_lastpriv_int2Earg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+!CHECK-NEXT: hlfir.declare %[[IV]]
 ! Testing last iteration check
 !CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
 !CHECK: %[[C0:.*]] = arith.constant 0 : i32
diff --git a/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90 b/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90
index 99323e69113bc..e8ac8e7f62122 100644
--- a/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90
+++ b/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90
@@ -56,17 +56,15 @@
 ! CHECK-DAG:         %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_private_fixEx"}
 ! CHECK-DAG:         %[[X_DECL:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFmultiple_private_fixEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:         omp.parallel {
-! CHECK-DAG:           %[[PRIV_I:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK-DAG:           %[[PRIV_I_DECL:.*]]:2 = hlfir.declare %[[PRIV_I]] {uniq_name = "_QFmultiple_private_fixEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK-DAG:           %[[PRIV_J:.*]] = fir.alloca i32 {bindc_name = "j", pinned, uniq_name = "_QFmultiple_private_fixEj"}
-! CHECK-DAG:           %[[PRIV_J_DECL:.*]]:2 = hlfir.declare %[[PRIV_J]] {uniq_name = "_QFmultiple_private_fixEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK-DAG:           %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, {{.*}}}
-! CHECK-DAG:           %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFmultiple_private_fixEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
 ! CHECK:           %[[ONE:.*]] = arith.constant 1 : i32
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[GAMA_DECL]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i32
-! CHECK:           omp.wsloop {
+! CHECK:           omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[PRIV_J:.*]], @{{.*}} %{{.*}}#0 -> %[[PRIV_X:.*]], @{{.*}} %{{.*}}#0 -> %[[PRIV_I:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
 ! CHECK-NEXT:        omp.loop_nest (%[[VAL_6:.*]]) : i32 = (%[[ONE]]) to (%[[VAL_3]]) inclusive step (%[[VAL_5]]) {
+! CHECK-DAG:           %[[PRIV_I_DECL:.*]]:2 = hlfir.declare %[[PRIV_I]] {uniq_name = "_QFmultiple_private_fixEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-DAG:           %[[PRIV_J_DECL:.*]]:2 = hlfir.declare %[[PRIV_J]] {uniq_name = "_QFmultiple_private_fixEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-DAG:           %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFmultiple_private_fixEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:               fir.store %[[VAL_6]] to %[[PRIV_I_DECL]]#1 : !fir.ref<i32>
 ! CHECK:               %[[VAL_7:.*]] = arith.constant 1 : i32
 ! CHECK:               %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i32) -> index
diff --git a/flang/test/Lower/OpenMP/parallel-private-clause.f90 b/flang/test/Lower/OpenMP/parallel-private-clause.f90
index 7114314df05d3..f26b97b55d51a 100644
--- a/flang/test/Lower/OpenMP/parallel-private-clause.f90
+++ b/flang/test/Lower/OpenMP/parallel-private-clause.f90
@@ -271,17 +271,16 @@ subroutine simple_loop_1
   !$OMP PARALLEL PRIVATE(r)
   ! FIRDialect:      %[[R_DECL:.*]]:2 = hlfir.declare %[[R]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsimple_loop_1Er"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
-  ! FIRDialect:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned, {{.*}}}
-  ! FIRDialect:      %[[ALLOCA_IV_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_IV]] {uniq_name = "_QFsimple_loop_1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
   ! FIRDialect:      %[[WS_LB:.*]] = arith.constant 1 : i32
   ! FIRDialect:      %[[WS_UB:.*]] = arith.constant 9 : i32
   ! FIRDialect:      %[[WS_STEP:.*]] = arith.constant 1 : i32
 
-  ! FIRDialect:      omp.wsloop {
+  ! FIRDialect:      omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[ALLOCA_IV:.*]] : !fir.ref<i32>) {
   ! FIRDialect-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
   !$OMP DO
   do i=1, 9
+  ! FIRDialect:      %[[ALLOCA_IV_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_IV]] {uniq_name = "_QFsimple_loop_1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! FIRDialect:      fir.store %[[I]] to %[[ALLOCA_IV_DECL]]#1 : !fir.ref<i32>
   ! FIRDialect:      %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV_DECL]]#0 : !fir.ref<i32>
   ! FIRDialect:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}} : (!fir.ref<i8>, i32) -> i1
@@ -299,34 +298,23 @@ subroutine simple_loop_2
   real, allocatable :: r;
   ! FIRDialect:  omp.parallel
   !$OMP PARALLEL
-  ! FIRDialect:      [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      %[[R_DECL:.*]]:2 = hlfir.declare [[R]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}Er"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
-
-  ! FIRDialect:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned, {{.*}}}
-  ! FIRDialect:      %[[ALLOCA_IV_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_IV]] {uniq_name = "{{.*}}Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
   ! FIRDialect:      %[[WS_LB:.*]] = arith.constant 1 : i32
   ! FIRDialect:      %[[WS_UB:.*]] = arith.constant 9 : i32
   ! FIRDialect:      %[[WS_STEP:.*]] = arith.constant 1 : i32
 
-  ! FIRDialect:      omp.wsloop {
+  ! FIRDialect:      omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[R:.*]], @{{.*}} %{{.*}}#0 -> %[[ALLOCA_IV:.*]] : !fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<i32>) {
   ! FIRDialect-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
   !$OMP DO PRIVATE(r)
   do i=1, 9
+  ! FIRDialect:     %[[R_DECL:.*]]:2 = hlfir.declare %[[R]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}Er"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+  ! FIRDialect:     %[[ALLOCA_IV_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_IV]] {uniq_name = "{{.*}}Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! FIRDialect:     fir.store %[[I]] to %[[ALLOCA_IV_DECL]]#1 : !fir.ref<i32>
   ! FIRDialect:     %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV_DECL]]#0 : !fir.ref<i32>
   ! FIRDialect:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   ! FIRDialect:     omp.yield
-  ! FIRDialect:     {{%.*}} = fir.load %[[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:     fir.if {{%.*}} {
-  ! FIRDialect:     [[LD:%.*]] = fir.load %[[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:     [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
-  ! FIRDialect:     fir.freemem [[AD]] : !fir.heap<f32>
-  ! FIRDialect:     fir.store {{%.*}} to %[[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
   !$OMP END DO
   ! FIRDialect:  omp.terminator
   !$OMP END PARALLEL
@@ -337,35 +325,24 @@ subroutine simple_loop_3
   integer :: i
   real, allocatable :: r;
   ! FIRDialect:  omp.parallel
-
-  ! FIRDialect:      [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      [[R_DECL:%.*]]:2 = hlfir.declare [[R]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}Er"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
-
-  ! FIRDialect:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned, {{.*}}}
-  ! FIRDialect:      %[[ALLOCA_IV_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_IV]] {uniq_name = "{{.*}}Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
   ! FIRDialect:      %[[WS_LB:.*]] = arith.constant 1 : i32
   ! FIRDialect:      %[[WS_UB:.*]] = arith.constant 9 : i32
   ! FIRDialect:      %[[WS_STEP:.*]] = arith.constant 1 : i32
 
-  ! FIRDialect:      omp.wsloop {
+  ! FIRDialect:      omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[R:.*]], @{{.*}} %{{.*}}#0 -> %[[ALLOCA_IV:.*]] : !fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<i32>) {
   ! FIRDialect-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
   !$OMP PARALLEL DO PRIVATE(r)
   do i=1, 9
+  ! FIRDialect:      [[R_DECL:%.*]]:2 = hlfir.declare %[[R]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}Er"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+
+  ! FIRDialect:      %[[ALLOCA_IV_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_IV]] {uniq_name = "{{.*}}Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
   ! FIRDialect:      fir.store %[[I]] to %[[ALLOCA_IV_DECL:.*]]#1 : !fir.ref<i32>
   ! FIRDialect:      %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV_DECL]]#0 : !fir.ref<i32>
   ! FIRDialect:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   ! FIRDialect:      omp.yield
-  ! FIRDialect:      {{%.*}} = fir.load [[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      fir.if {{%.*}} {
-  ! FIRDialect:      [[LD:%.*]] = fir.load [[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
-  ! FIRDialect:      fir.freemem [[AD]] : !fir.heap<f32>
-  ! FIRDialect:      fir.store {{%.*}} to [[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
   !$OMP END PARALLEL DO
   ! FIRDialect:  omp.terminator
 end subroutine
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90
index dabd495d733b5..11d5682209676 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90
@@ -92,13 +92,12 @@ program reduce
 ! CHECK:           %[[VAL_11:.*]] = fir.embox %[[VAL_9]](%[[VAL_10]]) : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xi32>>>
 ! CHECK:           fir.store %[[VAL_11]] to %[[VAL_3]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_12:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 0 : i32
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_16:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_box_heap_Uxi32 %[[VAL_3]]#0 -> %[[VAL_17:.*]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_12:.*]] : !fir.ref<i32>) reduction(byref @add_reduction_byref_box_heap_Uxi32 %[[VAL_3]]#0 -> %[[VAL_17:.*]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_18:.*]]) : i32 = (%[[VAL_14]]) to (%[[VAL_15]]) inclusive step (%[[VAL_16]]) {
+! CHECK:                 %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = {{.*}}<allocatable>, uniq_name = "_QFEr"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 ! CHECK:                 fir.store %[[VAL_18]] to %[[VAL_13]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_20:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
index 01d8dc33f40e6..54fe53b5d6f6a 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
@@ -99,18 +99,17 @@ program reduce
 ! CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_3]]#1 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_12]] : (index) -> i64
 ! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_13]] : (i32) -> i64
-! CHECK:           %[[VAL_18:.*]] = fir.call @_FortranAPointerSetBounds(%[[VAL_15]], %[[VAL_14]], %[[VAL_16]], %[[VAL_17]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK:           fir.call @_FortranAPointerSetBounds(%[[VAL_15]], %[[VAL_14]], %[[VAL_16]], %[[VAL_17]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK:           %[[VAL_19:.*]] = fir.convert %[[VAL_3]]#1 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:           %[[VAL_20:.*]] = fir.convert %[[VAL_6]] : (!fir.ref<!fir.char<{{.*}}>>) -> !fir.ref<i8>
 ! CHECK:           %[[VAL_21:.*]] = fir.call @_FortranAPointerAllocate(%[[VAL_19]], %[[VAL_4]], %[[VAL_5]], %[[VAL_20]], %[[VAL_7]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_22:.*]] = fir.alloca i32 {bindc_name = "i", pinned, uniq_name = "_QFEi"}
-! CHECK:             %[[VAL_23:.*]]:2 = hlfir.declare %[[VAL_22]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_24:.*]] = arith.constant 0 : i32
 ! CHECK:             %[[VAL_25:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_26:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_box_ptr_Uxi32 %[[VAL_3]]#0 -> %[[VAL_27:.*]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_22:.*]] : !fir.ref<i32>) reduction(byref @add_reduction_byref_box_ptr_Uxi32 %[[VAL_3]]#0 -> %[[VAL_27:.*]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) {
 ! CHECK:               omp.loop_nest (%[[VAL_28:.*]]) : i32 = (%[[VAL_24]]) to (%[[VAL_25]]) inclusive step (%[[VAL_26]]) {
+! CHECK:                 %[[VAL_23:.*]]:2 = hlfir.declare %[[VAL_22]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_27]] {fortran_attrs = {{.*}}<pointer>, uniq_name = "_QFEr"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
 ! CHECK:                 fir.store %[[VAL_28]] to %[[VAL_23]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_30:.*]] = fir.load %[[VAL_23]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/parallel-reduction3.f90 b/flang/test/Lower/OpenMP/parallel-reduction3.f90
index 70b4f0f12820b..194b3fdd98201 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction3.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction3.f90
@@ -71,13 +71,12 @@
 ! CHECK:           omp.parallel {
 ! CHECK:             %[[VAL_14:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
 ! CHECK:             fir.store %[[VAL_12]]#0 to %[[VAL_14]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
-! CHECK:             %[[VAL_15:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_17:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_18:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_19:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_box_Uxi32 %[[VAL_14]] -> %[[VAL_20:.*]] : !fir.ref<!fir.box<!fir.array<?xi32>>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_15:.*]] : !fir.ref<i32>) reduction(byref @add_reduction_byref_box_Uxi32 %[[VAL_14]] -> %[[VAL_20:.*]] : !fir.ref<!fir.box<!fir.array<?xi32>>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_21:.*]]) : i32 = (%[[VAL_17]]) to (%[[VAL_18]]) inclusive step (%[[VAL_19]]) {
+! CHECK:                 %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_20]] {uniq_name = "_QFsEc"} : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> (!fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.array<?xi32>>>)
 ! CHECK:                 fir.store %[[VAL_21]] to %[[VAL_16]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_23:.*]] = fir.load %[[VAL_22]]#0 : !fir.ref<!fir.box<!fir.array<?xi32>>>
@@ -115,12 +114,12 @@
 ! CHECK:           %[[VAL_44:.*]] = arith.constant 1 : i32
 ! CHECK:           %[[VAL_45:.*]] = arith.constant false
 ! CHECK:           %[[VAL_46:.*]] = arith.constant false
-! CHECK:           %[[VAL_47:.*]] = fir.call @_FortranAStopStatement(%[[VAL_44]], %[[VAL_45]], %[[VAL_46]]) fastmath<contract> : (i32, i1, i1) -> none
+! CHECK:           fir.call @_FortranAStopStatement(%[[VAL_44]], %[[VAL_45]], %[[VAL_46]]) fastmath<contract> : (i32, i1, i1) -> ()
 ! CHECK:           fir.unreachable
 ! CHECK:         ^bb2:
 ! CHECK:           return
 ! CHECK:         }
-! CHECK:         func.func private @_FortranAStopStatement(i32, i1, i1) -> none attributes {fir.runtime}
+! CHECK:         func.func private @_FortranAStopStatement(i32, i1, i1) attributes {fir.runtime}
 
 subroutine s(x)
     integer :: x
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90 b/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90
index 4e7c2c15df743..a0cdaaa4c7b09 100644
--- a/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90
+++ b/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90
@@ -11,20 +11,13 @@ subroutine omp_do_firstprivate(a)
   n = a+1
   !$omp parallel do firstprivate(a)
   ! CHECK:  omp.parallel {
-
-  ! CHECK: %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_do_firstprivateEa"}
-  ! CHECK: %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_firstprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-  ! CHECK-NEXT: %[[LD:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<i32>
-  ! CHECK-NEXT: hlfir.assign %[[LD]] to %[[A_PVT_DECL]]#0 : i32, !fir.ref<i32>
-
-  ! CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-  ! CHECK: %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_firstprivateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: %[[UB:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK-NEXT: %[[UB:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<i32>
   ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: omp.wsloop {
+  ! CHECK-NEXT: omp.wsloop private(@{{.*a_firstprivate_ref_i32.*}} %{{.*}}#0 -> %[[A_PVT_REF:.*]], @{{.*i_private_ref_i32.*}} %{{.*}}#0 -> %[[I_PVT_REF:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
   ! CHECK-NEXT: omp.loop_nest (%[[ARG1:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+  ! CHECK: %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_firstprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_firstprivateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! CHECK-NEXT: fir.store %[[ARG1]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
   ! CHECK-NEXT: fir.call @_QPfoo(%[[I_PVT_DECL]]#1, %[[A_PVT_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
   ! CHECK-NEXT: omp.yield
@@ -47,25 +40,14 @@ subroutine omp_do_firstprivate2(a, n)
   n = a+1
   !$omp parallel do firstprivate(a, n)
   ! CHECK:  omp.parallel {
-
-  ! CHECK: %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned, {{.*}}}
+  ! CHECK: %[[LB:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<i32>
+  ! CHECK: %[[UB:.*]] = fir.load %[[ARG1_DECL]]#0 : !fir.ref<i32>
+  ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
+  ! CHECK: omp.wsloop private(@{{.*a_firstprivate_ref_i32}} %{{.*}}#0 -> %[[A_PVT_REF:.*]], @{{.*n_firstprivate_ref_i32}} %{{.*}}#0 -> %[[N_PVT_REF:.*]], @{{.*i_private_ref_i32}} %{{.*}}#0 -> %[[I_PVT_REF:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
+  ! CHECK-NEXT: omp.loop_nest (%[[ARG2:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   ! CHECK: %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_firstprivate2Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-  ! CHECK: %[[LD:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<i32>
-  ! CHECK: hlfir.assign %[[LD]] to %[[A_PVT_DECL]]#0 : i32, !fir.ref<i32>
-
-  ! CHECK: %[[N_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "n", pinned, uniq_name = "_QFomp_do_firstprivate2En"}
   ! CHECK: %[[N_PVT_DECL:.*]]:2 = hlfir.declare %[[N_PVT_REF]] {uniq_name = "_QFomp_do_firstprivate2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-  ! CHECK: %[[LD1:.*]] = fir.load %[[ARG1_DECL]]#0 : !fir.ref<i32>
-  ! CHECK: hlfir.assign %[[LD1]] to %[[N_PVT_DECL]]#0 : i32, !fir.ref<i32>
-
-  ! CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
   ! CHECK: %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_firstprivate2Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
-  ! CHECK: %[[LB:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
-  ! CHECK: %[[UB:.*]] = fir.load %[[N_PVT_DECL]]#0 : !fir.ref<i32>
-  ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.wsloop {
-  ! CHECK-NEXT: omp.loop_nest (%[[ARG2:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   ! CHECK: fir.store %[[ARG2]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
   ! CHECK: fir.call @_QPfoo(%[[I_PVT_DECL]]#1, %[[A_PVT_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
   ! CHECK: omp.yield
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90 b/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90
index dbde5291c01c8..a7c0dc3b1b406 100644
--- a/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90
+++ b/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90
@@ -12,17 +12,15 @@ subroutine omp_do_lastprivate(a)
   !$omp parallel do lastprivate(a)
   ! CHECK:  omp.parallel {
 
-  ! CHECK: %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_do_lastprivateEa"}
-  ! CHECK: %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_lastprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
-  ! CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-  ! CHECK: %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_lastprivateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: %[[UB:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK-NEXT: %[[UB:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<i32>
   ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: omp.wsloop {
+  ! CHECK-NEXT: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[A_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[I_PVT_REF:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
   ! CHECK-NEXT: omp.loop_nest (%[[ARG1:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+  ! CHECK:      %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_lastprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:      %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_lastprivateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
   ! CHECK-NEXT: fir.store %[[ARG1]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
   ! CHECK-NEXT: fir.call @_QPfoo(%[[I_PVT_DECL]]#1, %[[A_PVT_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
   ! CHECK:      %[[NEXT_ARG1:.*]] = arith.addi %[[ARG1]], %[[STEP]] : i32
@@ -58,20 +56,15 @@ subroutine omp_do_lastprivate2(a, n)
   !$omp parallel do lastprivate(a, n)
   ! CHECK:  omp.parallel {
 
-  ! CHECK: %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned, {{.*}}}
+  ! CHECK: %[[LB:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<i32>
+  ! CHECK: %[[UB:.*]] = fir.load %[[ARG1_DECL]]#0 : !fir.ref<i32>
+  ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
+  ! CHECK: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[A_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[N_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[I_PVT_REF:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
+  ! CHECK-NEXT: omp.loop_nest (%[[ARG2:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   ! CHECK: %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate2Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
-  ! CHECK: %[[N_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "n", pinned, uniq_name = "_QFomp_do_lastprivate2En"}
   ! CHECK: %[[N_PVT_DECL:.*]]:2 = hlfir.declare %[[N_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
-  ! CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
   ! CHECK: %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate2Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
-  ! CHECK: %[[LB:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
-  ! CHECK: %[[UB:.*]] = fir.load %[[N_PVT_DECL]]#0 : !fir.ref<i32>
-  ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.wsloop {
-  ! CHECK-NEXT: omp.loop_nest (%[[ARG2:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   ! CHECK: fir.store %[[ARG2]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
   ! CHECK: fir.call @_QPfoo(%[[I_PVT_DECL]]#1, %[[A_PVT_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
   ! CHECK: %[[NEXT_ARG2:.*]] = arith.addi %[[ARG2]], %[[STEP]] : i32
@@ -104,23 +97,18 @@ subroutine omp_do_lastprivate_collapse2(a)
   !$omp parallel do lastprivate(a) collapse(2)
   ! CHECK:  omp.parallel {
 
-  ! CHECK: %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_do_lastprivate_collapse2Ea"}
-  ! CHECK: %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse2Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
-  ! CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-  ! CHECK: %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse2Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-  !
-  ! CHECK: %[[J_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "j", pinned, {{.*}}}
-  ! CHECK: %[[J_PVT_DECL:.*]]:2 = hlfir.declare %[[J_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse2Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
   ! CHECK: %[[LB1:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: %[[UB1:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK-NEXT: %[[UB1:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<i32>
   ! CHECK-NEXT: %[[STEP1:.*]] = arith.constant 1 : i32
   ! CHECK: %[[LB2:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: %[[UB2:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK-NEXT: %[[UB2:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<i32>
   ! CHECK-NEXT: %[[STEP2:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: omp.wsloop {
+  ! CHECK-NEXT: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[A_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[I_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[J_PVT_REF:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
   ! CHECK-NEXT: omp.loop_nest (%[[ARG1:.*]], %[[ARG2:.*]]) : i32 = (%[[LB1]], %[[LB2]]) to (%[[UB1]], %[[UB2]]) inclusive step (%[[STEP1]], %[[STEP2]]) {
+  ! CHECK:      %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse2Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:      %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse2Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:      %[[J_PVT_DECL:.*]]:2 = hlfir.declare %[[J_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse2Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
   ! CHECK-NEXT: fir.store %[[ARG1]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
   ! CHECK-NEXT: fir.store %[[ARG2]] to %[[J_PVT_DECL]]#1 : !fir.ref<i32>
   ! CHECK-NEXT: fir.call @_QPfoo(%[[I_PVT_DECL]]#1, %[[A_PVT_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
@@ -164,29 +152,23 @@ subroutine omp_do_lastprivate_collapse3(a)
   !$omp parallel do lastprivate(a) collapse(3)
   ! CHECK:  omp.parallel {
 
-  ! CHECK: %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_do_lastprivate_collapse3Ea"}
-  ! CHECK: %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse3Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
-  ! CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-  ! CHECK: %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse3Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
-  ! CHECK: %[[J_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "j", pinned, {{.*}}}
-  ! CHECK: %[[J_PVT_DECL:.*]]:2 = hlfir.declare %[[J_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse3Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
-  ! CHECK: %[[K_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "k", pinned, {{.*}}}
-  ! CHECK: %[[K_PVT_DECL:.*]]:2 = hlfir.declare %[[K_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse3Ek"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
   ! CHECK: %[[LB1:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: %[[UB1:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK-NEXT: %[[UB1:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<i32>
   ! CHECK-NEXT: %[[STEP1:.*]] = arith.constant 1 : i32
   ! CHECK: %[[LB2:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: %[[UB2:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK-NEXT: %[[UB2:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<i32>
   ! CHECK-NEXT: %[[STEP2:.*]] = arith.constant 1 : i32
   ! CHECK: %[[LB3:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: %[[UB3:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK-NEXT: %[[UB3:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<i32>
   ! CHECK-NEXT: %[[STEP3:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: omp.wsloop {
+  ! CHECK-NEXT: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[A_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[I_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[J_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[K_PVT_REF:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
   ! CHECK-NEXT: omp.loop_nest (%[[ARG1:.*]], %[[ARG2:.*]], %[[ARG3:.*]]) : i32 = (%[[LB1]], %[[LB2]], %[[LB3]]) to (%[[UB1]], %[[UB2]], %[[UB3]]) inclusive step (%[[STEP1]], %[[STEP2]], %[[STEP3]]) {
+  ! CHECK:      %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse3Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:      %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse3Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:      %[[J_PVT_DECL:.*]]:2 = hlfir.declare %[[J_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse3Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:      %[[K_PVT_DECL:.*]]:2 = hlfir.declare %[[K_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse3Ek"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
   ! CHECK-NEXT: fir.store %[[ARG1]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
   ! CHECK-NEXT: fir.store %[[ARG2]] to %[[J_PVT_DECL]]#1 : !fir.ref<i32>
   ! CHECK-NEXT: fir.store %[[ARG3]] to %[[K_PVT_DECL]]#1 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop-reduction-byref.f90 b/flang/test/Lower/OpenMP/parallel-wsloop-reduction-byref.f90
index 99c521406a777..3f44f292cb6a0 100644
--- a/flang/test/Lower/OpenMP/parallel-wsloop-reduction-byref.f90
+++ b/flang/test/Lower/OpenMP/parallel-wsloop-reduction-byref.f90
@@ -4,7 +4,7 @@
 ! RUN: flang -fc1 -fopenmp -mmlir --force-byref-reduction -emit-hlfir %s -o - | FileCheck %s
 
 ! CHECK: omp.parallel {
-! CHECK: omp.wsloop reduction(byref @add_reduction_byref_i32
+! CHECK: omp.wsloop private({{.*}}) reduction(byref @add_reduction_byref_i32
 subroutine sb
   integer :: x
   x = 0
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop-reduction.f90 b/flang/test/Lower/OpenMP/parallel-wsloop-reduction.f90
index cfeb5de83f4e8..a206eef52da5a 100644
--- a/flang/test/Lower/OpenMP/parallel-wsloop-reduction.f90
+++ b/flang/test/Lower/OpenMP/parallel-wsloop-reduction.f90
@@ -4,7 +4,7 @@
 ! RUN: flang -fc1 -fopenmp -emit-hlfir %s -o - | FileCheck %s
 
 ! CHECK: omp.parallel {
-! CHECK: omp.wsloop reduction(@add_reduction_i32
+! CHECK: omp.wsloop private({{.*}}) reduction(@add_reduction_i32
 subroutine sb
   integer :: x
   x = 0
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop.f90 b/flang/test/Lower/OpenMP/parallel-wsloop.f90
index cba5209f85989..7116069e8daa6 100644
--- a/flang/test/Lower/OpenMP/parallel-wsloop.f90
+++ b/flang/test/Lower/OpenMP/parallel-wsloop.f90
@@ -10,7 +10,7 @@ subroutine simple_parallel_do
   ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
   ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
   ! CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-  ! CHECK:      omp.wsloop {
+  ! CHECK:      omp.wsloop private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
   !$OMP PARALLEL DO
   do i=1, 9
@@ -39,7 +39,7 @@ subroutine parallel_do_with_parallel_clauses(cond, nt)
   ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
   ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
   ! CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-  ! CHECK:      omp.wsloop {
+  ! CHECK:      omp.wsloop private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
   !$OMP PARALLEL DO IF(cond) NUM_THREADS(nt) PROC_BIND(close)
   do i=1, 9
@@ -64,7 +64,7 @@ subroutine parallel_do_with_clauses(nt)
   ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
   ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
   ! CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-  ! CHECK:      omp.wsloop schedule(dynamic) {
+  ! CHECK:      omp.wsloop schedule(dynamic) private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
   !$OMP PARALLEL DO NUM_THREADS(nt) SCHEDULE(dynamic)
   do i=1, 9
@@ -92,19 +92,16 @@ subroutine parallel_do_with_privatisation_clauses(cond,nt)
   integer :: nt
   integer :: i
   ! CHECK:  omp.parallel
-  ! CHECK:      %[[PRIVATE_COND_REF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "cond", pinned, uniq_name = "_QFparallel_do_with_privatisation_clausesEcond"}
-  ! CHECK:      %[[PRIVATE_COND_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_COND_REF]] {uniq_name = "_QFparallel_do_with_privatisation_clausesEcond"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-  ! CHECK:      %[[PRIVATE_NT_REF:.*]] = fir.alloca i32 {bindc_name = "nt", pinned, uniq_name = "_QFparallel_do_with_privatisation_clausesEnt"}
-  ! CHECK:      %[[PRIVATE_NT_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_NT_REF]] {uniq_name = "_QFparallel_do_with_privatisation_clausesEnt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-  ! CHECK:      %[[NT_VAL:.*]] = fir.load %[[NT_DECL]]#0 : !fir.ref<i32>
-  ! CHECK:      hlfir.assign %[[NT_VAL]] to %[[PRIVATE_NT_DECL]]#0 : i32, !fir.ref<i32>
+
   ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
   ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
   ! CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-  ! CHECK:      omp.wsloop {
+  ! CHECK:      omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[PRIVATE_COND_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[PRIVATE_NT_REF:.*]], @{{.*}} %3#0 -> %{{.*}} : !fir.ref<!fir.logical<4>>, !fir.ref<i32>, !fir.ref<i32>) {
   ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
   !$OMP PARALLEL DO PRIVATE(cond) FIRSTPRIVATE(nt)
   do i=1, 9
+  ! CHECK:      %[[PRIVATE_COND_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_COND_REF]] {uniq_name = "_QFparallel_do_with_privatisation_clausesEcond"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+  ! CHECK:      %[[PRIVATE_NT_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_NT_REF]] {uniq_name = "_QFparallel_do_with_privatisation_clausesEnt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! CHECK:      fir.store %[[I]] to %[[IV_ADDR:.*]]#1 : !fir.ref<i32>
   ! CHECK:      %[[LOAD_IV:.*]] = fir.load %[[IV_ADDR]]#0 : !fir.ref<i32>
   ! CHECK:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
@@ -149,14 +146,12 @@ end subroutine parallel_private_do
 
 ! CHECK:             %[[NT_PRIV_DECL:.*]]:2 = hlfir.declare %[[NT_PRIV_ADDR]] {uniq_name = "_QFparallel_private_doEnt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
-! CHECK:             %[[I_PRIV:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV]] {uniq_name = "_QFparallel_private_doEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 9 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[I_PRIV:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[I:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:                 %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV]] {uniq_name = "_QFparallel_private_doEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[I]] to %[[I_PRIV_DECL]]#1 : !fir.ref<i32>
 ! CHECK:                 fir.call @_QPfoo(%[[I_PRIV_DECL]]#1, %[[COND_DECL]]#1, %[[NT_PRIV_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.logical<4>>, !fir.ref<i32>) -> ()
 ! CHECK:                 omp.yield
@@ -195,14 +190,13 @@ end subroutine omp_parallel_multiple_firstprivate_do
 
 ! CHECK:             %[[B_PRIV_DECL:.*]]:2 = hlfir.declare %[[B_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
-! CHECK:             %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[I_PRIV_ADDR:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[I:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
+! CHECK:                 %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[I]] to %[[I_PRIV_DECL]]#1 : !fir.ref<i32>
 ! CHECK:                 fir.call @_QPbar(%[[I_PRIV_DECL]]#1, %[[A_PRIV_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
 ! CHECK:                 omp.yield
@@ -237,23 +231,15 @@ end subroutine parallel_do_private
 ! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}) {
 ! CHECK:           %[[NT_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_do_privateEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           omp.parallel   {
-
-! CHECK:             %[[COND_PRIV_ADDR:.*]] = fir.alloca !fir.logical<4> {bindc_name = "cond", pinned, uniq_name = "_QFparallel_do_privateEcond"}
-! CHECK:             %[[COND_PRIV_DECL:.*]]:2 = hlfir.declare %[[COND_PRIV_ADDR]] {uniq_name = "_QFparallel_do_privateEcond"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-
-! CHECK:             %[[NT_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "nt", pinned, uniq_name = "_QFparallel_do_privateEnt"}
-! CHECK:             %[[NT_PRIV_DECL:.*]]:2 = hlfir.declare %[[NT_PRIV_ADDR]] {uniq_name = "_QFparallel_do_privateEnt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:             %[[NT_VAL:.*]] = fir.load %[[NT_DECL]]#0 : !fir.ref<i32>
-! CHECK:             hlfir.assign %[[NT_VAL]] to %[[NT_PRIV_DECL]]#0 : i32, !fir.ref<i32>
-
-! CHECK:             %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV_ADDR]] {uniq_name = "_QFparallel_do_privateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 9 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[COND_PRIV_ADDR:.*]], @{{.*}} %{{.*}}#0 -> %[[NT_PRIV_ADDR:.*]], @{{.*}} %3#0 -> %[[I_PRIV_ADDR:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<i32>, !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[I:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:                 %[[COND_PRIV_DECL:.*]]:2 = hlfir.declare %[[COND_PRIV_ADDR]] {uniq_name = "_QFparallel_do_privateEcond"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:                 %[[NT_PRIV_DECL:.*]]:2 = hlfir.declare %[[NT_PRIV_ADDR]] {uniq_name = "_QFparallel_do_privateEnt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:                 %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV_ADDR]] {uniq_name = "_QFparallel_do_privateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
 ! CHECK:                 fir.store %[[I]] to %[[I_PRIV_DECL]]#1 : !fir.ref<i32>
 ! CHECK:                 fir.call @_QPfoo(%[[I_PRIV_DECL]]#1, %[[COND_PRIV_DECL]]#1, %[[NT_PRIV_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.logical<4>>, !fir.ref<i32>) -> ()
 ! CHECK:                 omp.yield
@@ -287,25 +273,15 @@ end subroutine omp_parallel_do_multiple_firstprivate
 ! CHECK:           %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>
 ! CHECK:           omp.parallel {
-
-! CHECK:             %[[A_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_parallel_do_multiple_firstprivateEa"}
-! CHECK:             %[[A_PRIV_DECL:.*]]:2 = hlfir.declare %[[A_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:             %[[A:.*]] = fir.load %[[A_DECL]]#0 : !fir.ref<i32>
-! CHECK:             hlfir.assign %[[A]] to %[[A_PRIV_DECL]]#0 : i32, !fir.ref<i32>
-
-! CHECK:             %[[B_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "b", pinned, uniq_name = "_QFomp_parallel_do_multiple_firstprivateEb"}
-! CHECK:             %[[B_PRIV_DECL:.*]]:2 = hlfir.declare %[[B_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:             %[[B:.*]] = fir.load %[[B_DECL]]#0 : !fir.ref<i32>
-! CHECK:             hlfir.assign %[[B]] to %[[B_PRIV_DECL]]#0 : i32, !fir.ref<i32>
-
-! CHECK:             %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[A_PRIV_ADDR:.*]], @{{.*}} %{{.}}#0 -> %[[B_PRIV_ADDR:.*]], @{{.*}} %{{.}}#0 -> %[[I_PRIV_ADDR:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
 ! CHECK-NEXT:         omp.loop_nest (%[[I:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
+! CHECK:                 %[[A_PRIV_DECL:.*]]:2 = hlfir.declare %[[A_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:                 %[[B_PRIV_DECL:.*]]:2 = hlfir.declare %[[B_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:                 %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
 ! CHECK:                 fir.store %[[I]] to %[[I_PRIV_DECL]]#1 : !fir.ref<i32>
 ! CHECK:                 fir.call @_QPbar(%[[I_PRIV_DECL]]#1, %[[A_PRIV_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
 ! CHECK:                 omp.yield
diff --git a/flang/test/Lower/OpenMP/private-derived-type.f90 b/flang/test/Lower/OpenMP/private-derived-type.f90
index 9d680cd5d6114..7e0a3f14639f6 100644
--- a/flang/test/Lower/OpenMP/private-derived-type.f90
+++ b/flang/test/Lower/OpenMP/private-derived-type.f90
@@ -15,6 +15,17 @@ subroutine s4
   !$omp end parallel
 end subroutine s4
 
+! CHECK:  omp.private {type = private} @[[DERIVED_PRIV:.*]] : !fir.ref<!fir.type<{{.*}}y3{x:!fir.box<!fir.heap<i32>>}>> alloc {
+! CHECK:             %[[VAL_23:.*]] = fir.alloca !fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}> {bindc_name = "v", pinned, uniq_name = "_QFs4Ev"}
+! CHECK:             %[[VAL_25:.*]] = fir.embox %[[VAL_23]] : (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>
+! CHECK:             %[[VAL_26:.*]] = fir.address_of
+! CHECK:             %[[VAL_27:.*]] = arith.constant 8 : i32
+! CHECK:             %[[VAL_28:.*]] = fir.convert %[[VAL_25]] : (!fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<none>
+! CHECK:             %[[VAL_29:.*]] = fir.convert %[[VAL_26]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
+!                    Check we do call FortranAInitialize on the derived type
+! CHECK:             fir.call @_FortranAInitialize(%[[VAL_28]], %[[VAL_29]], %[[VAL_27]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
+! CHECK:             %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFs4Ev"} : (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>, !fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>)
+! CHECK:  }
 
 ! CHECK-LABEL:   func.func @_QPs4() {
 !                  Example of how the lowering for regular derived type variables:
@@ -25,22 +36,13 @@ end subroutine s4
 ! CHECK:           %[[VAL_12:.*]] = arith.constant 4 : i32
 ! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_10]] : (!fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_11]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_15:.*]] = fir.call @_FortranAInitialize(%[[VAL_13]], %[[VAL_14]], %[[VAL_12]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           fir.call @_FortranAInitialize(%[[VAL_13]], %[[VAL_14]], %[[VAL_12]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_23:.*]] = fir.alloca !fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}> {bindc_name = "v", pinned, uniq_name = "_QFs4Ev"}
-! CHECK:             %[[VAL_25:.*]] = fir.embox %[[VAL_23]] : (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>
-! CHECK:             %[[VAL_26:.*]] = fir.address_of
-! CHECK:             %[[VAL_27:.*]] = arith.constant 8 : i32
-! CHECK:             %[[VAL_28:.*]] = fir.convert %[[VAL_25]] : (!fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<none>
-! CHECK:             %[[VAL_29:.*]] = fir.convert %[[VAL_26]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-!                    Check we do call FortranAInitialize on the derived type
-! CHECK:             %[[VAL_30:.*]] = fir.call @_FortranAInitialize(%[[VAL_28]], %[[VAL_29]], %[[VAL_27]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
-! CHECK:             %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFs4Ev"} : (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>, !fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>)
-! CHECK:             omp.wsloop {
+! CHECK:             omp.wsloop private(@[[DERIVED_PRIV]] %{{.*}}#0 -> %{{.*}}, @{{.*}} %{{.*}}#0 -> %{{.*}} : !fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>, !fir.ref<i32>) {
 ! CHECK:           }
 ! CHECK:           %[[VAL_39:.*]] = fir.embox %[[VAL_9]]#1 : (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>
 ! CHECK:           %[[VAL_40:.*]] = fir.convert %[[VAL_39]] : (!fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<none>
 !                  Check the derived type is destroyed
-! CHECK:           %[[VAL_41:.*]] = fir.call @_FortranADestroy(%[[VAL_40]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK:           fir.call @_FortranADestroy(%[[VAL_40]]) fastmath<contract> : (!fir.box<none>) -> ()
 ! CHECK:           return
 ! CHECK:         }
diff --git a/flang/test/Lower/OpenMP/same_var_first_lastprivate.f90 b/flang/test/Lower/OpenMP/same_var_first_lastprivate.f90
new file mode 100644
index 0000000000000..c49a0908b721e
--- /dev/null
+++ b/flang/test/Lower/OpenMP/same_var_first_lastprivate.f90
@@ -0,0 +1,39 @@
+! RUN: %flang_fc1 -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+subroutine first_and_lastprivate
+  integer i
+  integer :: var = 1
+
+  !$omp parallel do firstprivate(var) lastprivate(var)
+  do i=1,1
+  end do
+  !$omp end parallel do
+end subroutine
+
+! CHECK:  omp.private {type = firstprivate} @{{.*}}Evar_firstprivate_ref_i32 : {{.*}} alloc {
+! CHECK:    %[[ALLOC:.*]] = fir.alloca i32 {{.*}}
+! CHECK:    %[[ALLOC_DECL:.*]]:2 = hlfir.declare %[[ALLOC]]
+! CHECK:    omp.yield(%[[ALLOC_DECL]]#0 : !fir.ref<i32>)
+! CHECK:  } copy {
+! CHECK: ^{{.*}}(%[[ORIG_REF:.*]]: {{.*}}, %[[PRIV_REF:.*]]: {{.*}}):
+! CHECK:    %[[ORIG_VAL:.*]] = fir.load %[[ORIG_REF]]
+! CHECK:    hlfir.assign %[[ORIG_VAL]] to %[[PRIV_REF]]
+! CHECK:    omp.yield(%[[PRIV_REF]] : !fir.ref<i32>)
+! CHECK:  }
+
+! CHECK:  func.func @{{.*}}first_and_lastprivate()
+! CHECK:    %[[ORIG_VAR_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "{{.*}}Evar"}
+! CHECK:    omp.parallel {
+! CHECK:      omp.barrier
+! CHECK:      omp.wsloop private(@{{.*}}var_firstprivate_ref_i32 {{.*}}) {
+! CHECK:        omp.loop_nest {{.*}} {
+! CHECK:          %[[PRIV_VAR_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "{{.*}}Evar"}
+! CHECK:          fir.if %{{.*}} {
+! CHECK:            %[[PRIV_VAR_VAL:.*]] = fir.load %[[PRIV_VAR_DECL]]#0 : !fir.ref<i32>
+! CHECK:            hlfir.assign %[[PRIV_VAR_VAL]] to %[[ORIG_VAR_DECL]]#0
+! CHECK:          }
+! CHECK:          omp.yield
+! CHECK:        }
+! CHECK:      }
+! CHECK:      omp.terminator
+! CHECK:    }
diff --git a/flang/test/Lower/OpenMP/stop-stmt-in-region.f90 b/flang/test/Lower/OpenMP/stop-stmt-in-region.f90
index d119c2120c7c5..d817c4e771b31 100644
--- a/flang/test/Lower/OpenMP/stop-stmt-in-region.f90
+++ b/flang/test/Lower/OpenMP/stop-stmt-in-region.f90
@@ -9,7 +9,7 @@
 ! CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i32
 ! CHECK:           %[[VAL_1:.*]] = arith.constant false
 ! CHECK:           %[[VAL_2:.*]] = arith.constant false
-! CHECK:           %[[VAL_3:.*]] = fir.call @_FortranAStopStatement(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) {{.*}} : (i32, i1, i1) -> none
+! CHECK:           fir.call @_FortranAStopStatement(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) {{.*}} : (i32, i1, i1) -> ()
 ! CHECK-NOT:       fir.unreachable
 ! CHECK:           omp.terminator
 ! CHECK:         }
@@ -28,7 +28,7 @@ subroutine test_stop_in_region1()
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
 ! CHECK:           %[[VAL_2:.*]] = arith.constant false
 ! CHECK:           %[[VAL_3:.*]] = arith.constant false
-! CHECK:           %[[VAL_4:.*]] = fir.call @_FortranAStopStatement(%[[VAL_1]], %[[VAL_2]], %[[VAL_3]]) {{.*}} : (i32, i1, i1) -> none
+! CHECK:           fir.call @_FortranAStopStatement(%[[VAL_1]], %[[VAL_2]], %[[VAL_3]]) {{.*}} : (i32, i1, i1) -> ()
 ! CHECK:           omp.terminator
 ! CHECK:         }
 ! CHECK:         return
@@ -56,7 +56,7 @@ subroutine test_stop_in_region2()
 ! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_0_DECL]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_6:.*]] = arith.constant false
 ! CHECK:           %[[VAL_7:.*]] = arith.constant false
-! CHECK:           %[[VAL_8:.*]] = fir.call @_FortranAStopStatement(%[[VAL_5]], %[[VAL_6]], %[[VAL_7]]) {{.*}} : (i32, i1, i1) -> none
+! CHECK:           fir.call @_FortranAStopStatement(%[[VAL_5]], %[[VAL_6]], %[[VAL_7]]) {{.*}} : (i32, i1, i1) -> ()
 ! CHECK:           omp.terminator
 ! CHECK:         ^bb2:
 ! CHECK:           omp.terminator
@@ -80,14 +80,13 @@ subroutine test_stop_in_region3()
 ! CHECK:         %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFtest_stop_in_region4Ex"}
 ! CHECK:         %[[VAL_2_DECL:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFtest_stop_in_region4Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
-! CHECK:         %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:         %[[VAL_0_DECL:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_stop_in_region4Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
 ! CHECK:         %[[VAL_3:.*]] = arith.constant 1 : i32
 ! CHECK:         %[[VAL_4:.*]] = arith.constant 10 : i32
 ! CHECK:         %[[VAL_5:.*]] = arith.constant 1 : i32
-! CHECK:         omp.wsloop {
+! CHECK:         omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_0:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:      omp.loop_nest (%[[VAL_6:.*]]) : i32 = (%[[VAL_3]]) to (%[[VAL_4]]) inclusive step (%[[VAL_5]]) {
+! CHECK:             %[[VAL_0_DECL:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_stop_in_region4Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             fir.store %[[VAL_6]] to %[[VAL_0_DECL]]#1 : !fir.ref<i32>
 ! CHECK:             cf.br ^bb1
 ! CHECK:           ^bb1:
@@ -101,7 +100,7 @@ subroutine test_stop_in_region3()
 ! CHECK:             %[[VAL_11:.*]] = fir.load %[[VAL_2_DECL]]#0 : !fir.ref<i32>
 ! CHECK:             %[[VAL_12:.*]] = arith.constant false
 ! CHECK:             %[[VAL_13:.*]] = arith.constant false
-! CHECK:             %[[VAL_14:.*]] = fir.call @_FortranAStopStatement(%[[VAL_11]], %[[VAL_12]], %[[VAL_13]]) {{.*}} : (i32, i1, i1) -> none
+! CHECK:             fir.call @_FortranAStopStatement(%[[VAL_11]], %[[VAL_12]], %[[VAL_13]]) {{.*}} : (i32, i1, i1) -> ()
 ! CHECK:             omp.yield
 ! CHECK:           ^bb3:
 ! CHECK:             omp.yield
@@ -125,7 +124,7 @@ subroutine test_stop_in_region4()
 
 !CHECK-LABEL: func.func @_QPtest_stop_in_region5
 !CHECK:   omp.parallel   {
-!CHECK:     {{.*}} fir.call @_FortranAStopStatement({{.*}}, {{.*}}, {{.*}}) fastmath<contract> : (i32, i1, i1) -> none
+!CHECK:     {{.*}} fir.call @_FortranAStopStatement({{.*}}, {{.*}}, {{.*}}) fastmath<contract> : (i32, i1, i1) -> ()
 !CHECK:     omp.terminator
 !CHECK:   }
 !CHECK:   return
@@ -142,10 +141,10 @@ subroutine test_stop_in_region5()
 !CHECK:  omp.parallel   {
 !CHECK:    cf.cond_br %{{.*}}, ^[[BB1:.*]], ^[[BB2:.*]]
 !CHECK:  ^[[BB1]]:
-!CHECK:    {{.*}}fir.call @_FortranAStopStatement({{.*}}, {{.*}}, {{.*}}) fastmath<contract> : (i32, i1, i1) -> none
+!CHECK:    {{.*}}fir.call @_FortranAStopStatement({{.*}}, {{.*}}, {{.*}}) fastmath<contract> : (i32, i1, i1) -> ()
 !CHECK:    omp.terminator
 !CHECK:  ^[[BB2]]:
-!CHECK:    {{.*}}fir.call @_FortranAStopStatement({{.*}}, {{.*}}, {{.*}}) fastmath<contract> : (i32, i1, i1) -> none
+!CHECK:    {{.*}}fir.call @_FortranAStopStatement({{.*}}, {{.*}}, {{.*}}) fastmath<contract> : (i32, i1, i1) -> ()
 !CHECK:    omp.terminator
 !CHECK:  }
 !CHECK:  return
diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90
index 04764be2293c1..bf801e69405b9 100644
--- a/flang/test/Lower/OpenMP/target.f90
+++ b/flang/test/Lower/OpenMP/target.f90
@@ -586,11 +586,10 @@ subroutine omp_target_parallel_do
       !CHECK: %[[VAL_0_DECL:.*]]:2 = hlfir.declare %[[ARG_0]](%{{.*}}) {uniq_name = "_QFomp_target_parallel_doEa"} : (!fir.ref<!fir.array<1024xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>)
       !CHECK: omp.parallel
       !$omp target parallel do map(tofrom: a)
-         !CHECK: %[[I_PVT_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-         !CHECK: %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_ALLOCA]] {uniq_name = "_QFomp_target_parallel_doEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-         !CHECK: omp.wsloop {
+         !CHECK: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[I_PVT_ALLOCA:.*]] : !fir.ref<i32>) {
          !CHECK-NEXT: omp.loop_nest (%[[I_VAL:.*]]) : i32
          do i = 1, 1024
+         !CHECK:     %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_ALLOCA]] {uniq_name = "_QFomp_target_parallel_doEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
            !CHECK:   fir.store %[[I_VAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
            !CHECK:   %[[C10:.*]] = arith.constant 10 : i32
            !CHECK:   %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/unstructured.f90 b/flang/test/Lower/OpenMP/unstructured.f90
index a0955c8440c1c..a9925a484eb1c 100644
--- a/flang/test/Lower/OpenMP/unstructured.f90
+++ b/flang/test/Lower/OpenMP/unstructured.f90
@@ -69,10 +69,9 @@ subroutine ss2(n) ! unstructured OpenMP construct; loop exit inside construct
 ! CHECK:     cond_br %{{[0-9]*}}, ^bb2, ^bb4
 ! CHECK:   ^bb2:  // pred: ^bb1
 
-! CHECK:     %[[ALLOCA_2:.*]] = fir.alloca i32 {{{.*}}, pinned, {{.*}}}
-! CHECK:     %[[OMP_LOOP_K_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_2]] {uniq_name = "_QFss3Ek"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:     omp.wsloop {
+! CHECK:     omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[ALLOCA_2:.*]] : !fir.ref<i32>) {
 ! CHECK:       omp.loop_nest (%[[ARG1:.*]]) : {{.*}} {
+! CHECK:         %[[OMP_LOOP_K_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_2]] {uniq_name = "_QFss3Ek"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:         fir.store %[[ARG1]] to %[[OMP_LOOP_K_DECL]]#1 : !fir.ref<i32>
 ! CHECK:         @_FortranAioBeginExternalListOutput
 ! CHECK:         %[[LOAD_1:.*]] = fir.load %[[OMP_LOOP_K_DECL]]#0 : !fir.ref<i32>
@@ -81,11 +80,9 @@ subroutine ss2(n) ! unstructured OpenMP construct; loop exit inside construct
 ! CHECK:       }
 ! CHECK:     }
 
-! CHECK:     %[[ALLOCA_1:.*]] = fir.alloca i32 {{{.*}}, pinned, {{.*}}}
-! CHECK:     %[[OMP_LOOP_J_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_1]] {uniq_name = "_QFss3Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
-! CHECK:     omp.wsloop {
+! CHECK:     omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[ALLOCA_1:.*]] : !fir.ref<i32>) {
 ! CHECK:       omp.loop_nest (%[[ARG2:.*]]) : {{.*}} {
+! CHECK:         %[[OMP_LOOP_J_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_1]] {uniq_name = "_QFss3Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:         fir.store %[[ARG2]] to %[[OMP_LOOP_J_DECL]]#1 : !fir.ref<i32>
 ! CHECK:         br ^bb1
 ! CHECK:       ^bb2:  // 2 preds: ^bb1, ^bb5
@@ -128,10 +125,9 @@ subroutine ss3(n) ! nested unstructured OpenMP constructs
 
 ! CHECK-LABEL: func @_QPss4{{.*}} {
 ! CHECK:       omp.parallel private(@{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) {
-! CHECK:         %[[ALLOCA:.*]] = fir.alloca i32 {{{.*}}, pinned, uniq_name = "_QFss4Ej"}
-! CHECK:         %[[OMP_LOOP_J_DECL:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFss4Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:         omp.wsloop {
+! CHECK:         omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[ALLOCA:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:      omp.loop_nest (%[[ARG:.*]]) : {{.*}} {
+! CHECK:             %[[OMP_LOOP_J_DECL:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFss4Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             fir.store %[[ARG]] to %[[OMP_LOOP_J_DECL]]#1 : !fir.ref<i32>
 ! CHECK:             %[[COND:.*]] = arith.cmpi eq, %{{.*}}, %{{.*}}
 ! CHECK:             %[[COND_XOR:.*]] = arith.xori %[[COND]], %{{.*}}
@@ -160,7 +156,7 @@ subroutine ss4(n) ! CYCLE in OpenMP wsloop constructs
 
 ! CHECK-LABEL: func @_QPss5() {
 ! CHECK:  omp.parallel private(@{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) {
-! CHECK:    omp.wsloop {
+! CHECK:    omp.wsloop private({{.*}}) {
 ! CHECK:      omp.loop_nest {{.*}} {
 ! CHECK:        br ^[[BB1:.*]]
 ! CHECK:      ^[[BB1]]:
@@ -202,7 +198,7 @@ subroutine ss5() ! EXIT inside OpenMP wsloop (inside parallel)
 ! CHECK:  ^[[BB1_OUTER]]:
 ! CHECK:    cond_br %{{.*}}, ^[[BB2_OUTER:.*]], ^[[BB3_OUTER:.*]]
 ! CHECK:  ^[[BB2_OUTER]]:
-! CHECK:    omp.wsloop {
+! CHECK:    omp.wsloop private({{.*}}) {
 ! CHECK:      omp.loop_nest {{.*}} {
 ! CHECK:        br ^[[BB1:.*]]
 ! CHECK:      ^[[BB1]]:
@@ -248,7 +244,7 @@ subroutine ss6() ! EXIT inside OpenMP wsloop in a do loop (inside parallel)
 ! CHECK:   cond_br %{{.*}}, ^[[BB2_OUTER:.*]], ^[[BB3_OUTER:.*]]
 ! CHECK-NEXT: ^[[BB2_OUTER:.*]]:
 ! CHECK:   omp.parallel  {
-! CHECK:     omp.wsloop {
+! CHECK:     omp.wsloop private({{.*}}) {
 ! CHECK:       omp.loop_nest {{.*}} {
 ! CHECK:         br ^[[BB1:.*]]
 ! CHECK-NEXT:       ^[[BB1]]:
@@ -288,7 +284,7 @@ subroutine ss7() ! EXIT inside OpenMP parallel do (inside do loop)
 
 ! CHECK-LABEL: func @_QPss8() {
 ! CHECK:  omp.parallel  {
-! CHECK:    omp.wsloop {
+! CHECK:    omp.wsloop private({{.*}}) {
 ! CHECK:      omp.loop_nest {{.*}} {
 ! CHECK:        br ^[[BB1:.*]]
 ! CHECK-NEXT:      ^[[BB1]]:
diff --git a/flang/test/Lower/OpenMP/wsloop-chunks.f90 b/flang/test/Lower/OpenMP/wsloop-chunks.f90
index 0fb7d6f1b64fa..0a2b962761acb 100644
--- a/flang/test/Lower/OpenMP/wsloop-chunks.f90
+++ b/flang/test/Lower/OpenMP/wsloop-chunks.f90
@@ -20,7 +20,7 @@ program wsloop
 ! CHECK:         %[[VAL_3:.*]] = arith.constant 1 : i32
 ! CHECK:         %[[VAL_4:.*]] = arith.constant 9 : i32
 ! CHECK:         %[[VAL_5:.*]] = arith.constant 1 : i32
-! CHECK:         omp.wsloop nowait schedule(static = %[[VAL_2]] : i32) {
+! CHECK:         omp.wsloop nowait schedule(static = %[[VAL_2]] : i32) private({{.*}}) {
 ! CHECK-NEXT:      omp.loop_nest (%[[ARG0:.*]]) : i32 = (%[[VAL_3]]) to (%[[VAL_4]]) inclusive step (%[[VAL_5]]) {
 ! CHECK:             fir.store %[[ARG0]] to %[[STORE_IV:.*]]#1 : !fir.ref<i32>
 ! CHECK:             %[[LOAD_IV:.*]] = fir.load %[[STORE_IV]]#0 : !fir.ref<i32>
@@ -40,7 +40,7 @@ program wsloop
 ! CHECK:         %[[VAL_15:.*]] = arith.constant 1 : i32
 ! CHECK:         %[[VAL_16:.*]] = arith.constant 9 : i32
 ! CHECK:         %[[VAL_17:.*]] = arith.constant 1 : i32
-! CHECK:         omp.wsloop nowait schedule(static = %[[VAL_14]] : i32) {
+! CHECK:         omp.wsloop nowait schedule(static = %[[VAL_14]] : i32) private({{.*}}) {
 ! CHECK-NEXT:      omp.loop_nest (%[[ARG1:.*]]) : i32 = (%[[VAL_15]]) to (%[[VAL_16]]) inclusive step (%[[VAL_17]]) {
 ! CHECK:             fir.store %[[ARG1]] to %[[STORE_IV1:.*]]#1 : !fir.ref<i32>
 ! CHECK:             %[[VAL_24:.*]] = arith.constant 2 : i32
@@ -66,7 +66,7 @@ program wsloop
 ! CHECK:         %[[VAL_30:.*]] = arith.constant 1 : i32
 ! CHECK:         %[[VAL_31:.*]] = arith.constant 9 : i32
 ! CHECK:         %[[VAL_32:.*]] = arith.constant 1 : i32
-! CHECK:         omp.wsloop nowait schedule(static = %[[VAL_29]] : i32) {
+! CHECK:         omp.wsloop nowait schedule(static = %[[VAL_29]] : i32) private({{.*}}) {
 ! CHECK-NEXT:      omp.loop_nest (%[[ARG2:.*]]) : i32 = (%[[VAL_30]]) to (%[[VAL_31]]) inclusive step (%[[VAL_32]]) {
 ! CHECK:             fir.store %[[ARG2]] to %[[STORE_IV2:.*]]#1 : !fir.ref<i32>
 ! CHECK:             %[[VAL_39:.*]] = arith.constant 3 : i32
diff --git a/flang/test/Lower/OpenMP/wsloop-collapse.f90 b/flang/test/Lower/OpenMP/wsloop-collapse.f90
index 61ee76d589107..6d9862e625400 100644
--- a/flang/test/Lower/OpenMP/wsloop-collapse.f90
+++ b/flang/test/Lower/OpenMP/wsloop-collapse.f90
@@ -38,15 +38,6 @@ program wsloop_collapse
 !CHECK:           %[[VAL_23:.*]] = arith.constant 0 : i32
 !CHECK:           hlfir.assign %[[VAL_23]] to %[[VAL_19]]#0 : i32, !fir.ref<i32>
 
-!CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-!CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
-!CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "j", pinned, {{.*}}}
-!CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
-!CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "k", pinned, {{.*}}}
-!CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
   integer :: i, j, k
   integer :: a, b, c
   integer :: x
@@ -65,12 +56,17 @@ program wsloop_collapse
 !CHECK:           %[[VAL_30:.*]] = arith.constant 1 : i32
 !CHECK:           %[[VAL_31:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
 !CHECK:           %[[VAL_32:.*]] = arith.constant 1 : i32
-!CHECK:           omp.wsloop {
+!CHECK:           omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_4:.*]], @{{.*}} %{{.*}}#0 -> %[[VAL_2:.*]], @{{.*}} %{{.*}}#0 -> %[[VAL_0:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
 !CHECK-NEXT:        omp.loop_nest (%[[VAL_33:.*]], %[[VAL_34:.*]], %[[VAL_35:.*]]) : i32 = (%[[VAL_24]], %[[VAL_27]], %[[VAL_30]]) to (%[[VAL_25]], %[[VAL_28]], %[[VAL_31]]) inclusive step (%[[VAL_26]], %[[VAL_29]], %[[VAL_32]]) {
   !$omp do collapse(3)
   do i = 1, a
      do j= 1, b
         do k = 1, c
+
+!CHECK:               %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:               %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:               %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
 !CHECK:               fir.store %[[VAL_33]] to %[[VAL_5]]#1 : !fir.ref<i32>
 !CHECK:               fir.store %[[VAL_34]] to %[[VAL_3]]#1 : !fir.ref<i32>
 !CHECK:               fir.store %[[VAL_35]] to %[[VAL_1]]#1 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-monotonic.f90 b/flang/test/Lower/OpenMP/wsloop-monotonic.f90
index 9659fff2d42e7..e21aa4c678f42 100644
--- a/flang/test/Lower/OpenMP/wsloop-monotonic.f90
+++ b/flang/test/Lower/OpenMP/wsloop-monotonic.f90
@@ -11,11 +11,10 @@ program wsloop_dynamic
 !CHECK:  omp.parallel {
 
 !$OMP DO SCHEDULE(monotonic:dynamic)
-!CHECK:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned, {{.*}}}
 !CHECK:     %[[WS_LB:.*]] = arith.constant 1 : i32
 !CHECK:     %[[WS_UB:.*]] = arith.constant 9 : i32
 !CHECK:     %[[WS_STEP:.*]] = arith.constant 1 : i32
-!CHECK:     omp.wsloop nowait schedule(dynamic, monotonic) {
+!CHECK:     omp.wsloop nowait schedule(dynamic, monotonic) private({{.*}}) {
 !CHECK-NEXT:  omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
 !CHECK:         fir.store %[[I]] to %[[ALLOCA_IV:.*]]#1 : !fir.ref<i32>
 
diff --git a/flang/test/Lower/OpenMP/wsloop-nonmonotonic.f90 b/flang/test/Lower/OpenMP/wsloop-nonmonotonic.f90
index b1bea525ff489..23d3c49c00786 100644
--- a/flang/test/Lower/OpenMP/wsloop-nonmonotonic.f90
+++ b/flang/test/Lower/OpenMP/wsloop-nonmonotonic.f90
@@ -12,13 +12,12 @@ program wsloop_dynamic
 !CHECK:  omp.parallel {
 
 !$OMP DO SCHEDULE(nonmonotonic:dynamic)
-!CHECK:     %[[I_REF:.*]] = fir.alloca i32 {{{.*}}, pinned, {{.*}}}
-!CHECK:     %[[ALLOCA_IV:.*]]:2 = hlfir.declare %[[I_REF]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:     %[[WS_LB:.*]] = arith.constant 1 : i32
 !CHECK:     %[[WS_UB:.*]] = arith.constant 9 : i32
 !CHECK:     %[[WS_STEP:.*]] = arith.constant 1 : i32
-!CHECK:     omp.wsloop nowait schedule(dynamic, nonmonotonic) {
+!CHECK:     omp.wsloop nowait schedule(dynamic, nonmonotonic) private(@{{.*}} %{{.*}}#0 -> %[[I_REF:.*]] : !fir.ref<i32>) {
 !CHECK-NEXT:  omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
+!CHECK:         %[[ALLOCA_IV:.*]]:2 = hlfir.declare %[[I_REF]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:         fir.store %[[I]] to %[[ALLOCA_IV]]#1 : !fir.ref<i32>
 
   do i=1, 9
diff --git a/flang/test/Lower/OpenMP/wsloop-ordered.f90 b/flang/test/Lower/OpenMP/wsloop-ordered.f90
index 5fa53f7b28447..4862b7296a9bc 100644
--- a/flang/test/Lower/OpenMP/wsloop-ordered.f90
+++ b/flang/test/Lower/OpenMP/wsloop-ordered.f90
@@ -6,7 +6,7 @@
 subroutine wsloop_ordered_no_para()
   integer :: a(10), i
 
-! CHECK:  omp.wsloop ordered(0) {
+! CHECK:  omp.wsloop ordered(0) private({{.*}}) {
 ! CHECK-NEXT: omp.loop_nest (%{{.*}}) : i32 = (%{{.*}}) to (%{{.*}}) inclusive step (%{{.*}}) {
 ! CHECK:        omp.yield
 ! CHECK:      }
@@ -27,7 +27,7 @@ subroutine wsloop_ordered_with_para()
   integer :: a(10), i
 
 ! CHECK: func @_QPwsloop_ordered_with_para() {
-! CHECK:  omp.wsloop ordered(1) {
+! CHECK:  omp.wsloop ordered(1) private({{.*}}) {
 ! CHECK-NEXT: omp.loop_nest (%{{.*}}) : i32 = (%{{.*}}) to (%{{.*}}) inclusive step (%{{.*}}) {
 ! CHECK:        omp.yield
 ! CHECK:      }
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-add-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-add-byref.f90
index c38a79191bc4e..bc021e7a3b273 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-add-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-add-byref.f90
@@ -85,13 +85,12 @@
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_i32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_5:.*]] : !fir.ref<i32>) reduction(byref @add_reduction_byref_i32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:                 %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
@@ -127,13 +126,12 @@ subroutine simple_int_reduction
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_f32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_5:.*]] : !fir.ref<i32>) reduction(byref @add_reduction_byref_f32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:                 %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_real_reductionEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<f32>
@@ -170,13 +168,12 @@ subroutine simple_real_reduction
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_i32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_5:.*]] : !fir.ref<i32>) reduction(byref @add_reduction_byref_i32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:                 %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_int_reduction_switch_orderEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_13:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
@@ -211,13 +208,12 @@ subroutine simple_int_reduction_switch_order
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_f32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_5:.*]] : !fir.ref<i32>) reduction(byref @add_reduction_byref_f32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:                 %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_real_reduction_switch_orderEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_13:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
@@ -261,13 +257,12 @@ subroutine simple_real_reduction_switch_order
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_7]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_int_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_i32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], byref @add_reduction_byref_i32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], byref @add_reduction_byref_i32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_11:.*]] : !fir.ref<i32>) reduction(byref @add_reduction_byref_i32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], byref @add_reduction_byref_i32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], byref @add_reduction_byref_i32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
+! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_int_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_int_reductions_same_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_int_reductions_same_typeEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_int_reductions_same_typeEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -324,13 +319,12 @@ subroutine multiple_int_reductions_same_type
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_7]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_real_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_f32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], byref @add_reduction_byref_f32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], byref @add_reduction_byref_f32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<f32>, !fir.ref<f32>, !fir.ref<f32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_11:.*]] : !fir.ref<i32>) reduction(byref @add_reduction_byref_f32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], byref @add_reduction_byref_f32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], byref @add_reduction_byref_f32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<f32>, !fir.ref<f32>, !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
+! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_real_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_real_reductions_same_typeEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_real_reductions_same_typeEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_real_reductions_same_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
@@ -394,13 +388,12 @@ subroutine multiple_real_reductions_same_type
 ! CHECK:           %[[VAL_13:.*]] = arith.constant 0.000000e+00 : f64
 ! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_3]]#0 : f64, !fir.ref<f64>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_14:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_14]] {uniq_name = "_QFmultiple_reductions_different_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_16:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_17:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_18:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_i32 %[[VAL_5]]#0 -> %[[VAL_19:.*]], byref @add_reduction_byref_i64 %[[VAL_7]]#0 -> %[[VAL_20:.*]], byref @add_reduction_byref_f32 %[[VAL_9]]#0 -> %[[VAL_21:.*]], byref @add_reduction_byref_f64 %[[VAL_3]]#0 -> %[[VAL_22:.*]] : !fir.ref<i32>, !fir.ref<i64>, !fir.ref<f32>, !fir.ref<f64>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_14:.*]] : !fir.ref<i32>) reduction(byref @add_reduction_byref_i32 %[[VAL_5]]#0 -> %[[VAL_19:.*]], byref @add_reduction_byref_i64 %[[VAL_7]]#0 -> %[[VAL_20:.*]], byref @add_reduction_byref_f32 %[[VAL_9]]#0 -> %[[VAL_21:.*]], byref @add_reduction_byref_f64 %[[VAL_3]]#0 -> %[[VAL_22:.*]] : !fir.ref<i32>, !fir.ref<i64>, !fir.ref<f32>, !fir.ref<f64>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_23:.*]]) : i32 = (%[[VAL_16]]) to (%[[VAL_17]]) inclusive step (%[[VAL_18]]) {
+! CHECK:                 %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_14]] {uniq_name = "_QFmultiple_reductions_different_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_19]] {uniq_name = "_QFmultiple_reductions_different_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_20]] {uniq_name = "_QFmultiple_reductions_different_typeEy"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:                 %[[VAL_26:.*]]:2 = hlfir.declare %[[VAL_21]] {uniq_name = "_QFmultiple_reductions_different_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-add.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-add.f90
index c5278e0ef8815..a355e968b4146 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-add.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-add.f90
@@ -53,13 +53,12 @@
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_i32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_5:.*]] : !fir.ref<i32>) reduction(@add_reduction_i32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:                 %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
@@ -95,13 +94,12 @@ subroutine simple_int_reduction
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_f32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_5:.*]] : !fir.ref<i32>) reduction(@add_reduction_f32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:                 %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_real_reductionEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<f32>
@@ -138,13 +136,12 @@ subroutine simple_real_reduction
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_i32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_5:.*]] : !fir.ref<i32>) reduction(@add_reduction_i32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:                 %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_int_reduction_switch_orderEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_13:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
@@ -179,13 +176,12 @@ subroutine simple_int_reduction_switch_order
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_f32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_5:.*]] : !fir.ref<i32>) reduction(@add_reduction_f32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:                 %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_real_reduction_switch_orderEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_13:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
@@ -229,13 +225,12 @@ subroutine simple_real_reduction_switch_order
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_7]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_int_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_i32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], @add_reduction_i32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], @add_reduction_i32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_11:.*]] : !fir.ref<i32>) reduction(@add_reduction_i32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], @add_reduction_i32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], @add_reduction_i32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
+! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_int_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_int_reductions_same_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_int_reductions_same_typeEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_int_reductions_same_typeEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -292,13 +287,12 @@ subroutine multiple_int_reductions_same_type
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_7]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_real_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_f32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], @add_reduction_f32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], @add_reduction_f32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<f32>, !fir.ref<f32>, !fir.ref<f32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_11:.*]] : !fir.ref<i32>) reduction(@add_reduction_f32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], @add_reduction_f32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], @add_reduction_f32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<f32>, !fir.ref<f32>, !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
+! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_real_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_real_reductions_same_typeEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_real_reductions_same_typeEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_real_reductions_same_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
@@ -362,13 +356,12 @@ subroutine multiple_real_reductions_same_type
 ! CHECK:           %[[VAL_13:.*]] = arith.constant 0.000000e+00 : f64
 ! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_3]]#0 : f64, !fir.ref<f64>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_14:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_14]] {uniq_name = "_QFmultiple_reductions_different_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_16:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_17:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_18:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_i32 %[[VAL_5]]#0 -> %[[VAL_19:.*]], @add_reduction_i64 %[[VAL_7]]#0 -> %[[VAL_20:.*]], @add_reduction_f32 %[[VAL_9]]#0 -> %[[VAL_21:.*]], @add_reduction_f64 %[[VAL_3]]#0 -> %[[VAL_22:.*]] : !fir.ref<i32>, !fir.ref<i64>, !fir.ref<f32>, !fir.ref<f64>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_14:.*]] : !fir.ref<i32>) reduction(@add_reduction_i32 %[[VAL_5]]#0 -> %[[VAL_19:.*]], @add_reduction_i64 %[[VAL_7]]#0 -> %[[VAL_20:.*]], @add_reduction_f32 %[[VAL_9]]#0 -> %[[VAL_21:.*]], @add_reduction_f64 %[[VAL_3]]#0 -> %[[VAL_22:.*]] : !fir.ref<i32>, !fir.ref<i64>, !fir.ref<f32>, !fir.ref<f64>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_23:.*]]) : i32 = (%[[VAL_16]]) to (%[[VAL_17]]) inclusive step (%[[VAL_18]]) {
+! CHECK:                 %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_14]] {uniq_name = "_QFmultiple_reductions_different_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_19]] {uniq_name = "_QFmultiple_reductions_different_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_20]] {uniq_name = "_QFmultiple_reductions_different_typeEy"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:                 %[[VAL_26:.*]]:2 = hlfir.declare %[[VAL_21]] {uniq_name = "_QFmultiple_reductions_different_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
index ce45d09d77a22..f09130152fb28 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
@@ -214,13 +214,12 @@ program reduce15
 ! CHECK:           }
 ! CHECK:           fir.store %[[VAL_54:.*]]#1 to %[[VAL_3]]#1 : !fir.ref<i32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_55:.*]] = fir.alloca i32 {bindc_name = "i", pinned, uniq_name = "_QFEi"}
-! CHECK:             %[[VAL_56:.*]]:2 = hlfir.declare %[[VAL_55]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_57:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_58:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_59:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @max_byref_box_heap_Uxi32 %[[VAL_5]]#0 -> %[[VAL_60:.*]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_55:.*]] : !fir.ref<i32>) reduction(byref @max_byref_box_heap_Uxi32 %[[VAL_5]]#0 -> %[[VAL_60:.*]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) {
 ! CHECK:               omp.loop_nest (%[[VAL_61:.*]]) : i32 = (%[[VAL_57]]) to (%[[VAL_58]]) inclusive step (%[[VAL_59]]) {
+! CHECK:                 %[[VAL_56:.*]]:2 = hlfir.declare %[[VAL_55]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_62:.*]]:2 = hlfir.declare %[[VAL_60]] {fortran_attrs = {{.*}}<allocatable>, uniq_name = "_QFEmaxes"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 ! CHECK:                 fir.store %[[VAL_61]] to %[[VAL_56]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_63:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
@@ -256,13 +255,12 @@ program reduce15
 ! CHECK:             omp.terminator
 ! CHECK:           }
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_87:.*]] = fir.alloca i32 {bindc_name = "i", pinned, uniq_name = "_QFEi"}
-! CHECK:             %[[VAL_88:.*]]:2 = hlfir.declare %[[VAL_87]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_89:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_90:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_91:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @min_byref_box_heap_Uxi32 %[[VAL_7]]#0 -> %[[VAL_92:.*]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_87:.*]] : !fir.ref<i32>) reduction(byref @min_byref_box_heap_Uxi32 %[[VAL_7]]#0 -> %[[VAL_92:.*]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) {
 ! CHECK:               omp.loop_nest (%[[VAL_93:.*]]) : i32 = (%[[VAL_89]]) to (%[[VAL_90]]) inclusive step (%[[VAL_91]]) {
+! CHECK:                 %[[VAL_88:.*]]:2 = hlfir.declare %[[VAL_87]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_94:.*]]:2 = hlfir.declare %[[VAL_92]] {fortran_attrs = {{.*}}<allocatable>, uniq_name = "_QFEmins"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 ! CHECK:                 fir.store %[[VAL_93]] to %[[VAL_88]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_95:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90
index ba7aea0d96c5b..7e6d7fddff5a1 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90
@@ -77,13 +77,12 @@ program reduce
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_8]] to %[[VAL_5]]#0 realloc : i32, !fir.ref<!fir.box<!fir.heap<i32>>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_9:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_9]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_11:.*]] = arith.constant 0 : i32
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_box_heap_i32 %[[VAL_5]]#0 -> %[[VAL_14:.*]] : !fir.ref<!fir.box<!fir.heap<i32>>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_9:.*]] : !fir.ref<i32>) reduction(byref @add_reduction_byref_box_heap_i32 %[[VAL_5]]#0 -> %[[VAL_14:.*]] : !fir.ref<!fir.box<!fir.heap<i32>>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_15:.*]]) : i32 = (%[[VAL_11]]) to (%[[VAL_12]]) inclusive step (%[[VAL_13]]) {
+! CHECK:                 %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_9]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]] {fortran_attrs = {{.*}}<allocatable>, uniq_name = "_QFEr"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
 ! CHECK:                 fir.store %[[VAL_15]] to %[[VAL_10]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_17:.*]] = fir.load %[[VAL_10]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
index 9785f77c0e091..0e2fc3a24ee1b 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
@@ -81,13 +81,12 @@ subroutine reduce(r)
 ! CHECK:           omp.parallel {
 ! CHECK:             %[[VAL_4:.*]] = fir.alloca !fir.box<!fir.array<?xf64>>
 ! CHECK:             fir.store %[[VAL_3]]#1 to %[[VAL_4]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFFreduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 0 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_box_Uxf64 %[[VAL_4]] -> %[[VAL_10:.*]] : !fir.ref<!fir.box<!fir.array<?xf64>>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_5:.*]] : !fir.ref<i32>) reduction(byref @add_reduction_byref_box_Uxf64 %[[VAL_4]] -> %[[VAL_10:.*]] : !fir.ref<!fir.box<!fir.array<?xf64>>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:                 %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFFreduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {fortran_attrs = {{.*}}, uniq_name = "_QFFreduceEr"} : (!fir.ref<!fir.box<!fir.array<?xf64>>>) -> (!fir.ref<!fir.box<!fir.array<?xf64>>>, !fir.ref<!fir.box<!fir.array<?xf64>>>)
 ! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_13:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
index ea5df5a836972..07debb9f6b9e0 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
@@ -76,13 +76,12 @@ program reduce
 ! CHECK:             %[[VAL_6:.*]] = fir.embox %[[VAL_5]]#0(%[[VAL_4]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
 ! CHECK:             %[[VAL_7:.*]] = fir.alloca !fir.box<!fir.array<2xi32>>
 ! CHECK:             fir.store %[[VAL_6]] to %[[VAL_7]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
-! CHECK:             %[[VAL_8:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 0 : i32
 ! CHECK:             %[[VAL_11:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_box_2xi32 %[[VAL_7]] -> %[[VAL_13:.*]] : !fir.ref<!fir.box<!fir.array<2xi32>>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_8:.*]] : !fir.ref<i32>) reduction(byref @add_reduction_byref_box_2xi32 %[[VAL_7]] -> %[[VAL_13:.*]] : !fir.ref<!fir.box<!fir.array<2xi32>>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_14:.*]]) : i32 = (%[[VAL_10]]) to (%[[VAL_11]]) inclusive step (%[[VAL_12]]) {
+! CHECK:                 %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_13]] {uniq_name = "_QFEr"} : (!fir.ref<!fir.box<!fir.array<2xi32>>>) -> (!fir.ref<!fir.box<!fir.array<2xi32>>>, !fir.ref<!fir.box<!fir.array<2xi32>>>)
 ! CHECK:                 fir.store %[[VAL_14]] to %[[VAL_9]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
index 9815cfa9c3150..a25bedb359f4e 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
@@ -76,13 +76,12 @@ program reduce
 ! CHECK:             %[[VAL_6:.*]] = fir.embox %[[VAL_5]]#0(%[[VAL_4]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
 ! CHECK:             %[[VAL_7:.*]] = fir.alloca !fir.box<!fir.array<2xi32>>
 ! CHECK:             fir.store %[[VAL_6]] to %[[VAL_7]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
-! CHECK:             %[[VAL_8:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 0 : i32
 ! CHECK:             %[[VAL_11:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_box_2xi32 %[[VAL_7]] -> %[[VAL_13:.*]] : !fir.ref<!fir.box<!fir.array<2xi32>>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_8:.*]] : !fir.ref<i32>) reduction(byref @add_reduction_byref_box_2xi32 %[[VAL_7]] -> %[[VAL_13:.*]] : !fir.ref<!fir.box<!fir.array<2xi32>>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_14:.*]]) : i32 = (%[[VAL_10]]) to (%[[VAL_11]]) inclusive step (%[[VAL_12]]) {
+! CHECK:                 %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_13]] {uniq_name = "_QFEr"} : (!fir.ref<!fir.box<!fir.array<2xi32>>>) -> (!fir.ref<!fir.box<!fir.array<2xi32>>>, !fir.ref<!fir.box<!fir.array<2xi32>>>)
 ! CHECK:                 fir.store %[[VAL_14]] to %[[VAL_9]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<!fir.box<!fir.array<2xi32>>>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90
index 829229807698a..18dcc3d722886 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90
@@ -32,13 +32,12 @@
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_iandEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @iand_byref_i32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_7:.*]] : !fir.ref<i32>) reduction(byref @iand_byref_i32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:                 %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_iandEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_iandEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-iand.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-iand.f90
index 6c060f2e5292a..eaf07f93c7474 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-iand.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-iand.f90
@@ -24,13 +24,12 @@
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_iandEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@iand_i32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_7:.*]] : !fir.ref<i32>) reduction(@iand_i32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:                 %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_iandEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_iandEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90
index 284ada404bd60..6be6913f91a33 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90
@@ -28,10 +28,9 @@
 
 
 !CHECK: omp.parallel
-!CHECK: %[[I_REF:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-!CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %[[I_REF]] {uniq_name = "_QFreduction_ieorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: omp.wsloop reduction(byref @ieor_byref_i32 %[[X_DECL]]#0 -> %[[PRV:.+]] : !fir.ref<i32>)
+!CHECK: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[I_REF:.*]] : !fir.ref<i32>) reduction(byref @ieor_byref_i32 %[[X_DECL]]#0 -> %[[PRV:.+]] : !fir.ref<i32>)
 !CHECK-NEXT: omp.loop_nest
+!CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %[[I_REF]] {uniq_name = "_QFreduction_ieorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRV_DECL:.+]]:2 = hlfir.declare %[[PRV]] {{.*}} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: fir.store %{{.*}} to %[[I_DECL]]#1 : !fir.ref<i32>
 !CHECK: %[[I_32:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ieor.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ieor.f90
index e67253a413ce2..632dbcf1348ec 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-ieor.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-ieor.f90
@@ -17,10 +17,9 @@
 
 
 !CHECK: omp.parallel
-!CHECK: %[[I_REF:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-!CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %[[I_REF]] {uniq_name = "_QFreduction_ieorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: omp.wsloop reduction(@[[IEOR_DECLARE_I]] %[[X_DECL]]#0 -> %[[PRV:.+]] : !fir.ref<i32>)
+!CHECK: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[I_REF:.*]] : !fir.ref<i32>) reduction(@[[IEOR_DECLARE_I]] %[[X_DECL]]#0 -> %[[PRV:.+]] : !fir.ref<i32>)
 !CHECK-NEXT: omp.loop_nest
+!CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %[[I_REF]] {uniq_name = "_QFreduction_ieorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRV_DECL:.+]]:2 = hlfir.declare %[[PRV]] {{.*}} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: fir.store %{{.*}} to %[[I_DECL]]#1 : !fir.ref<i32>
 !CHECK: %[[I_32:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90
index 315121cc7beb7..90b9d2f61f930 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90
@@ -30,13 +30,12 @@
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel
-! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_iorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @ior_byref_i32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_7:.*]] : !fir.ref<i32>) reduction(byref @ior_byref_i32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:                 %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_iorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_iorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ior.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ior.f90
index 3da250da9703d..144bc17cf8b31 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-ior.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-ior.f90
@@ -24,13 +24,12 @@
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel
-! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_iorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@ior_i32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_7:.*]] : !fir.ref<i32>) reduction(@ior_i32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:                 %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_iorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_iorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90
index 30908b6bdd4ce..e73540a93a71b 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90
@@ -39,13 +39,12 @@
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @and_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) reduction(byref @and_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:                 %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
@@ -88,13 +87,12 @@ end subroutine simple_reduction
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @and_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) reduction(byref @and_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:                 %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
@@ -146,13 +144,12 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_17]] to %[[VAL_11]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_18:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @and_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], byref @and_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], byref @and_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_18:.*]] : !fir.ref<i32>) reduction(byref @and_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], byref @and_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], byref @and_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
+! CHECK:                 %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_25]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90
index 367683de02080..c059dab5bff5a 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90
@@ -31,13 +31,12 @@
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@and_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) reduction(@and_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:                 %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
@@ -80,13 +79,12 @@ end subroutine simple_reduction
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@and_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) reduction(@and_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:                 %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
@@ -138,13 +136,12 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_17]] to %[[VAL_11]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_18:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@and_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], @and_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], @and_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_18:.*]] : !fir.ref<i32>) reduction(@and_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], @and_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], @and_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
+! CHECK:                 %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_25]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90
index 9137dd8ff4454..5e24ad6f7bb63 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90
@@ -39,13 +39,12 @@
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @eqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) reduction(byref @eqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:                 %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
@@ -87,13 +86,12 @@ subroutine simple_reduction(y)
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @eqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) reduction(byref @eqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:                 %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
@@ -145,13 +143,12 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_17]] to %[[VAL_11]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_18:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @eqv_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], byref @eqv_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], byref @eqv_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_18:.*]] : !fir.ref<i32>) reduction(byref @eqv_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], byref @eqv_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], byref @eqv_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
+! CHECK:                 %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_25]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90
index d1ef676c37407..ad9e869984eac 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90
@@ -31,13 +31,12 @@
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@eqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) reduction(@eqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:                 %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
@@ -79,13 +78,12 @@ subroutine simple_reduction(y)
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@eqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) reduction(@eqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:                 %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
@@ -137,13 +135,12 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_17]] to %[[VAL_11]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_18:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@eqv_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], @eqv_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], @eqv_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_18:.*]] : !fir.ref<i32>) reduction(@eqv_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], @eqv_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], @eqv_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
+! CHECK:                 %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_25]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90
index d1491a0f5561d..b5bf1d0d0b589 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90
@@ -39,13 +39,12 @@
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @neqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) reduction(byref @neqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:                 %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
@@ -88,13 +87,12 @@ subroutine simple_reduction(y)
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @neqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) reduction(byref @neqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:                 %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
@@ -148,13 +146,12 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_17]] to %[[VAL_11]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_18:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @neqv_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], byref @neqv_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], byref @neqv_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_18:.*]] : !fir.ref<i32>) reduction(byref @neqv_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], byref @neqv_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], byref @neqv_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
+! CHECK:                 %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_25]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90
index b4df699c49ffa..ac9fc7f051d88 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90
@@ -31,13 +31,12 @@
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@neqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) reduction(@neqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:                 %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
@@ -80,13 +79,12 @@ subroutine simple_reduction(y)
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@neqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) reduction(@neqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:                 %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
@@ -140,13 +138,12 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_17]] to %[[VAL_11]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_18:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@neqv_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], @neqv_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], @neqv_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_18:.*]] : !fir.ref<i32>) reduction(@neqv_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], @neqv_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], @neqv_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
+! CHECK:                 %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_25]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90
index 8f4a6c22c1d74..883064884b637 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90
@@ -38,13 +38,12 @@
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @or_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) reduction(byref @or_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:                 %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
@@ -86,13 +85,12 @@ subroutine simple_reduction(y)
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @or_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) reduction(byref @or_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:                 %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
@@ -144,13 +142,12 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_17]] to %[[VAL_11]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_18:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @or_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], byref @or_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], byref @or_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_18:.*]] : !fir.ref<i32>) reduction(byref @or_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], byref @or_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], byref @or_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
+! CHECK:                 %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_25]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90
index 9d367797ec216..312c08d17a14d 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90
@@ -31,13 +31,12 @@
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@or_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) reduction(@or_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:                 %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
@@ -79,13 +78,12 @@ subroutine simple_reduction(y)
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@or_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) reduction(@or_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:                 %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
@@ -137,13 +135,12 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_17]] to %[[VAL_11]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_18:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@or_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], @or_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], @or_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_18:.*]] : !fir.ref<i32>) reduction(@or_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]], @or_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]], @or_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
+! CHECK:                 %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:                 %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_25]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-2-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-2-byref.f90
index 28c70899e6ccc..0438e19f34391 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-max-2-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-max-2-byref.f90
@@ -1,7 +1,7 @@
 ! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
 
-! CHECK: omp.wsloop reduction(byref @max_byref_i32
+! CHECK: omp.wsloop private({{.*}}) reduction(byref @max_byref_i32
 ! CHECK: arith.cmpi sgt
 ! CHECK: arith.select
 
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-2.f90
index abd7ca1ae555d..66c75bbe38f10 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-max-2.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-max-2.f90
@@ -1,7 +1,7 @@
 ! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
 
-! CHECK: omp.wsloop reduction(@max_i32
+! CHECK: omp.wsloop private({{.*}}) reduction(@max_i32
 ! CHECK: arith.cmpi sgt
 ! CHECK: arith.select
 
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90
index 9abff8ccfa3b6..07c18f90480bf 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90
@@ -45,13 +45,12 @@
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @max_byref_i32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_7:.*]] : !fir.ref<i32>) reduction(byref @max_byref_i32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:                 %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
@@ -75,13 +74,12 @@
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @max_byref_f32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<f32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_7:.*]] : !fir.ref<i32>) reduction(byref @max_byref_f32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:                 %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_max_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
@@ -95,13 +93,12 @@
 ! CHECK:                 omp.yield
 ! CHECK:             omp.terminator
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_30:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_31:.*]]:2 = hlfir.declare %[[VAL_30]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_32:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_33:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_34:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @max_byref_f32 %[[VAL_4]]#0 -> %[[VAL_35:.*]] : !fir.ref<f32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_30:.*]] : !fir.ref<i32>) reduction(byref @max_byref_f32 %[[VAL_4]]#0 -> %[[VAL_35:.*]] : !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_36:.*]]) : i32 = (%[[VAL_32]]) to (%[[VAL_33]]) inclusive step (%[[VAL_34]]) {
+! CHECK:                 %[[VAL_31:.*]]:2 = hlfir.declare %[[VAL_30]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_37:.*]]:2 = hlfir.declare %[[VAL_35]] {uniq_name = "_QFreduction_max_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 fir.store %[[VAL_36]] to %[[VAL_31]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_38:.*]] = fir.load %[[VAL_31]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max.f90
index 7237d3f903b74..7bdfa0948c747 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-max.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-max.f90
@@ -35,13 +35,12 @@
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@max_i32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_7:.*]] : !fir.ref<i32>) reduction(@max_i32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:                 %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
@@ -65,13 +64,12 @@
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@max_f32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<f32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_7:.*]] : !fir.ref<i32>) reduction(@max_f32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:                 %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_max_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
@@ -85,13 +83,12 @@
 ! CHECK:                 omp.yield
 ! CHECK:             omp.terminator
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_30:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_31:.*]]:2 = hlfir.declare %[[VAL_30]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_32:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_33:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_34:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@max_f32 %[[VAL_4]]#0 -> %[[VAL_35:.*]] : !fir.ref<f32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_30:.*]] : !fir.ref<i32>) reduction(@max_f32 %[[VAL_4]]#0 -> %[[VAL_35:.*]] : !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_36:.*]]) : i32 = (%[[VAL_32]]) to (%[[VAL_33]]) inclusive step (%[[VAL_34]]) {
+! CHECK:                 %[[VAL_31:.*]]:2 = hlfir.declare %[[VAL_30]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_37:.*]]:2 = hlfir.declare %[[VAL_35]] {uniq_name = "_QFreduction_max_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 fir.store %[[VAL_36]] to %[[VAL_31]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_38:.*]] = fir.load %[[VAL_31]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90
index a4bfbaa09d2fa..88a455f4b45ac 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90
@@ -45,13 +45,12 @@
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_min_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @min_byref_i32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_7:.*]] : !fir.ref<i32>) reduction(byref @min_byref_i32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:                 %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_min_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_min_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
@@ -75,13 +74,12 @@
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @min_byref_f32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<f32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_7:.*]] : !fir.ref<i32>) reduction(byref @min_byref_f32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:                 %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_min_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
@@ -97,13 +95,12 @@
 ! CHECK:             omp.terminator
 ! CHECK:           }
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_30:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_31:.*]]:2 = hlfir.declare %[[VAL_30]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_32:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_33:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_34:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @min_byref_f32 %[[VAL_4]]#0 -> %[[VAL_35:.*]] : !fir.ref<f32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_30:.*]] : !fir.ref<i32>) reduction(byref @min_byref_f32 %[[VAL_4]]#0 -> %[[VAL_35:.*]] : !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_36:.*]]) : i32 = (%[[VAL_32]]) to (%[[VAL_33]]) inclusive step (%[[VAL_34]]) {
+! CHECK:                 %[[VAL_31:.*]]:2 = hlfir.declare %[[VAL_30]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_37:.*]]:2 = hlfir.declare %[[VAL_35]] {uniq_name = "_QFreduction_min_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 fir.store %[[VAL_36]] to %[[VAL_31]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_38:.*]] = fir.load %[[VAL_31]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min.f90
index ce9e53a17523c..6d4dcf1ab68eb 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-min.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-min.f90
@@ -35,13 +35,12 @@
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_min_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@min_i32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_7:.*]] : !fir.ref<i32>) reduction(@min_i32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:                 %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_min_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_min_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
@@ -65,13 +64,12 @@
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@min_f32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<f32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_7:.*]] : !fir.ref<i32>) reduction(@min_f32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:                 %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_min_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
@@ -87,13 +85,12 @@
 ! CHECK:             omp.terminator
 ! CHECK:           }
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_30:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_31:.*]]:2 = hlfir.declare %[[VAL_30]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_32:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_33:.*]] = arith.constant 100 : i32
 ! CHECK:             %[[VAL_34:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@min_f32 %[[VAL_4]]#0 -> %[[VAL_35:.*]] : !fir.ref<f32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_30:.*]] : !fir.ref<i32>) reduction(@min_f32 %[[VAL_4]]#0 -> %[[VAL_35:.*]] : !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_36:.*]]) : i32 = (%[[VAL_32]]) to (%[[VAL_33]]) inclusive step (%[[VAL_34]]) {
+! CHECK:                 %[[VAL_31:.*]]:2 = hlfir.declare %[[VAL_30]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_37:.*]]:2 = hlfir.declare %[[VAL_35]] {uniq_name = "_QFreduction_min_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 fir.store %[[VAL_36]] to %[[VAL_31]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_38:.*]] = fir.load %[[VAL_31]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90
index d83ebb77af3eb..db8e59cb09dfa 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90
@@ -34,13 +34,12 @@ program reduce
 ! CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QFEr) : !fir.ref<i32>
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFEr"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@min_i32 %[[VAL_3]]#0 -> %[[VAL_9:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_4:.*]] : !fir.ref<i32>) reduction(@min_i32 %[[VAL_3]]#0 -> %[[VAL_9:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
+! CHECK:                 %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_9]] {uniq_name = "_QFEr"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[VAL_10]] to %[[VAL_5]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_12:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90
index 18554fbb72aee..85df29e83f75d 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90
@@ -88,13 +88,12 @@
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i32
 ! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @multiply_reduction_byref_i32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_5:.*]] : !fir.ref<i32>) reduction(byref @multiply_reduction_byref_i32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:                 %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
@@ -125,13 +124,12 @@ subroutine simple_int_reduction
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 1.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @multiply_reduction_byref_f32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_5:.*]] : !fir.ref<i32>) reduction(byref @multiply_reduction_byref_f32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:                 %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_real_reductionEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<f32>
@@ -163,13 +161,12 @@ subroutine simple_real_reduction
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i32
 ! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @multiply_reduction_byref_i32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_5:.*]] : !fir.ref<i32>) reduction(byref @multiply_reduction_byref_i32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:                 %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_int_reduction_switch_orderEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_13:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
@@ -200,13 +197,12 @@ subroutine simple_int_reduction_switch_order
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 1.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @multiply_reduction_byref_f32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_5:.*]] : !fir.ref<i32>) reduction(byref @multiply_reduction_byref_f32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:                 %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_real_reduction_switch_orderEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_13:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
@@ -246,13 +242,12 @@ subroutine simple_real_reduction_switch_order
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 1 : i32
 ! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_7]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_int_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @multiply_reduction_byref_i32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], byref @multiply_reduction_byref_i32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], byref @multiply_reduction_byref_i32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_11:.*]] : !fir.ref<i32>) reduction(byref @multiply_reduction_byref_i32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], byref @multiply_reduction_byref_i32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], byref @multiply_reduction_byref_i32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
+! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_int_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_int_reductions_same_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_int_reductions_same_typeEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_int_reductions_same_typeEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -305,13 +300,12 @@ subroutine multiple_int_reductions_same_type
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 1.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_7]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_real_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @multiply_reduction_byref_f32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], byref @multiply_reduction_byref_f32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], byref @multiply_reduction_byref_f32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<f32>, !fir.ref<f32>, !fir.ref<f32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_11:.*]] : !fir.ref<i32>) reduction(byref @multiply_reduction_byref_f32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], byref @multiply_reduction_byref_f32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], byref @multiply_reduction_byref_f32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<f32>, !fir.ref<f32>, !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
+! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_real_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_real_reductions_same_typeEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_real_reductions_same_typeEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_real_reductions_same_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
@@ -371,13 +365,12 @@ subroutine multiple_real_reductions_same_type
 ! CHECK:           %[[VAL_13:.*]] = arith.constant 1.000000e+00 : f64
 ! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_3]]#0 : f64, !fir.ref<f64>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_14:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_14]] {uniq_name = "_QFmultiple_reductions_different_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_16:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_17:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_18:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @multiply_reduction_byref_i32 %[[VAL_5]]#0 -> %[[VAL_19:.*]], byref @multiply_reduction_byref_i64 %[[VAL_7]]#0 -> %[[VAL_20:.*]], byref @multiply_reduction_byref_f32 %[[VAL_9]]#0 -> %[[VAL_21:.*]], byref @multiply_reduction_byref_f64 %[[VAL_3]]#0 -> %[[VAL_22:.*]] : !fir.ref<i32>, !fir.ref<i64>, !fir.ref<f32>, !fir.ref<f64>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_14:.*]] : !fir.ref<i32>) reduction(byref @multiply_reduction_byref_i32 %[[VAL_5]]#0 -> %[[VAL_19:.*]], byref @multiply_reduction_byref_i64 %[[VAL_7]]#0 -> %[[VAL_20:.*]], byref @multiply_reduction_byref_f32 %[[VAL_9]]#0 -> %[[VAL_21:.*]], byref @multiply_reduction_byref_f64 %[[VAL_3]]#0 -> %[[VAL_22:.*]] : !fir.ref<i32>, !fir.ref<i64>, !fir.ref<f32>, !fir.ref<f64>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_23:.*]]) : i32 = (%[[VAL_16]]) to (%[[VAL_17]]) inclusive step (%[[VAL_18]]) {
+! CHECK:                 %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_14]] {uniq_name = "_QFmultiple_reductions_different_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_19]] {uniq_name = "_QFmultiple_reductions_different_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_20]] {uniq_name = "_QFmultiple_reductions_different_typeEy"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:                 %[[VAL_26:.*]]:2 = hlfir.declare %[[VAL_21]] {uniq_name = "_QFmultiple_reductions_different_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-mul.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-mul.f90
index f5c12ccf61f76..09c44f187f4a2 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-mul.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-mul.f90
@@ -55,13 +55,12 @@
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i32
 ! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@multiply_reduction_i32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_5:.*]] : !fir.ref<i32>) reduction(@multiply_reduction_i32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:                 %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
@@ -92,13 +91,12 @@ subroutine simple_int_reduction
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 1.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@multiply_reduction_f32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_5:.*]] : !fir.ref<i32>) reduction(@multiply_reduction_f32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:                 %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_real_reductionEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<f32>
@@ -130,13 +128,12 @@ subroutine simple_real_reduction
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i32
 ! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@multiply_reduction_i32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_5:.*]] : !fir.ref<i32>) reduction(@multiply_reduction_i32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:                 %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_int_reduction_switch_orderEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_13:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
@@ -167,13 +164,12 @@ subroutine simple_int_reduction_switch_order
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 1.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@multiply_reduction_f32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_5:.*]] : !fir.ref<i32>) reduction(@multiply_reduction_f32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:                 %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_real_reduction_switch_orderEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_13:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
@@ -213,13 +209,12 @@ subroutine simple_real_reduction_switch_order
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 1 : i32
 ! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_7]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_int_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@multiply_reduction_i32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], @multiply_reduction_i32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], @multiply_reduction_i32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_11:.*]] : !fir.ref<i32>) reduction(@multiply_reduction_i32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], @multiply_reduction_i32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], @multiply_reduction_i32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
+! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_int_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_int_reductions_same_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_int_reductions_same_typeEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_int_reductions_same_typeEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -272,13 +267,12 @@ subroutine multiple_int_reductions_same_type
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 1.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_7]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_real_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_14:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@multiply_reduction_f32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], @multiply_reduction_f32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], @multiply_reduction_f32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<f32>, !fir.ref<f32>, !fir.ref<f32>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_11:.*]] : !fir.ref<i32>) reduction(@multiply_reduction_f32 %[[VAL_3]]#0 -> %[[VAL_16:.*]], @multiply_reduction_f32 %[[VAL_5]]#0 -> %[[VAL_17:.*]], @multiply_reduction_f32 %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<f32>, !fir.ref<f32>, !fir.ref<f32>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
+! CHECK:                 %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_real_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_real_reductions_same_typeEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_real_reductions_same_typeEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:                 %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_real_reductions_same_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
@@ -338,13 +332,12 @@ subroutine multiple_real_reductions_same_type
 ! CHECK:           %[[VAL_13:.*]] = arith.constant 1.000000e+00 : f64
 ! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_3]]#0 : f64, !fir.ref<f64>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_14:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_14]] {uniq_name = "_QFmultiple_reductions_different_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_16:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_17:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_18:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@multiply_reduction_i32 %[[VAL_5]]#0 -> %[[VAL_19:.*]], @multiply_reduction_i64 %[[VAL_7]]#0 -> %[[VAL_20:.*]], @multiply_reduction_f32 %[[VAL_9]]#0 -> %[[VAL_21:.*]], @multiply_reduction_f64 %[[VAL_3]]#0 -> %[[VAL_22:.*]] : !fir.ref<i32>, !fir.ref<i64>, !fir.ref<f32>, !fir.ref<f64>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_14:.*]] : !fir.ref<i32>) reduction(@multiply_reduction_i32 %[[VAL_5]]#0 -> %[[VAL_19:.*]], @multiply_reduction_i64 %[[VAL_7]]#0 -> %[[VAL_20:.*]], @multiply_reduction_f32 %[[VAL_9]]#0 -> %[[VAL_21:.*]], @multiply_reduction_f64 %[[VAL_3]]#0 -> %[[VAL_22:.*]] : !fir.ref<i32>, !fir.ref<i64>, !fir.ref<f32>, !fir.ref<f64>) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_23:.*]]) : i32 = (%[[VAL_16]]) to (%[[VAL_17]]) inclusive step (%[[VAL_18]]) {
+! CHECK:                 %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_14]] {uniq_name = "_QFmultiple_reductions_different_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_19]] {uniq_name = "_QFmultiple_reductions_different_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_20]] {uniq_name = "_QFmultiple_reductions_different_typeEy"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:                 %[[VAL_26:.*]]:2 = hlfir.declare %[[VAL_21]] {uniq_name = "_QFmultiple_reductions_different_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-multi.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-multi.f90
index 659ba06005670..66229259adf82 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-multi.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-multi.f90
@@ -41,7 +41,7 @@
 !CHECK:      %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_REF]] {uniq_name = "_QFmultiple_reductionEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK:      %[[Z_REF:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFmultiple_reductionEz"}
 !CHECK:      %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z_REF]] {uniq_name = "_QFmultiple_reductionEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:      omp.wsloop reduction(
+!CHECK:      omp.wsloop private({{.*}}) reduction(
 !CHECK-SAME: @[[ADD_RED_I32_NAME]] %[[X_DECL]]#0 -> %[[PRV_X:[^,]+]],
 !CHECK-SAME: @[[ADD_RED_F32_NAME]] %[[Y_DECL]]#0 -> %[[PRV_Y:[^,]+]],
 !CHECK-SAME: @[[MIN_RED_I32_NAME]] %[[Z_DECL]]#0 -> %[[PRV_Z:.+]] :
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
index 5b6ab095b45b6..75773416e4840 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
@@ -112,13 +112,12 @@ program main
 ! CHECK:             %[[VAL_11:.*]] = fir.embox %[[VAL_4]]#0(%[[VAL_3]]) : (!fir.ref<!fir.array<3x3xf64>>, !fir.shape<2>) -> !fir.box<!fir.array<3x3xf64>>
 ! CHECK:             %[[VAL_12:.*]] = fir.alloca !fir.box<!fir.array<3x3xf64>>
 ! CHECK:             fir.store %[[VAL_11]] to %[[VAL_12]] : !fir.ref<!fir.box<!fir.array<3x3xf64>>>
-! CHECK:             %[[VAL_13:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}}
-! CHECK:             %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_13]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_16:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_17:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_f64 %[[VAL_8]]#0 -> %[[VAL_18:.*]], byref @add_reduction_byref_box_3x3xf64 %[[VAL_12]] -> %[[VAL_19:.*]] : !fir.ref<f64>, !fir.ref<!fir.box<!fir.array<3x3xf64>>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_13:.*]] : !fir.ref<i32>) reduction(@add_reduction_f64 %[[VAL_8]]#0 -> %[[VAL_18:.*]], byref @add_reduction_byref_box_3x3xf64 %[[VAL_12]] -> %[[VAL_19:.*]] : !fir.ref<f64>, !fir.ref<!fir.box<!fir.array<3x3xf64>>>) {
 ! CHECK:               omp.loop_nest (%[[VAL_20:.*]]) : i32 = (%[[VAL_15]]) to (%[[VAL_16]]) inclusive step (%[[VAL_17]]) {
+! CHECK:                 %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_13]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFEscalar"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
 ! CHECK:                 %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_19]] {uniq_name = "_QFEarray"} : (!fir.ref<!fir.box<!fir.array<3x3xf64>>>) -> (!fir.ref<!fir.box<!fir.array<3x3xf64>>>, !fir.ref<!fir.box<!fir.array<3x3xf64>>>)
 ! CHECK:                 fir.store %[[VAL_20]] to %[[VAL_14]]#1 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90
index 2c126bb8962c2..f706e48b8fda8 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90
@@ -87,13 +87,12 @@ program reduce_pointer
 ! CHECK:           %[[VAL_17:.*]] = fir.box_addr %[[VAL_16]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
 ! CHECK:           hlfir.assign %[[VAL_15]] to %[[VAL_17]] : i32, !fir.ptr<i32>
 ! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_18:.*]] = fir.alloca i32 {bindc_name = "i", pinned, uniq_name = "_QFEi"}
-! CHECK:             %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_21:.*]] = arith.constant 5 : i32
 ! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(byref @add_reduction_byref_box_ptr_i32 %[[VAL_5]]#0 -> %[[VAL_23:.*]] : !fir.ref<!fir.box<!fir.ptr<i32>>>) {
+! CHECK:             omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_18:.*]] : !fir.ref<i32>) reduction(byref @add_reduction_byref_box_ptr_i32 %[[VAL_5]]#0 -> %[[VAL_23:.*]] : !fir.ref<!fir.box<!fir.ptr<i32>>>) {
 ! CHECK:               omp.loop_nest (%[[VAL_24:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
+! CHECK:                 %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:                 %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_23]] {fortran_attrs = {{.*}}<pointer>, uniq_name = "_QFEv"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
 ! CHECK:                 fir.store %[[VAL_24]] to %[[VAL_19]]#1 : !fir.ref<i32>
 ! CHECK:                 %[[VAL_26:.*]] = fir.load %[[VAL_25]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
diff --git a/flang/test/Lower/OpenMP/wsloop-schedule.f90 b/flang/test/Lower/OpenMP/wsloop-schedule.f90
index ae854a2de0c9d..0ff4ce7c3ede3 100644
--- a/flang/test/Lower/OpenMP/wsloop-schedule.f90
+++ b/flang/test/Lower/OpenMP/wsloop-schedule.f90
@@ -14,7 +14,7 @@ program wsloop_dynamic
 !CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
 !CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
 !CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-!CHECK:      omp.wsloop nowait schedule(runtime, simd) {
+!CHECK:      omp.wsloop nowait schedule(runtime, simd) private({{.*}}) {
 !CHECK-NEXT:   omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
 !CHECK:          fir.store %[[I]] to %[[STORE:.*]]#1 : !fir.ref<i32>
 
diff --git a/flang/test/Lower/OpenMP/wsloop-unstructured.f90 b/flang/test/Lower/OpenMP/wsloop-unstructured.f90
index 8c89f863ab877..6174718c08758 100644
--- a/flang/test/Lower/OpenMP/wsloop-unstructured.f90
+++ b/flang/test/Lower/OpenMP/wsloop-unstructured.f90
@@ -29,7 +29,7 @@ end subroutine sub
 ! CHECK-SAME:                      %[[VAL_2:.*]]: !fir.ref<!fir.array<?x?xf32>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                      %[[VAL_3:.*]]: !fir.ref<!fir.array<?x?xf32>> {fir.bindc_name = "y"}) {
 ! [...]
-! CHECK:             omp.wsloop {
+! CHECK:             omp.wsloop private({{.*}}) {
 ! CHECK-NEXT:          omp.loop_nest (%[[VAL_53:.*]], %[[VAL_54:.*]]) : i32 = ({{.*}}) to ({{.*}}) inclusive step ({{.*}}) {
 ! [...]
 ! CHECK:                 cf.br ^bb1
diff --git a/flang/test/Lower/OpenMP/wsloop-variable.f90 b/flang/test/Lower/OpenMP/wsloop-variable.f90
index cc77ce754d97e..50b2b3a21ff1e 100644
--- a/flang/test/Lower/OpenMP/wsloop-variable.f90
+++ b/flang/test/Lower/OpenMP/wsloop-variable.f90
@@ -22,7 +22,7 @@ program wsloop_variable
 !CHECK:      %[[TMP5:.*]] = fir.convert %{{.*}} : (i128) -> i64
 !CHECK:      %[[TMP6:.*]] = fir.convert %[[TMP1]] : (i32) -> i64
 !CHECK:      %[[TMP7:.*]] = fir.convert %{{.*}} : (i32) -> i64
-!CHECK:      omp.wsloop {
+!CHECK:      omp.wsloop private({{.*}}) {
 !CHECK-NEXT:   omp.loop_nest (%[[ARG0:.*]], %[[ARG1:.*]]) : i64 = (%[[TMP2]], %[[TMP5]]) to (%[[TMP3]], %[[TMP6]]) inclusive step (%[[TMP4]], %[[TMP7]]) {
 !CHECK:          %[[ARG0_I16:.*]] = fir.convert %[[ARG0]] : (i64) -> i16
 !CHECK:          fir.store %[[ARG0_I16]] to %[[STORE_IV0:.*]]#1 : !fir.ref<i16>
@@ -48,7 +48,7 @@ program wsloop_variable
 !CHECK:      %[[TMP12:.*]] = arith.constant 1 : i32
 !CHECK:      %[[TMP13:.*]] = fir.convert %{{.*}} : (i8) -> i32
 !CHECK:      %[[TMP14:.*]] = fir.convert %{{.*}} : (i64) -> i32
-!CHECK:      omp.wsloop {
+!CHECK:      omp.wsloop private({{.*}}) {
 !CHECK-NEXT:   omp.loop_nest (%[[ARG0:.*]]) : i32 = (%[[TMP12]]) to (%[[TMP13]]) inclusive step (%[[TMP14]]) {
 !CHECK:          %[[ARG0_I16:.*]] = fir.convert %[[ARG0]] : (i32) -> i16
 !CHECK:          fir.store %[[ARG0_I16]] to %[[STORE3:.*]]#1 : !fir.ref<i16>
@@ -68,7 +68,7 @@ program wsloop_variable
 !CHECK:      %[[TMP17:.*]] = fir.convert %{{.*}} : (i8) -> i64
 !CHECK:      %[[TMP18:.*]] = fir.convert %{{.*}} : (i16) -> i64
 !CHECK:      %[[TMP19:.*]] = fir.convert %{{.*}} : (i32) -> i64
-!CHECK:      omp.wsloop {
+!CHECK:      omp.wsloop private({{.*}}) {
 !CHECK-NEXT:   omp.loop_nest (%[[ARG1:.*]]) : i64 = (%[[TMP17]]) to (%[[TMP18]]) inclusive step (%[[TMP19]])  {
 !CHECK:          %[[ARG1_I128:.*]] = fir.convert %[[ARG1]] : (i64) -> i128
 !CHECK:          fir.store %[[ARG1_I128]] to %[[STORE4:.*]]#1 : !fir.ref<i128>
@@ -123,16 +123,14 @@ subroutine wsloop_variable_sub
   integer(kind=16) :: i16_lb
   real :: x
 
-!CHECK:           %[[VAL_2:.*]] = fir.alloca i16 {bindc_name = "i2", pinned, {{.*}}}
-!CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFwsloop_variable_subEi2"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
-
 !CHECK:           %[[VAL_22:.*]] = arith.constant 1 : i32
 !CHECK:           %[[VAL_23:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i8>
 !CHECK:           %[[VAL_24:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<i16>
 !CHECK:           %[[VAL_25:.*]] = fir.convert %[[VAL_23]] : (i8) -> i32
 !CHECK:           %[[VAL_26:.*]] = fir.convert %[[VAL_24]] : (i16) -> i32
-!CHECK:           omp.wsloop {
+!CHECK:           omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_2:.*]] : !fir.ref<i16>) {
 !CHECK-NEXT:        omp.loop_nest (%[[VAL_27:.*]]) : i32 = (%[[VAL_22]]) to (%[[VAL_25]]) inclusive step (%[[VAL_26]]) {
+!CHECK:               %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFwsloop_variable_subEi2"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
 !CHECK:               %[[VAL_28:.*]] = fir.convert %[[VAL_27]] : (i32) -> i16
 !CHECK:               fir.store %[[VAL_28]] to %[[VAL_3]]#1 : !fir.ref<i16>
 !CHECK:               %[[VAL_29:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i128>
@@ -172,14 +170,13 @@ subroutine wsloop_variable_sub
 !CHECK:           %[[VAL_49:.*]] = arith.constant 5 : i8
 !CHECK:           hlfir.assign %[[VAL_49]] to %[[VAL_19]]#0 : i8, !fir.ref<i8>
 
-!CHECK:           %[[VAL_0:.*]] = fir.alloca i8 {bindc_name = "i1", pinned, {{.*}}}
-!CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFwsloop_variable_subEi1"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
 
 !CHECK:           %[[VAL_50:.*]] = arith.constant 1 : i32
 !CHECK:           %[[VAL_51:.*]] = arith.constant 10 : i32
 !CHECK:           %[[VAL_52:.*]] = arith.constant 1 : i32
-!CHECK:           omp.wsloop {
+!CHECK:           omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_0:.*]] : !fir.ref<i8>) {
 !CHECK-NEXT:        omp.loop_nest (%[[VAL_53:.*]]) : i32 = (%[[VAL_50]]) to (%[[VAL_51]]) inclusive step (%[[VAL_52]]) {
+!CHECK:               %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFwsloop_variable_subEi1"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
 !CHECK:               %[[VAL_54:.*]] = fir.convert %[[VAL_53]] : (i32) -> i8
 !CHECK:               fir.store %[[VAL_54]] to %[[VAL_1]]#1 : !fir.ref<i8>
 !CHECK:               %[[VAL_55:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i8>
diff --git a/flang/test/Lower/OpenMP/wsloop.f90 b/flang/test/Lower/OpenMP/wsloop.f90
index 4378233a622ed..44b2f585b3a67 100644
--- a/flang/test/Lower/OpenMP/wsloop.f90
+++ b/flang/test/Lower/OpenMP/wsloop.f90
@@ -7,15 +7,14 @@ subroutine simple_loop
   integer :: i
   ! CHECK:  omp.parallel
   !$OMP PARALLEL
-  ! CHECK:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned, {{.*}}}
-  ! CHECK:      %[[IV_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_IV]] {uniq_name = "_QFsimple_loopEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
   ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
   ! CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-  ! CHECK:      omp.wsloop {
+  ! CHECK:      omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[ALLOCA_IV:.*]] : !fir.ref<i32>) {
   ! CHECK-NEXT:   omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
   !$OMP DO
   do i=1, 9
+  ! CHECK:          %[[IV_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_IV]] {uniq_name = "_QFsimple_loopEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! CHECK:          fir.store %[[I]] to %[[IV_DECL:.*]]#1 : !fir.ref<i32>
   ! CHECK:          %[[LOAD_IV:.*]] = fir.load %[[IV_DECL]]#0 : !fir.ref<i32>
   ! CHECK:          fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
@@ -32,13 +31,12 @@ subroutine simple_loop_with_step
   integer :: i
   ! CHECK:  omp.parallel
   !$OMP PARALLEL
-  ! CHECK:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned, {{.*}}}
-  ! CHECK:      %[[IV_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_IV]] {uniq_name = "_QFsimple_loop_with_stepEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
   ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
   ! CHECK:      %[[WS_STEP:.*]] = arith.constant 2 : i32
-  ! CHECK:      omp.wsloop {
+  ! CHECK:      omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[ALLOCA_IV:.*]] : !fir.ref<i32>) {
   ! CHECK-NEXT:   omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
+  ! CHECK:          %[[IV_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_IV]] {uniq_name = "_QFsimple_loop_with_stepEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! CHECK:          fir.store %[[I]] to %[[IV_DECL]]#1 : !fir.ref<i32>
   ! CHECK:          %[[LOAD_IV:.*]] = fir.load %[[IV_DECL]]#0 : !fir.ref<i32>
   !$OMP DO
@@ -57,15 +55,14 @@ subroutine loop_with_schedule_nowait
   integer :: i
   ! CHECK:  omp.parallel
   !$OMP PARALLEL
-  ! CHECK:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned, {{.*}}}
-  ! CHECK:      %[[IV_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_IV]] {uniq_name = "_QFloop_with_schedule_nowaitEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
   ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
   ! CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-  ! CHECK:      omp.wsloop nowait schedule(runtime) {
+  ! CHECK:      omp.wsloop nowait schedule(runtime) private(@{{.*}} %{{.*}}#0 -> %[[ALLOCA_IV:.*]] : !fir.ref<i32>) {
   ! CHECK-NEXT:   omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
   !$OMP DO SCHEDULE(runtime)
   do i=1, 9
+  ! CHECK:          %[[IV_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_IV]] {uniq_name = "_QFloop_with_schedule_nowaitEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! CHECK:          fir.store %[[I]] to %[[IV_DECL]]#1 : !fir.ref<i32>
   ! CHECK:          %[[LOAD_IV:.*]] = fir.load %[[IV_DECL]]#0 : !fir.ref<i32>
   ! CHECK:          fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
diff --git a/flang/test/Lower/allocatable-assignment.f90 b/flang/test/Lower/allocatable-assignment.f90
index 5c9887c507b67..7fe7aa4ebae34 100644
--- a/flang/test/Lower/allocatable-assignment.f90
+++ b/flang/test/Lower/allocatable-assignment.f90
@@ -678,7 +678,7 @@ subroutine test_scalar_rhs(x, y)
   ! CHECK: } else {
   ! CHECK: %[[error_msg_addr:.*]] = fir.address_of(@[[error_message:.*]]) : !fir.ref<!fir.char<1,76>>
   ! CHECK: %[[msg_addr_cast:.*]] = fir.convert %[[error_msg_addr]] : (!fir.ref<!fir.char<1,76>>) -> !fir.ref<i8>
-  ! CHECK: %{{.*}} = fir.call @_FortranAReportFatalUserError(%[[msg_addr_cast]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i32) -> none
+  ! CHECK: fir.call @_FortranAReportFatalUserError(%[[msg_addr_cast]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i32) -> ()
   ! CHECK-NOT: allocmem
   ! CHECK: }
   x = y
@@ -1049,7 +1049,7 @@ subroutine test_derived_with_init(x, y)
 ! CHECK:    %[[VAL_11:.*]] = fir.allocmem !fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}> {uniq_name = ".auto.alloc"}
 ! CHECK:    %[[VAL_12:.*]] = fir.embox %[[VAL_11]] : (!fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.box<!fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>
 ! CHECK:    %[[VAL_15:.*]] = fir.convert %[[VAL_12]] : (!fir.box<!fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>) -> !fir.box<none>
-! CHECK:    fir.call @_FortranAInitialize(%[[VAL_15]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:    fir.call @_FortranAInitialize(%[[VAL_15]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:    fir.result %[[VAL_11]] : !fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>
 ! CHECK:  } else {
 ! CHECK:    fir.result %{{.*}} : !fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>
diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90
index bbc54754ca1ab..db518c541918a 100644
--- a/flang/test/Lower/allocatable-polymorphic.f90
+++ b/flang/test/Lower/allocatable-polymorphic.f90
@@ -102,7 +102,7 @@ subroutine test_pointer()
 ! CHECK: %[[TYPE_DESC_P1_CAST:.*]] = fir.convert %[[TYPE_DESC_P1]] : (!fir.tdesc<!fir.type<_QMpolyTp1{a:i32,b:i32}>>) -> !fir.ref<none>
 ! CHECK: %[[RANK:.*]] = arith.constant 0 : i32
 ! CHECK: %[[CORANK:.*]] = arith.constant 0 : i32
-! CHECK: %{{.*}} = fir.call @_FortranAPointerNullifyDerived(%[[P_DESC_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[CORANK]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
+! CHECK: fir.call @_FortranAPointerNullifyDerived(%[[P_DESC_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[CORANK]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[P_DESC_CAST:.*]] = fir.convert %[[P_DECL]]#1 : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[P_DESC_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
@@ -115,7 +115,7 @@ subroutine test_pointer()
 ! CHECK: %[[TYPE_DESC_P1_CAST:.*]] = fir.convert %[[TYPE_DESC_P1]] : (!fir.tdesc<!fir.type<_QMpolyTp1{a:i32,b:i32}>>) -> !fir.ref<none>
 ! CHECK: %[[RANK:.*]] = arith.constant 0 : i32
 ! CHECK: %[[CORANK:.*]] = arith.constant 0 : i32
-! CHECK: %{{.*}} = fir.call @_FortranAPointerNullifyDerived(%[[C1_DESC_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[CORANK]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
+! CHECK: fir.call @_FortranAPointerNullifyDerived(%[[C1_DESC_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[CORANK]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[C1_DESC_CAST:.*]] = fir.convert %[[C1_DECL]]#1 : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[C1_DESC_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
@@ -124,7 +124,7 @@ subroutine test_pointer()
 ! CHECK: %[[TYPE_DESC_P2_CAST:.*]] = fir.convert %[[TYPE_DESC_P2]] : (!fir.tdesc<!fir.type<_QMpolyTp2{p1:!fir.type<_QMpolyTp1{a:i32,b:i32}>,c:i32}>>) -> !fir.ref<none>
 ! CHECK: %[[RANK:.*]] = arith.constant 0 : i32
 ! CHECK: %[[CORANK:.*]] = arith.constant 0 : i32
-! CHECK: %{{.*}} = fir.call @_FortranAPointerNullifyDerived(%[[C2_DESC_CAST]], %[[TYPE_DESC_P2_CAST]], %[[RANK]], %[[CORANK]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
+! CHECK: fir.call @_FortranAPointerNullifyDerived(%[[C2_DESC_CAST]], %[[TYPE_DESC_P2_CAST]], %[[RANK]], %[[CORANK]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[C2_DESC_CAST:.*]] = fir.convert %[[C2_DECL]]#1 : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[C2_DESC_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
@@ -151,9 +151,9 @@ subroutine test_pointer()
 ! CHECK: %[[TYPE_DESC_P1_CAST:.*]] = fir.convert %[[TYPE_DESC_P1]] : (!fir.tdesc<!fir.type<_QMpolyTp1{a:i32,b:i32}>>) -> !fir.ref<none>
 ! CHECK: %[[RANK:.*]] = arith.constant 1 : i32
 ! CHECK: %[[CORANK:.*]] = arith.constant 0 : i32
-! CHECK: %{{.*}} = fir.call @_FortranAPointerNullifyDerived(%[[C3_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[CORANK]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
+! CHECK: fir.call @_FortranAPointerNullifyDerived(%[[C3_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[CORANK]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[C3_CAST:.*]] = fir.convert %[[C3_DECL]]#1 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerSetBounds(%[[C3_CAST]], %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK: fir.call @_FortranAPointerSetBounds(%[[C3_CAST]], %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK: %[[C3_CAST:.*]] = fir.convert %[[C3_DECL]]#1 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[C3_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
@@ -162,9 +162,9 @@ subroutine test_pointer()
 ! CHECK: %[[TYPE_DESC_P2_CAST:.*]] = fir.convert %[[TYPE_DESC_P2]] : (!fir.tdesc<!fir.type<_QMpolyTp2{p1:!fir.type<_QMpolyTp1{a:i32,b:i32}>,c:i32}>>) -> !fir.ref<none>
 ! CHECK: %[[RANK:.*]] = arith.constant 1 : i32
 ! CHECK: %[[CORANK:.*]] = arith.constant 0 : i32
-! CHECK: %{{.*}} = fir.call @_FortranAPointerNullifyDerived(%[[C4_CAST]], %[[TYPE_DESC_P2_CAST]], %[[RANK]], %[[CORANK]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
+! CHECK: fir.call @_FortranAPointerNullifyDerived(%[[C4_CAST]], %[[TYPE_DESC_P2_CAST]], %[[RANK]], %[[CORANK]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[C4_CAST:.*]] = fir.convert %[[C4_DECL]]#1 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerSetBounds(%[[C4_CAST]], %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK: fir.call @_FortranAPointerSetBounds(%[[C4_CAST]], %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK: %[[C4_CAST:.*]] = fir.convert %[[C4_DECL]]#1 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[C4_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
@@ -265,7 +265,7 @@ subroutine test_allocatable()
 ! CHECK: %[[TYPE_DESC_P1_CAST:.*]] = fir.convert %[[TYPE_DESC_P1]] : (!fir.tdesc<!fir.type<_QMpolyTp1{a:i32,b:i32}>>) -> !fir.ref<none>
 ! CHECK: %[[RANK:.*]] = arith.constant 0 : i32
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i32
-! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[P_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
+! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[P_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[P_CAST:.*]] = fir.convert %[[P_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[P_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
@@ -274,7 +274,7 @@ subroutine test_allocatable()
 ! CHECK: %[[TYPE_DESC_P1_CAST:.*]] = fir.convert %[[TYPE_DESC_P1]] : (!fir.tdesc<!fir.type<_QMpolyTp1{a:i32,b:i32}>>) -> !fir.ref<none>
 ! CHECK: %[[RANK:.*]] = arith.constant 0 : i32
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i32
-! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[C1_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
+! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[C1_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[C1_CAST:.*]] = fir.convert %[[C1_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C1_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
@@ -283,7 +283,7 @@ subroutine test_allocatable()
 ! CHECK: %[[TYPE_DESC_P2_CAST:.*]] = fir.convert %[[TYPE_DESC_P2]] : (!fir.tdesc<!fir.type<_QMpolyTp2{p1:!fir.type<_QMpolyTp1{a:i32,b:i32}>,c:i32}>>) -> !fir.ref<none>
 ! CHECK: %[[RANK:.*]] = arith.constant 0 : i32
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i32
-! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[C2_CAST]], %[[TYPE_DESC_P2_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
+! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[C2_CAST]], %[[TYPE_DESC_P2_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[C2_CAST:.*]] = fir.convert %[[C2_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C2_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
@@ -292,13 +292,13 @@ subroutine test_allocatable()
 ! CHECK: %[[TYPE_DESC_P1_CAST:.*]] = fir.convert %[[TYPE_DESC_P1]] : (!fir.tdesc<!fir.type<_QMpolyTp1{a:i32,b:i32}>>) -> !fir.ref<none>
 ! CHECK: %[[RANK:.*]] = arith.constant 1 : i32
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i32
-! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[C3_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
+! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[C3_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[C10:.*]] = arith.constant 10 : i32
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i32
 ! CHECK: %[[C3_CAST:.*]] = fir.convert %[[C3_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[C1_I64:.*]] = fir.convert %c1{{.*}} : (index) -> i64
 ! CHECK: %[[C10_I64:.*]] = fir.convert %[[C10]] : (i32) -> i64
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableSetBounds(%[[C3_CAST]], %[[C0]], %[[C1_I64]], %[[C10_I64]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK: fir.call @_FortranAAllocatableSetBounds(%[[C3_CAST]], %[[C0]], %[[C1_I64]], %[[C10_I64]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK: %[[C3_CAST:.*]] = fir.convert %[[C3_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C3_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
@@ -307,14 +307,14 @@ subroutine test_allocatable()
 ! CHECK: %[[TYPE_DESC_P2_CAST:.*]] = fir.convert %[[TYPE_DESC_P2]] : (!fir.tdesc<!fir.type<_QMpolyTp2{p1:!fir.type<_QMpolyTp1{a:i32,b:i32}>,c:i32}>>) -> !fir.ref<none>
 ! CHECK: %[[RANK:.*]] = arith.constant 1 : i32
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i32
-! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[C4_CAST]], %[[TYPE_DESC_P2_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
+! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[C4_CAST]], %[[TYPE_DESC_P2_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[CST1:.*]] = arith.constant 1 : index
 ! CHECK: %[[C20:.*]] = arith.constant 20 : i32
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i32
 ! CHECK: %[[C4_CAST:.*]] = fir.convert %[[C4_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[C1_I64:.*]] = fir.convert %[[CST1]] : (index) -> i64
 ! CHECK: %[[C20_I64:.*]] = fir.convert %[[C20]] : (i32) -> i64
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableSetBounds(%[[C4_CAST]], %[[C0]], %[[C1_I64]], %[[C20_I64]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK: fir.call @_FortranAAllocatableSetBounds(%[[C4_CAST]], %[[C0]], %[[C1_I64]], %[[C20_I64]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK: %[[C4_CAST:.*]] = fir.convert %[[C4_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C4_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
@@ -388,7 +388,7 @@ subroutine test_unlimited_polymorphic_with_intrinsic_type_spec()
 ! CHECK: %[[KIND:.*]] = arith.constant 4 : i32
 ! CHECK: %[[RANK:.*]] = arith.constant 0 : i32
 ! CHECK: %[[CORANK:.*]] = arith.constant 0 : i32
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableInitIntrinsicForAllocate(%[[BOX_NONE]], %[[CAT]], %[[KIND]], %[[RANK]], %[[CORANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, i32, i32) -> none
+! CHECK: fir.call @_FortranAAllocatableInitIntrinsicForAllocate(%[[BOX_NONE]], %[[CAT]], %[[KIND]], %[[RANK]], %[[CORANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, i32, i32) -> ()
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[P_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<none>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
@@ -397,7 +397,7 @@ subroutine test_unlimited_polymorphic_with_intrinsic_type_spec()
 ! CHECK: %[[KIND:.*]] = arith.constant 4 : i32
 ! CHECK: %[[RANK:.*]] = arith.constant 0 : i32
 ! CHECK: %[[CORANK:.*]] = arith.constant 0 : i32
-! CHECK: %{{.*}} = fir.call @_FortranAPointerNullifyIntrinsic(%[[BOX_NONE]], %[[CAT]], %[[KIND]], %[[RANK]], %[[CORANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, i32, i32) -> none
+! CHECK: fir.call @_FortranAPointerNullifyIntrinsic(%[[BOX_NONE]], %[[CAT]], %[[KIND]], %[[RANK]], %[[CORANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, i32, i32) -> ()
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[PTR_DECL]]#1 : (!fir.ref<!fir.class<!fir.ptr<none>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
@@ -435,7 +435,7 @@ subroutine test_type_with_polymorphic_pointer_component()
 ! CHECK: %[[TYPE_DESC_P1_CAST:.*]] = fir.convert %[[TYPE_DESC_P1]] : (!fir.tdesc<!fir.type<_QMpolyTp1{a:i32,b:i32}>>) -> !fir.ref<none>
 ! CHECK: %[[RANK:.*]] = arith.constant 0 : i32
 ! CHECK: %[[CORANK:.*]] = arith.constant 0 : i32
-! CHECK: %{{.*}} = fir.call @_FortranAPointerNullifyDerived(%[[ELEMENT_DESC_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[CORANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
+! CHECK: fir.call @_FortranAPointerNullifyDerived(%[[ELEMENT_DESC_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[CORANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[ELEMENT_DESC_CAST:.*]] = fir.convert %[[ELEMENT]] : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[ELEMENT_DESC_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
@@ -463,14 +463,14 @@ subroutine test_allocate_with_mold()
 ! CHECK: %[[RANK:.*]] = arith.constant 1 : i32 
 ! CHECK: %[[P_BOX_NONE:.*]] = fir.convert %[[P_DECL]]#1 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[X_BOX_NONE:.*]] = fir.convert %[[EMBOX_X]] : (!fir.box<!fir.array<10x!fir.type<_QMpolyTp2{p1:!fir.type<_QMpolyTp1{a:i32,b:i32}>,c:i32}>>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerApplyMold(%[[P_BOX_NONE]], %[[X_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> none
+! CHECK: fir.call @_FortranAPointerApplyMold(%[[P_BOX_NONE]], %[[X_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 ! CHECK: %[[P_BOX_NONE:.*]] = fir.convert %[[P_DECL]]#1 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[P_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK: %[[EMBOX_I:.*]] = fir.embox %[[I_DECL]]#1(%{{.*}}) : (!fir.ref<!fir.array<20xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<20xi32>>
 ! CHECK: %[[RANK:.*]] = arith.constant 1 : i32
 ! CHECK: %[[UP_BOX_NONE:.*]] = fir.convert %[[UP_DECL]]#1 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[I_BOX_NONE:.*]] = fir.convert %[[EMBOX_I]] : (!fir.box<!fir.array<20xi32>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerApplyMold(%[[UP_BOX_NONE]], %[[I_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> none
+! CHECK: fir.call @_FortranAPointerApplyMold(%[[UP_BOX_NONE]], %[[I_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 ! CHECK: %[[UP_BOX_NONE:.*]] = fir.convert %[[UP_DECL]]#1 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[UP_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
@@ -497,8 +497,8 @@ subroutine test_allocate_with_source()
 ! CHECK: %[[RANK:.*]] = arith.constant 1 : i32
 ! CHECK: %[[P_BOX_NONE:.*]] = fir.convert %[[P_DECL]]#1 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[X_BOX_NONE:.*]] = fir.convert %[[EMBOX_X]] : (!fir.box<!fir.array<10x!fir.type<_QMpolyTp2{p1:!fir.type<_QMpolyTp1{a:i32,b:i32}>,c:i32}>>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerApplyMold(%[[P_BOX_NONE]], %[[X_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> none
-! CHECK: %{{.*}} = fir.call @_FortranAPointerSetBounds
+! CHECK: fir.call @_FortranAPointerApplyMold(%[[P_BOX_NONE]], %[[X_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
+! CHECK: fir.call @_FortranAPointerSetBounds
 ! CHECK: %[[BOX_NONE_P:.*]] = fir.convert %[[P_DECL]]#1 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[BOX_NONE_X:.*]] = fir.convert %[[EMBOX_X]] : (!fir.box<!fir.array<10x!fir.type<_QMpolyTp2{p1:!fir.type<_QMpolyTp1{a:i32,b:i32}>,c:i32}>>>) -> !fir.box<none>
 ! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocateSource(%[[BOX_NONE_P]], %[[BOX_NONE_X]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -506,8 +506,8 @@ subroutine test_allocate_with_source()
 ! CHECK: %[[RANK:.*]] = arith.constant 1 : i32
 ! CHECK: %[[UP_BOX_NONE:.*]] = fir.convert %[[UP_DECL]]#1 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[I_BOX_NONE:.*]] = fir.convert %[[EMBOX_I]] : (!fir.box<!fir.array<20xi32>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerApplyMold(%[[UP_BOX_NONE]], %[[I_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> none
-! CHECK: %{{.*}} = fir.call @_FortranAPointerSetBounds
+! CHECK: fir.call @_FortranAPointerApplyMold(%[[UP_BOX_NONE]], %[[I_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
+! CHECK: fir.call @_FortranAPointerSetBounds
 ! CHECK: %[[UP_BOX_NONE:.*]] = fir.convert %[[UP_DECL]]#1 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[I_BOX_NONE:.*]] = fir.convert %[[EMBOX_I]] : (!fir.box<!fir.array<20xi32>>) -> !fir.box<none>
 ! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocateSource(%[[UP_BOX_NONE]], %[[I_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -526,7 +526,7 @@ subroutine test_allocatable_up_from_up_mold(a, b)
 ! CHECK: %[[RANK:.*]] = arith.constant 0 : i32
 ! CHECK: %[[A_BOX_NONE:.*]] = fir.convert %[[A_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<none>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[B_BOX_NONE:.*]] = fir.convert %[[LOAD_B]] : (!fir.class<!fir.ptr<none>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableApplyMold(%[[A_BOX_NONE]], %[[B_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> none
+! CHECK: fir.call @_FortranAAllocatableApplyMold(%[[A_BOX_NONE]], %[[B_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 ! CHECK: %[[A_BOX_NONE:.*]] = fir.convert %[[A_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<none>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[B_BOX_NONE:.*]] = fir.convert %[[LOAD_B]] : (!fir.class<!fir.ptr<none>>) -> !fir.box<none>
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocateSource(%[[A_BOX_NONE]], %[[B_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -546,14 +546,14 @@ subroutine test_allocatable_up_from_mold_rank(a)
 ! CHECK: %[[RANK:.*]] = arith.constant 1 : i32
 ! CHECK: %[[A_BOX_NONE:.*]] = fir.convert %[[A_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[BOX_NONE_10:.*]] = fir.convert %[[EMBOX_10]] : (!fir.box<i32>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableApplyMold(%[[A_BOX_NONE]], %[[BOX_NONE_10]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> none
+! CHECK: fir.call @_FortranAAllocatableApplyMold(%[[A_BOX_NONE]], %[[BOX_NONE_10]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 ! CHECK: %[[C1:.*]] = arith.constant 1 : index
 ! CHECK: %[[C2:.*]] = arith.constant 20 : i32
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i32
 ! CHECK: %[[A_BOX_NONE:.*]] = fir.convert %[[A_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[C1_I64:.*]] = fir.convert %[[C1]] : (index) -> i64
 ! CHECK: %[[C20_I64:.*]] = fir.convert %[[C20]] : (i32) -> i64
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableSetBounds(%[[A_BOX_NONE]], %[[C0]], %[[C1_I64]], %[[C20_I64]]) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK: fir.call @_FortranAAllocatableSetBounds(%[[A_BOX_NONE]], %[[C0]], %[[C1_I64]], %[[C20_I64]]) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK: %[[A_BOX_NONE:.*]] = fir.convert %[[A_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[BOX_NONE_10:.*]] = fir.convert %[[EMBOX_10]] : (!fir.box<i32>) -> !fir.box<none>
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocateSource(%[[A_BOX_NONE]], %[[BOX_NONE_10]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -571,7 +571,7 @@ subroutine test_allocatable_up_character()
 ! CHECK: %[[KIND:.*]] = arith.constant 1 : i32
 ! CHECK: %[[RANK:.*]] = arith.constant 0 : i32
 ! CHECK: %[[CORANK:.*]] = arith.constant 0 : i32
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableInitCharacterForAllocate(%[[A_NONE]], %[[LEN]], %[[KIND]], %[[RANK]], %[[CORANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i32, i32, i32) -> none
+! CHECK: fir.call @_FortranAAllocatableInitCharacterForAllocate(%[[A_NONE]], %[[LEN]], %[[KIND]], %[[RANK]], %[[CORANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i32, i32, i32) -> ()
 ! CHECK: %[[A_NONE:.*]] = fir.convert %[[A_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<none>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
@@ -591,17 +591,17 @@ program test_alloc
 
 ! LLVM-LABEL: define void @_QMpolyPtest_allocatable()
 
-! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 0, i32 0)
+! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 0, i32 0)
 ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
-! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 0, i32 0)
+! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 0, i32 0)
 ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
-! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp2, i32 0, i32 0)
+! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp2, i32 0, i32 0)
 ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
-! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 1, i32 0)
-! LLVM: %{{.*}} = call {} @_FortranAAllocatableSetBounds(ptr %{{.*}}, i32 0, i64 1, i64 10)
+! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 1, i32 0)
+! LLVM: call void @_FortranAAllocatableSetBounds(ptr %{{.*}}, i32 0, i64 1, i64 10)
 ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
-! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp2, i32 1, i32 0)
-! LLVM: %{{.*}} = call {} @_FortranAAllocatableSetBounds(ptr %{{.*}}, i32 0, i64 1, i64 20)
+! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp2, i32 1, i32 0)
+! LLVM: call void @_FortranAAllocatableSetBounds(ptr %{{.*}}, i32 0, i64 1, i64 20)
 ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
 ! LLVM-COUNT-2:  call void %{{[0-9]*}}()
 
@@ -682,6 +682,6 @@ program test_alloc
 ! LLVM-LABEL: define void @_QMpolyPtest_deallocate()
 ! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr null, i64 ptrtoint (ptr getelementptr (%_QMpolyTp1, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 42, i8 2, i8 1, ptr @_QMpolyEXdtXp1, [1 x i64] zeroinitializer }, ptr %[[ALLOCA1:[0-9]*]]
 ! LLVM: call void @llvm.memcpy.p0.p0.i32(ptr %[[ALLOCA2:[0-9]+]], ptr %[[ALLOCA1]], i32 40, i1 false)
-! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %[[ALLOCA2]], ptr @_QMpolyEXdtXp1, i32 0, i32 0)
+! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %[[ALLOCA2]], ptr @_QMpolyEXdtXp1, i32 0, i32 0)
 ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %[[ALLOCA2]], i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
 ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableDeallocatePolymorphic(ptr %[[ALLOCA2]], ptr {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
diff --git a/flang/test/Lower/allocatable-runtime.f90 b/flang/test/Lower/allocatable-runtime.f90
index 3f1f8a86b7d07..9670a1e0e716e 100644
--- a/flang/test/Lower/allocatable-runtime.f90
+++ b/flang/test/Lower/allocatable-runtime.f90
@@ -28,7 +28,7 @@ subroutine foo()
   ! CHECK-DAG: %[[xBoxCast2:.*]] = fir.convert %[[xBoxAddr]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
   ! CHECK-DAG: %[[xlbCast:.*]] = fir.convert %[[xlb]] : (i32) -> i64
   ! CHECK-DAG: %[[xubCast:.*]] = fir.convert %[[xub]] : (i32) -> i64
-  ! CHECK: fir.call @{{.*}}AllocatableSetBounds(%[[xBoxCast2]], %c0{{.*}}, %[[xlbCast]], %[[xubCast]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+  ! CHECK: fir.call @{{.*}}AllocatableSetBounds(%[[xBoxCast2]], %c0{{.*}}, %[[xlbCast]], %[[xubCast]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
   ! CHECK-DAG: %[[xBoxCast3:.*]] = fir.convert %[[xBoxAddr]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
   ! CHECK-DAG: %[[sourceFile:.*]] = fir.convert %{{.*}} -> !fir.ref<i8>
   ! CHECK: fir.call @{{.*}}AllocatableAllocate(%[[xBoxCast3]], %false{{.*}}, %[[errMsg]], %[[sourceFile]], %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -178,6 +178,6 @@ subroutine mold_allocation()
 ! CHECK: %[[RANK:.*]] = arith.constant 1 : i32
 ! CHECK: %[[A_BOX_NONE:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[M_BOX_NONE:.*]] = fir.convert %[[EMBOX_M]] : (!fir.box<!fir.array<10xi32>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableApplyMold(%[[A_BOX_NONE]], %[[M_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> none
+! CHECK: fir.call @_FortranAAllocatableApplyMold(%[[A_BOX_NONE]], %[[M_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 ! CHECK: %[[A_BOX_NONE:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
diff --git a/flang/test/Lower/allocate-mold.f90 b/flang/test/Lower/allocate-mold.f90
index 0cc10fc9016de..e50861a4ce76b 100644
--- a/flang/test/Lower/allocate-mold.f90
+++ b/flang/test/Lower/allocate-mold.f90
@@ -14,7 +14,7 @@ subroutine scalar_mold_allocation()
 ! CHECK: %[[BOX_ADDR_A:.*]] = fir.embox %[[ADDR_A]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
 ! CHECK: fir.store %[[BOX_ADDR_A]] to %[[A]] : !fir.ref<!fir.box<!fir.heap<i32>>>
 ! CHECK: %[[A_REF_BOX_NONE1:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableApplyMold(%[[A_REF_BOX_NONE1]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> none
+! CHECK: fir.call @_FortranAAllocatableApplyMold(%[[A_REF_BOX_NONE1]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 ! CHECK: %[[A_REF_BOX_NONE2:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_REF_BOX_NONE2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
@@ -35,9 +35,9 @@ end subroutine array_scalar_mold_allocation
 ! CHECK: %[[BOX_SHAPESHIFT:.*]] = fir.embox %[[LOADED_A]](%[[SHAPESHIFT]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
 ! CHECK: fir.store %[[BOX_SHAPESHIFT]] to %[[A]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK: %[[REF_BOX_A0:.*]] = fir.convert %1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableApplyMold(%[[REF_BOX_A0]], {{.*}}, {{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> none
+! CHECK: fir.call @_FortranAAllocatableApplyMold(%[[REF_BOX_A0]], {{.*}}, {{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 ! CHECK: %[[C10:.*]] = arith.constant 10 : i32
 ! CHECK: %[[REF_BOX_A1:.*]] = fir.convert %1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableSetBounds(%[[REF_BOX_A1]], {{.*}},{{.*}}, {{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK: fir.call @_FortranAAllocatableSetBounds(%[[REF_BOX_A1]], {{.*}},{{.*}}, {{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK: %[[REF_BOX_A2:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[REF_BOX_A2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
diff --git a/flang/test/Lower/allocate-source-allocatables-2.f90 b/flang/test/Lower/allocate-source-allocatables-2.f90
index 39b9f04a5f67a..e84b367f29bc0 100644
--- a/flang/test/Lower/allocate-source-allocatables-2.f90
+++ b/flang/test/Lower/allocate-source-allocatables-2.f90
@@ -25,7 +25,7 @@ subroutine test()
 ! CHECK:           %[[VAL_29:.*]] = arith.constant 1 : i32
 ! CHECK:           %[[VAL_30:.*]] = arith.constant 0 : i32
 ! CHECK:           %[[VAL_31:.*]] = arith.constant 0 : i32
-! CHECK:           %[[VAL_32:.*]] = fir.call @_FortranAAllocatableInitCharacterForAllocate(%[[VAL_27]], %[[VAL_28]], %[[VAL_29]], %[[VAL_30]], %[[VAL_31]]
+! CHECK:           fir.call @_FortranAAllocatableInitCharacterForAllocate(%[[VAL_27]], %[[VAL_28]], %[[VAL_29]], %[[VAL_30]], %[[VAL_31]]
 ! CHECK:           %[[VAL_33:.*]] = fir.convert %[[VAL_4]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:           %[[VAL_34:.*]] = fir.convert %[[VAL_22]] : (!fir.box<!fir.char<1,5>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_36:.*]] = fir.call @_FortranAAllocatableAllocateSource(%[[VAL_33]], %[[VAL_34]], %[[VAL_18]],
diff --git a/flang/test/Lower/allocate-source-allocatables.f90 b/flang/test/Lower/allocate-source-allocatables.f90
index f09612c3197da..29b00b79a69d4 100644
--- a/flang/test/Lower/allocate-source-allocatables.f90
+++ b/flang/test/Lower/allocate-source-allocatables.f90
@@ -72,7 +72,7 @@ subroutine test_allocatable_scalar(a)
 ! CHECK:         %[[VAL_55:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_56:.*]] = fir.convert %[[VAL_49]] : (index) -> i64
 ! CHECK:         %[[VAL_57:.*]] = fir.convert %[[VAL_53]] : (index) -> i64
-! CHECK:         %[[VAL_58:.*]] = fir.call @_FortranAAllocatableSetBounds(%[[VAL_55]], %[[VAL_54]], %[[VAL_56]], %[[VAL_57]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK:         fir.call @_FortranAAllocatableSetBounds(%[[VAL_55]], %[[VAL_54]], %[[VAL_56]], %[[VAL_57]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK:         %[[VAL_59:.*]] = arith.constant 1 : index
 ! CHECK:         %[[VAL_60:.*]]:3 = fir.box_dims %[[VAL_41]], %[[VAL_59]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
 ! CHECK:         %[[VAL_61:.*]] = arith.addi %[[VAL_60]]#1, %[[VAL_49]] : index
@@ -81,16 +81,16 @@ subroutine test_allocatable_scalar(a)
 ! CHECK:         %[[VAL_64:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_65:.*]] = fir.convert %[[VAL_49]] : (index) -> i64
 ! CHECK:         %[[VAL_66:.*]] = fir.convert %[[VAL_62]] : (index) -> i64
-! CHECK:         %[[VAL_67:.*]] = fir.call @_FortranAAllocatableSetBounds(%[[VAL_64]], %[[VAL_63]], %[[VAL_65]], %[[VAL_66]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK:         fir.call @_FortranAAllocatableSetBounds(%[[VAL_64]], %[[VAL_63]], %[[VAL_65]], %[[VAL_66]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK:         %[[VAL_68:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_69:.*]] = fir.convert %[[VAL_41]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 ! CHECK:         %[[VAL_71:.*]] = fir.call @_FortranAAllocatableAllocateSource(%[[VAL_68]], %[[VAL_69]], %[[VAL_36]], %[[VAL_37]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
-! CHECK:         %[[VAL_94:.*]] = fir.call @_FortranAAllocatableSetBounds(
-! CHECK:         %[[VAL_103:.*]] = fir.call @_FortranAAllocatableSetBounds(
+! CHECK:         fir.call @_FortranAAllocatableSetBounds(
+! CHECK:         fir.call @_FortranAAllocatableSetBounds(
 ! CHECK:         %[[VAL_107:.*]] = fir.call @_FortranAAllocatableAllocateSource(
 ! CHECK:         %[[VAL_114:.*]] = arith.constant true
-! CHECK:         %[[VAL_149:.*]] = fir.call @_FortranAAllocatableSetBounds(
-! CHECK:         %[[VAL_158:.*]] = fir.call @_FortranAAllocatableSetBounds(
+! CHECK:         fir.call @_FortranAAllocatableSetBounds(
+! CHECK:         fir.call @_FortranAAllocatableSetBounds(
 ! CHECK:         %[[VAL_162:.*]] = fir.call @_FortranAAllocatableAllocateSource(%{{.*}}, %{{.*}}, %[[VAL_114]]
 
 subroutine test_allocatable_2d_array(n, a)
@@ -139,7 +139,7 @@ subroutine test_allocatable_2d_array(n, a)
 ! CHECK:         %[[VAL_33:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_34:.*]] = fir.convert %[[VAL_30]] : (i32) -> i64
 ! CHECK:         %[[VAL_35:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
-! CHECK:         %[[VAL_36:.*]] = fir.call @_FortranAAllocatableSetBounds(%[[VAL_33]], %[[VAL_32]], %[[VAL_34]], %[[VAL_35]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK:         fir.call @_FortranAAllocatableSetBounds(%[[VAL_33]], %[[VAL_32]], %[[VAL_34]], %[[VAL_35]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK:         %[[VAL_37:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_38:.*]] = fir.convert %[[VAL_24]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
 ! CHECK:         %[[VAL_40:.*]] = fir.call @_FortranAAllocatableAllocateSource(%[[VAL_37]], %[[VAL_38]], %[[VAL_19]], %[[VAL_20]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -162,7 +162,7 @@ subroutine test_allocatable_2d_array(n, a)
 ! CHECK:         %[[VAL_53:.*]] = fir.convert %[[VAL_8]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_54:.*]] = fir.convert %[[VAL_50]] : (index) -> i64
 ! CHECK:         %[[VAL_55:.*]] = fir.convert %[[VAL_51]] : (i32) -> i64
-! CHECK:         %[[VAL_56:.*]] = fir.call @_FortranAAllocatableSetBounds(%[[VAL_53]], %[[VAL_52]], %[[VAL_54]], %[[VAL_55]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK:         fir.call @_FortranAAllocatableSetBounds(%[[VAL_53]], %[[VAL_52]], %[[VAL_54]], %[[VAL_55]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK:         %[[VAL_57:.*]] = fir.convert %[[VAL_8]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_58:.*]] = fir.convert %[[VAL_24]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
 ! CHECK:         %[[VAL_60:.*]] = fir.call @_FortranAAllocatableAllocateSource(%[[VAL_57]], %[[VAL_58]], %[[VAL_19]], %[[VAL_20]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -218,7 +218,7 @@ subroutine test_allocatable_with_shapespec(n, a, m)
 ! CHECK:         %[[VAL_41:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_42:.*]] = fir.convert %[[VAL_35]] : (index) -> i64
 ! CHECK:         %[[VAL_43:.*]] = fir.convert %[[VAL_39]] : (index) -> i64
-! CHECK:         %[[VAL_44:.*]] = fir.call @_FortranAAllocatableSetBounds(%[[VAL_41]], %[[VAL_40]], %[[VAL_42]], %[[VAL_43]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK:         fir.call @_FortranAAllocatableSetBounds(%[[VAL_41]], %[[VAL_40]], %[[VAL_42]], %[[VAL_43]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK:         %[[VAL_45:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_46:.*]] = fir.convert %[[VAL_29]] : (!fir.box<!fir.array<5xi32>>) -> !fir.box<none>
 ! CHECK:         %[[VAL_48:.*]] = fir.call @_FortranAAllocatableAllocateSource(%[[VAL_45]], %[[VAL_46]], %[[VAL_7]], %[[VAL_8]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -276,7 +276,7 @@ subroutine test_allocatable_from_const(n, a)
 ! CHECK:         %[[VAL_32:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,4>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_33:.*]] = fir.convert %[[VAL_26]] : (index) -> i64
 ! CHECK:         %[[VAL_34:.*]] = fir.convert %[[VAL_30]] : (index) -> i64
-! CHECK:         %[[VAL_35:.*]] = fir.call @_FortranAAllocatableSetBounds(%[[VAL_32]], %[[VAL_31]], %[[VAL_33]], %[[VAL_34]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK:         fir.call @_FortranAAllocatableSetBounds(%[[VAL_32]], %[[VAL_31]], %[[VAL_33]], %[[VAL_34]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK:         %[[VAL_36:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,4>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_37:.*]] = fir.convert %[[VAL_20]] : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> !fir.box<none>
 ! CHECK:         %[[VAL_39:.*]] = fir.call @_FortranAAllocatableAllocateSource(%[[VAL_36]], %[[VAL_37]], %[[VAL_15]], %[[VAL_16]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -311,7 +311,7 @@ subroutine test_allocatable_chararray(n, a)
 ! CHECK:         %[[VAL_18:.*]] = arith.constant 1 : i32
 ! CHECK:         %[[VAL_19:.*]] = arith.constant 0 : i32
 ! CHECK:         %[[VAL_20:.*]] = arith.constant 0 : i32
-! CHECK:         %[[VAL_21:.*]] = fir.call @_FortranAAllocatableInitCharacterForAllocate(%[[VAL_16]], %[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %[[VAL_20]]) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i32, i32, i32) -> none
+! CHECK:         fir.call @_FortranAAllocatableInitCharacterForAllocate(%[[VAL_16]], %[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %[[VAL_20]]) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i32, i32, i32) -> ()
 ! CHECK:         %[[VAL_22:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_23:.*]] = fir.convert %[[VAL_11]] : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
 ! CHECK:         %[[VAL_25:.*]] = fir.call @_FortranAAllocatableAllocateSource(%[[VAL_22]], %[[VAL_23]], %[[VAL_7]], %[[VAL_8]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -354,7 +354,7 @@ subroutine test_allocatable_char(n, a)
 ! CHECK:         %[[VAL_26:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QFtest_allocatable_derived_typeTt{x:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_27:.*]] = fir.convert %[[VAL_12]]#0 : (index) -> i64
 ! CHECK:         %[[VAL_28:.*]] = fir.convert %[[VAL_24]] : (index) -> i64
-! CHECK:         %[[VAL_29:.*]] = fir.call @_FortranAAllocatableSetBounds(%[[VAL_26]], %[[VAL_25]], %[[VAL_27]], %[[VAL_28]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK:         fir.call @_FortranAAllocatableSetBounds(%[[VAL_26]], %[[VAL_25]], %[[VAL_27]], %[[VAL_28]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK:         %[[VAL_30:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QFtest_allocatable_derived_typeTt{x:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_31:.*]] = fir.convert %[[VAL_14]] : (!fir.box<!fir.array<?x!fir.type<_QFtest_allocatable_derived_typeTt{x:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>) -> !fir.box<none>
 ! CHECK:         %[[VAL_33:.*]] = fir.call @_FortranAAllocatableAllocateSource(%[[VAL_30]], %[[VAL_31]], %[[VAL_6]], %[[VAL_7]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
diff --git a/flang/test/Lower/allocate-source-pointers.f90 b/flang/test/Lower/allocate-source-pointers.f90
index 1beb420c53191..e6359dba81eb4 100644
--- a/flang/test/Lower/allocate-source-pointers.f90
+++ b/flang/test/Lower/allocate-source-pointers.f90
@@ -65,7 +65,7 @@ subroutine test_pointer_scalar(a)
 ! CHECK:         %[[VAL_46:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_47:.*]] = fir.convert %[[VAL_40]] : (index) -> i64
 ! CHECK:         %[[VAL_48:.*]] = fir.convert %[[VAL_44]] : (index) -> i64
-! CHECK:         %[[VAL_49:.*]] = fir.call @_FortranAPointerSetBounds(%[[VAL_46]], %[[VAL_45]], %[[VAL_47]], %[[VAL_48]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK:         fir.call @_FortranAPointerSetBounds(%[[VAL_46]], %[[VAL_45]], %[[VAL_47]], %[[VAL_48]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK:         %[[VAL_50:.*]] = arith.constant 1 : index
 ! CHECK:         %[[VAL_51:.*]]:3 = fir.box_dims %[[VAL_35]], %[[VAL_50]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
 ! CHECK:         %[[VAL_52:.*]] = arith.addi %[[VAL_51]]#1, %[[VAL_40]] : index
@@ -74,16 +74,16 @@ subroutine test_pointer_scalar(a)
 ! CHECK:         %[[VAL_55:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_56:.*]] = fir.convert %[[VAL_40]] : (index) -> i64
 ! CHECK:         %[[VAL_57:.*]] = fir.convert %[[VAL_53]] : (index) -> i64
-! CHECK:         %[[VAL_58:.*]] = fir.call @_FortranAPointerSetBounds(%[[VAL_55]], %[[VAL_54]], %[[VAL_56]], %[[VAL_57]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK:         fir.call @_FortranAPointerSetBounds(%[[VAL_55]], %[[VAL_54]], %[[VAL_56]], %[[VAL_57]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK:         %[[VAL_59:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_60:.*]] = fir.convert %[[VAL_35]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 ! CHECK:         %[[VAL_62:.*]] = fir.call @_FortranAPointerAllocateSource(%[[VAL_59]], %[[VAL_60]], %[[VAL_30]], %[[VAL_31]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
-! CHECK:         %[[VAL_76:.*]] = fir.call @_FortranAPointerSetBounds(
-! CHECK:         %[[VAL_85:.*]] = fir.call @_FortranAPointerSetBounds(
+! CHECK:         fir.call @_FortranAPointerSetBounds(
+! CHECK:         fir.call @_FortranAPointerSetBounds(
 ! CHECK:         %[[VAL_89:.*]] = fir.call @_FortranAPointerAllocateSource(
 ! CHECK:         %[[VAL_90:.*]] = arith.constant true
-! CHECK:         %[[VAL_122:.*]] = fir.call @_FortranAPointerSetBounds(
-! CHECK:         %[[VAL_131:.*]] = fir.call @_FortranAPointerSetBounds(
+! CHECK:         fir.call @_FortranAPointerSetBounds(
+! CHECK:         fir.call @_FortranAPointerSetBounds(
 ! CHECK:         %[[VAL_135:.*]] = fir.call @_FortranAPointerAllocateSource(%{{.*}}, %{{.*}}, %[[VAL_90]]
 
 subroutine test_pointer_2d_array(n, a)
@@ -131,7 +131,7 @@ subroutine test_pointer_2d_array(n, a)
 ! CHECK:         %[[VAL_32:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_33:.*]] = fir.convert %[[VAL_29]] : (i32) -> i64
 ! CHECK:         %[[VAL_34:.*]] = fir.convert %[[VAL_30]] : (i32) -> i64
-! CHECK:         %[[VAL_35:.*]] = fir.call @_FortranAPointerSetBounds(%[[VAL_32]], %[[VAL_31]], %[[VAL_33]], %[[VAL_34]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK:         fir.call @_FortranAPointerSetBounds(%[[VAL_32]], %[[VAL_31]], %[[VAL_33]], %[[VAL_34]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK:         %[[VAL_36:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_37:.*]] = fir.convert %[[VAL_24]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
 ! CHECK:         %[[VAL_39:.*]] = fir.call @_FortranAPointerAllocateSource(%[[VAL_36]], %[[VAL_37]], %[[VAL_19]], %[[VAL_20]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -146,7 +146,7 @@ subroutine test_pointer_2d_array(n, a)
 ! CHECK:         %[[VAL_47:.*]] = fir.convert %[[VAL_8]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_48:.*]] = fir.convert %[[VAL_44]] : (index) -> i64
 ! CHECK:         %[[VAL_49:.*]] = fir.convert %[[VAL_45]] : (i32) -> i64
-! CHECK:         %[[VAL_50:.*]] = fir.call @_FortranAPointerSetBounds(%[[VAL_47]], %[[VAL_46]], %[[VAL_48]], %[[VAL_49]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK:         fir.call @_FortranAPointerSetBounds(%[[VAL_47]], %[[VAL_46]], %[[VAL_48]], %[[VAL_49]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK:         %[[VAL_51:.*]] = fir.convert %[[VAL_8]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_52:.*]] = fir.convert %[[VAL_24]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
 ! CHECK:         %[[VAL_54:.*]] = fir.call @_FortranAPointerAllocateSource(%[[VAL_51]], %[[VAL_52]], %[[VAL_19]], %[[VAL_20]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -203,7 +203,7 @@ subroutine test_pointer_with_shapespec(n, a, m)
 ! CHECK:         %[[VAL_40:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_41:.*]] = fir.convert %[[VAL_34]] : (index) -> i64
 ! CHECK:         %[[VAL_42:.*]] = fir.convert %[[VAL_38]] : (index) -> i64
-! CHECK:         %[[VAL_43:.*]] = fir.call @_FortranAPointerSetBounds(%[[VAL_40]], %[[VAL_39]], %[[VAL_41]], %[[VAL_42]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK:         fir.call @_FortranAPointerSetBounds(%[[VAL_40]], %[[VAL_39]], %[[VAL_41]], %[[VAL_42]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK:         %[[VAL_44:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_45:.*]] = fir.convert %[[VAL_29]] : (!fir.box<!fir.array<5xi32>>) -> !fir.box<none>
 ! CHECK:         %[[VAL_47:.*]] = fir.call @_FortranAPointerAllocateSource(%[[VAL_44]], %[[VAL_45]], %[[VAL_7]], %[[VAL_8]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -253,7 +253,7 @@ subroutine test_pointer_from_const(n, a)
 ! CHECK:         %[[VAL_31:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,4>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_32:.*]] = fir.convert %[[VAL_25]] : (index) -> i64
 ! CHECK:         %[[VAL_33:.*]] = fir.convert %[[VAL_29]] : (index) -> i64
-! CHECK:         %[[VAL_34:.*]] = fir.call @_FortranAPointerSetBounds(%[[VAL_31]], %[[VAL_30]], %[[VAL_32]], %[[VAL_33]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK:         fir.call @_FortranAPointerSetBounds(%[[VAL_31]], %[[VAL_30]], %[[VAL_32]], %[[VAL_33]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK:         %[[VAL_35:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,4>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_36:.*]] = fir.convert %[[VAL_20]] : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> !fir.box<none>
 ! CHECK:         %[[VAL_38:.*]] = fir.call @_FortranAPointerAllocateSource(%[[VAL_35]], %[[VAL_36]], %[[VAL_15]], %[[VAL_16]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -290,7 +290,7 @@ subroutine test_pointer_chararray(n, a)
 ! CHECK:         %[[VAL_18:.*]] = arith.constant 1 : i32
 ! CHECK:         %[[VAL_19:.*]] = arith.constant 0 : i32
 ! CHECK:         %[[VAL_20:.*]] = arith.constant 0 : i32
-! CHECK:         %[[VAL_21:.*]] = fir.call @_FortranAPointerNullifyCharacter(%[[VAL_16]], %[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %[[VAL_20]]) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i32, i32, i32) -> none
+! CHECK:         fir.call @_FortranAPointerNullifyCharacter(%[[VAL_16]], %[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %[[VAL_20]]) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i32, i32, i32) -> ()
 ! CHECK:         %[[VAL_22:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_23:.*]] = fir.convert %[[VAL_11]] : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
 ! CHECK:         %[[VAL_25:.*]] = fir.call @_FortranAPointerAllocateSource(%[[VAL_22]], %[[VAL_23]], %[[VAL_7]], %[[VAL_8]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -339,7 +339,7 @@ subroutine test_pointer_char(n, a)
 ! CHECK:         %[[VAL_25:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFtest_pointer_derived_typeTt{x:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_26:.*]] = fir.convert %[[VAL_12]]#0 : (index) -> i64
 ! CHECK:         %[[VAL_27:.*]] = fir.convert %[[VAL_23]] : (index) -> i64
-! CHECK:         %[[VAL_28:.*]] = fir.call @_FortranAPointerSetBounds(%[[VAL_25]], %[[VAL_24]], %[[VAL_26]], %[[VAL_27]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+! CHECK:         fir.call @_FortranAPointerSetBounds(%[[VAL_25]], %[[VAL_24]], %[[VAL_26]], %[[VAL_27]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK:         %[[VAL_29:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFtest_pointer_derived_typeTt{x:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_30:.*]] = fir.convert %[[VAL_14]] : (!fir.box<!fir.array<?x!fir.type<_QFtest_pointer_derived_typeTt{x:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>>>) -> !fir.box<none>
 ! CHECK:         %[[VAL_32:.*]] = fir.call @_FortranAPointerAllocateSource(%[[VAL_29]], %[[VAL_30]], %[[VAL_6]], %[[VAL_7]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
diff --git a/flang/test/Lower/array-derived-assignments.f90 b/flang/test/Lower/array-derived-assignments.f90
index f4e51271d5936..3a66a0824666b 100644
--- a/flang/test/Lower/array-derived-assignments.f90
+++ b/flang/test/Lower/array-derived-assignments.f90
@@ -92,7 +92,7 @@ subroutine test_deep_copy(t1, t2)
   ! CHECK:         %[[VAL_17:.*]] = fir.convert %[[VAL_6]] : (!fir.ref<!fir.box<!fir.type<_QMarray_derived_assignTdeep_copy{i:i32,a:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>>) -> !fir.ref<!fir.box<none>>
   ! CHECK:         %[[VAL_18:.*]] = fir.convert %[[VAL_15]] : (!fir.box<!fir.type<_QMarray_derived_assignTdeep_copy{i:i32,a:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> !fir.box<none>
   ! CHECK:         %[[VAL_19:.*]] = fir.convert %[[VAL_16]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-  ! CHECK:         %[[VAL_20:.*]] = fir.call @_FortranAAssign(%[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  ! CHECK:         fir.call @_FortranAAssign(%[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   ! CHECK:         %[[VAL_21:.*]] = arith.subi %[[VAL_9]], %[[VAL_5]] : index
   ! CHECK:         br ^bb1(%[[VAL_11]], %[[VAL_21]] : index, index)
   type(deep_copy) :: t1(10), t2(10)
diff --git a/flang/test/Lower/basic-function.f90 b/flang/test/Lower/basic-function.f90
index f5f81545c899d..5f2fabe1b325d 100644
--- a/flang/test/Lower/basic-function.f90
+++ b/flang/test/Lower/basic-function.f90
@@ -45,7 +45,7 @@ integer function fct_body()
 ! CHECK-LABEL: func @_QPfct_body() -> i32
 ! CHECK:         cf.br ^bb1
 ! CHECK:       ^bb1
-! CHECK:         %{{.*}} = fir.call @_FortranAStopStatement
+! CHECK:         fir.call @_FortranAStopStatement
 ! CHECK:         fir.unreachable
 
 function fct_iarr1()
diff --git a/flang/test/Lower/call-by-value-attr.f90 b/flang/test/Lower/call-by-value-attr.f90
index 09fc32fbf71ae..97028edfb8d77 100644
--- a/flang/test/Lower/call-by-value-attr.f90
+++ b/flang/test/Lower/call-by-value-attr.f90
@@ -78,7 +78,7 @@ end subroutine subra
   !CHECK: fir.store %[[TEMP_BOX]] to %[[TEMP_BOX_LOC:.*]] : !fir.ref<!fir.box<!fir.array<11xi32>>>
   !CHECK: %[[TEMP_BOX_ADDR:.*]] = fir.convert %[[TEMP_BOX_LOC]] : (!fir.ref<!fir.box<!fir.array<11xi32>>>) -> !fir.ref<!fir.box<none>>
   !CHECK: %[[BOX_ADDR:.*]] = fir.convert %[[BOX]] : (!fir.box<!fir.array<11xi32>>) -> !fir.box<none>
-  !CHECK: fir.call @_FortranAAssignTemporary(%[[TEMP_BOX_ADDR]], %[[BOX_ADDR]], %{{.*}}, %{{.*}}){{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  !CHECK: fir.call @_FortranAAssignTemporary(%[[TEMP_BOX_ADDR]], %[[BOX_ADDR]], %{{.*}}, %{{.*}}){{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   !CHECK: fir.result %[[ARRAY_COPY_2]] : !fir.heap<!fir.array<11xi32>>
   !CHECK: %[[CONVERT_B:.*]] = fir.convert %[[ADDR]] : (!fir.heap<!fir.array<11xi32>>) -> !fir.ref<!fir.array<10xi32>>
   !CHECK: fir.call @_QPsubra(%[[CONVERT_B]])
diff --git a/flang/test/Lower/call-copy-in-out.f90 b/flang/test/Lower/call-copy-in-out.f90
index 253db7f05a6b6..fd3b5c342a48f 100644
--- a/flang/test/Lower/call-copy-in-out.f90
+++ b/flang/test/Lower/call-copy-in-out.f90
@@ -23,7 +23,7 @@ subroutine test_assumed_shape_to_array(x)
 ! CHECK-DAG:  fir.store %[[temp_box]] to %[[temp_box_loc:.*]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
 ! CHECK-DAG: %[[temp_box_addr:.*]] = fir.convert %[[temp_box_loc]] : (!fir.ref<!fir.box<!fir.array<?xf32>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK-DAG: %[[arg_box:.*]] = fir.convert %[[x]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
-! CHECK-DAG: fir.call @_FortranAAssignTemporary(%[[temp_box_addr]], %[[arg_box]], %{{.*}}, %{{.*}}){{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK-DAG: fir.call @_FortranAAssignTemporary(%[[temp_box_addr]], %[[arg_box]], %{{.*}}, %{{.*}}){{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:  fir.result %[[temp]] : !fir.heap<!fir.array<?xf32>>
 
 ! CHECK:  %[[dim:.*]]:3 = fir.box_dims %[[x]], %c0{{.*}} : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
@@ -38,7 +38,7 @@ subroutine test_assumed_shape_to_array(x)
 ! CHECK-DAG:  fir.store %[[x]] to %[[arg_box_loc:.*]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
 ! CHECK-DAG: %[[arg_box_addr:.*]] = fir.convert %[[arg_box_loc]] : (!fir.ref<!fir.box<!fir.array<?xf32>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK-DAG: %[[temp_box_cast:.*]] = fir.convert %[[temp_box_ref]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK-DAG: fir.call @_FortranACopyOutAssign(%[[arg_box_addr]], %[[temp_box_cast]], %{{.*}}, %{{.*}}){{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, !fir.ref<i8>, i32) -> none
+! CHECK-DAG: fir.call @_FortranACopyOutAssign(%[[arg_box_addr]], %[[temp_box_cast]], %{{.*}}, %{{.*}}){{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, !fir.ref<i8>, i32) -> ()
 
   call bar(x)
 end subroutine
@@ -204,7 +204,7 @@ subroutine test_char(x)
 ! CHECK:             fir.store %[[VAL_12]] to %[[VAL_2]] : !fir.ref<!fir.box<!fir.array<?x!fir.char<1,10>>>>
 ! CHECK:             %[[VAL_15:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.box<!fir.array<?x!fir.char<1,10>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:             %[[VAL_16:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x!fir.char<1,10>>>) -> !fir.box<none>
-! CHECK:             %[[VAL_18:.*]] = fir.call @_FortranAAssignTemporary(%[[VAL_15]], %[[VAL_16]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:             fir.call @_FortranAAssignTemporary(%[[VAL_15]], %[[VAL_16]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:             fir.result %[[VAL_10]] : !fir.heap<!fir.array<?x!fir.char<1,10>>>
 ! CHECK:           }
 ! CHECK:           %[[VAL_19:.*]] = arith.constant 0 : index
@@ -222,7 +222,7 @@ subroutine test_char(x)
 ! CHECK:             fir.store %[[VAL_0]] to %[[VAL_1]] : !fir.ref<!fir.box<!fir.array<?x!fir.char<1,10>>>>
 ! CHECK:             %[[VAL_31:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.box<!fir.array<?x!fir.char<1,10>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:             %[[VAL_32:.*]] = fir.convert %[[TMP_BOX_REF]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK:             %[[VAL_34:.*]] = fir.call @_FortranACopyOutAssign(%[[VAL_31]], %[[VAL_32]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, !fir.ref<i8>, i32) -> none
+! CHECK:             fir.call @_FortranACopyOutAssign(%[[VAL_31]], %[[VAL_32]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, !fir.ref<i8>, i32) -> ()
 ! CHECK:           }
   
   character(10) :: x(:)
diff --git a/flang/test/Lower/default-initialization.f90 b/flang/test/Lower/default-initialization.f90
index 7a6133452b3a2..a3a36d5a1c3de 100644
--- a/flang/test/Lower/default-initialization.f90
+++ b/flang/test/Lower/default-initialization.f90
@@ -24,7 +24,7 @@ subroutine local
     ! CHECK: %[[x:.*]] = fir.alloca !fir.type<_QMtest_dinitTt{i:i32}>
     ! CHECK: %[[xbox:.*]] = fir.embox %[[x]] : (!fir.ref<!fir.type<_QMtest_dinitTt{i:i32}>>) -> !fir.box<!fir.type<_QMtest_dinitTt{i:i32}>>
     ! CHECK: %[[xboxNone:.*]] = fir.convert %[[xbox]]
-    ! CHECK: fir.call @_FortranAInitialize(%[[xboxNone]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> none
+    ! CHECK: fir.call @_FortranAInitialize(%[[xboxNone]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> ()
     type(t) :: x
     print *, x%i
   end subroutine 
@@ -36,7 +36,7 @@ subroutine local_array()
     ! CHECK: %[[xshape:.*]] = fir.shape %c4{{.*}} : (index) -> !fir.shape<1>
     ! CHECK: %[[xbox:.*]] = fir.embox %[[x]](%[[xshape]]) : (!fir.ref<!fir.array<4x!fir.type<_QMtest_dinitTt{i:i32}>>>, !fir.shape<1>) -> !fir.box<!fir.array<4x!fir.type<_QMtest_dinitTt{i:i32}>>>
     ! CHECK: %[[xboxNone:.*]] = fir.convert %[[xbox]]
-    ! CHECK: fir.call @_FortranAInitialize(%[[xboxNone]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> none
+    ! CHECK: fir.call @_FortranAInitialize(%[[xboxNone]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> ()
     type(t) :: x(4)
     print *, x(2)%i
   end subroutine 
@@ -48,7 +48,7 @@ subroutine local_alloc_comp
     ! CHECK: %[[x:.*]] = fir.alloca !fir.type<_QMtest_dinitTt_alloc_comp{i:!fir.box<!fir.heap<!fir.array<?xf32>>>}>
     ! CHECK: %[[xbox:.*]] = fir.embox %[[x]] : (!fir.ref<!fir.type<_QMtest_dinitTt_alloc_comp{i:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> !fir.box<!fir.type<_QMtest_dinitTt_alloc_comp{i:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>
     ! CHECK: %[[xboxNone:.*]] = fir.convert %[[xbox]]
-    ! CHECK: fir.call @_FortranAInitialize(%[[xboxNone]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> none
+    ! CHECK: fir.call @_FortranAInitialize(%[[xboxNone]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> ()
     type(t_alloc_comp) :: x
   end subroutine 
 
@@ -58,7 +58,7 @@ function result()
     ! CHECK: %[[x:.*]] = fir.alloca !fir.type<_QMtest_dinitTt{i:i32}>
     ! CHECK: %[[xbox:.*]] = fir.embox %[[x]] : (!fir.ref<!fir.type<_QMtest_dinitTt{i:i32}>>) -> !fir.box<!fir.type<_QMtest_dinitTt{i:i32}>>
     ! CHECK: %[[xboxNone:.*]] = fir.convert %[[xbox]]
-    ! CHECK: fir.call @_FortranAInitialize(%[[xboxNone]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> none
+    ! CHECK: fir.call @_FortranAInitialize(%[[xboxNone]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> ()
     type(t) :: result
   end function
 
@@ -68,7 +68,7 @@ function result()
   subroutine intent_out(x)
     ! CHECK: %[[xbox:.*]] = fir.embox %[[x]] : (!fir.ref<!fir.type<_QMtest_dinitTt{i:i32}>>) -> !fir.box<!fir.type<_QMtest_dinitTt{i:i32}>>
     ! CHECK: %[[xboxNone:.*]] = fir.convert %[[xbox]]
-    ! CHECK: fir.call @_FortranAInitialize(%[[xboxNone]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> none
+    ! CHECK: fir.call @_FortranAInitialize(%[[xboxNone]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> ()
     type(t), intent(out) :: x
   end subroutine
 
@@ -81,7 +81,7 @@ subroutine intent_out_optional(x)
     ! CHECK: fir.if %[[isPresent]] {
       ! CHECK: %[[xbox:.*]] = fir.embox %[[x]] : (!fir.ref<!fir.type<_QMtest_dinitTt{i:i32}>>) -> !fir.box<!fir.type<_QMtest_dinitTt{i:i32}>>
       ! CHECK: %[[xboxNone:.*]] = fir.convert %[[xbox]]
-      ! CHECK: fir.call @_FortranAInitialize(%[[xboxNone]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> none
+      ! CHECK: fir.call @_FortranAInitialize(%[[xboxNone]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> ()
     ! CHECK: }
     type(t), intent(out), optional :: x
   end subroutine
@@ -96,7 +96,7 @@ subroutine local_eq()
     ! CHECK: %[[x:.*]] = fir.convert %[[xcoor]] : (!fir.ref<i8>) -> !fir.ptr<!fir.type<_QMtest_dinitTtseq{i:i32}>>
     ! CHECK: %[[xbox:.*]] = fir.embox %[[x]] : (!fir.ptr<!fir.type<_QMtest_dinitTtseq{i:i32}>>) -> !fir.box<!fir.type<_QMtest_dinitTtseq{i:i32}>>
     ! CHECK: %[[xboxNone:.*]] = fir.convert %[[xbox]]
-    ! CHECK: fir.call @_FortranAInitialize(%[[xboxNone]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> none
+    ! CHECK: fir.call @_FortranAInitialize(%[[xboxNone]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> ()
     equivalence (x, zi)
     print *, i
   end subroutine
@@ -114,14 +114,14 @@ subroutine local_eq2()
     ! CHECK: %[[x:.*]] = fir.convert %[[xcoor]] : (!fir.ref<i8>) -> !fir.ptr<!fir.type<_QMtest_dinitTtseq{i:i32}>>
     ! CHECK: %[[xbox:.*]] = fir.embox %[[x]] : (!fir.ptr<!fir.type<_QMtest_dinitTtseq{i:i32}>>) -> !fir.box<!fir.type<_QMtest_dinitTtseq{i:i32}>>
     ! CHECK: %[[xboxNone:.*]] = fir.convert %[[xbox]]
-    ! CHECK: fir.call @_FortranAInitialize(%[[xboxNone]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> none
+    ! CHECK: fir.call @_FortranAInitialize(%[[xboxNone]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 
   
     ! CHECK: %[[ycoor:.*]] = fir.coordinate_of %[[equiv]], %c0{{.*}} : (!fir.ref<!fir.array<4xi8>>, index) -> !fir.ref<i8>
     ! CHECK: %[[y:.*]] = fir.convert %[[ycoor]] : (!fir.ref<i8>) -> !fir.ptr<!fir.type<_QMtest_dinitTtseq{i:i32}>>
     ! CHECK: %[[ybox:.*]] = fir.embox %[[y]] : (!fir.ptr<!fir.type<_QMtest_dinitTtseq{i:i32}>>) -> !fir.box<!fir.type<_QMtest_dinitTtseq{i:i32}>>
     ! CHECK: %[[yboxNone:.*]] = fir.convert %[[ybox]]
-    ! CHECK: fir.call @_FortranAInitialize(%[[yboxNone]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> none
+    ! CHECK: fir.call @_FortranAInitialize(%[[yboxNone]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> ()
     equivalence (x, y)
     print *, y%i
   end subroutine
diff --git a/flang/test/Lower/derived-assignments.f90 b/flang/test/Lower/derived-assignments.f90
index 4465a7eecc886..1048e6199451a 100644
--- a/flang/test/Lower/derived-assignments.f90
+++ b/flang/test/Lower/derived-assignments.f90
@@ -170,7 +170,7 @@ subroutine test_box_assign(t1, t2)
   ! CHECK: fir.store %[[t1Load]] to %[[tmpBox]] : !fir.ref<!fir.box<!fir.ptr<!fir.type<_QFtest_box_assignTt{i:i32}>>>>
   ! CHECK: %[[lhs:.*]] = fir.convert %[[tmpBox]] : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QFtest_box_assignTt{i:i32}>>>>) -> !fir.ref<!fir.box<none>>
   ! CHECK: %[[rhs:.*]] = fir.convert %[[t2Load]] : (!fir.box<!fir.ptr<!fir.type<_QFtest_box_assignTt{i:i32}>>>) -> !fir.box<none>
-  ! CHECK: fir.call @_FortranAAssign(%[[lhs]], %[[rhs]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  ! CHECK: fir.call @_FortranAAssign(%[[lhs]], %[[rhs]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   t1 = t2
 end subroutine
 
@@ -190,7 +190,7 @@ subroutine test_alloc_comp(t1, t2)
   ! CHECK: fir.store %[[t1Box]] to %[[tmpBox]] : !fir.ref<!fir.box<!fir.type<_QFtest_alloc_compTt{{.*}}>>>
   ! CHECK: %[[lhs:.*]] = fir.convert %[[tmpBox]] : (!fir.ref<!fir.box<!fir.type<_QFtest_alloc_compTt{{.*}}>>>) -> !fir.ref<!fir.box<none>>
   ! CHECK: %[[rhs:.*]] = fir.convert %[[t2Box]] : (!fir.box<!fir.type<_QFtest_alloc_compTt{{.*}}>>) -> !fir.box<none>
-  ! CHECK: fir.call @_FortranAAssign(%[[lhs]], %[[rhs]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  ! CHECK: fir.call @_FortranAAssign(%[[lhs]], %[[rhs]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   t1 = t2
 end subroutine
 
@@ -230,7 +230,7 @@ subroutine test_alloc_comp(t1, t2)
 !    ! cHECK: fir.store %[[t1Box]] to %[[tmpBox]] : !fir.ref<!fir.box<!fir.type<_QMcomponent_with_user_def_assignTt{{.*}}>>>
 !    ! cHECK: %[[lhs:.*]] = fir.convert %[[tmpBox]] : (!fir.ref<!fir.box<!fir.type<_QMcomponent_with_user_def_assignTt{{.*}}>>>) -> !fir.ref<!fir.box<none>>
 !    ! cHECK: %[[rhs:.*]] = fir.convert %[[t2Box]] : (!fir.box<!fir.type<_QMcomponent_with_user_def_assignTt{{.*}}>>) -> !fir.box<none>
-!    ! cHECK: fir.call @_FortranAAssign(%[[lhs]], %[[rhs]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+!    ! cHECK: fir.call @_FortranAAssign(%[[lhs]], %[[rhs]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 !    t1 = t2
 !  end subroutine
 !end module
diff --git a/flang/test/Lower/derived-type-finalization.f90 b/flang/test/Lower/derived-type-finalization.f90
index e7ade0d8145bb..b38fcd8ba5766 100644
--- a/flang/test/Lower/derived-type-finalization.f90
+++ b/flang/test/Lower/derived-type-finalization.f90
@@ -60,7 +60,7 @@ subroutine test_lhs_allocatable()
 ! CHECK: %[[EMBOX:.*]] = fir.embox %[[LHS]] : (!fir.ref<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>) -> !fir.box<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>
 ! CHECK: fir.store %[[EMBOX]] to %[[BOXREF]] : !fir.ref<!fir.box<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>>
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[BOXREF]] : (!fir.ref<!fir.box<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAssign(%[[BOX_NONE]], {{.*}}
+! CHECK: fir.call @_FortranAAssign(%[[BOX_NONE]], {{.*}}
 
 ! CHECK-LABEL: func.func @_QMderived_type_finalizationPtest_lhs_allocatable() {
 ! CHECK: %[[LHS:.*]] = fir.alloca !fir.box<!fir.heap<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>> {bindc_name = "lhs", uniq_name = "_QMderived_type_finalizationFtest_lhs_allocatableElhs"}
@@ -72,7 +72,7 @@ subroutine test_lhs_allocatable()
 ! CHECK: %[[IS_NULL:.*]] = arith.cmpi ne, %[[ADDR_I64]], %[[C0]] : i64
 ! CHECK: fir.if %[[IS_NULL]] {
 ! CHECK:   %[[BOX_NONE:.*]] = fir.convert %[[LHS]] : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>>>) -> !fir.box<none>
-! CHECK:   %{{.*}} = fir.call @_FortranADestroy(%[[BOX_NONE]]) {{.*}} : (!fir.box<none>) -> none
+! CHECK:   fir.call @_FortranADestroy(%[[BOX_NONE]]) {{.*}} : (!fir.box<none>) -> ()
 ! CHECK: }
 
   ! 7.5.6.3 point 2. Finalization on explicit deallocation.
@@ -111,7 +111,7 @@ subroutine test_end_finalization()
 ! CHECK: %[[LOCAL_T:.*]] = fir.alloca !fir.type<_QMderived_type_finalizationTt1{a:i32}> {bindc_name = "t", uniq_name = "_QMderived_type_finalizationFtest_end_finalizationEt"}
 ! CHECK: %[[EMBOX:.*]] = fir.embox %[[LOCAL_T]] : (!fir.ref<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>) -> !fir.box<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[EMBOX]] : (!fir.box<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranADestroy(%[[BOX_NONE]]) {{.*}} : (!fir.box<none>) -> none
+! CHECK: fir.call @_FortranADestroy(%[[BOX_NONE]]) {{.*}} : (!fir.box<none>) -> ()
 ! CHECK: return
 
   ! test with multiple return.
@@ -139,7 +139,7 @@ subroutine test_end_finalization2(a)
 ! CHECK: ^bb3:
 ! CHECK:   %[[EMBOX:.*]] = fir.embox %[[T]] : (!fir.ref<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>) -> !fir.box<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>
 ! CHECK:   %[[BOX_NONE:.*]] = fir.convert %[[EMBOX]] : (!fir.box<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>) -> !fir.box<none>
-! CHECK:   %{{.*}} = fir.call @_FortranADestroy(%[[BOX_NONE]]) {{.*}} : (!fir.box<none>) -> none
+! CHECK:   fir.call @_FortranADestroy(%[[BOX_NONE]]) {{.*}} : (!fir.box<none>) -> ()
 ! CHECK:   return
 ! CHECK: }
 
@@ -159,7 +159,7 @@ subroutine test_fct_ref()
 ! CHECK: fir.save_result %[[CALL_RES]] to %[[RESULT]] : !fir.type<_QMderived_type_finalizationTt1{a:i32}>, !fir.ref<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>
 ! CHECK: %[[EMBOX:.*]] = fir.embox %[[RESULT]] : (!fir.ref<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>) -> !fir.box<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[EMBOX]] : (!fir.box<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranADestroy(%[[BOX_NONE]]) {{.*}} : (!fir.box<none>) -> none
+! CHECK: fir.call @_FortranADestroy(%[[BOX_NONE]]) {{.*}} : (!fir.box<none>) -> ()
 ! CHECK: return
 
   subroutine test_finalize_intent_out(t)
@@ -170,7 +170,7 @@ subroutine test_finalize_intent_out(t)
 ! CHECK-SAME: %[[T:.*]]: !fir.ref<!fir.type<_QMderived_type_finalizationTt1{a:i32}>> {fir.bindc_name = "t"}) {
 ! CHECK: %[[EMBOX:.*]] = fir.embox %[[T]] : (!fir.ref<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>) -> !fir.box<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[EMBOX]] : (!fir.box<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranADestroy(%[[BOX_NONE]]) {{.*}}: (!fir.box<none>) -> none
+! CHECK: fir.call @_FortranADestroy(%[[BOX_NONE]]) {{.*}}: (!fir.box<none>) -> ()
 ! CHECK: return
 
   function get_t1(i)
@@ -189,7 +189,7 @@ subroutine test_nonpointer_function()
 ! CHECK: %[[RES:.*]] = fir.call @_QMderived_type_finalizationPget_t1(%{{.*}}) {{.*}} : (!fir.ref<i32>) -> !fir.box<!fir.ptr<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>>
 ! CHECK: fir.save_result %[[RES]] to %[[TMP]] : !fir.box<!fir.ptr<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>>, !fir.ref<!fir.box<!fir.ptr<!fir.type<_QMderived_type_finalizationTt1{a:i32}>>>>
 ! CHECK: %{{.*}} = fir.call @_FortranAioOutputDerivedType
-! CHECK-NOT: %{{.*}} = fir.call @_FortranADestroy
+! CHECK-NOT: fir.call @_FortranADestroy
 ! CHECK: %{{.*}} = fir.call @_FortranAioEndIoStatement
 ! CHECK: return
 
@@ -201,9 +201,9 @@ subroutine test_avoid_double_finalization(a)
 
 ! CHECK-LABEL: func.func @_QMderived_type_finalizationPtest_avoid_double_finalization(
 ! CHECK: fir.call @_FortranAInitialize(
-! CHECK-NOT: %{{.*}} = fir.call @_FortranADestroy
-! CHECK: %{{.*}} = fir.call @_FortranAAssign(
-! CHECK: %{{.*}} = fir.call @_FortranADestroy(
+! CHECK-NOT: fir.call @_FortranADestroy
+! CHECK: fir.call @_FortranAAssign(
+! CHECK: fir.call @_FortranADestroy(
 
   function no_func_ret_finalize() result(ty)
     type(t1) :: ty
@@ -211,7 +211,7 @@ function no_func_ret_finalize() result(ty)
   end function
 
 ! CHECK-LABEL: func.func @_QMderived_type_finalizationPno_func_ret_finalize() -> !fir.type<_QMderived_type_finalizationTt1{a:i32}> {
-! CHECK: %{{.*}} = fir.call @_FortranAAssign
+! CHECK: fir.call @_FortranAAssign
 ! CHECK-NOT: fir.call @_FortranADestroy
 ! CHECK: return %{{.*}} : !fir.type<_QMderived_type_finalizationTt1{a:i32}>
 
@@ -232,7 +232,7 @@ subroutine test_avoid_double_free()
 ! CHECK: fir.call @_FortranAAllocatableAllocateSource(
 ! CHECK-NOT: fir.freemem %{{.*}} : !fir.heap<!fir.array<?x!fir.type<_QMderived_type_finalizationTt1{a:i32}>>>
 ! CHECK: %[[RES_CONV:.*]] = fir.convert %[[RES]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMderived_type_finalizationTt1{a:i32}>>>>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranADestroy(%[[RES_CONV]]) {{.*}} : (!fir.box<none>) -> none
+! CHECK: fir.call @_FortranADestroy(%[[RES_CONV]]) {{.*}} : (!fir.box<none>) -> ()
 
   subroutine t4_final(this)
     type(t4) :: this
@@ -243,7 +243,7 @@ subroutine local_t4()
   end subroutine
 
 ! CHECK-LABEL: func.func @_QMderived_type_finalizationPlocal_t4()
-! CHECK: %{{.*}} = fir.call @_FortranADestroy(%2) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK: fir.call @_FortranADestroy(%2) fastmath<contract> : (!fir.box<none>) -> ()
 
 end module
 
diff --git a/flang/test/Lower/derived-type-temp.f90 b/flang/test/Lower/derived-type-temp.f90
index 18bcacf10753c..4f1d4f2d51cf5 100644
--- a/flang/test/Lower/derived-type-temp.f90
+++ b/flang/test/Lower/derived-type-temp.f90
@@ -16,4 +16,4 @@ program derived_temp_init
 ! CHECK: %[[temp:.*]] = fir.alloca !fir.type<_QFTt1{i:!fir.box<!fir.heap<i32>>}> {bindc_name = "x", uniq_name = "_QFEx"}
 ! CHECK: %[[box:.*]] = fir.embox %[[temp]] : (!fir.ref<!fir.type<_QFTt1{i:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<!fir.type<_QFTt1{i:!fir.box<!fir.heap<i32>>}>>
 ! CHECK: %[[box_none:.*]] = fir.convert %[[box]] : (!fir.box<!fir.type<_QFTt1{i:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAInitialize(%[[box_none]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranAInitialize(%[[box_none]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> ()
diff --git a/flang/test/Lower/do_concurrent_local_default_init.f90 b/flang/test/Lower/do_concurrent_local_default_init.f90
index 1766e0a104ff6..7652e4fcd0402 100644
--- a/flang/test/Lower/do_concurrent_local_default_init.f90
+++ b/flang/test/Lower/do_concurrent_local_default_init.f90
@@ -47,6 +47,6 @@ subroutine test_default_init()
 ! CHECK:             %[[VAL_26:.*]] = fir.alloca !fir.type<_QFtest_default_initTt{i:i32}> {bindc_name = "a", pinned, uniq_name = "_QFtest_default_initEa"}
 ! CHECK:             %[[VAL_27:.*]] = fir.embox %[[VAL_26]] : (!fir.ref<!fir.type<_QFtest_default_initTt{i:i32}>>) -> !fir.box<!fir.type<_QFtest_default_initTt{i:i32}>>
 ! CHECK:             %[[VAL_30:.*]] = fir.convert %[[VAL_27]] : (!fir.box<!fir.type<_QFtest_default_initTt{i:i32}>>) -> !fir.box<none>
-! CHECK:             %[[VAL_32:.*]] = fir.call @_FortranAInitialize(%[[VAL_30]], {{.*}}
+! CHECK:             fir.call @_FortranAInitialize(%[[VAL_30]], {{.*}}
 ! CHECK:             %[[VAL_33:.*]]:2 = hlfir.declare %[[VAL_26]] {uniq_name = "_QFtest_default_initEa"} : (!fir.ref<!fir.type<_QFtest_default_initTt{i:i32}>>) -> (!fir.ref<!fir.type<_QFtest_default_initTt{i:i32}>>, !fir.ref<!fir.type<_QFtest_default_initTt{i:i32}>>)
 ! CHECK:           }
diff --git a/flang/test/Lower/fail_image.f90 b/flang/test/Lower/fail_image.f90
index a4708cbbf47cc..08be8e19402d2 100644
--- a/flang/test/Lower/fail_image.f90
+++ b/flang/test/Lower/fail_image.f90
@@ -7,7 +7,7 @@ subroutine fail_image_test(fail)
 ! CHECK:  cond_br {{.*}}, ^[[BB1:.*]], ^[[BB2:.*]]
 ! CHECK: ^[[BB1]]:
   if (fail) then
-! CHECK:  {{.*}} = fir.call @_FortranAFailImageStatement() {{.*}}: () -> none
+! CHECK: fir.call @_FortranAFailImageStatement() {{.*}}:
 ! CHECK-NEXT:  fir.unreachable
    FAIL IMAGE
   end if
@@ -17,4 +17,4 @@ subroutine fail_image_test(fail)
 ! CHECK-NEXT:  return
   return
 end subroutine
-! CHECK-LABEL: func private @_FortranAFailImageStatement() -> none attributes {fir.runtime}
+! CHECK-LABEL: func private @_FortranAFailImageStatement() attributes {fir.runtime}
diff --git a/flang/test/Lower/forall/forall-allocatable-2.f90 b/flang/test/Lower/forall/forall-allocatable-2.f90
index 95bd290f27350..f7c46acf87275 100644
--- a/flang/test/Lower/forall/forall-allocatable-2.f90
+++ b/flang/test/Lower/forall/forall-allocatable-2.f90
@@ -23,7 +23,7 @@ end subroutine forall_with_allocatable2
 ! CHECK:         %[[VAL_5:.*]] = arith.constant {{.*}} : i32
 ! CHECK:         %[[VAL_6:.*]] = fir.convert %[[VAL_3]] : (!fir.box<!fir.type<_QFforall_with_allocatable2Tt{i:i32,arr:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> !fir.box<none>
 ! CHECK:         %[[VAL_7:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-! CHECK:         %[[VAL_8:.*]] = fir.call @_FortranAInitialize(%[[VAL_6]], %[[VAL_7]], %[[VAL_5]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:         fir.call @_FortranAInitialize(%[[VAL_6]], %[[VAL_7]], %[[VAL_5]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:         %[[VAL_9:.*]] = arith.constant 5 : i32
 ! CHECK:         %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> index
 ! CHECK:         %[[VAL_11:.*]] = arith.constant 15 : i32
diff --git a/flang/test/Lower/forall/forall-where.f90 b/flang/test/Lower/forall/forall-where.f90
index af309e63535fa..b1dd72fdfb4f2 100644
--- a/flang/test/Lower/forall/forall-where.f90
+++ b/flang/test/Lower/forall/forall-where.f90
@@ -380,6 +380,6 @@ end subroutine test_nested_forall_where
 ! CHECK:         }
 ! CHECK:         fir.array_merge_store %[[VAL_248]], %[[VAL_340:.*]] to %[[VAL_0]] : !fir.array<?x?x!fir.type<_QFtest_nested_forall_whereTt{data:!fir.array<100xf32>}>>, !fir.array<?x?x!fir.type<_QFtest_nested_forall_whereTt{data:!fir.array<100xf32>}>>, !fir.box<!fir.array<?x?x!fir.type<_QFtest_nested_forall_whereTt{data:!fir.array<100xf32>}>>>
 ! CHECK:         %[[VAL_341:.*]] = fir.convert %[[VAL_8]] : (!fir.ref<tuple<i64, !fir.heap<!fir.array<?xi8>>, !fir.heap<!fir.array<?xi64>>>>) -> !fir.llvm_ptr<i8>
-! CHECK:         %[[VAL_342:.*]] = fir.call @_FortranARaggedArrayDeallocate(%[[VAL_341]]) {{.*}}: (!fir.llvm_ptr<i8>) -> none
+! CHECK:         fir.call @_FortranARaggedArrayDeallocate(%[[VAL_341]]) {{.*}}: (!fir.llvm_ptr<i8>) -> ()
 ! CHECK:         return
 ! CHECK:       }
diff --git a/flang/test/Lower/goto-statement.f90 b/flang/test/Lower/goto-statement.f90
index f69ed6ba656a2..5591c09da9122 100644
--- a/flang/test/Lower/goto-statement.f90
+++ b/flang/test/Lower/goto-statement.f90
@@ -8,7 +8,7 @@ subroutine sub1()
 ! CHECK-LABEL: sub1
 ! CHECK:   cf.br ^[[BB1:.*]]
 ! CHECK: ^[[BB1]]:
-! CHECK:   {{.*}} fir.call @_FortranAStopStatement({{.*}}, {{.*}}, {{.*}}) {{.*}}: (i32, i1, i1) -> none
+! CHECK:   {{.*}} fir.call @_FortranAStopStatement({{.*}}, {{.*}}, {{.*}}) {{.*}}: (i32, i1, i1) -> ()
 ! CHECK: }
 
 ! Test multiple goto statements
@@ -25,7 +25,7 @@ subroutine sub2()
 ! CHECK: ^[[BB2]]:
 ! CHECK:   cf.br ^[[BB3:.*]]
 ! CHECK: ^[[BB3]]:
-! CHECK:   {{.*}} fir.call @_FortranAStopStatement({{.*}}, {{.*}}, {{.*}}) {{.*}}: (i32, i1, i1) -> none
+! CHECK:   {{.*}} fir.call @_FortranAStopStatement({{.*}}, {{.*}}, {{.*}}) {{.*}}: (i32, i1, i1) -> ()
 ! CHECK: }
 
 ! Test goto which branches to a previous label
@@ -36,10 +36,10 @@ subroutine sub3()
 3 goto 2
 end subroutine
 ! CHECK: sub3
-! CHECK:   {{.*}} fir.call @_FortranAPauseStatement() {{.*}}: () -> none
+! CHECK:   {{.*}} fir.call @_FortranAPauseStatement() {{.*}}: () -> ()
 ! CHECK:   cf.br ^[[BB2:.*]]
 ! CHECK: ^[[BB1:.*]]: //
-! CHECK:   {{.*}} fir.call @_FortranAStopStatement({{.*}}, {{.*}}, {{.*}}) {{.*}}: (i32, i1, i1) -> none
+! CHECK:   {{.*}} fir.call @_FortranAStopStatement({{.*}}, {{.*}}, {{.*}}) {{.*}}: (i32, i1, i1) -> ()
 ! CHECK: ^[[BB2]]:
 ! CHECK:   cf.br ^[[BB1]]
 ! CHECK: }
@@ -55,7 +55,7 @@ subroutine sub4()
 pause
 end subroutine
 ! CHECK-LABEL: sub4
-! CHECK:   {{.*}} fir.call @_FortranAPauseStatement() {{.*}}: () -> none
+! CHECK:   {{.*}} fir.call @_FortranAPauseStatement() {{.*}}: () -> ()
 ! CHECK-NEXT:   cf.br ^[[BB1:.*]]
 ! CHECK-NEXT: ^[[BB1]]:
 ! CHECK-NEXT:   cf.br ^[[BB2:.*]]
diff --git a/flang/test/Lower/io-statement-big-unit-checks.f90 b/flang/test/Lower/io-statement-big-unit-checks.f90
index 471fe399aee01..2be658c1f76d2 100644
--- a/flang/test/Lower/io-statement-big-unit-checks.f90
+++ b/flang/test/Lower/io-statement-big-unit-checks.f90
@@ -177,7 +177,7 @@ subroutine open_8_error_recovery_1(n, ios)
 ! CHECK:    %[[VAL_20:.*]] = arith.constant false
 ! CHECK:    %[[VAL_21:.*]] = arith.constant false
 ! CHECK:    %[[VAL_22:.*]] = arith.constant false
-! CHECK:    %[[VAL_23:.*]] = fir.call @_FortranAioEnableHandlers(%[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %[[VAL_20]], %[[VAL_21]], %[[VAL_22]]) {{.*}}: (!fir.ref<i8>, i1, i1, i1, i1, i1) -> none
+! CHECK:    fir.call @_FortranAioEnableHandlers(%[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %[[VAL_20]], %[[VAL_21]], %[[VAL_22]]) {{.*}}: (!fir.ref<i8>, i1, i1, i1, i1, i1) -> ()
 ! CHECK:    %[[VAL_24:.*]] = fir.call @_FortranAioEndIoStatement(%[[VAL_17]]) {{.*}}: (!fir.ref<i8>) -> i32
 ! CHECK:    fir.result %[[VAL_24]] : i32
 ! CHECK:  } else {
@@ -209,10 +209,10 @@ subroutine open_8_error_recovery_2(n, msg)
 ! CHECK:    %[[VAL_21:.*]] = arith.constant false
 ! CHECK:    %[[VAL_22:.*]] = arith.constant false
 ! CHECK:    %[[VAL_23:.*]] = arith.constant true
-! CHECK:    %[[VAL_24:.*]] = fir.call @_FortranAioEnableHandlers(%[[VAL_18]], %[[VAL_19]], %[[VAL_20]], %[[VAL_21]], %[[VAL_22]], %[[VAL_23]]) {{.*}}: (!fir.ref<i8>, i1, i1, i1, i1, i1) -> none
+! CHECK:    fir.call @_FortranAioEnableHandlers(%[[VAL_18]], %[[VAL_19]], %[[VAL_20]], %[[VAL_21]], %[[VAL_22]], %[[VAL_23]]) {{.*}}: (!fir.ref<i8>, i1, i1, i1, i1, i1) -> ()
 ! CHECK:    %[[VAL_25:.*]] = fir.convert %[[VAL_2]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
 ! CHECK:    %[[VAL_26:.*]] = fir.convert %[[VAL_2]]#1 : (index) -> i64
-! CHECK:    %[[VAL_27:.*]] = fir.call @_FortranAioGetIoMsg(%[[VAL_18]], %[[VAL_25]], %[[VAL_26]]) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i64) -> none
+! CHECK:    fir.call @_FortranAioGetIoMsg(%[[VAL_18]], %[[VAL_25]], %[[VAL_26]]) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i64) -> ()
 ! CHECK:    %[[VAL_28:.*]] = fir.call @_FortranAioEndIoStatement(%[[VAL_18]]) {{.*}}: (!fir.ref<i8>) -> i32
 ! CHECK:    fir.result %[[VAL_28]] : i32
 ! CHECK:  } else {
diff --git a/flang/test/Lower/module_use.f90 b/flang/test/Lower/module_use.f90
index ad43865470b68..b976663239ef5 100644
--- a/flang/test/Lower/module_use.f90
+++ b/flang/test/Lower/module_use.f90
@@ -1,5 +1,6 @@
-! RUN: bbc -emit-fir %S/module_definition.f90
-! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: rm -fr %t && mkdir -p %t
+! RUN: bbc -emit-fir -module %t %S/module_definition.f90
+! RUN: bbc -emit-fir -J %t %s -o - | FileCheck %s
 
 ! Test use of module data not defined in this file.
 ! The modules are defined in module_definition.f90
diff --git a/flang/test/Lower/nested-where.f90 b/flang/test/Lower/nested-where.f90
index b1b6367174ebd..ab457280b80ce 100644
--- a/flang/test/Lower/nested-where.f90
+++ b/flang/test/Lower/nested-where.f90
@@ -310,9 +310,9 @@ program nested_where
   ! CHECK:  }
   ! CHECK:  fir.array_merge_store %[[VAL_35]], %[[VAL_277:.*]] to %[[VAL_5]] : !fir.array<3xi32>, !fir.array<3xi32>, !fir.ref<!fir.array<3xi32>>
   ! CHECK:  %[[VAL_278:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<tuple<i64, !fir.heap<!fir.array<?xi8>>, !fir.heap<!fir.array<?xi64>>>>) -> !fir.llvm_ptr<i8>
-  ! CHECK:  %[[VAL_279:.*]] = fir.call @_FortranARaggedArrayDeallocate(%[[VAL_278]]) {{.*}}: (!fir.llvm_ptr<i8>) -> none
+  ! CHECK:  fir.call @_FortranARaggedArrayDeallocate(%[[VAL_278]]) {{.*}}: (!fir.llvm_ptr<i8>) -> ()
   ! CHECK:  %[[VAL_280:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<tuple<i64, !fir.heap<!fir.array<?xi8>>, !fir.heap<!fir.array<?xi64>>>>) -> !fir.llvm_ptr<i8>
-  ! CHECK:  %[[VAL_281:.*]] = fir.call @_FortranARaggedArrayDeallocate(%[[VAL_280]]) {{.*}}: (!fir.llvm_ptr<i8>) -> none
+  ! CHECK:  fir.call @_FortranARaggedArrayDeallocate(%[[VAL_280]]) {{.*}}: (!fir.llvm_ptr<i8>) -> ()
   
   integer :: a(3) = 0
   logical :: mask1(3) = (/ .true.,.false.,.true. /)
diff --git a/flang/test/Lower/nullify-polymorphic.f90 b/flang/test/Lower/nullify-polymorphic.f90
index 5cb966810f1b9..99470ad48d272 100644
--- a/flang/test/Lower/nullify-polymorphic.f90
+++ b/flang/test/Lower/nullify-polymorphic.f90
@@ -50,4 +50,4 @@ program test
 ! CHECK: %[[TYPE_DESC_CAST:.*]] = fir.convert %[[DECLARED_TYPE_DESC]] : (!fir.tdesc<!fir.type<_QMpolyTp1{a:i32,b:i32}>>) -> !fir.ref<none>
 ! CHECK: %[[RANK:.*]] = arith.constant 0 : i32
 ! CHECK: %[[CORANK:.*]] = arith.constant 0 : i32
-! CHECK: %{{.*}} = fir.call @_FortranAPointerNullifyDerived(%[[C_DESC_CAST]], %[[TYPE_DESC_CAST]], %[[RANK]], %[[CORANK]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
+! CHECK: fir.call @_FortranAPointerNullifyDerived(%[[C_DESC_CAST]], %[[TYPE_DESC_CAST]], %[[RANK]], %[[CORANK]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
diff --git a/flang/test/Lower/optional-value-caller.f90 b/flang/test/Lower/optional-value-caller.f90
index 31bf326dd1df1..d3ad5cf85e6b9 100644
--- a/flang/test/Lower/optional-value-caller.f90
+++ b/flang/test/Lower/optional-value-caller.f90
@@ -333,7 +333,7 @@ subroutine test_array_ptr(i)
 ! CHECK:             %[[VAL_20:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:             %[[VAL_21:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>) -> !fir.box<none>
 ! CHECK:             %[[VAL_22:.*]] = fir.convert %[[VAL_18]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-! CHECK:             %[[VAL_23:.*]] = fir.call @_FortranAAssignTemporary(%[[VAL_20]], %[[VAL_21]], %[[VAL_22]], %[[VAL_19]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:             fir.call @_FortranAAssignTemporary(%[[VAL_20]], %[[VAL_21]], %[[VAL_22]], %[[VAL_19]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:             fir.result %[[VAL_15]] : !fir.heap<!fir.array<?xi32>>
 ! CHECK:           }
 ! CHECK:           fir.result %[[VAL_24:.*]] : !fir.heap<!fir.array<?xi32>>
@@ -440,7 +440,7 @@ subroutine test_char_array(c)
 ! CHECK:             %[[VAL_23:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.box<!fir.array<?x!fir.char<1,?>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:             %[[VAL_24:.*]] = fir.convert %[[VAL_9]] : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> !fir.box<none>
 ! CHECK:             %[[VAL_25:.*]] = fir.convert %[[VAL_21]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-! CHECK:             %[[VAL_26:.*]] = fir.call @_FortranAAssignTemporary(%[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_22]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:             fir.call @_FortranAAssignTemporary(%[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_22]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:             fir.result %[[VAL_18]] : !fir.heap<!fir.array<?x!fir.char<1,?>>>
 ! CHECK:           }
 ! CHECK:           fir.result %[[VAL_27:.*]] : !fir.heap<!fir.array<?x!fir.char<1,?>>>
diff --git a/flang/test/Lower/parent-component.f90 b/flang/test/Lower/parent-component.f90
index c6bc53340643f..3cb23f277c9a3 100644
--- a/flang/test/Lower/parent-component.f90
+++ b/flang/test/Lower/parent-component.f90
@@ -192,6 +192,6 @@ subroutine parent_comp_lhs()
 ! CHECK: fir.store %[[EMBOX_A]] to %[[BOX]] : !fir.ref<!fir.box<!fir.type<_QFTp{a:i32}>>>
 ! CHECK: %[[A_NONE:.*]] = fir.convert %[[BOX]] : (!fir.ref<!fir.box<!fir.type<_QFTp{a:i32}>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[B_NONE:.*]] = fir.convert %[[EMBOX_B]] : (!fir.box<!fir.type<_QFTp{a:i32}>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAAssign(%[[A_NONE]], %[[B_NONE]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranAAssign(%[[A_NONE]], %[[B_NONE]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 
 end
diff --git a/flang/test/Lower/pointer-association-polymorphic.f90 b/flang/test/Lower/pointer-association-polymorphic.f90
index 6c56db892d1b8..7d166e1423cfa 100644
--- a/flang/test/Lower/pointer-association-polymorphic.f90
+++ b/flang/test/Lower/pointer-association-polymorphic.f90
@@ -87,7 +87,7 @@ subroutine test_pointer()
 ! CHECK: %[[C1_DESC_LOAD:.*]] = fir.load %[[C1_DESC]] : !fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>
 ! CHECK: %[[P_CONV:.*]] = fir.convert %[[P_DESC]] : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[C1_DESC_CONV:.*]] = fir.convert %[[C1_DESC_LOAD]] : (!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerAssociate(%[[P_CONV]], %[[C1_DESC_CONV]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> none
+! CHECK: fir.call @_FortranAPointerAssociate(%[[P_CONV]], %[[C1_DESC_CONV]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> ()
 ! CHECK: %[[P_DESC_LOAD:.*]] = fir.load %[[P_DESC]] : !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>
 ! CHECK: %[[P_REBOX:.*]] = fir.rebox %[[P_DESC_LOAD]] : (!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>) -> !fir.class<!fir.type<_QMpolyTp1{a:i32,b:i32}>>
 ! CHECK: fir.dispatch "proc"(%[[P_DESC_LOAD]] : !fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>) (%[[P_REBOX]] : !fir.class<!fir.type<_QMpolyTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32}
@@ -95,7 +95,7 @@ subroutine test_pointer()
 ! CHECK: %[[C2_DESC_LOAD:.*]] = fir.load %[[C2_DESC]] : !fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>
 ! CHECK: %[[P_CONV:.*]] = fir.convert %[[P_DESC]] : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[C2_DESC_CONV:.*]] = fir.convert %[[C2_DESC_LOAD]] : (!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerAssociate(%[[P_CONV]], %[[C2_DESC_CONV]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> none
+! CHECK: fir.call @_FortranAPointerAssociate(%[[P_CONV]], %[[C2_DESC_CONV]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> ()
 ! CHECK: %[[P_DESC_LOAD:.*]] = fir.load %[[P_DESC]] : !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>
 ! CHECK: %[[P_REBOX:.*]] = fir.rebox %[[P_DESC_LOAD]] : (!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>) -> !fir.class<!fir.type<_QMpolyTp1{a:i32,b:i32}>>
 ! CHECK: fir.dispatch "proc"(%[[P_DESC_LOAD]] : !fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>) (%[[P_REBOX]] : !fir.class<!fir.type<_QMpolyTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32}
@@ -110,7 +110,7 @@ subroutine test_pointer()
 ! CHECK: %[[C3_EMBOX:.*]] = fir.embox %[[C3_COORD]] source_box %[[C3_LOAD]] : (!fir.ref<!fir.type<_QMpolyTp1{a:i32,b:i32}>>, !fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.class<!fir.type<_QMpolyTp1{a:i32,b:i32}>>
 ! CHECK: %[[P_CONV:.*]] = fir.convert %[[P_DESC]] : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[C3_EMBOX_CONV:.*]] = fir.convert %[[C3_EMBOX]] : (!fir.class<!fir.type<_QMpolyTp1{a:i32,b:i32}>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerAssociate(%[[P_CONV]], %[[C3_EMBOX_CONV]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> none
+! CHECK: fir.call @_FortranAPointerAssociate(%[[P_CONV]], %[[C3_EMBOX_CONV]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> ()
 ! CHECK: %[[P_DESC_LOAD:.*]] = fir.load %[[P_DESC]] : !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>
 ! CHECK: %[[P_REBOX:.*]] = fir.rebox %[[P_DESC_LOAD]] : (!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>) -> !fir.class<!fir.type<_QMpolyTp1{a:i32,b:i32}>>
 ! CHECK: fir.dispatch "proc"(%[[P_DESC_LOAD]] : !fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>) (%[[P_REBOX]] : !fir.class<!fir.type<_QMpolyTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32}
@@ -125,7 +125,7 @@ subroutine test_pointer()
 ! CHECK: %[[C4_EMBOX:.*]] = fir.embox %[[C4_COORD]] source_box %[[C4_LOAD]] : (!fir.ref<!fir.type<_QMpolyTp1{a:i32,b:i32}>>, !fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.class<!fir.type<_QMpolyTp1{a:i32,b:i32}>>
 ! CHECK: %[[P_CONV:.*]] = fir.convert %[[P_DESC]] : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[C4_EMBOX_CONV:.*]] = fir.convert %[[C4_EMBOX]] : (!fir.class<!fir.type<_QMpolyTp1{a:i32,b:i32}>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerAssociate(%[[P_CONV]], %[[C4_EMBOX_CONV]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> none
+! CHECK: fir.call @_FortranAPointerAssociate(%[[P_CONV]], %[[C4_EMBOX_CONV]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> ()
 ! CHECK: %[[P_DESC_LOAD:.*]] = fir.load %[[P_DESC]] : !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>
 ! CHECK: %[[P_REBOX:.*]] = fir.rebox %[[P_DESC_LOAD]] : (!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>) -> !fir.class<!fir.type<_QMpolyTp1{a:i32,b:i32}>>
 ! CHECK: fir.dispatch "proc"(%[[P_DESC_LOAD]] : !fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>) (%[[P_REBOX]] : !fir.class<!fir.type<_QMpolyTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32}
@@ -134,7 +134,7 @@ subroutine test_pointer()
 ! CHECK: %[[C3_REBOX:.*]] = fir.rebox %[[C3_LOAD]](%{{.*}}) : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>, !fir.shift<1>) -> !fir.class<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>
 ! CHECK: %[[PA_CONV:.*]] = fir.convert %[[PA_DESC]] : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[C3_REBOX_CONV:.*]] = fir.convert %[[C3_REBOX]] : (!fir.class<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerAssociate(%[[PA_CONV]], %[[C3_REBOX_CONV]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> none
+! CHECK: fir.call @_FortranAPointerAssociate(%[[PA_CONV]], %[[C3_REBOX_CONV]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> ()
 ! CHECK-LABEL: fir.do_loop
 ! CHECK: %[[PA_LOAD:.*]] = fir.load %[[PA_DESC]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>
 ! CHECK: %[[PA_COORD:.*]] = fir.coordinate_of %[[PA_LOAD]], %{{.*}} : (!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>, i64) -> !fir.ref<!fir.type<_QMpolyTp1{a:i32,b:i32}>>
@@ -145,7 +145,7 @@ subroutine test_pointer()
 ! CHECK: %[[C4_REBOX:.*]] = fir.rebox %[[C4_LOAD]](%{{.*}}) : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>, !fir.shift<1>) -> !fir.class<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>
 ! CHECK: %[[PA_CONV:.*]] = fir.convert %[[PA_DESC]] : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>> 
 ! CHECK: %[[C4_REBOX_CONV:.*]] = fir.convert %[[C4_REBOX]] : (!fir.class<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>) -> !fir.box<none> 
-! CHECK: %{{.*}} = fir.call @_FortranAPointerAssociate(%[[PA_CONV]], %[[C4_REBOX_CONV]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> none 
+! CHECK: fir.call @_FortranAPointerAssociate(%[[PA_CONV]], %[[C4_REBOX_CONV]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> () 
 ! CHECK-LABEL: fir.do_loop
 ! CHECK: %[[PA_LOAD:.*]] = fir.load %[[PA_DESC]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>
 ! CHECK: %[[PA_COORD:.*]] = fir.coordinate_of %[[PA_LOAD]], %{{.*}} : (!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>, i64) -> !fir.ref<!fir.type<_QMpolyTp1{a:i32,b:i32}>>
@@ -166,7 +166,7 @@ subroutine test_pointer()
 ! CHECK: %[[SLICE_REBOX:.*]] = fir.rebox %[[C4_LOAD]](%[[SHIFT]]) [%[[SLICE]]] : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>, !fir.shift<1>, !fir.slice<1>) -> !fir.class<!fir.array<3x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>
 ! CHECK: %[[PA_CONV:.*]] = fir.convert %[[PA_DESC]] : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[SLICE_REBOX_CONV:.*]] = fir.convert %[[SLICE_REBOX]] : (!fir.class<!fir.array<3x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>) -> !fir.box<none> 
-! CHECK: %{{.*}} = fir.call @_FortranAPointerAssociate(%[[PA_CONV]], %[[SLICE_REBOX_CONV]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> none 
+! CHECK: fir.call @_FortranAPointerAssociate(%[[PA_CONV]], %[[SLICE_REBOX_CONV]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> () 
 ! CHECK-LABEL: fir.do_loop
 ! CHECK: %[[PA_LOAD:.*]] = fir.load %[[PA_DESC]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>
 ! CHECK: %[[PA_COORD:.*]] = fir.coordinate_of %[[PA_LOAD]], %{{.*}} : (!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>, i64) -> !fir.ref<!fir.type<_QMpolyTp1{a:i32,b:i32}>>
diff --git a/flang/test/Lower/pointer-disassociate.f90 b/flang/test/Lower/pointer-disassociate.f90
index e341bca5cd89b..fb70fd7795b2e 100644
--- a/flang/test/Lower/pointer-disassociate.f90
+++ b/flang/test/Lower/pointer-disassociate.f90
@@ -118,7 +118,7 @@ subroutine test_polymorphic_null(p)
 ! CHECK:  %[[VAL_3:.*]] = fir.convert %[[VAL_1]] : (!fir.tdesc<!fir.type<_QFtest_polymorphic_nullTt>>) -> !fir.ref<none> 
 ! CHECK:  %[[VAL_4:.*]] = arith.constant 1 : i32
 ! CHECK:  %[[VAL_5:.*]] = arith.constant 0 : i32
-! CHECK:  %[[VAL_6:.*]] = fir.call @_FortranAPointerNullifyDerived(%[[VAL_2]], %[[VAL_3]], %[[VAL_4]], %[[VAL_5]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
+! CHECK:  fir.call @_FortranAPointerNullifyDerived(%[[VAL_2]], %[[VAL_3]], %[[VAL_4]], %[[VAL_5]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 
 subroutine test_unlimited_polymorphic_null(p)
   class(*), pointer :: p(:)
diff --git a/flang/test/Lower/polymorphic-temp.f90 b/flang/test/Lower/polymorphic-temp.f90
index 8633620e8430e..5e2937e1f5f65 100644
--- a/flang/test/Lower/polymorphic-temp.f90
+++ b/flang/test/Lower/polymorphic-temp.f90
@@ -46,7 +46,7 @@ subroutine test_temp_from_intrinsic_spread()
 ! CHECK: %[[RES_BOX_NONE:.*]] = fir.convert %[[TEMP_RES0]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[P_BOX_NONE:.*]] = fir.convert %[[LOAD_P]] : (!fir.class<!fir.ptr<none>>) -> !fir.box<none>
 ! CHECK: %[[C2_I64:.*]] = fir.convert %[[C2]] : (i32) -> i64
-! CHECK: %{{.*}} = fir.call @_FortranASpread(%[[RES_BOX_NONE]], %[[P_BOX_NONE]], %[[C1]], %[[C2_I64]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i64, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranASpread(%[[RES_BOX_NONE]], %[[P_BOX_NONE]], %[[C1]], %[[C2_I64]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i64, !fir.ref<i8>, i32) -> ()
 ! CHECK: %[[LOAD_RES:.*]] = fir.load %[[TEMP_RES0]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>
 ! CHECK: %[[RES_ADDR:.*]] = fir.box_addr %[[LOAD_RES]] : (!fir.class<!fir.heap<!fir.array<?xnone>>>) -> !fir.heap<!fir.array<?xnone>>
 ! CHECK: %[[REBOX:.*]] = fir.rebox %[[LOAD_RES]] : (!fir.class<!fir.heap<!fir.array<?xnone>>>) -> !fir.class<!fir.array<?xnone>>
@@ -57,7 +57,7 @@ subroutine test_temp_from_intrinsic_spread()
 ! CHECK: %[[EMBOX_PA_1:.*]] = fir.embox %[[COORD_PA_1]] source_box %[[LOAD_PA]] : (!fir.ref<none>, !fir.class<!fir.ptr<!fir.array<?xnone>>>) -> !fir.class<none>
 ! CHECK: %[[RES1_BOX_NONE:.*]] = fir.convert %[[TEMP_RES1]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[PA1_BOX_NONE:.*]] = fir.convert %[[EMBOX_PA_1]] : (!fir.class<none>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranASpread(%[[RES1_BOX_NONE]], %[[PA1_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i64, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranASpread(%[[RES1_BOX_NONE]], %[[PA1_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i64, !fir.ref<i8>, i32) -> ()
 
   subroutine test_temp_from_intrinsic_reshape(i)
     class(*), allocatable :: a(:,:)
@@ -73,12 +73,12 @@ subroutine test_temp_from_intrinsic_reshape(i)
 ! CHECK: fir.store %[[EMBOX_WITH_SOURCE]] to %[[TMP_RES]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>
 ! CHECK: %[[RES_BOX_NONE:.*]] = fir.convert %[[TMP_RES]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[I_BOX_NONE:.*]] = fir.convert %[[I]] : (!fir.class<!fir.array<20x20xnone>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAReshape(%[[RES_BOX_NONE]], %[[I_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranAReshape(%[[RES_BOX_NONE]], %[[I_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK: %[[LOAD_RES:.*]] = fir.load %[[TMP_RES]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>
 ! CHECK: %[[RANK:.*]] = arith.constant 2 : i32
 ! CHECK: %[[A_BOX_NONE:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[RES_BOX_NONE:.*]] = fir.convert %[[LOAD_RES]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableApplyMold(%[[A_BOX_NONE]], %[[RES_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> none
+! CHECK: fir.call @_FortranAAllocatableApplyMold(%[[A_BOX_NONE]], %[[RES_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 
   subroutine check(r)
     class(p1) :: r(:)
@@ -100,7 +100,7 @@ subroutine test_temp_from_intrinsic_pack(i, mask)
 ! CHECK: %[[RES_BOX_NONE:.*]] = fir.convert %[[TMP_RES]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpoly_tmpTp1{a:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[I_BOX_NONE:.*]] = fir.convert %[[I]] : (!fir.class<!fir.array<20x20x!fir.type<_QMpoly_tmpTp1{a:i32}>>>) -> !fir.box<none>
 ! CHECK: %[[MASK_BOX_NONE:.*]] = fir.convert %[[EMBOXED_MASK]] : (!fir.box<!fir.array<20x20x!fir.logical<4>>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAPack(%[[RES_BOX_NONE]], %[[I_BOX_NONE]], %[[MASK_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranAPack(%[[RES_BOX_NONE]], %[[I_BOX_NONE]], %[[MASK_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 
   subroutine check_rank2(r)
     class(p1), intent(in) :: r(:,:)
@@ -119,7 +119,7 @@ subroutine test_temp_from_unpack(v, m, f)
 ! CHECK: %[[V_BOX_NONE:.*]] = fir.convert %[[V]] : (!fir.class<!fir.array<?x!fir.type<_QMpoly_tmpTp1{a:i32}>>>) -> !fir.box<none>
 ! CHECK: %[[M_BOX_NONE:.*]] = fir.convert %[[M]] : (!fir.box<!fir.array<?x?x!fir.logical<4>>>) -> !fir.box<none>
 ! CHECK: %[[F_BOX_NONE:.*]] = fir.convert %[[F]] : (!fir.class<!fir.array<?x?x!fir.type<_QMpoly_tmpTp1{a:i32}>>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAUnpack(%[[TMP_BOX_NONE]], %[[V_BOX_NONE]], %[[M_BOX_NONE]], %[[F_BOX_NONE]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranAUnpack(%[[TMP_BOX_NONE]], %[[V_BOX_NONE]], %[[M_BOX_NONE]], %[[F_BOX_NONE]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 
   subroutine check_cshift(r)
     class(p1) :: r(:)
@@ -139,7 +139,7 @@ subroutine test_temp_from_intrinsic_cshift(a, shift)
 ! CHECK: %[[RES_BOX_NONE:.*]] = fir.convert %[[TMP_RES]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpoly_tmpTp1{a:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[ARRAY_NONE:.*]] = fir.convert %[[ARRAY]] : (!fir.class<!fir.array<20x!fir.type<_QMpoly_tmpTp1{a:i32}>>>) -> !fir.box<none>
 ! CHECK: %[[SHIFT_I64:.*]] = fir.convert %[[LOAD_SHIFT]] : (i32) -> i64
-! CHECK: %{{.*}} = fir.call @_FortranACshiftVector(%[[RES_BOX_NONE]], %[[ARRAY_NONE]], %[[SHIFT_I64]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranACshiftVector(%[[RES_BOX_NONE]], %[[ARRAY_NONE]], %[[SHIFT_I64]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.ref<i8>, i32) -> ()
 
   subroutine test_temp_from_intrinsic_eoshift(a, shift, b)
     class(p1), intent(in) :: a(20)
@@ -157,7 +157,7 @@ subroutine test_temp_from_intrinsic_eoshift(a, shift, b)
 ! CHECK: %[[ARRAY_NONE:.*]] = fir.convert %[[ARRAY]] : (!fir.class<!fir.array<20x!fir.type<_QMpoly_tmpTp1{a:i32}>>>) -> !fir.box<none>
 ! CHECK: %[[SHIFT_I64:.*]] = fir.convert %[[LOAD_SHIFT]] : (i32) -> i64
 ! CHECK: %[[BOUNDARY_NONE:.*]] = fir.convert %[[BOUNDARY]] : (!fir.class<!fir.type<_QMpoly_tmpTp1{a:i32}>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAEoshiftVector(%[[RES_BOX_NONE]], %[[ARRAY_NONE]], %[[SHIFT_I64]], %[[BOUNDARY_NONE]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranAEoshiftVector(%[[RES_BOX_NONE]], %[[ARRAY_NONE]], %[[SHIFT_I64]], %[[BOUNDARY_NONE]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 
   subroutine test_temp_from_intrinsic_transfer(source, mold)
     class(p1), intent(in) :: source(:)
@@ -171,7 +171,7 @@ subroutine test_temp_from_intrinsic_transfer(source, mold)
 ! CHECK: %[[RES_BOX_NONE:.*]] = fir.convert %[[TMP_RES]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpoly_tmpTp1{a:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[SOURCE_NONE:.*]] = fir.convert %[[SOURCE]] : (!fir.class<!fir.array<?x!fir.type<_QMpoly_tmpTp1{a:i32}>>>) -> !fir.box<none>
 ! CHECK: %[[MOLD_NONE:.*]] = fir.convert %[[MOLD]] : (!fir.class<!fir.array<?x!fir.type<_QMpoly_tmpTp1{a:i32}>>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranATransfer(%[[RES_BOX_NONE]], %[[SOURCE_NONE]], %[[MOLD_NONE]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranATransfer(%[[RES_BOX_NONE]], %[[SOURCE_NONE]], %[[MOLD_NONE]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 
   subroutine test_temp_from_intrinsic_transpose(matrix)
     class(p1), intent(in) :: matrix(:,:)
@@ -183,7 +183,7 @@ subroutine test_temp_from_intrinsic_transpose(matrix)
 ! CHECK: %[[TMP_RES:.*]] = fir.alloca !fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMpoly_tmpTp1{a:i32}>>>>
 ! CHECK: %[[RES_BOX_NONE:.*]] = fir.convert %[[TMP_RES]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMpoly_tmpTp1{a:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[MATRIX_NONE:.*]] = fir.convert %[[MATRIX]] : (!fir.class<!fir.array<?x?x!fir.type<_QMpoly_tmpTp1{a:i32}>>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranATranspose(%[[RES_BOX_NONE]], %[[MATRIX_NONE]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranATranspose(%[[RES_BOX_NONE]], %[[MATRIX_NONE]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 
   subroutine check_scalar(a)
     class(p1), intent(in) :: a
diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90
index 8c40c91bc3baa..73603d7ee7bee 100644
--- a/flang/test/Lower/polymorphic.f90
+++ b/flang/test/Lower/polymorphic.f90
@@ -211,7 +211,7 @@ subroutine associate_up_pointer(r)
 ! CHECK: %[[REBOX_RP:.*]] = fir.rebox %[[LOAD_RP]](%{{.*}}) : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, !fir.shift<1>) -> !fir.box<!fir.array<?xf32>>
 ! CHECK: %[[CONV_P:.*]] = fir.convert %[[P]] : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[RP_BOX_NONE:.*]] = fir.convert %[[REBOX_RP]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerAssociate(%[[CONV_P]], %[[RP_BOX_NONE]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> none
+! CHECK: fir.call @_FortranAPointerAssociate(%[[CONV_P]], %[[RP_BOX_NONE]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> ()
 ! CHECK: return
 
 ! Test that the fir.dispatch operation is created with the correct pass object
@@ -315,7 +315,7 @@ subroutine nullify_pointer_array(a)
 ! CHECK: %[[CONV_TDESC:.*]] = fir.convert %[[TYPE_DESC]] : (!fir.tdesc<!fir.type<_QMpolymorphic_testTp3{p:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolymorphic_testTp3>>>>}>>) -> !fir.ref<none>
 ! CHECK: %[[C1:.*]] = arith.constant 1 : i32
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i32
-! CHECK: %{{.*}} = fir.call @_FortranAPointerNullifyDerived(%[[CONV_P]], %[[CONV_TDESC]], %[[C1]], %[[C0]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
+! CHECK: fir.call @_FortranAPointerNullifyDerived(%[[CONV_P]], %[[CONV_TDESC]], %[[C1]], %[[C0]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 
   subroutine up_input(a)
     class(*), intent(in) :: a
@@ -400,7 +400,7 @@ subroutine assign_polymorphic_allocatable()
 ! CHECK: %[[BOXED_T:.*]] = fir.embox %[[T]](%[[SHAPE]]) : (!fir.ref<!fir.array<10x20x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>, !fir.shape<2>) -> !fir.box<!fir.array<10x20x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>
 ! CHECK: %[[CONV_C:.*]] = fir.convert %[[C]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[CONV_BOXED_T:.*]] = fir.convert %[[BOXED_T]] : (!fir.box<!fir.array<10x20x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAAssignPolymorphic(%[[CONV_C]], %[[CONV_BOXED_T]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranAAssignPolymorphic(%[[CONV_C]], %[[CONV_BOXED_T]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK: return
 
   subroutine pointer_assign_remap()
@@ -436,7 +436,7 @@ subroutine pointer_assign_remap()
 ! CHECK: %[[ARG0:.*]] = fir.convert %[[P]] : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x?x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[ARG1:.*]] = fir.convert %[[REBOX_A]] : (!fir.class<!fir.array<?x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>) -> !fir.box<none>
 ! CHECK: %[[ARG2:.*]] = fir.convert %[[BOXED_BOUND_ARRAY]] : (!fir.box<!fir.array<2x2xi64>>) -> !fir.box<none>
-! CHECK:  %{{.*}} = fir.call @_FortranAPointerAssociateRemapping(%[[ARG0]], %[[ARG1]], %[[ARG2]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:  fir.call @_FortranAPointerAssociateRemapping(%[[ARG0]], %[[ARG1]], %[[ARG2]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i64
 ! CHECK: %[[C99:.*]] = arith.constant 99 : i64
@@ -454,7 +454,7 @@ subroutine pointer_assign_remap()
 ! CHECK: %[[ARG0:.*]] = fir.convert %[[Q]] : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[ARG1:.*]] = fir.convert %[[REBOX_A]] : (!fir.class<!fir.array<?x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>) -> !fir.box<none>
 ! CHECK: %[[ARG2:.*]] = fir.convert %[[BOXED_BOUND_ARRAY]] : (!fir.box<!fir.array<2x1xi64>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerAssociateRemapping(%[[ARG0]], %[[ARG1]], %[[ARG2]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranAPointerAssociateRemapping(%[[ARG0]], %[[ARG1]], %[[ARG2]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 
   subroutine pointer_assign_lower_bounds()
     class(p1), allocatable, target :: a(:)
@@ -467,7 +467,7 @@ subroutine pointer_assign_lower_bounds()
 ! CHECK: %[[A:.*]] = fir.alloca !fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>> {bindc_name = "a", fir.target, uniq_name = "_QMpolymorphic_testFpointer_assign_lower_boundsEa"}
 ! CHECK: %[[P:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>> {bindc_name = "p", uniq_name = "_QMpolymorphic_testFpointer_assign_lower_boundsEp"}
 ! CHECK: %[[LB:.*]] = arith.constant -50 : i64
-! CHECK: %[[REBOX_A:.*]] = fir.rebox %21(%23) : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>>, !fir.shift<1>) -> !fir.class<!fir.array<?x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>
+! CHECK: %[[REBOX_A:.*]] = fir.rebox %{{.*}}(%{{.*}}) : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>>, !fir.shift<1>) -> !fir.class<!fir.array<?x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>
 ! CHECK: %[[LBOUND_ARRAY:.*]] = fir.alloca !fir.array<1xi64>
 ! CHECK: %[[ARRAY:.*]] = fir.undefined !fir.array<1xi64>
 ! CHECK: %[[ARRAY0:.*]] = fir.insert_value %[[ARRAY]], %[[LB]], [0 : index] : (!fir.array<1xi64>, i64) -> !fir.array<1xi64>
@@ -478,7 +478,7 @@ subroutine pointer_assign_lower_bounds()
 ! CHECK: %[[P_BOX_NONE:.*]] = fir.convert %[[P]] : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[A_BOX_NONE:.*]] = fir.convert %[[REBOX_A]] : (!fir.class<!fir.array<?x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>) -> !fir.box<none>
 ! CHECK: %[[LBOUNDS_BOX_NONE:.*]] = fir.convert %[[LBOUND_ARRAY_BOXED]] : (!fir.box<!fir.array<1xi64>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerAssociateLowerBounds(%[[P_BOX_NONE]], %[[A_BOX_NONE]], %[[LBOUNDS_BOX_NONE]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> none
+! CHECK: fir.call @_FortranAPointerAssociateLowerBounds(%[[P_BOX_NONE]], %[[A_BOX_NONE]], %[[LBOUNDS_BOX_NONE]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
 
   subroutine test_elemental_assign()
     type(p1) :: pa(3)
@@ -501,7 +501,7 @@ subroutine test_elemental_assign()
 ! CHECK: %[[DO_RES:.*]] = fir.do_loop %[[ARG0:.*]] = %[[C0]] to %[[UB]] step %[[C1]] unordered iter_args(%[[ARG1:.*]] = %[[LOAD_PA]]) -> (!fir.array<3x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) {
 ! CHECK:   %[[FETCH_INT:.*]] = fir.array_fetch %[[LOAD_INT_ARRAY]], %[[ARG0]] : (!fir.array<3xi32>, index) -> i32
 ! CHECK:   %[[ARRAY_MOD:.*]]:2 = fir.array_modify %[[ARG1]], %[[ARG0]] : (!fir.array<3x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>, index) -> (!fir.ref<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>, !fir.array<3x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>)
-! CHECK:   %[[EMBOXED:.*]] = fir.embox %10#0 : (!fir.ref<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) -> !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>
+! CHECK:   %[[EMBOXED:.*]] = fir.embox %{{.*}}#0 : (!fir.ref<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) -> !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>
 ! CHECK:   fir.store %[[FETCH_INT]] to %[[INT]] : !fir.ref<i32>
 ! CHECK:   fir.call @_QMpolymorphic_testPassign_p1_int(%[[EMBOXED]], %[[INT]]) fastmath<contract> : (!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>, !fir.ref<i32>) -> ()
 ! CHECK:   fir.result %[[ARRAY_MOD]]#1 : !fir.array<3x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>
@@ -791,9 +791,9 @@ subroutine test_unlimited_polymorphic_intentout(a)
 ! CHECK-LABEL: func.func @_QMpolymorphic_testPtest_unlimited_polymorphic_intentout(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.class<none> {fir.bindc_name = "a"}) {
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ARG0]] : (!fir.class<none>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranADestroy(%[[BOX_NONE]]) {{.*}} : (!fir.box<none>) -> none 
+! CHECK: fir.call @_FortranADestroy(%[[BOX_NONE]]) {{.*}} : (!fir.box<none>) -> () 
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ARG0]] : (!fir.class<none>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAInitialize(%[[BOX_NONE]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranAInitialize(%[[BOX_NONE]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 
   subroutine test_polymorphic_intentout(a)
     class(p1), intent(out) :: a
@@ -802,9 +802,9 @@ subroutine test_polymorphic_intentout(a)
 ! CHECK-LABEL: func.func @_QMpolymorphic_testPtest_polymorphic_intentout(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {fir.bindc_name = "a"}) {
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ARG0]] : (!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranADestroy(%[[BOX_NONE]]) {{.*}} : (!fir.box<none>) -> none 
+! CHECK: fir.call @_FortranADestroy(%[[BOX_NONE]]) {{.*}} : (!fir.box<none>) -> () 
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ARG0]] : (!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAInitialize(%[[BOX_NONE]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranAInitialize(%[[BOX_NONE]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 
   subroutine rebox_up_to_record_type(p)
     class(*), allocatable, target :: p(:,:)
@@ -944,7 +944,7 @@ subroutine test_rhs_assign(a)
 ! CHECK: %[[LOAD_RES:.*]] = fir.load %[[RES]] : !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>>
 ! CHECK: %[[A_NONE:.*]] = fir.convert %[[A]] : (!fir.box<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[RES_NONE:.*]] = fir.convert %[[LOAD_RES]] : (!fir.class<!fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAAssign(%[[A_NONE]], %[[RES_NONE]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranAAssign(%[[A_NONE]], %[[RES_NONE]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 
   subroutine type_with_polymorphic_components(a, b)
     type(p4) :: a, b
@@ -959,7 +959,7 @@ subroutine type_with_polymorphic_components(a, b)
 ! CHECK: fir.store %[[EMBOX_A]] to %[[ALLOCA]] : !fir.ref<!fir.box<!fir.type<_QMpolymorphic_testTp4{a:!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>>}>>>
 ! CHECK: %[[BOX_NONE1:.*]] = fir.convert %[[ALLOCA]] : (!fir.ref<!fir.box<!fir.type<_QMpolymorphic_testTp4{a:!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>>}>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[BOX_NONE2:.*]] = fir.convert %[[EMBOX_B]] : (!fir.box<!fir.type<_QMpolymorphic_testTp4{a:!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>>}>>) -> !fir.box<none>
-! CHECK: %{{.*}} = fir.call @_FortranAAssign(%[[BOX_NONE1]], %[[BOX_NONE2]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK: fir.call @_FortranAAssign(%[[BOX_NONE1]], %[[BOX_NONE2]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 
   subroutine up_pointer(p)
     class(*), pointer, intent(in) :: p
@@ -1015,7 +1015,7 @@ subroutine test_parent_comp_in_select_type(s)
 ! CHECK:  %[[LOAD_P:.*]] = fir.load %[[P]] : !fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>>
 ! CHECK:  %[[LHS_CONV:.*]] = fir.convert %[[REBOX_P1]] : (!fir.box<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:  %[[RHS_CONV:.*]] = fir.convert %[[LOAD_P]] : (!fir.class<!fir.heap<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>) -> !fir.box<none>
-! CHECK:  %{{.*}} = fir.call @_FortranAAssign(%[[LHS_CONV]], %[[RHS_CONV]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:  fir.call @_FortranAAssign(%[[LHS_CONV]], %[[RHS_CONV]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 
   subroutine move_alloc_unlimited_poly(a, b)
     class(*), allocatable :: a, b
@@ -1158,7 +1158,7 @@ program test
 ! CHECK: %[[O:.*]] = fir.load %[[ADDR_O]] : !fir.ref<!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>>
 ! CHECK: %[[FIELD_INNER:.*]] = fir.field_index inner, !fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>
 ! CHECK: %[[COORD_INNER:.*]] = fir.coordinate_of %[[O]], %[[FIELD_INNER]] : (!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>, !fir.field) -> !fir.ref<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>
-! CHECK: %{{.*}} = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered iter_args(%arg1 = %9) -> (!fir.array<5x!fir.logical<4>>) {
+! CHECK: %{{.*}} = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered iter_args(%arg1 = %{{.*}}) -> (!fir.array<5x!fir.logical<4>>) {
 ! CHECK:   %[[EMBOXED:.*]] = fir.embox %[[COORD_INNER]] : (!fir.ref<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) -> !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>
-! CHECK:   %{{.*}} = fir.call @_QMpolymorphic_testPlt(%17, %[[EMBOXED]]) {{.*}} : (!fir.ref<i32>, !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) -> !fir.logical<4>
+! CHECK:   %{{.*}} = fir.call @_QMpolymorphic_testPlt(%{{.*}}, %[[EMBOXED]]) {{.*}} : (!fir.ref<i32>, !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) -> !fir.logical<4>
 ! CHECK:  }
diff --git a/flang/test/Lower/select-type-2.fir b/flang/test/Lower/select-type-2.fir
index fbc1eb07bbd2e..87b42611752f9 100644
--- a/flang/test/Lower/select-type-2.fir
+++ b/flang/test/Lower/select-type-2.fir
@@ -63,7 +63,7 @@
 // CHECK:           %[[VAL_19:.*]] = fir.address_of(@_QQclX6661696C2074797065) : !fir.ref<!fir.char<1,9>>
 // CHECK:           %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (!fir.ref<!fir.char<1,9>>) -> !fir.ref<i8>
 // CHECK:           %[[VAL_21:.*]] = fir.convert %[[VAL_2]] : (index) -> i64
-// CHECK:           %[[VAL_22:.*]] = fir.call @_FortranAStopStatementText(%[[VAL_20]], %[[VAL_21]], %[[VAL_1]], %[[VAL_1]]) fastmath<contract> : (!fir.ref<i8>, i64, i1, i1) -> none
+// CHECK:           fir.call @_FortranAStopStatementText(%[[VAL_20]], %[[VAL_21]], %[[VAL_1]], %[[VAL_1]]) fastmath<contract> : (!fir.ref<i8>, i64, i1, i1) -> ()
 // CHECK:           fir.unreachable
 // CHECK:         ^bb4:
 // CHECK:           cf.br ^bb3(%[[VAL_13]] : !fir.class<!fir.ptr<none>>)
@@ -96,7 +96,7 @@ func.func @_QPtest() {
   %9 = fir.address_of(@_QQclX6661696C2074797065) : !fir.ref<!fir.char<1,9>>
   %10 = fir.convert %9 : (!fir.ref<!fir.char<1,9>>) -> !fir.ref<i8>
   %11 = fir.convert %c9 : (index) -> i64
-  %12 = fir.call @_FortranAStopStatementText(%10, %11, %false, %false) fastmath<contract> : (!fir.ref<i8>, i64, i1, i1) -> none
+  fir.call @_FortranAStopStatementText(%10, %11, %false, %false) fastmath<contract> : (!fir.ref<i8>, i64, i1, i1) -> ()
   fir.unreachable
 ^bb3:  // pred: ^bb1
   %13 = fir.box_addr %6 : (!fir.class<!fir.ptr<none>>) -> !fir.ptr<i32>
diff --git a/flang/test/Lower/stop-statement.f90 b/flang/test/Lower/stop-statement.f90
index cf0665cf5dbd1..0cbb01dd8a742 100644
--- a/flang/test/Lower/stop-statement.f90
+++ b/flang/test/Lower/stop-statement.f90
@@ -75,5 +75,5 @@ subroutine stop_char_lit
   stop 'crash'
 end subroutine stop_char_lit
 
-! CHECK-DAG: func private @_Fortran{{.*}}StopStatement(i32, i1, i1) -> none
-! CHECK-DAG: func private @_Fortran{{.*}}StopStatementText(!fir.ref<i8>, i64, i1, i1) -> none
+! CHECK-DAG: func private @_Fortran{{.*}}StopStatement(i32, i1, i1)
+! CHECK-DAG: func private @_Fortran{{.*}}StopStatementText(!fir.ref<i8>, i64, i1, i1)
diff --git a/flang/test/Lower/structure-constructors-alloc-comp.f90 b/flang/test/Lower/structure-constructors-alloc-comp.f90
index 5b1bca317c94f..8887ed4851045 100644
--- a/flang/test/Lower/structure-constructors-alloc-comp.f90
+++ b/flang/test/Lower/structure-constructors-alloc-comp.f90
@@ -31,7 +31,7 @@ subroutine test_alloc1(y)
 ! HLFIR:    %[[CONS_6:.*]] = arith.constant {{.*}} : i32
 ! HLFIR:    %[[VAL_16:.*]] = fir.convert %[[VAL_14]] : (!fir.box<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.box<none>
 ! HLFIR:    %[[VAL_17:.*]] = fir.convert %[[VAL_15]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-! HLFIR:    %{{.*}} = fir.call @_FortranAInitialize(%[[VAL_16]], %[[VAL_17]], %[[CONS_6]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! HLFIR:    fir.call @_FortranAInitialize(%[[VAL_16]], %[[VAL_17]], %[[CONS_6]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! HLFIR:    %[[VAL_18:.*]] = hlfir.designate %[[VAL_13]]#0{"x"}   : (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.ref<f32>
 ! HLFIR:    %[[VAL_19:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<f32>
 ! HLFIR:    hlfir.assign %[[VAL_19]] to %[[VAL_18]] temporary_lhs : f32, !fir.ref<f32>
@@ -57,7 +57,7 @@ subroutine test_alloc2(y, b)
 ! HLFIR:    %[[CONS_7:.*]] = arith.constant {{.*}} : i32
 ! HLFIR:    %[[VAL_18:.*]] = fir.convert %[[VAL_16]] : (!fir.box<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.box<none>
 ! HLFIR:    %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-! HLFIR:    {{.*}} = fir.call @_FortranAInitialize(%[[VAL_18]], %[[VAL_19]], %[[CONS_7]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! HLFIR:    fir.call @_FortranAInitialize(%[[VAL_18]], %[[VAL_19]], %[[CONS_7]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! HLFIR:    %[[VAL_20:.*]] = hlfir.designate %[[VAL_15]]#0{"x"}   : (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.ref<f32>
 ! HLFIR:    %[[VAL_21:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<f32>
 ! HLFIR:    hlfir.assign %[[VAL_21]] to %[[VAL_20]] temporary_lhs : f32, !fir.ref<f32>
@@ -111,7 +111,7 @@ subroutine takes_ta_alloc_char(x)
 ! HLFIR:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc_char{a:!fir.box<!fir.heap<!fir.char<1,?>>>}>>) -> (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc_char{a:!fir.box<!fir.heap<!fir.char<1,?>>>}>>, !fir.ref<!fir.type<_QMm_struct_ctorTt_alloc_char{a:!fir.box<!fir.heap<!fir.char<1,?>>>}>>)
 ! HLFIR:           %[[VAL_2:.*]] = fir.embox %[[VAL_1]]#0 : (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc_char{a:!fir.box<!fir.heap<!fir.char<1,?>>>}>>) -> !fir.box<!fir.type<_QMm_struct_ctorTt_alloc_char{a:!fir.box<!fir.heap<!fir.char<1,?>>>}>>
 ! HLFIR:           %[[VAL_5:.*]] = fir.convert %[[VAL_2]] : (!fir.box<!fir.type<_QMm_struct_ctorTt_alloc_char{a:!fir.box<!fir.heap<!fir.char<1,?>>>}>>) -> !fir.box<none>
-! HLFIR:           %[[VAL_7:.*]] = fir.call @_FortranAInitialize(%[[VAL_5]],
+! HLFIR:           fir.call @_FortranAInitialize(%[[VAL_5]],
 ! HLFIR:           %[[VAL_8:.*]] = hlfir.designate %[[VAL_1]]#0{"a"}   {fortran_attrs = #fir.var_attrs<allocatable>} : (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc_char{a:!fir.box<!fir.heap<!fir.char<1,?>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
 ! HLFIR:           %[[VAL_9:.*]] = fir.address_of(@_QQclX68656C6C6F) : !fir.ref<!fir.char<1,5>>
 ! HLFIR:           %[[VAL_10:.*]] = arith.constant 5 : index
@@ -134,7 +134,7 @@ subroutine takes_ta_alloc_char_cst_len(x)
 ! HLFIR:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc_char_cst_len{a:!fir.box<!fir.heap<!fir.char<1,2>>>}>>) -> (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc_char_cst_len{a:!fir.box<!fir.heap<!fir.char<1,2>>>}>>, !fir.ref<!fir.type<_QMm_struct_ctorTt_alloc_char_cst_len{a:!fir.box<!fir.heap<!fir.char<1,2>>>}>>)
 ! HLFIR:           %[[VAL_2:.*]] = fir.embox %[[VAL_1]]#0 : (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc_char_cst_len{a:!fir.box<!fir.heap<!fir.char<1,2>>>}>>) -> !fir.box<!fir.type<_QMm_struct_ctorTt_alloc_char_cst_len{a:!fir.box<!fir.heap<!fir.char<1,2>>>}>>
 ! HLFIR:           %[[VAL_5:.*]] = fir.convert %[[VAL_2]] : (!fir.box<!fir.type<_QMm_struct_ctorTt_alloc_char_cst_len{a:!fir.box<!fir.heap<!fir.char<1,2>>>}>>) -> !fir.box<none>
-! HLFIR:           %[[VAL_7:.*]] = fir.call @_FortranAInitialize(%[[VAL_5]],
+! HLFIR:           fir.call @_FortranAInitialize(%[[VAL_5]],
 ! HLFIR:           %[[VAL_8:.*]] = arith.constant 2 : index
 ! HLFIR:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_1]]#0{"a"}   typeparams %[[VAL_8]] {fortran_attrs = #fir.var_attrs<allocatable>} : (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc_char_cst_len{a:!fir.box<!fir.heap<!fir.char<1,2>>>}>>, index) -> !fir.ref<!fir.box<!fir.heap<!fir.char<1,2>>>>
 ! HLFIR:           %[[VAL_10:.*]] = fir.address_of(@_QQclX68656C6C6F) : !fir.ref<!fir.char<1,5>>
diff --git a/flang/test/Lower/structure-constructors.f90 b/flang/test/Lower/structure-constructors.f90
index 14d8bfe04d1f0..86581ce51bf45 100644
--- a/flang/test/Lower/structure-constructors.f90
+++ b/flang/test/Lower/structure-constructors.f90
@@ -346,7 +346,7 @@ subroutine test_parent_component2()
 ! CHECK:         fir.store %[[VAL_11]] to %[[VAL_1]] : !fir.ref<!fir.box<!fir.type<_QFtest_parent_component3Tbase{m:!fir.array<2x!fir.char<1,5>>}>>>
 ! CHECK:         %[[VAL_14:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.box<!fir.type<_QFtest_parent_component3Tbase{m:!fir.array<2x!fir.char<1,5>>}>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_15:.*]] = fir.convert %[[VAL_9]] : (!fir.box<!fir.ptr<!fir.type<_QFtest_parent_component3Tbase{m:!fir.array<2x!fir.char<1,5>>}>>>) -> !fir.box<none>
-! CHECK:         %[[VAL_17:.*]] = fir.call @_FortranAAssign(%[[VAL_14]], %[[VAL_15]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:         fir.call @_FortranAAssign(%[[VAL_14]], %[[VAL_15]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK:         %[[VAL_18:.*]] = fir.field_index mask, !fir.type<_QFtest_parent_component3Tmid{m:!fir.array<2x!fir.char<1,5>>,mask:!fir.logical<4>}>
 ! CHECK:         %[[VAL_19:.*]] = fir.coordinate_of %[[VAL_2]], %[[VAL_18]] : (!fir.ref<!fir.type<_QFtest_parent_component3Tmid{m:!fir.array<2x!fir.char<1,5>>,mask:!fir.logical<4>}>>, !fir.field) -> !fir.ref<!fir.logical<4>>
 ! CHECK:         %[[VAL_20:.*]] = arith.constant true
diff --git a/flang/test/Lower/transformational-intrinsics.f90 b/flang/test/Lower/transformational-intrinsics.f90
index 3dfb689f18d81..5e10f0f510720 100644
--- a/flang/test/Lower/transformational-intrinsics.f90
+++ b/flang/test/Lower/transformational-intrinsics.f90
@@ -24,7 +24,7 @@ subroutine in_io(x)
   ! CHECK: %[[res_desc:.]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x!fir.logical<1>>>>
   ! CHECK-DAG: %[[res_arg:.*]] = fir.convert %[[res_desc]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<1>>>>>) -> !fir.ref<!fir.box<none>>
   ! CHECK-DAG: %[[x_arg:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?x!fir.logical<1>>>) -> !fir.box<none>
-  ! CHECK: fir.call @_Fortran{{.*}}AllDim(%[[res_arg]], %[[x_arg]], {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+  ! CHECK: fir.call @_Fortran{{.*}}AllDim(%[[res_arg]], %[[x_arg]], {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
   ! CHECK: %[[res_desc_load:.*]] = fir.load %[[res_desc]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<1>>>>>
   ! CHECK-DAG: %[[dims:.*]]:3 = fir.box_dims %[[res_desc_load]], %c0{{.*}} : (!fir.box<!fir.heap<!fir.array<?x!fir.logical<1>>>>, index) -> (index, index, index)
   ! CHECK-DAG: %[[res_addr:.*]] = fir.box_addr %[[res_desc_load]] : (!fir.box<!fir.heap<!fir.array<?x!fir.logical<1>>>>) -> !fir.heap<!fir.array<?x!fir.logical<1>>>
@@ -44,7 +44,7 @@ subroutine in_call(x)
   ! CHECK: %[[res_desc:.]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x!fir.logical<1>>>>
   ! CHECK-DAG: %[[res_arg:.*]] = fir.convert %[[res_desc]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<1>>>>>) -> !fir.ref<!fir.box<none>>
   ! CHECK-DAG: %[[x_arg:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?x!fir.logical<1>>>) -> !fir.box<none>
-  ! CHECK: fir.call @_Fortran{{.*}}AllDim(%[[res_arg]], %[[x_arg]], {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+  ! CHECK: fir.call @_Fortran{{.*}}AllDim(%[[res_arg]], %[[x_arg]], {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
   ! CHECK: %[[res_desc_load:.*]] = fir.load %[[res_desc]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<1>>>>>
   ! CHECK-DAG: %[[dims:.*]]:3 = fir.box_dims %[[res_desc_load]], %c0{{.*}} : (!fir.box<!fir.heap<!fir.array<?x!fir.logical<1>>>>, index) -> (index, index, index)
   ! CHECK-DAG: %[[res_addr:.*]] = fir.box_addr %[[res_desc_load]] : (!fir.box<!fir.heap<!fir.array<?x!fir.logical<1>>>>) -> !fir.heap<!fir.array<?x!fir.logical<1>>>
@@ -157,7 +157,7 @@ subroutine in_elem_expr(x, y, z)
   ! CHECK:         %[[VAL_29:.*]] = fir.convert %[[VAL_19]] : (!fir.box<!fir.array<3x3xi32>>) -> !fir.box<none>
   ! CHECK:         %[[VAL_30:.*]] = fir.convert %[[VAL_25]] : (!fir.box<!fir.array<3xi32>>) -> !fir.box<none>
   ! CHECK:         %[[VAL_31:.*]] = fir.convert %[[VAL_26]] : (!fir.ref<!fir.char<1,{{[0-9]+}}>>) -> !fir.ref<i8>
-  ! CHECK:         %[[VAL_32:.*]] = fir.call @_FortranACshift(%[[VAL_28]], %[[VAL_29]], %[[VAL_30]], %[[VAL_17]], %[[VAL_31]], %[[VAL_27]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+  ! CHECK:         fir.call @_FortranACshift(%[[VAL_28]], %[[VAL_29]], %[[VAL_30]], %[[VAL_17]], %[[VAL_31]], %[[VAL_27]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
   ! CHECK:         %[[VAL_33:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>
   ! CHECK:         %[[VAL_34:.*]] = arith.constant 0 : index
   ! CHECK:         %[[VAL_35:.*]]:3 = fir.box_dims %[[VAL_33]], %[[VAL_34]] : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>, index) -> (index, index, index)
@@ -198,7 +198,7 @@ subroutine in_elem_expr(x, y, z)
   ! CHECK:         %[[VAL_68:.*]] = fir.convert %[[VAL_59]] : (!fir.box<!fir.array<6xi32>>) -> !fir.box<none>
   ! CHECK:         %[[VAL_69:.*]] = fir.convert %[[VAL_64]] : (i32) -> i64
   ! CHECK:         %[[VAL_70:.*]] = fir.convert %[[VAL_65]] : (!fir.ref<!fir.char<1,{{[0-9]+}}>>) -> !fir.ref<i8>
-  ! CHECK:         %[[VAL_71:.*]] = fir.call @_FortranACshiftVector(%[[VAL_67]], %[[VAL_68]], %[[VAL_69]], %[[VAL_70]], %[[VAL_66]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.ref<i8>, i32) -> none
+  ! CHECK:         fir.call @_FortranACshiftVector(%[[VAL_67]], %[[VAL_68]], %[[VAL_69]], %[[VAL_70]], %[[VAL_66]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.ref<i8>, i32) -> ()
   ! CHECK:         %[[VAL_72:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   ! CHECK:         %[[VAL_73:.*]] = arith.constant 0 : index
   ! CHECK:         %[[VAL_74:.*]]:3 = fir.box_dims %[[VAL_72]], %[[VAL_73]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
@@ -260,7 +260,7 @@ subroutine unpack_test()
   ! CHECK-DAG: %[[a20:.*]] = fir.convert %[[a10]] : (!fir.box<!fir.array<3xi32>>) -> !fir.box<none>
   ! CHECK-DAG: %[[a21:.*]] = fir.convert %[[a12]] : (!fir.box<!fir.array<3x3x!fir.logical<4>>>) -> !fir.box<none>
   ! CHECK-DAG: %[[a22:.*]] = fir.convert %[[a14]] : (!fir.box<!fir.array<3x3xi32>>) -> !fir.box<none>
-  ! CHECK: fir.call @_FortranAUnpack(%[[a19]], %[[a20]], %[[a21]], %[[a22]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  ! CHECK: fir.call @_FortranAUnpack(%[[a19]], %[[a20]], %[[a21]], %[[a22]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   ! CHECK-NEXT: %[[a22:.*]] = fir.load %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>
   ! CHECK: %[[a25:.*]] = fir.box_addr %[[a22]] : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>) -> !fir.heap<!fir.array<?x?xi32>>
   ! CHECK: fir.freemem %[[a25]] : !fir.heap<!fir.array<?x?xi32>>
@@ -279,7 +279,7 @@ subroutine unpack_test()
   ! CHECK: %[[a49:.*]] = fir.convert %[[a41]] : (!fir.box<!fir.array<3x3x!fir.logical<4>>>) -> !fir.box<none>
   ! CHECK: %[[a50:.*]] = fir.convert %[[a42]] : (!fir.box<i32>) -> !fir.box<none>
   result = unpack(vector, mask, 343)
-  ! CHECK: fir.call @_FortranAUnpack(%[[a47]], %[[a48]], %[[a49]], %[[a50]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  ! CHECK: fir.call @_FortranAUnpack(%[[a47]], %[[a48]], %[[a49]], %[[a50]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   ! CHECK: %[[a53:.*]] = fir.load %[[a0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>
   ! CHECK: %[[a56:.*]] = fir.box_addr %[[a53]] : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>) -> !fir.heap<!fir.array<?x?xi32>>
   ! CHECK: fir.freemem %[[a56]] : !fir.heap<!fir.array<?x?xi32>>
diff --git a/flang/test/Lower/vector-subscript-io.f90 b/flang/test/Lower/vector-subscript-io.f90
index 9a041af16c88c..0f64e99e03a20 100644
--- a/flang/test/Lower/vector-subscript-io.f90
+++ b/flang/test/Lower/vector-subscript-io.f90
@@ -489,7 +489,7 @@ subroutine simple_iostat(x, y, j, stat)
 ! CHECK:   %[[VAL_341:.*]] = fir.address_of(@_QQ{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
 ! CHECK:   %[[VAL_342:.*]] = fir.convert %[[VAL_341]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 ! CHECK:   %[[VAL_343:.*]] = fir.call @_FortranAioBeginExternalListInput(%[[VAL_334]], %[[VAL_342]], %{{.*}}) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
-! CHECK:   %[[VAL_344:.*]] = fir.call @_FortranAioEnableHandlers(%[[VAL_343]], %[[VAL_337]], %[[VAL_336]], %[[VAL_336]], %[[VAL_336]], %[[VAL_336]]) {{.*}}: (!fir.ref<i8>, i1, i1, i1, i1, i1) -> none
+! CHECK:   fir.call @_FortranAioEnableHandlers(%[[VAL_343]], %[[VAL_337]], %[[VAL_336]], %[[VAL_336]], %[[VAL_336]], %[[VAL_336]]) {{.*}}: (!fir.ref<i8>, i1, i1, i1, i1, i1) -> ()
 ! CHECK:   %[[VAL_345:.*]]:3 = fir.box_dims %[[VAL_346]], %[[VAL_339]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
 ! CHECK:   %[[VAL_347:.*]] = fir.slice %[[VAL_338]], %[[VAL_345]]#1, %[[VAL_338]] : (index, index, index) -> !fir.slice<1>
 ! CHECK:   %[[VAL_348:.*]] = arith.subi %[[VAL_345]]#1, %[[VAL_338]] : index
@@ -538,7 +538,7 @@ subroutine iostat_in_io_loop(k, j, stat)
 ! CHECK:   %[[VAL_376:.*]] = fir.address_of(@_QQ{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
 ! CHECK:   %[[VAL_377:.*]] = fir.convert %[[VAL_376]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 ! CHECK:   %[[VAL_378:.*]] = fir.call @_FortranAioBeginExternalListInput(%[[VAL_366]], %[[VAL_377]], %{{.*}}) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
-! CHECK:   %[[VAL_379:.*]] = fir.call @_FortranAioEnableHandlers(%[[VAL_378]], %[[VAL_369]], %[[VAL_370]], %[[VAL_370]], %[[VAL_370]], %[[VAL_370]]) {{.*}}: (!fir.ref<i8>, i1, i1, i1, i1, i1) -> none
+! CHECK:   fir.call @_FortranAioEnableHandlers(%[[VAL_378]], %[[VAL_369]], %[[VAL_370]], %[[VAL_370]], %[[VAL_370]], %[[VAL_370]]) {{.*}}: (!fir.ref<i8>, i1, i1, i1, i1, i1) -> ()
 ! CHECK:   cf.br ^bb1(%[[VAL_371]], %[[VAL_369]] : index, i1)
 ! CHECK: ^bb1(%[[VAL_380:.*]]: index, %[[VAL_381:.*]]: i1):
 ! CHECK:   %[[VAL_382:.*]] = arith.cmpi sle, %[[VAL_380]], %[[VAL_368]] : index
diff --git a/flang/test/Preprocessing/bug129131.F b/flang/test/Preprocessing/bug129131.F
index 00aba5da2c7cb..43bbfdc232f92 100644
--- a/flang/test/Preprocessing/bug129131.F
+++ b/flang/test/Preprocessing/bug129131.F
@@ -1,4 +1,4 @@
-! RUN: %flang -fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
 ! CHECK: PRINT *, 2_4
 ! CHECK: PRINT *, 1_4
 #define a ,3
diff --git a/flang/test/Transforms/omp-reduction-cfg-conversion.fir b/flang/test/Transforms/omp-reduction-cfg-conversion.fir
index 707e665132afb..cbe599f0da5d2 100644
--- a/flang/test/Transforms/omp-reduction-cfg-conversion.fir
+++ b/flang/test/Transforms/omp-reduction-cfg-conversion.fir
@@ -18,7 +18,7 @@ omp.declare_reduction @add_reduction_i_32_box_3_byref : !fir.ref<!fir.box<!fir.a
   %8 = fir.convert %0 : (!fir.ref<!fir.box<!fir.array<3xi32>>>) -> !fir.ref<!fir.box<none>>
   %9 = fir.convert %6 : (!fir.box<i32>) -> !fir.box<none>
   %10 = fir.convert %7 : (!fir.ref<!fir.char<1,40>>) -> !fir.ref<i8>
-  %11 = fir.call @_FortranAAssign(%8, %9, %10, %c4_i32) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  fir.call @_FortranAAssign(%8, %9, %10, %c4_i32) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   %12 = fir.alloca !fir.box<!fir.array<3xi32>>
   fir.store %4 to %12 : !fir.ref<!fir.box<!fir.array<3xi32>>>
   omp.yield(%12 : !fir.ref<!fir.box<!fir.array<3xi32>>>)
diff --git a/flang/test/Transforms/simplifyintrinsics.fir b/flang/test/Transforms/simplifyintrinsics.fir
index a3930566035b3..b2af152e5a913 100644
--- a/flang/test/Transforms/simplifyintrinsics.fir
+++ b/flang/test/Transforms/simplifyintrinsics.fir
@@ -262,13 +262,13 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.targ
     %8 = fir.convert %5 : (!fir.ref<!fir.char<1,13>>) -> !fir.ref<i8>
     %9 = fir.convert %c0 : (index) -> i32
     %10 = fir.convert %4 : (!fir.box<i1>) -> !fir.box<none>
-    %11 = fir.call @_FortranACppSumComplex4(%6, %7, %8, %c5_i32, %9, %10) : (!fir.ref<complex<f32>>, !fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> none
+    fir.call @_FortranACppSumComplex4(%6, %7, %8, %c5_i32, %9, %10) : (!fir.ref<complex<f32>>, !fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> ()
     %12 = fir.load %0 : !fir.ref<complex<f32>>
     fir.store %12 to %1 : !fir.ref<complex<f32>>
     %13 = fir.load %1 : !fir.ref<complex<f32>>
     return %13 : complex<f32>
   }
-  func.func private @_FortranACppSumComplex4(!fir.ref<complex<f32>>, !fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> none attributes {fir.runtime}
+  func.func private @_FortranACppSumComplex4(!fir.ref<complex<f32>>, !fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> () attributes {fir.runtime}
   fir.global linkonce @_QQclX2E2F6973756D5F362E66393000 constant : !fir.char<1,13> {
     %0 = fir.string_lit "./isum_6.f90\00"(13) : !fir.char<1,13>
     fir.has_value %0 : !fir.char<1,13>
@@ -277,7 +277,7 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.targ
 
 // CHECK-LABEL:   func.func @sum_1d_complex(%{{.*}}: !fir.ref<!fir.array<10xcomplex<f32>>> {fir.bindc_name = "a"}) -> complex<f32> {
 // CHECK-NOT:       fir.call @_FortranACppSumComplex4x1_simplified({{.*}})
-// CHECK:           fir.call @_FortranACppSumComplex4({{.*}}) : (!fir.ref<complex<f32>>, !fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> none
+// CHECK:           fir.call @_FortranACppSumComplex4({{.*}}) : (!fir.ref<complex<f32>>, !fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> ()
 // CHECK-NOT:       fir.call @_FortranACppSumComplex4x1_simplified({{.*}})
 
 // -----
@@ -422,7 +422,7 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.targ
     %file = fir.address_of(@filename) : !fir.ref<!fir.char<1,16>>
     %file_ref = fir.convert %file : (!fir.ref<!fir.char<1,16>>) -> !fir.ref<i8>
     %absent_none = fir.convert %absent : (!fir.box<i1>) -> !fir.box<none>
-    %res = fir.call @_FortranASumDim(%box_none, %box_none2, %c1_i32, %file_ref, %lineno, %absent_none) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> none
+    fir.call @_FortranASumDim(%box_none, %box_none2, %c1_i32, %file_ref, %lineno, %absent_none) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
     func.return
   }
 }
@@ -1236,7 +1236,7 @@ func.func @_QMtestPcount_generate_mask(%arg0: !fir.ref<!fir.array<10x10x!fir.log
   %11 = fir.convert %5 : (!fir.box<!fir.array<10x10x!fir.logical<4>>>) -> !fir.box<none>
   %12 = fir.convert %c4 : (index) -> i32
   %13 = fir.convert %9 : (!fir.ref<!fir.char<1,15>>) -> !fir.ref<i8>
-  %14 = fir.call @_FortranACountDim(%10, %11, %c2_i32, %12, %13, %c11_i32) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32) -> none
+  fir.call @_FortranACountDim(%10, %11, %c2_i32, %12, %13, %c11_i32) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32) -> ()
   %15 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   %c0_2 = arith.constant 0 : index
   %16:3 = fir.box_dims %15, %c0_2 : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
@@ -1256,12 +1256,12 @@ func.func @_QMtestPcount_generate_mask(%arg0: !fir.ref<!fir.array<10x10x!fir.log
   %22 = fir.load %1 : !fir.ref<!fir.array<10xi32>>
   return %22 : !fir.array<10xi32>
 }
-func.func private @_FortranACountDim(!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32) -> none attributes {fir.runtime}
+func.func private @_FortranACountDim(!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32) -> () attributes {fir.runtime}
 
 // CHECK-LABEL:   func.func @_QMtestPcount_generate_mask(
 // CHECK-SAME:                                           %[[A:.*]]: !fir.ref<!fir.array<10x10x!fir.logical<4>>> {fir.bindc_name = "mask"}) -> !fir.array<10xi32> {
 // CHECK-NOT        fir.call @_FortranACountDimLogical4_simplified({{.*}})
-// CHECK:           %[[RES:.*]] = fir.call @_FortranACountDim({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32) -> none
+// CHECK:           fir.call @_FortranACountDim({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32) -> ()
 // CHECK-NOT        fir.call @_FortranACountDimLogical4_simplified({{.*}})
 
 // -----
@@ -1419,7 +1419,7 @@ func.func @_QPtestAny_DimArg(%arg0: !fir.ref<!fir.array<10x10x!fir.logical<4>>>
   %10 = fir.convert %0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>) -> !fir.ref<!fir.box<none>>
   %11 = fir.convert %5 : (!fir.box<!fir.array<10x10x!fir.logical<4>>>) -> !fir.box<none>
   %12 = fir.convert %9 : (!fir.ref<!fir.char<1,48>>) -> !fir.ref<i8>
-  %13 = fir.call @_FortranAAnyDim(%10, %11, %c2_i32, %12, %c3_i32) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+  fir.call @_FortranAAnyDim(%10, %11, %c2_i32, %12, %c3_i32) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
   %14 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>
   %c0_2 = arith.constant 0 : index
   %15:3 = fir.box_dims %14, %c0_2 : (!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>, index) -> (index, index, index)
@@ -1439,12 +1439,12 @@ func.func @_QPtestAny_DimArg(%arg0: !fir.ref<!fir.array<10x10x!fir.logical<4>>>
   %21 = fir.load %1 : !fir.ref<!fir.array<10x!fir.logical<4>>>
   return %21 : !fir.array<10x!fir.logical<4>>
 }
-func.func private @_FortranAAnyDim(!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none attributes {fir.runtime}
+func.func private @_FortranAAnyDim(!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> () attributes {fir.runtime}
 
 // CHECK-LABEL:   func.func @_QPtestAny_DimArg(
 // CHECK-SAME:                          %[[ARR:.*]]: !fir.ref<!fir.array<10x10x!fir.logical<4>>> {fir.bindc_name = "a"}) -> !fir.array<10x!fir.logical<4>> {
 // CHECK-NOT        fir.call @_FortranAAnyDimLogical4x1_simplified({{.*}})
-// CHECK:           fir.call @_FortranAAnyDim({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+// CHECK:           fir.call @_FortranAAnyDim({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
 // CHECK-NOT        fir.call @_FortranAAnyDimLogical4x1_simplified({{.*}})
 
 // -----
@@ -1658,7 +1658,7 @@ func.func @_QPtestAll_DimArg(%arg0: !fir.ref<!fir.array<10x10x!fir.logical<4>>>
   %10 = fir.convert %0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>) -> !fir.ref<!fir.box<none>>
   %11 = fir.convert %5 : (!fir.box<!fir.array<10x10x!fir.logical<4>>>) -> !fir.box<none>
   %12 = fir.convert %9 : (!fir.ref<!fir.char<1,48>>) -> !fir.ref<i8>
-  %13 = fir.call @_FortranAAllDim(%10, %11, %c1_i32, %12, %c3_i32) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+  fir.call @_FortranAAllDim(%10, %11, %c1_i32, %12, %c3_i32) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
   %14 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>
   %c0_2 = arith.constant 0 : index
   %15:3 = fir.box_dims %14, %c0_2 : (!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>, index) -> (index, index, index)
@@ -1678,12 +1678,12 @@ func.func @_QPtestAll_DimArg(%arg0: !fir.ref<!fir.array<10x10x!fir.logical<4>>>
   %21 = fir.load %1 : !fir.ref<!fir.array<10x!fir.logical<4>>>
   return %21 : !fir.array<10x!fir.logical<4>>
 }
-func.func private @_FortranAAllDim(!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none attributes {fir.runtime}
+func.func private @_FortranAAllDim(!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> () attributes {fir.runtime}
 
 // CHECK-LABEL:   func.func @_QPtestAll_DimArg(
 // CHECK-SAME:                          %[[ARR:.*]]: !fir.ref<!fir.array<10x10x!fir.logical<4>>> {fir.bindc_name = "a"}) -> !fir.array<10x!fir.logical<4>> {
 // CHECK-NOT        fir.call @_FortranAAllDimLogical4x1_simplified({{.*}})
-// CHECK:           fir.call @_FortranAAllDim({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+// CHECK:           fir.call @_FortranAAllDim({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
 // CHECK-NOT        fir.call @_FortranAAllDimLogical4x1_simplified({{.*}})
 
 // -----
@@ -1714,7 +1714,7 @@ func.func @_QPtestminloc_works1d(%arg0: !fir.ref<!fir.array<10xi32>> {fir.bindc_
   %14 = fir.convert %c4 : (index) -> i32
   %15 = fir.convert %11 : (!fir.ref<!fir.char<1,55>>) -> !fir.ref<i8>
   %16 = fir.convert %7 : (!fir.box<!fir.array<10x!fir.logical<4>>>) -> !fir.box<none>
-  %17 = fir.call @_FortranAMinlocInteger4(%12, %13, %14, %15, %c5_i32, %16, %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  fir.call @_FortranAMinlocInteger4(%12, %13, %14, %15, %c5_i32, %16, %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   %18 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   %c0_1 = arith.constant 0 : index
   %19:3 = fir.box_dims %18, %c0_1 : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
@@ -1837,7 +1837,7 @@ func.func @_QPtestminloc_works2d_nomask(%arg0: !fir.ref<!fir.array<10x10xi32>> {
   %12 = fir.convert %5 : (!fir.box<!fir.array<10x10xi32>>) -> !fir.box<none>
   %13 = fir.convert %10 : (!fir.ref<!fir.char<1,62>>) -> !fir.ref<i8>
   %14 = fir.convert %6 : (!fir.box<i1>) -> !fir.box<none>
-  %15 = fir.call @_FortranAMinlocInteger4(%11, %12, %c8_i32, %13, %c4_i32, %14, %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  fir.call @_FortranAMinlocInteger4(%11, %12, %c8_i32, %13, %c4_i32, %14, %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   %16 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi64>>>>
   %c0_1 = arith.constant 0 : index
   %17:3 = fir.box_dims %16, %c0_1 : (!fir.box<!fir.heap<!fir.array<?xi64>>>, index) -> (index, index, index)
@@ -1957,7 +1957,7 @@ func.func @_QPtestminloc_works1d_scalarmask_f64(%arg0: !fir.ref<!fir.array<10xf6
   %13 = fir.convert %c4 : (index) -> i32
   %14 = fir.convert %10 : (!fir.ref<!fir.char<1,65>>) -> !fir.ref<i8>
   %15 = fir.convert %6 : (!fir.box<!fir.logical<4>>) -> !fir.box<none>
-  %16 = fir.call @_FortranAMinlocReal8(%11, %12, %13, %14, %c6_i32, %15, %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  fir.call @_FortranAMinlocReal8(%11, %12, %13, %14, %c6_i32, %15, %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   %17 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   %c0_0 = arith.constant 0 : index
   %18:3 = fir.box_dims %17, %c0_0 : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
@@ -2070,7 +2070,7 @@ func.func @_QPtestminloc_doesntwork1d_back(%arg0: !fir.ref<!fir.array<10xi32>> {
   %13 = fir.convert %c4 : (index) -> i32
   %14 = fir.convert %10 : (!fir.ref<!fir.char<1,62>>) -> !fir.ref<i8>
   %15 = fir.convert %6 : (!fir.box<i1>) -> !fir.box<none>
-  %16 = fir.call @_FortranAMinlocInteger4(%11, %12, %13, %14, %c4_i32, %15, %true) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  fir.call @_FortranAMinlocInteger4(%11, %12, %13, %14, %c4_i32, %15, %true) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   %17 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   %c0_0 = arith.constant 0 : index
   %18:3 = fir.box_dims %17, %c0_0 : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
@@ -2094,7 +2094,7 @@ func.func @_QPtestminloc_doesntwork1d_back(%arg0: !fir.ref<!fir.array<10xi32>> {
 // CHECK-LABEL:   func.func @_QPtestminloc_doesntwork1d_back(
 // CHECK-SAME:                                               %[[ARR:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> !fir.array<1xi32> {
 // CHECK-NOT:         fir.call @_FortranAMinlocInteger4x1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
-// CHECK:             fir.call @_FortranAMinlocInteger4({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+// CHECK:             fir.call @_FortranAMinlocInteger4({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
 // CHECK-NOT:         fir.call @_FortranAMinlocInteger4x1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
 
 // -----
@@ -2123,7 +2123,7 @@ func.func @_QPtestminloc_1d_dim(%arg0: !fir.ref<!fir.array<10xi32>> {fir.bindc_n
   %12 = fir.convert %c4 : (index) -> i32
   %13 = fir.convert %9 : (!fir.ref<!fir.char<1,61>>) -> !fir.ref<i8>
   %14 = fir.convert %6 : (!fir.box<i1>) -> !fir.box<none>
-  %15 = fir.call @_FortranAMinlocDim(%10, %11, %12, %c1_i32, %13, %c4_i32, %14, %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  fir.call @_FortranAMinlocDim(%10, %11, %12, %c1_i32, %13, %c4_i32, %14, %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   %16 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<i32>>>
   %17 = fir.box_addr %16 : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
   %18 = fir.load %17 : !fir.heap<i32>
@@ -2220,7 +2220,7 @@ func.func @_QPtestminloc_doesntwork1d_unknownsize(%arg0: !fir.box<!fir.array<?xi
   %11 = fir.convert %c4 : (index) -> i32
   %12 = fir.convert %8 : (!fir.ref<!fir.char<1,69>>) -> !fir.ref<i8>
   %13 = fir.convert %4 : (!fir.box<i1>) -> !fir.box<none>
-  %14 = fir.call @_FortranAMinlocInteger4(%9, %10, %11, %12, %c4_i32, %13, %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  fir.call @_FortranAMinlocInteger4(%9, %10, %11, %12, %c4_i32, %13, %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   %15 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   %c0_0 = arith.constant 0 : index
   %16:3 = fir.box_dims %15, %c0_0 : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
@@ -2243,7 +2243,7 @@ func.func @_QPtestminloc_doesntwork1d_unknownsize(%arg0: !fir.box<!fir.array<?xi
 // CHECK-LABEL:   func.func @_QPtestminloc_doesntwork1d_unknownsize(
 // CHECK-SAME:                                                      %[[ARR:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) -> !fir.array<1xi32> {
 // CHECK-NOT:         fir.call @_FortranAMinlocInteger4x1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
-// CHECK:             fir.call @_FortranAMinlocInteger4({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+// CHECK:             fir.call @_FortranAMinlocInteger4({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
 // CHECK-NOT:         fir.call @_FortranAMinlocInteger4x1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
 
 // -----
@@ -2275,7 +2275,7 @@ func.func @_QPtestminloc_doesntwork1d_chars(%arg0: !fir.boxchar<1> {fir.bindc_na
   %15 = fir.convert %c4 : (index) -> i32
   %16 = fir.convert %12 : (!fir.ref<!fir.char<1,41>>) -> !fir.ref<i8>
   %17 = fir.convert %8 : (!fir.box<i1>) -> !fir.box<none>
-  %18 = fir.call @_FortranAMinlocCharacter(%13, %14, %15, %16, %c4_i32, %17, %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  fir.call @_FortranAMinlocCharacter(%13, %14, %15, %16, %c4_i32, %17, %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   %19 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   %c0_0 = arith.constant 0 : index
   %20:3 = fir.box_dims %19, %c0_0 : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
@@ -2299,7 +2299,7 @@ func.func @_QPtestminloc_doesntwork1d_chars(%arg0: !fir.boxchar<1> {fir.bindc_na
 // CHECK-LABEL:   func.func @_QPtestminloc_doesntwork1d_chars(
 // CHECK-SAME:                                                %[[ARR:.*]]: !fir.boxchar<1> {fir.bindc_name = "a"}) -> !fir.array<1xi32> {
 // CHECK-NOT:         fir.call @_FortranAMinlocCharacterx1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
-// CHECK:             fir.call @_FortranAMinlocCharacter({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+// CHECK:             fir.call @_FortranAMinlocCharacter({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
 // CHECK-NOT:         fir.call @_FortranAMinlocCharacterx1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
 
 // -----
@@ -2356,7 +2356,7 @@ func.func @_QPtestminloc_doesntwork1d_unknownmask(%arg0: !fir.ref<!fir.array<10x
   %33 = fir.convert %c4 : (index) -> i32
   %34 = fir.convert %30 : (!fir.ref<!fir.char<1,41>>) -> !fir.ref<i8>
   %35 = fir.convert %26 : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
-  %36 = fir.call @_FortranAMinlocInteger4(%31, %32, %33, %34, %c7_i32, %35, %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  fir.call @_FortranAMinlocInteger4(%31, %32, %33, %34, %c7_i32, %35, %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   %37 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   %c0_2 = arith.constant 0 : index
   %38:3 = fir.box_dims %37, %c0_2 : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
@@ -2380,7 +2380,7 @@ func.func @_QPtestminloc_doesntwork1d_unknownmask(%arg0: !fir.ref<!fir.array<10x
 // CHECK-LABEL:   func.func @_QPtestminloc_doesntwork1d_unknownmask(
 // CHECK-SAME:                                                      %[[ARR:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> !fir.array<1xi32> {
 // CHECK-NOT:         fir.call @_FortranAMinlocInteger4x1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
-// CHECK:             fir.call @_FortranAMinlocInteger4({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+// CHECK:             fir.call @_FortranAMinlocInteger4({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
 // CHECK-NOT:         fir.call @_FortranAMinlocInteger4x1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
 
 // -----
@@ -2411,7 +2411,7 @@ func.func @_QPtestmaxloc_works1d(%arg0: !fir.ref<!fir.array<10xi32>> {fir.bindc_
   %14 = fir.convert %c4 : (index) -> i32
   %15 = fir.convert %11 : (!fir.ref<!fir.char<1,55>>) -> !fir.ref<i8>
   %16 = fir.convert %7 : (!fir.box<!fir.array<10x!fir.logical<4>>>) -> !fir.box<none>
-  %17 = fir.call @_FortranAMaxlocInteger4(%12, %13, %14, %15, %c5_i32, %16, %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  fir.call @_FortranAMaxlocInteger4(%12, %13, %14, %15, %c5_i32, %16, %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   %18 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   %c0_1 = arith.constant 0 : index
   %19:3 = fir.box_dims %18, %c0_1 : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
@@ -2534,7 +2534,7 @@ func.func @_QPtestmaxloc_works1d_scalarmask_f64(%arg0: !fir.ref<!fir.array<10xf6
   %13 = fir.convert %c4 : (index) -> i32
   %14 = fir.convert %10 : (!fir.ref<!fir.char<1,65>>) -> !fir.ref<i8>
   %15 = fir.convert %6 : (!fir.box<!fir.logical<4>>) -> !fir.box<none>
-  %16 = fir.call @_FortranAMaxlocReal8(%11, %12, %13, %14, %c6_i32, %15, %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+  fir.call @_FortranAMaxlocReal8(%11, %12, %13, %14, %c6_i32, %15, %false) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   %17 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   %c0_0 = arith.constant 0 : index
   %18:3 = fir.box_dims %17, %c0_0 : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
diff --git a/flang/test/Transforms/stack-arrays.fir b/flang/test/Transforms/stack-arrays.fir
index a784cea9bc3a4..4a417ed981ab1 100644
--- a/flang/test/Transforms/stack-arrays.fir
+++ b/flang/test/Transforms/stack-arrays.fir
@@ -356,14 +356,14 @@ func.func @stop_terminator() {
   fir.freemem %0 : !fir.heap<!fir.array<42xi32>>
   %c0_i32 = arith.constant 0 : i32
   %false = arith.constant false
-  %none = fir.call @_FortranAStopStatement(%c0_i32, %false, %false) : (i32, i1, i1) -> none
+  fir.call @_FortranAStopStatement(%c0_i32, %false, %false) : (i32, i1, i1) -> ()
   fir.unreachable
 }
 // CHECK: func.func @stop_terminator() {
 // CHECK-NEXT: fir.alloca !fir.array<42xi32>
 // CHECK-NEXT:  %[[ZERO:.*]] = arith.constant 0 : i32
 // CHECK-NEXT:  %[[FALSE:.*]] = arith.constant false
-// CHECK-NEXT:  %[[NONE:.*]] = fir.call @_FortranAStopStatement(%[[ZERO]], %[[FALSE]], %[[FALSE]]) : (i32, i1, i1) -> none
+// CHECK-NEXT:  fir.call @_FortranAStopStatement(%[[ZERO]], %[[FALSE]], %[[FALSE]]) : (i32, i1, i1) -> ()
 // CHECK-NEXT:  fir.unreachable
 // CHECK-NEXT: }
 
diff --git a/flang/test/lib/Analysis/AliasAnalysis/CMakeLists.txt b/flang/test/lib/Analysis/AliasAnalysis/CMakeLists.txt
index c4b3838c9a23e..cba47a4114517 100644
--- a/flang/test/lib/Analysis/AliasAnalysis/CMakeLists.txt
+++ b/flang/test/lib/Analysis/AliasAnalysis/CMakeLists.txt
@@ -16,14 +16,16 @@ add_flang_library(FIRTestAnalysis
   FIRSupport
   FIRTransforms
   FIRAnalysis
+  MLIRTestAnalysis
+
+  MLIR_LIBS
   ${dialect_libs}
   MLIRFuncDialect
   MLIRLLVMDialect
   MLIRAnalysis
-  MLIRTestAnalysis
   )
 
 target_include_directories(FIRTestAnalysis
   PRIVATE
   ${MLIR_MAIN_SRC_DIR}/..
-  )
\ No newline at end of file
+  )
diff --git a/flang/test/lib/OpenACC/CMakeLists.txt b/flang/test/lib/OpenACC/CMakeLists.txt
index 8aa3c7689af4e..e296827ef53be 100644
--- a/flang/test/lib/OpenACC/CMakeLists.txt
+++ b/flang/test/lib/OpenACC/CMakeLists.txt
@@ -14,6 +14,8 @@ add_flang_library(FIRTestOpenACCInterfaces
   FIRDialect
   FIROpenACCSupport
   FIRSupport
+
+  MLIR_LIBS
   MLIRIR
   MLIROpenACCDialect
   MLIRPass
diff --git a/flang/tools/bbc/CMakeLists.txt b/flang/tools/bbc/CMakeLists.txt
index 3d92abdc60114..85aeb85e0c530 100644
--- a/flang/tools/bbc/CMakeLists.txt
+++ b/flang/tools/bbc/CMakeLists.txt
@@ -29,6 +29,9 @@ target_link_libraries(bbc PRIVATE
   flangFrontend
   flangPasses
   FlangOpenMPTransforms
+)
+
+mlir_target_link_libraries(bbc PRIVATE
   ${dialect_libs}
   ${extension_libs}
   MLIRAffineToStandard
diff --git a/flang/tools/fir-lsp-server/CMakeLists.txt b/flang/tools/fir-lsp-server/CMakeLists.txt
index d5445d8f8e99b..6f095e24524b7 100644
--- a/flang/tools/fir-lsp-server/CMakeLists.txt
+++ b/flang/tools/fir-lsp-server/CMakeLists.txt
@@ -12,7 +12,9 @@ target_link_libraries(fir-lsp-server PRIVATE
   CUFDialect
   FIRDialect
   FIROpenACCSupport
-  HLFIRDialect
+  HLFIRDialect)
+
+mlir_target_link_libraries(fir-lsp-server PRIVATE
   MLIRLspServerLib
   ${dialect_libs}
   ${extension_libs})
diff --git a/flang/tools/fir-opt/CMakeLists.txt b/flang/tools/fir-opt/CMakeLists.txt
index f0741ca282169..efbde329b8b8c 100644
--- a/flang/tools/fir-opt/CMakeLists.txt
+++ b/flang/tools/fir-opt/CMakeLists.txt
@@ -24,6 +24,9 @@ target_link_libraries(fir-opt PRIVATE
   FlangOpenMPTransforms
   FIRAnalysis
   ${test_libs}
+)
+
+mlir_target_link_libraries(fir-opt PRIVATE
   ${dialect_libs}
   ${extension_libs}
 
diff --git a/flang/tools/tco/CMakeLists.txt b/flang/tools/tco/CMakeLists.txt
index 0ac18734be2ce..220f908d2f108 100644
--- a/flang/tools/tco/CMakeLists.txt
+++ b/flang/tools/tco/CMakeLists.txt
@@ -21,6 +21,9 @@ target_link_libraries(tco PRIVATE
   FIROpenACCSupport
   FlangOpenMPTransforms
   FortranCommon
+)
+
+mlir_target_link_libraries(tco PRIVATE
   ${dialect_libs}
   ${extension_libs}
   MLIRIR
diff --git a/flang/unittests/Frontend/CMakeLists.txt b/flang/unittests/Frontend/CMakeLists.txt
index 22c568af3d121..9177997f41f53 100644
--- a/flang/unittests/Frontend/CMakeLists.txt
+++ b/flang/unittests/Frontend/CMakeLists.txt
@@ -20,5 +20,9 @@ target_link_libraries(FlangFrontendTests
   FortranSemantics
   FortranCommon
   FortranEvaluate
+)
+
+mlir_target_link_libraries(FlangFrontendTests
+  PRIVATE
   MLIRIR
 )
diff --git a/flang/unittests/Optimizer/Builder/ComplexTest.cpp b/flang/unittests/Optimizer/Builder/ComplexTest.cpp
index eefab118e255a..689af4642b0b6 100644
--- a/flang/unittests/Optimizer/Builder/ComplexTest.cpp
+++ b/flang/unittests/Optimizer/Builder/ComplexTest.cpp
@@ -34,7 +34,7 @@ struct ComplexTest : public testing::Test {
     helper = std::make_unique<fir::factory::Complex>(*firBuilder, loc);
 
     // Init commonly used types
-    realTy1 = mlir::FloatType::getF32(&context);
+    realTy1 = mlir::Float32Type::get(&context);
     complexTy1 = mlir::ComplexType::get(realTy1);
     integerTy1 = mlir::IntegerType::get(&context, 32);
 
diff --git a/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp b/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
index 05407d96998a2..3e2af24c47b96 100644
--- a/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
+++ b/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
@@ -146,7 +146,7 @@ TEST_F(FIRBuilderTest, createRealZeroConstant) {
   auto builder = getBuilder();
   auto ctx = builder.getContext();
   auto loc = builder.getUnknownLoc();
-  auto realTy = mlir::FloatType::getF64(ctx);
+  auto realTy = mlir::Float64Type::get(ctx);
   auto cst = builder.createRealZeroConstant(loc, realTy);
   EXPECT_TRUE(mlir::isa<arith::ConstantOp>(cst.getDefiningOp()));
   auto cstOp = dyn_cast<arith::ConstantOp>(cst.getDefiningOp());
@@ -434,7 +434,7 @@ TEST_F(FIRBuilderTest, createZeroValue) {
   auto intAttr = mlir::dyn_cast<mlir::IntegerAttr>(cst.getValue());
   EXPECT_TRUE(intAttr && intAttr.getInt() == 0);
 
-  mlir::Type f32Ty = mlir::FloatType::getF32(builder.getContext());
+  mlir::Type f32Ty = mlir::Float32Type::get(builder.getContext());
   mlir::Value zeroFloat = fir::factory::createZeroValue(builder, loc, f32Ty);
   EXPECT_TRUE(zeroFloat.getType() == f32Ty);
   auto cst2 = mlir::dyn_cast_or_null<mlir::arith::ConstantOp>(
@@ -494,7 +494,7 @@ TEST_F(FIRBuilderTest, getBaseTypeOf) {
     return {scalars, arrays};
   };
 
-  auto f32Ty = mlir::FloatType::getF32(builder.getContext());
+  auto f32Ty = mlir::Float32Type::get(builder.getContext());
   mlir::Type f32SeqTy = builder.getVarLenSeqTy(f32Ty);
   auto [f32Scalars, f32Arrays] = makeExv(f32Ty, f32SeqTy);
   for (const auto &scalar : f32Scalars) {
@@ -537,7 +537,7 @@ TEST_F(FIRBuilderTest, genArithFastMath) {
   auto ctx = builder.getContext();
   auto loc = builder.getUnknownLoc();
 
-  auto realTy = mlir::FloatType::getF32(ctx);
+  auto realTy = mlir::Float32Type::get(ctx);
   auto arg = builder.create<fir::UndefOp>(loc, realTy);
 
   // Test that FastMathFlags is 'none' by default.
diff --git a/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp b/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp
index 640b7ecc1e565..29700d2d3dbff 100644
--- a/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp
+++ b/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp
@@ -59,7 +59,7 @@ struct HLFIRToolsTest : public testing::Test {
 TEST_F(HLFIRToolsTest, testScalarRoundTrip) {
   auto &builder = getBuilder();
   mlir::Location loc = getLoc();
-  mlir::Type f32Type = mlir::FloatType::getF32(&context);
+  mlir::Type f32Type = mlir::Float32Type::get(&context);
   mlir::Type scalarf32Type = builder.getRefType(f32Type);
   mlir::Value scalarf32Addr = builder.create<fir::UndefOp>(loc, scalarf32Type);
   fir::ExtendedValue scalarf32{scalarf32Addr};
@@ -82,7 +82,7 @@ TEST_F(HLFIRToolsTest, testArrayRoundTrip) {
   llvm::SmallVector<mlir::Value> lbounds{
       createConstant(-1), createConstant(-2)};
 
-  mlir::Type f32Type = mlir::FloatType::getF32(&context);
+  mlir::Type f32Type = mlir::Float32Type::get(&context);
   mlir::Type seqf32Type = builder.getVarLenSeqTy(f32Type, 2);
   mlir::Type arrayf32Type = builder.getRefType(seqf32Type);
   mlir::Value arrayf32Addr = builder.create<fir::UndefOp>(loc, arrayf32Type);
diff --git a/flang/unittests/Optimizer/FIRTypesTest.cpp b/flang/unittests/Optimizer/FIRTypesTest.cpp
index a07c018a8afd5..b3151b4aa7efb 100644
--- a/flang/unittests/Optimizer/FIRTypesTest.cpp
+++ b/flang/unittests/Optimizer/FIRTypesTest.cpp
@@ -227,7 +227,7 @@ TEST_F(FIRTypesTest, updateTypeForUnlimitedPolymorphic) {
   mlir::Type ptrArrNone = fir::PointerType::get(arrNone);
 
   mlir::Type i32Ty = mlir::IntegerType::get(&context, 32);
-  mlir::Type f32Ty = mlir::FloatType::getF32(&context);
+  mlir::Type f32Ty = mlir::Float32Type::get(&context);
   mlir::Type l1Ty = fir::LogicalType::get(&context, 1);
   mlir::Type cplx32Ty = mlir::ComplexType::get(f32Ty);
   mlir::Type char1Ty = fir::CharacterType::get(&context, 1, 10);
@@ -268,12 +268,12 @@ TEST_F(FIRTypesTest, getTypeAsString) {
           fir::ReferenceType::get(mlir::IntegerType::get(&context, 32)),
           *kindMap));
   EXPECT_EQ(
-      "f64", fir::getTypeAsString(mlir::FloatType::getF64(&context), *kindMap));
+      "f64", fir::getTypeAsString(mlir::Float64Type::get(&context), *kindMap));
   EXPECT_EQ(
       "l8", fir::getTypeAsString(fir::LogicalType::get(&context, 1), *kindMap));
   EXPECT_EQ("z32",
       fir::getTypeAsString(
-          mlir::ComplexType::get(mlir::FloatType::getF32(&context)), *kindMap));
+          mlir::ComplexType::get(mlir::Float32Type::get(&context)), *kindMap));
   EXPECT_EQ("c8",
       fir::getTypeAsString(fir::CharacterType::get(&context, 1, 1), *kindMap));
   EXPECT_EQ("c8x10",
diff --git a/flang/unittests/Optimizer/FortranVariableTest.cpp b/flang/unittests/Optimizer/FortranVariableTest.cpp
index 4ba9359a07e4d..30c23b63b4d56 100644
--- a/flang/unittests/Optimizer/FortranVariableTest.cpp
+++ b/flang/unittests/Optimizer/FortranVariableTest.cpp
@@ -45,7 +45,7 @@ struct FortranVariableTest : public testing::Test {
 
 TEST_F(FortranVariableTest, SimpleScalar) {
   mlir::Location loc = getLoc();
-  mlir::Type eleType = mlir::FloatType::getF32(&context);
+  mlir::Type eleType = mlir::Float32Type::get(&context);
   mlir::Value addr = builder->create<fir::AllocaOp>(loc, eleType);
   auto name = mlir::StringAttr::get(&context, "x");
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
@@ -96,7 +96,7 @@ TEST_F(FortranVariableTest, CharacterScalar) {
 
 TEST_F(FortranVariableTest, SimpleArray) {
   mlir::Location loc = getLoc();
-  mlir::Type eleType = mlir::FloatType::getF32(&context);
+  mlir::Type eleType = mlir::Float32Type::get(&context);
   llvm::SmallVector<mlir::Value> extents{
       createConstant(10), createConstant(20), createConstant(30)};
   fir::SequenceType::Shape typeShape(
diff --git a/flang/unittests/Optimizer/RTBuilder.cpp b/flang/unittests/Optimizer/RTBuilder.cpp
index 35b9f1a6d5dcb..00960801928f7 100644
--- a/flang/unittests/Optimizer/RTBuilder.cpp
+++ b/flang/unittests/Optimizer/RTBuilder.cpp
@@ -31,7 +31,7 @@ TEST(RTBuilderTest, ComplexRuntimeInterface) {
   auto c99_cacosf_funcTy = mlir::cast<mlir::FunctionType>(c99_cacosf_signature);
   EXPECT_EQ(c99_cacosf_funcTy.getNumInputs(), 1u);
   EXPECT_EQ(c99_cacosf_funcTy.getNumResults(), 1u);
-  auto cplx_ty = mlir::ComplexType::get(mlir::FloatType::getF32(&ctx));
+  auto cplx_ty = mlir::ComplexType::get(mlir::Float32Type::get(&ctx));
   EXPECT_EQ(c99_cacosf_funcTy.getInput(0), cplx_ty);
   EXPECT_EQ(c99_cacosf_funcTy.getResult(0), cplx_ty);
 }
diff --git a/flang/unittests/Runtime/Support.cpp b/flang/unittests/Runtime/Support.cpp
index 9d1a417fdbf42..8c8de73b5b979 100644
--- a/flang/unittests/Runtime/Support.cpp
+++ b/flang/unittests/Runtime/Support.cpp
@@ -67,3 +67,14 @@ TEST(IsAssumedSize, Basic) {
       std::vector<int>{}, std::vector<std::int32_t>{0})};
   EXPECT_FALSE(RTNAME(IsAssumedSize)(*scalar));
 }
+
+TEST(DescriptorBytesFor, Basic) {
+  for (size_t i = 0; i < Fortran::common::TypeCategory_enumSize; ++i) {
+    auto tc{static_cast<TypeCategory>(i)};
+    if (tc == TypeCategory::Derived)
+      continue;
+
+    auto b{Descriptor::BytesFor(tc, 4)};
+    EXPECT_GT(b, 0U);
+  }
+}
diff --git a/libc/docs/CMakeLists.txt b/libc/docs/CMakeLists.txt
index fca8caec004f7..f88d7c27f9f6b 100644
--- a/libc/docs/CMakeLists.txt
+++ b/libc/docs/CMakeLists.txt
@@ -55,6 +55,10 @@ if (SPHINX_FOUND)
       strings
       sys/mman
       sys/resource
+      sys/stat
+      sys/time
+      sys/wait
+      termios
       threads
       uchar
       wchar
diff --git a/libc/docs/headers/index.rst b/libc/docs/headers/index.rst
index 67325e40dcbbd..858b2142defa9 100644
--- a/libc/docs/headers/index.rst
+++ b/libc/docs/headers/index.rst
@@ -28,6 +28,10 @@ Implementation Status
    strings
    sys/mman
    sys/resource
+   sys/stat
+   sys/time
+   sys/wait
+   termios
    threads
    time
    uchar
diff --git a/libc/include/__llvm-libc-common.h b/libc/include/__llvm-libc-common.h
index c63eb134a5e5d..b5a23c5765f4d 100644
--- a/libc/include/__llvm-libc-common.h
+++ b/libc/include/__llvm-libc-common.h
@@ -39,7 +39,11 @@
 #define _Thread_local thread_local
 
 #undef __NOEXCEPT
+#if __cplusplus >= 201103L
 #define __NOEXCEPT noexcept
+#else
+#define __NOEXCEPT throw()
+#endif
 
 #else // not __cplusplus
 
diff --git a/libc/include/llvm-libc-types/sigset_t.h b/libc/include/llvm-libc-types/sigset_t.h
index 1f601488db4c3..8c4d3b49533db 100644
--- a/libc/include/llvm-libc-types/sigset_t.h
+++ b/libc/include/llvm-libc-types/sigset_t.h
@@ -13,8 +13,8 @@
 
 // This definition can be adjusted/specialized for different targets and
 // platforms as necessary. This definition works for Linux on most targets.
-struct sigset_t {
+typedef struct {
   unsigned long __signals[__NSIGSET_WORDS];
-};
+} sigset_t;
 
 #endif // LLVM_LIBC_TYPES_SIGSET_T_H
diff --git a/libc/include/llvm-libc-types/struct_sigaction.h b/libc/include/llvm-libc-types/struct_sigaction.h
index 4257cfecd643a..b4d0c965a4c63 100644
--- a/libc/include/llvm-libc-types/struct_sigaction.h
+++ b/libc/include/llvm-libc-types/struct_sigaction.h
@@ -17,7 +17,7 @@ struct sigaction {
     void (*sa_handler)(int);
     void (*sa_sigaction)(int, siginfo_t *, void *);
   };
-  struct sigset_t sa_mask;
+  sigset_t sa_mask;
   int sa_flags;
 #ifdef __linux__
   // This field is present on linux for most targets.
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 5090dc218cda4..148484052dcad 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -12,6 +12,7 @@ add_header_library(
     libc.src.__support.CPP.optional
     libc.src.__support.CPP.span
     libc.src.__support.CPP.type_traits
+    libc.src.__support.math_extras
 )
 
 add_object_library(
diff --git a/libc/src/__support/block.h b/libc/src/__support/block.h
index 9ca3f11530c4b..a58c38bbb7acb 100644
--- a/libc/src/__support/block.h
+++ b/libc/src/__support/block.h
@@ -18,46 +18,22 @@
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/libc_assert.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/math_extras.h"
 
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
-namespace internal {
-// Types of corrupted blocks, and functions to crash with an error message
-// corresponding to each type.
-enum class BlockStatus {
-  VALID,
-  MISALIGNED,
-  PREV_MISMATCHED,
-  NEXT_MISMATCHED,
-};
-} // namespace internal
-
 /// Returns the value rounded down to the nearest multiple of alignment.
 LIBC_INLINE constexpr size_t align_down(size_t value, size_t alignment) {
   // Note this shouldn't overflow since the result will always be <= value.
   return (value / alignment) * alignment;
 }
 
-/// Returns the value rounded down to the nearest multiple of alignment.
-template <typename T>
-LIBC_INLINE constexpr T *align_down(T *value, size_t alignment) {
-  return reinterpret_cast<T *>(
-      align_down(reinterpret_cast<size_t>(value), alignment));
-}
-
-/// Returns the value rounded up to the nearest multiple of alignment.
+/// Returns the value rounded up to the nearest multiple of alignment. May wrap
+/// around.
 LIBC_INLINE constexpr size_t align_up(size_t value, size_t alignment) {
-  __builtin_add_overflow(value, alignment - 1, &value);
-  return align_down(value, alignment);
-}
-
-/// Returns the value rounded up to the nearest multiple of alignment.
-template <typename T>
-LIBC_INLINE constexpr T *align_up(T *value, size_t alignment) {
-  return reinterpret_cast<T *>(
-      align_up(reinterpret_cast<size_t>(value), alignment));
+  return align_down(value + alignment - 1, alignment);
 }
 
 using ByteSpan = cpp::span<LIBC_NAMESPACE::cpp::byte>;
@@ -68,8 +44,8 @@ using cpp::optional;
 /// The blocks store their offsets to the previous and next blocks. The latter
 /// is also the block's size.
 ///
-/// Blocks will always be aligned to a `ALIGNMENT` boundary. Block sizes will
-/// always be rounded up to a multiple of `ALIGNMENT`.
+/// All blocks have their usable space aligned to some multiple of max_align_t.
+/// This also implies that block outer sizes are aligned to max_align_t.
 ///
 /// As an example, the diagram below represents two contiguous `Block`s. The
 /// indices indicate byte offsets:
@@ -122,15 +98,13 @@ class Block {
   static constexpr size_t SIZE_MASK = ~(PREV_FREE_MASK | LAST_MASK);
 
 public:
-  static constexpr size_t ALIGNMENT = cpp::max(alignof(max_align_t), size_t{4});
-  static const size_t BLOCK_OVERHEAD;
-
   // No copy or move.
   Block(const Block &other) = delete;
   Block &operator=(const Block &other) = delete;
 
-  /// Creates the first block for a given memory region, followed by a sentinel
-  /// last block. Returns the first block.
+  /// Initializes a given memory region into a first block and a sentinel last
+  /// block. Returns the first block, which has its usable space aligned to
+  /// max_align_t.
   static optional<Block *> init(ByteSpan region);
 
   /// @returns  A pointer to a `Block`, given a pointer to the start of the
@@ -142,11 +116,11 @@ class Block {
   ///           pointer will return a non-null pointer.
   LIBC_INLINE static Block *from_usable_space(void *usable_space) {
     auto *bytes = reinterpret_cast<cpp::byte *>(usable_space);
-    return reinterpret_cast<Block *>(bytes - BLOCK_OVERHEAD);
+    return reinterpret_cast<Block *>(bytes - sizeof(Block));
   }
   LIBC_INLINE static const Block *from_usable_space(const void *usable_space) {
     const auto *bytes = reinterpret_cast<const cpp::byte *>(usable_space);
-    return reinterpret_cast<const Block *>(bytes - BLOCK_OVERHEAD);
+    return reinterpret_cast<const Block *>(bytes - sizeof(Block));
   }
 
   /// @returns The total size of the block in bytes, including the header.
@@ -154,7 +128,7 @@ class Block {
 
   LIBC_INLINE static size_t outer_size(size_t inner_size) {
     // The usable region includes the prev_ field of the next block.
-    return inner_size - sizeof(prev_) + BLOCK_OVERHEAD;
+    return inner_size - sizeof(prev_) + sizeof(Block);
   }
 
   /// @returns The number of usable bytes inside the block were it to be
@@ -182,15 +156,23 @@ class Block {
   /// @returns The number of usable bytes inside a block with the given outer
   /// size if it remains free.
   LIBC_INLINE static size_t inner_size_free(size_t outer_size) {
-    return outer_size - BLOCK_OVERHEAD;
+    return outer_size - sizeof(Block);
   }
 
   /// @returns A pointer to the usable space inside this block.
+  ///
+  /// Aligned to some multiple of max_align_t.
   LIBC_INLINE cpp::byte *usable_space() {
-    return reinterpret_cast<cpp::byte *>(this) + BLOCK_OVERHEAD;
+    auto *s = reinterpret_cast<cpp::byte *>(this) + sizeof(Block);
+    LIBC_ASSERT(reinterpret_cast<uintptr_t>(s) % alignof(max_align_t) == 0 &&
+                "usable space must be aligned to a multiple of max_align_t");
+    return s;
   }
   LIBC_INLINE const cpp::byte *usable_space() const {
-    return reinterpret_cast<const cpp::byte *>(this) + BLOCK_OVERHEAD;
+    const auto *s = reinterpret_cast<const cpp::byte *>(this) + sizeof(Block);
+    LIBC_ASSERT(reinterpret_cast<uintptr_t>(s) % alignof(max_align_t) == 0 &&
+                "usable space must be aligned to a multiple of max_align_t");
+    return s;
   }
 
   // @returns The region of memory the block manages, including the header.
@@ -201,11 +183,12 @@ class Block {
   /// Attempts to split this block.
   ///
   /// If successful, the block will have an inner size of at least
-  /// `new_inner_size`, rounded to ensure that the split point is on an
-  /// ALIGNMENT boundary. The remaining space will be returned as a new block.
-  /// Note that the prev_ field of the next block counts as part of the inner
-  /// size of the returnd block.
-  optional<Block *> split(size_t new_inner_size);
+  /// `new_inner_size`. The remaining space will be returned as a new block,
+  /// with usable space aligned to `usable_space_alignment`. Note that the prev_
+  /// field of the next block counts as part of the inner size of the block.
+  /// `usable_space_alignment` must be a multiple of max_align_t.
+  optional<Block *> split(size_t new_inner_size,
+                          size_t usable_space_alignment = alignof(max_align_t));
 
   /// Merges this block with the one that comes after it.
   bool merge_next();
@@ -244,50 +227,63 @@ class Block {
     *new (&next()->prev_) size_t = outer_size();
   }
 
-  /// Marks this block as the last one in the chain. Makes next() return
-  /// nullptr.
-  LIBC_INLINE void mark_last() { next_ |= LAST_MASK; }
-
-  LIBC_INLINE constexpr Block(size_t outer_size) : next_(outer_size) {
-    LIBC_ASSERT(outer_size % ALIGNMENT == 0 && "block sizes must be aligned");
+  LIBC_INLINE Block(size_t outer_size, bool is_last) : next_(outer_size) {
+    // Last blocks are not usable, so they need not have sizes aligned to
+    // max_align_t. Their lower bits must still be free, so they must be aligned
+    // to Block.
+    LIBC_ASSERT(
+        outer_size % (is_last ? alignof(Block) : alignof(max_align_t)) == 0 &&
+        "block sizes must be aligned");
+    LIBC_ASSERT(is_usable_space_aligned(alignof(max_align_t)) &&
+                "usable space must be aligned to a multiple of max_align_t");
+    if (is_last)
+      next_ |= LAST_MASK;
   }
 
   LIBC_INLINE bool is_usable_space_aligned(size_t alignment) const {
     return reinterpret_cast<uintptr_t>(usable_space()) % alignment == 0;
   }
 
-  /// @returns The new inner size of this block that would give the usable
-  /// space of the next block the given alignment.
-  LIBC_INLINE size_t padding_for_alignment(size_t alignment) const {
-    if (is_usable_space_aligned(alignment))
+  // Returns the minimum inner size necessary for a block of that size to
+  // always be able to allocate at the given size and alignment.
+  //
+  // Returns 0 if there is no such size.
+  LIBC_INLINE static size_t min_size_for_allocation(size_t alignment,
+                                                    size_t size) {
+    LIBC_ASSERT(alignment >= alignof(max_align_t) &&
+                alignment % alignof(max_align_t) == 0 &&
+                "alignment must be multiple of max_align_t");
+
+    if (alignment == alignof(max_align_t))
+      return size;
+
+    // We must create a new block inside this one (splitting). This requires a
+    // block header in addition to the requested size.
+    if (add_overflow(size, sizeof(Block), size))
       return 0;
 
-    // We need to ensure we can always split this block into a "padding" block
-    // and the aligned block. To do this, we need enough extra space for at
-    // least one block.
+    // Beyond that, padding space may need to remain in this block to ensure
+    // that the usable space of the next block is aligned.
     //
-    // |block   |usable_space                          |
-    // |........|......................................|
-    //                            ^
-    //                            Alignment requirement
+    // Consider a position P of some lesser alignment, L, with maximal distance
+    // to the next position of some greater alignment, G, where G is a multiple
+    // of L. P must be one L unit past a G-aligned point. If it were one L-unit
+    // earlier, its distance would be zero. If it were one L-unit later, its
+    // distance would not be maximal. If it were not some integral number of L
+    // units away, it would not be L-aligned.
     //
+    // So the maximum distance would be G - L. As a special case, if L is 1
+    // (unaligned), the max distance is G - 1.
     //
-    // |block   |space   |block   |usable_space        |
-    // |........|........|........|....................|
-    //                            ^
-    //                            Alignment requirement
-    //
-    alignment = cpp::max(alignment, ALIGNMENT);
-    uintptr_t start = reinterpret_cast<uintptr_t>(usable_space());
-    uintptr_t next_usable_space = align_up(start + BLOCK_OVERHEAD, alignment);
-    uintptr_t next_block = next_usable_space - BLOCK_OVERHEAD;
-    return next_block - start + sizeof(prev_);
+    // This block's usable space is aligned to max_align_t >= Block. With zero
+    // padding, the next block's usable space is sizeof(Block) past it, which is
+    // a point aligned to Block. Thus the max padding needed is alignment -
+    // alignof(Block).
+    if (add_overflow(size, alignment - alignof(Block), size))
+      return 0;
+    return size;
   }
 
-  // Check that we can `allocate` a block with a given alignment and size from
-  // this existing block.
-  bool can_allocate(size_t alignment, size_t size) const;
-
   // This is the return type for `allocate` which can split one block into up to
   // three blocks.
   struct BlockInfo {
@@ -309,20 +305,36 @@ class Block {
     Block *next;
   };
 
-  // Divide a block into up to 3 blocks according to `BlockInfo`. This should
-  // only be called if `can_allocate` returns true.
+  // Divide a block into up to 3 blocks according to `BlockInfo`. Behavior is
+  // undefined if allocation is not possible for the given size and alignment.
   static BlockInfo allocate(Block *block, size_t alignment, size_t size);
 
+  // These two functions may wrap around.
+  LIBC_INLINE static uintptr_t next_possible_block_start(
+      uintptr_t ptr, size_t usable_space_alignment = alignof(max_align_t)) {
+    return align_up(ptr + sizeof(Block), usable_space_alignment) -
+           sizeof(Block);
+  }
+  LIBC_INLINE static uintptr_t prev_possible_block_start(
+      uintptr_t ptr, size_t usable_space_alignment = alignof(max_align_t)) {
+    return align_down(ptr, usable_space_alignment) - sizeof(Block);
+  }
+
 private:
   /// Construct a block to represent a span of bytes. Overwrites only enough
   /// memory for the block header; the rest of the span is left alone.
   LIBC_INLINE static Block *as_block(ByteSpan bytes) {
-    return ::new (bytes.data()) Block(bytes.size());
+    LIBC_ASSERT(reinterpret_cast<uintptr_t>(bytes.data()) % alignof(Block) ==
+                    0 &&
+                "block start must be suitably aligned");
+    return ::new (bytes.data()) Block(bytes.size(), /*is_last=*/false);
   }
 
-  /// Like `split`, but assumes the caller has already checked to parameters to
-  /// ensure the split will succeed.
-  Block *split_impl(size_t new_inner_size);
+  LIBC_INLINE static void make_last_block(cpp::byte *start) {
+    LIBC_ASSERT(reinterpret_cast<uintptr_t>(start) % alignof(Block) == 0 &&
+                "block start must be suitably aligned");
+    ::new (start) Block(sizeof(Block), /*is_last=*/true);
+  }
 
   /// Offset from this block to the previous block. 0 if this is the first
   /// block. This field is only alive when the previous block is free;
@@ -343,81 +355,57 @@ class Block {
   ///   previous block is free.
   /// * If the `last` flag is set, the block is the sentinel last block. It is
   ///   summarily considered used and has no next block.
-} __attribute__((packed, aligned(cpp::max(alignof(max_align_t), size_t{4}))));
-
-inline constexpr size_t Block::BLOCK_OVERHEAD =
-    align_up(sizeof(Block), ALIGNMENT);
-
-LIBC_INLINE ByteSpan get_aligned_subspan(ByteSpan bytes, size_t alignment) {
-  if (bytes.data() == nullptr)
-    return ByteSpan();
-
-  auto unaligned_start = reinterpret_cast<uintptr_t>(bytes.data());
-  auto aligned_start = align_up(unaligned_start, alignment);
-  auto unaligned_end = unaligned_start + bytes.size();
-  auto aligned_end = align_down(unaligned_end, alignment);
 
-  if (aligned_end <= aligned_start)
-    return ByteSpan();
+public:
+  /// Only for testing.
+  static constexpr size_t PREV_FIELD_SIZE = sizeof(prev_);
+};
 
-  return bytes.subspan(aligned_start - unaligned_start,
-                       aligned_end - aligned_start);
-}
+static_assert(alignof(Block) >= 4,
+              "at least 2 bits must be available in block sizes for flags");
 
 LIBC_INLINE
 optional<Block *> Block::init(ByteSpan region) {
-  optional<ByteSpan> result = get_aligned_subspan(region, ALIGNMENT);
-  if (!result)
+  if (!region.data())
     return {};
 
-  region = result.value();
-  // Two blocks are allocated: a free block and a sentinel last block.
-  if (region.size() < 2 * BLOCK_OVERHEAD)
+  uintptr_t start = reinterpret_cast<uintptr_t>(region.data());
+  uintptr_t end = start + region.size();
+  if (end < start)
     return {};
 
-  if (cpp::numeric_limits<size_t>::max() < region.size())
+  uintptr_t block_start = next_possible_block_start(start);
+  if (block_start < start)
     return {};
 
-  Block *block = as_block(region.first(region.size() - BLOCK_OVERHEAD));
-  Block *last = as_block(region.last(BLOCK_OVERHEAD));
-  block->mark_free();
-  last->mark_last();
-  return block;
-}
-
-LIBC_INLINE
-bool Block::can_allocate(size_t alignment, size_t size) const {
-  if (inner_size() < size)
-    return false;
-  if (is_usable_space_aligned(alignment))
-    return true;
+  uintptr_t last_start = prev_possible_block_start(end);
+  if (last_start >= end)
+    return {};
 
-  // Alignment isn't met, so a padding block is needed. Determine amount of
-  // inner_size() consumed by the padding block.
-  size_t padding_size = padding_for_alignment(alignment) - sizeof(prev_);
+  if (block_start + sizeof(Block) > last_start)
+    return {};
 
-  // Check that there is room for the allocation in the following aligned block.
-  size_t aligned_inner_size = inner_size() - padding_size - BLOCK_OVERHEAD;
-  return size <= aligned_inner_size;
+  auto *last_start_ptr = reinterpret_cast<cpp::byte *>(last_start);
+  Block *block =
+      as_block({reinterpret_cast<cpp::byte *>(block_start), last_start_ptr});
+  make_last_block(last_start_ptr);
+  block->mark_free();
+  return block;
 }
 
 LIBC_INLINE
 Block::BlockInfo Block::allocate(Block *block, size_t alignment, size_t size) {
-  LIBC_ASSERT(
-      block->can_allocate(alignment, size) &&
-      "Calls to this function for a given alignment and size should only be "
-      "done if `can_allocate` for these parameters returns true.");
+  LIBC_ASSERT(alignment % alignof(max_align_t) == 0 &&
+              "alignment must be a multiple of max_align_t");
 
   BlockInfo info{block, /*prev=*/nullptr, /*next=*/nullptr};
 
   if (!info.block->is_usable_space_aligned(alignment)) {
     Block *original = info.block;
-    optional<Block *> maybe_aligned_block =
-        original->split(info.block->padding_for_alignment(alignment));
+    // The padding block has no minimum size requirement.
+    optional<Block *> maybe_aligned_block = original->split(0, alignment);
     LIBC_ASSERT(maybe_aligned_block.has_value() &&
-                "This split should always result in a new block. The check in "
-                "`can_allocate` ensures that we have enough space here to make "
-                "two blocks.");
+                "it should always be possible to split for alignment");
 
     if (Block *prev = original->prev_free()) {
       // If there is a free block before this, we can merge the current one with
@@ -441,37 +429,40 @@ Block::BlockInfo Block::allocate(Block *block, size_t alignment, size_t size) {
 }
 
 LIBC_INLINE
-optional<Block *> Block::split(size_t new_inner_size) {
+optional<Block *> Block::split(size_t new_inner_size,
+                               size_t usable_space_alignment) {
+  LIBC_ASSERT(usable_space_alignment % alignof(max_align_t) == 0 &&
+              "alignment must be a multiple of max_align_t");
   if (used())
     return {};
-  // The prev_ field of the next block is always available, so there is a
-  // minimum size to a block created through splitting.
-  if (new_inner_size < sizeof(prev_))
-    new_inner_size = sizeof(prev_);
-
-  size_t old_inner_size = inner_size();
-  new_inner_size =
-      align_up(new_inner_size - sizeof(prev_), ALIGNMENT) + sizeof(prev_);
-  if (old_inner_size < new_inner_size)
-    return {};
 
-  if (old_inner_size - new_inner_size < BLOCK_OVERHEAD)
+  // Compute the minimum outer size that produces a block of at least
+  // `new_inner_size`.
+  size_t min_outer_size = outer_size(cpp::max(new_inner_size, sizeof(prev_)));
+
+  uintptr_t start = reinterpret_cast<uintptr_t>(this);
+  uintptr_t next_block_start =
+      next_possible_block_start(start + min_outer_size, usable_space_alignment);
+  if (next_block_start < start)
     return {};
+  size_t new_outer_size = next_block_start - start;
+  LIBC_ASSERT(new_outer_size % alignof(max_align_t) == 0 &&
+              "new size must be aligned to max_align_t");
 
-  return split_impl(new_inner_size);
-}
+  if (outer_size() < new_outer_size ||
+      outer_size() - new_outer_size < sizeof(Block))
+    return {};
 
-LIBC_INLINE
-Block *Block::split_impl(size_t new_inner_size) {
-  size_t outer_size1 = outer_size(new_inner_size);
-  LIBC_ASSERT(outer_size1 % ALIGNMENT == 0 && "new size must be aligned");
-  ByteSpan new_region = region().subspan(outer_size1);
+  ByteSpan new_region = region().subspan(new_outer_size);
   next_ &= ~SIZE_MASK;
-  next_ |= outer_size1;
+  next_ |= new_outer_size;
 
   Block *new_block = as_block(new_region);
   mark_free(); // Free status for this block is now stored in new_block.
   new_block->next()->prev_ = new_region.size();
+
+  LIBC_ASSERT(new_block->is_usable_space_aligned(usable_space_alignment) &&
+              "usable space must have requested alignment");
   return new_block;
 }
 
diff --git a/libc/src/__support/freelist_heap.h b/libc/src/__support/freelist_heap.h
index 8fa36257cb91a..d58685194aeb8 100644
--- a/libc/src/__support/freelist_heap.h
+++ b/libc/src/__support/freelist_heap.h
@@ -89,28 +89,14 @@ LIBC_INLINE void *FreeListHeap::allocate_impl(size_t alignment, size_t size) {
   if (!is_initialized)
     init();
 
-  size_t request_size = size;
-
-  // TODO: usable_space should always be aligned to max_align_t.
-  if (alignment > alignof(max_align_t) ||
-      (Block::BLOCK_OVERHEAD % alignof(max_align_t) != 0)) {
-    // TODO: This bound isn't precisely calculated yet. It assumes one extra
-    // Block::ALIGNMENT to accomodate the possibility for padding block
-    // overhead. (alignment - 1) ensures that there is an aligned point
-    // somewhere in usable_space, but this isn't tight either, since
-    // usable_space is also already somewhat aligned.
-    if (add_overflow(size, (alignment - 1) + Block::ALIGNMENT, request_size))
-      return nullptr;
-  }
+  size_t request_size = Block::min_size_for_allocation(alignment, size);
+  if (!request_size)
+    return nullptr;
 
   Block *block = free_store.remove_best_fit(request_size);
   if (!block)
     return nullptr;
 
-  LIBC_ASSERT(block->can_allocate(alignment, size) &&
-              "block should always be large enough to allocate at the correct "
-              "alignment");
-
   auto block_info = Block::allocate(block, alignment, size);
   if (block_info.next)
     free_store.insert(block_info.next);
@@ -135,6 +121,9 @@ LIBC_INLINE void *FreeListHeap::aligned_allocate(size_t alignment,
   if (size % alignment != 0)
     return nullptr;
 
+  // The minimum alignment supported by Block is max_align_t.
+  alignment = cpp::max(alignment, alignof(max_align_t));
+
   return allocate_impl(alignment, size);
 }
 
diff --git a/libc/src/__support/freestore.h b/libc/src/__support/freestore.h
index 97197dda4b546..09f2479debb36 100644
--- a/libc/src/__support/freestore.h
+++ b/libc/src/__support/freestore.h
@@ -40,13 +40,12 @@ class FreeStore {
   Block *remove_best_fit(size_t size);
 
 private:
-  static constexpr size_t ALIGNMENT = alignof(max_align_t);
   static constexpr size_t MIN_OUTER_SIZE =
-      align_up(Block::BLOCK_OVERHEAD + sizeof(FreeList::Node), ALIGNMENT);
+      align_up(sizeof(Block) + sizeof(FreeList::Node), alignof(max_align_t));
   static constexpr size_t MIN_LARGE_OUTER_SIZE =
-      align_up(Block::BLOCK_OVERHEAD + sizeof(FreeTrie::Node), ALIGNMENT);
+      align_up(sizeof(Block) + sizeof(FreeTrie::Node), alignof(max_align_t));
   static constexpr size_t NUM_SMALL_SIZES =
-      (MIN_LARGE_OUTER_SIZE - MIN_OUTER_SIZE) / ALIGNMENT;
+      (MIN_LARGE_OUTER_SIZE - MIN_OUTER_SIZE) / alignof(max_align_t);
 
   LIBC_INLINE static bool too_small(Block *block) {
     return block->outer_size() < MIN_OUTER_SIZE;
@@ -99,7 +98,8 @@ LIBC_INLINE Block *FreeStore::remove_best_fit(size_t size) {
 
 LIBC_INLINE FreeList &FreeStore::small_list(Block *block) {
   LIBC_ASSERT(is_small(block) && "only legal for small blocks");
-  return small_lists[(block->outer_size() - MIN_OUTER_SIZE) / ALIGNMENT];
+  return small_lists[(block->outer_size() - MIN_OUTER_SIZE) /
+                     alignof(max_align_t)];
 }
 
 LIBC_INLINE FreeList *FreeStore::find_best_small_fit(size_t size) {
diff --git a/libc/test/src/__support/block_test.cpp b/libc/test/src/__support/block_test.cpp
index 5e437db51b609..904ac5c66994d 100644
--- a/libc/test/src/__support/block_test.cpp
+++ b/libc/test/src/__support/block_test.cpp
@@ -22,23 +22,28 @@ using LIBC_NAMESPACE::cpp::span;
 
 TEST(LlvmLibcBlockTest, CanCreateSingleAlignedBlock) {
   constexpr size_t kN = 1024;
-  alignas(Block::ALIGNMENT) array<byte, kN> bytes;
+  alignas(max_align_t) array<byte, kN> bytes;
 
   auto result = Block::init(bytes);
   ASSERT_TRUE(result.has_value());
   Block *block = *result;
 
+  EXPECT_EQ(reinterpret_cast<uintptr_t>(block) % alignof(Block), size_t{0});
+  EXPECT_TRUE(block->is_usable_space_aligned(alignof(max_align_t)));
+
   Block *last = block->next();
   ASSERT_NE(last, static_cast<Block *>(nullptr));
-  constexpr size_t last_outer_size = Block::BLOCK_OVERHEAD;
-  EXPECT_EQ(last->outer_size(), last_outer_size);
+  EXPECT_EQ(reinterpret_cast<uintptr_t>(last) % alignof(Block), size_t{0});
+
+  EXPECT_EQ(last->outer_size(), sizeof(Block));
   EXPECT_EQ(last->prev_free(), block);
   EXPECT_TRUE(last->used());
 
-  EXPECT_EQ(block->outer_size(), kN - last_outer_size);
-  constexpr size_t last_prev_field_size = sizeof(size_t);
-  EXPECT_EQ(block->inner_size(), kN - last_outer_size - Block::BLOCK_OVERHEAD +
-                                     last_prev_field_size);
+  size_t block_outer_size =
+      reinterpret_cast<uintptr_t>(last) - reinterpret_cast<uintptr_t>(block);
+  EXPECT_EQ(block->outer_size(), block_outer_size);
+  EXPECT_EQ(block->inner_size(),
+            block_outer_size - sizeof(Block) + Block::PREV_FIELD_SIZE);
   EXPECT_EQ(block->prev_free(), static_cast<Block *>(nullptr));
   EXPECT_FALSE(block->used());
 }
@@ -47,11 +52,19 @@ TEST(LlvmLibcBlockTest, CanCreateUnalignedSingleBlock) {
   constexpr size_t kN = 1024;
 
   // Force alignment, so we can un-force it below
-  alignas(Block::ALIGNMENT) array<byte, kN> bytes;
+  alignas(max_align_t) array<byte, kN> bytes;
   span<byte> aligned(bytes);
 
   auto result = Block::init(aligned.subspan(1));
   EXPECT_TRUE(result.has_value());
+
+  Block *block = *result;
+  EXPECT_EQ(reinterpret_cast<uintptr_t>(block) % alignof(Block), size_t{0});
+  EXPECT_TRUE(block->is_usable_space_aligned(alignof(max_align_t)));
+
+  Block *last = block->next();
+  ASSERT_NE(last, static_cast<Block *>(nullptr));
+  EXPECT_EQ(reinterpret_cast<uintptr_t>(last) % alignof(Block), size_t{0});
 }
 
 TEST(LlvmLibcBlockTest, CannotCreateTooSmallBlock) {
@@ -62,11 +75,13 @@ TEST(LlvmLibcBlockTest, CannotCreateTooSmallBlock) {
 
 TEST(LlvmLibcBlockTest, CanSplitBlock) {
   constexpr size_t kN = 1024;
-  constexpr size_t prev_field_size = sizeof(size_t);
-  // Give the split position a large alignment.
-  constexpr size_t kSplitN = 512 + prev_field_size;
 
-  alignas(Block::ALIGNMENT) array<byte, kN> bytes;
+  // Choose a split position such that the next block's usable space is 512
+  // bytes from this one's. This should be sufficient for any machine's
+  // alignment.
+  const size_t kSplitN = Block::inner_size(512);
+
+  array<byte, kN> bytes;
   auto result = Block::init(bytes);
   ASSERT_TRUE(result.has_value());
   auto *block1 = *result;
@@ -78,10 +93,12 @@ TEST(LlvmLibcBlockTest, CanSplitBlock) {
 
   EXPECT_EQ(block1->inner_size(), kSplitN);
   EXPECT_EQ(block1->outer_size(),
-            kSplitN - prev_field_size + Block::BLOCK_OVERHEAD);
+            kSplitN - Block::PREV_FIELD_SIZE + sizeof(Block));
 
   EXPECT_EQ(block2->outer_size(), orig_size - block1->outer_size());
   EXPECT_FALSE(block2->used());
+  EXPECT_EQ(reinterpret_cast<uintptr_t>(block2) % alignof(Block), size_t{0});
+  EXPECT_TRUE(block2->is_usable_space_aligned(alignof(max_align_t)));
 
   EXPECT_EQ(block1->next(), block2);
   EXPECT_EQ(block2->prev_free(), block1);
@@ -90,28 +107,24 @@ TEST(LlvmLibcBlockTest, CanSplitBlock) {
 TEST(LlvmLibcBlockTest, CanSplitBlockUnaligned) {
   constexpr size_t kN = 1024;
 
-  alignas(Block::ALIGNMENT) array<byte, kN> bytes;
+  array<byte, kN> bytes;
   auto result = Block::init(bytes);
   ASSERT_TRUE(result.has_value());
   Block *block1 = *result;
   size_t orig_size = block1->outer_size();
 
   constexpr size_t kSplitN = 513;
-  constexpr size_t prev_field_size = sizeof(size_t);
-  uintptr_t split_addr =
-      reinterpret_cast<uintptr_t>(block1) + (kSplitN - prev_field_size);
-  // Round split_addr up to a multiple of the alignment.
-  split_addr += alignof(Block) - (split_addr % alignof(Block));
-  uintptr_t split_len = split_addr - (uintptr_t)&bytes + prev_field_size;
 
   result = block1->split(kSplitN);
   ASSERT_TRUE(result.has_value());
   Block *block2 = *result;
 
-  EXPECT_EQ(block1->inner_size(), split_len);
+  EXPECT_GE(block1->inner_size(), kSplitN);
 
   EXPECT_EQ(block2->outer_size(), orig_size - block1->outer_size());
   EXPECT_FALSE(block2->used());
+  EXPECT_EQ(reinterpret_cast<uintptr_t>(block2) % alignof(Block), size_t{0});
+  EXPECT_TRUE(block2->is_usable_space_aligned(alignof(max_align_t)));
 
   EXPECT_EQ(block1->next(), block2);
   EXPECT_EQ(block2->prev_free(), block1);
@@ -131,7 +144,7 @@ TEST(LlvmLibcBlockTest, CanSplitMidBlock) {
   constexpr size_t kSplit1 = 512;
   constexpr size_t kSplit2 = 256;
 
-  alignas(Block::ALIGNMENT) array<byte, kN> bytes;
+  array<byte, kN> bytes;
   auto result = Block::init(bytes);
   ASSERT_TRUE(result.has_value());
   Block *block1 = *result;
@@ -152,27 +165,25 @@ TEST(LlvmLibcBlockTest, CanSplitMidBlock) {
 
 TEST(LlvmLibcBlockTest, CannotSplitTooSmallBlock) {
   constexpr size_t kN = 64;
-  constexpr size_t kSplitN = kN + 1;
 
-  alignas(Block::ALIGNMENT) array<byte, kN> bytes;
+  array<byte, kN> bytes;
   auto result = Block::init(bytes);
   ASSERT_TRUE(result.has_value());
   Block *block = *result;
 
-  result = block->split(kSplitN);
+  result = block->split(block->inner_size() + 1);
   ASSERT_FALSE(result.has_value());
 }
 
 TEST(LlvmLibcBlockTest, CannotSplitBlockWithoutHeaderSpace) {
   constexpr size_t kN = 1024;
-  constexpr size_t kSplitN = kN - 2 * Block::BLOCK_OVERHEAD - 1;
 
-  alignas(Block::ALIGNMENT) array<byte, kN> bytes;
+  array<byte, kN> bytes;
   auto result = Block::init(bytes);
   ASSERT_TRUE(result.has_value());
   Block *block = *result;
 
-  result = block->split(kSplitN);
+  result = block->split(block->inner_size() - sizeof(Block) + 1);
   ASSERT_FALSE(result.has_value());
 }
 
@@ -180,7 +191,7 @@ TEST(LlvmLibcBlockTest, CannotMakeBlockLargerInSplit) {
   // Ensure that we can't ask for more space than the block actually has...
   constexpr size_t kN = 1024;
 
-  alignas(Block::ALIGNMENT) array<byte, kN> bytes;
+  array<byte, kN> bytes;
   auto result = Block::init(bytes);
   ASSERT_TRUE(result.has_value());
   Block *block = *result;
@@ -189,55 +200,41 @@ TEST(LlvmLibcBlockTest, CannotMakeBlockLargerInSplit) {
   ASSERT_FALSE(result.has_value());
 }
 
-TEST(LlvmLibcBlockTest, CannotMakeSecondBlockLargerInSplit) {
-  // Ensure that the second block in split is at least of the size of header.
-  constexpr size_t kN = 1024;
-
-  alignas(Block::ALIGNMENT) array<byte, kN> bytes;
-  auto result = Block::init(bytes);
-  ASSERT_TRUE(result.has_value());
-  Block *block = *result;
-
-  result = block->split(block->inner_size() - Block::BLOCK_OVERHEAD + 1);
-  ASSERT_FALSE(result.has_value());
-}
-
 TEST(LlvmLibcBlockTest, CanMakeMinimalSizeFirstBlock) {
   // This block does support splitting with minimal payload size.
   constexpr size_t kN = 1024;
-  constexpr size_t minimal_size = sizeof(size_t);
 
-  alignas(Block::ALIGNMENT) array<byte, kN> bytes;
+  array<byte, kN> bytes;
   auto result = Block::init(bytes);
   ASSERT_TRUE(result.has_value());
   Block *block = *result;
 
-  result = block->split(minimal_size);
+  result = block->split(0);
   ASSERT_TRUE(result.has_value());
-  EXPECT_EQ(block->inner_size(), minimal_size);
+  EXPECT_LE(block->outer_size(), sizeof(Block) + alignof(max_align_t));
 }
 
 TEST(LlvmLibcBlockTest, CanMakeMinimalSizeSecondBlock) {
   // Likewise, the split block can be minimal-width.
   constexpr size_t kN = 1024;
-  constexpr size_t minimal_size = sizeof(size_t);
 
-  alignas(Block::ALIGNMENT) array<byte, kN> bytes;
+  array<byte, kN> bytes;
   auto result = Block::init(bytes);
   ASSERT_TRUE(result.has_value());
   Block *block1 = *result;
 
-  result = block1->split(block1->inner_size() - Block::BLOCK_OVERHEAD);
+  result = block1->split(Block::prev_possible_block_start(
+                             reinterpret_cast<uintptr_t>(block1->next())) -
+                         reinterpret_cast<uintptr_t>(block1->usable_space()) +
+                         Block::PREV_FIELD_SIZE);
   ASSERT_TRUE(result.has_value());
-  Block *block2 = *result;
-
-  EXPECT_EQ(block2->inner_size(), minimal_size);
+  EXPECT_LE((*result)->outer_size(), sizeof(Block) + alignof(max_align_t));
 }
 
 TEST(LlvmLibcBlockTest, CanMarkBlockUsed) {
   constexpr size_t kN = 1024;
 
-  alignas(Block::ALIGNMENT) array<byte, kN> bytes;
+  array<byte, kN> bytes;
   auto result = Block::init(bytes);
   ASSERT_TRUE(result.has_value());
   Block *block = *result;
@@ -255,7 +252,7 @@ TEST(LlvmLibcBlockTest, CannotSplitUsedBlock) {
   constexpr size_t kN = 1024;
   constexpr size_t kSplitN = 512;
 
-  alignas(Block::ALIGNMENT) array<byte, kN> bytes;
+  array<byte, kN> bytes;
   auto result = Block::init(bytes);
   ASSERT_TRUE(result.has_value());
   Block *block = *result;
@@ -269,21 +266,19 @@ TEST(LlvmLibcBlockTest, CanMergeWithNextBlock) {
   // Do the three way merge from "CanSplitMidBlock", and let's
   // merge block 3 and 2
   constexpr size_t kN = 1024;
-  // Give the split positions large alignments.
-  constexpr size_t prev_field_size = sizeof(size_t);
-  constexpr size_t kSplit1 = 512 + prev_field_size;
-  constexpr size_t kSplit2 = 256 + prev_field_size;
-
-  alignas(Block::ALIGNMENT) array<byte, kN> bytes;
+  constexpr size_t kSplit1 = 512;
+  constexpr size_t kSplit2 = 256;
+  array<byte, kN> bytes;
   auto result = Block::init(bytes);
   ASSERT_TRUE(result.has_value());
   Block *block1 = *result;
-  size_t orig_size = block1->outer_size();
+  size_t total_size = block1->outer_size();
 
   result = block1->split(kSplit1);
   ASSERT_TRUE(result.has_value());
 
   result = block1->split(kSplit2);
+  size_t block1_size = block1->outer_size();
   ASSERT_TRUE(result.has_value());
   Block *block3 = *result;
 
@@ -291,15 +286,15 @@ TEST(LlvmLibcBlockTest, CanMergeWithNextBlock) {
 
   EXPECT_EQ(block1->next(), block3);
   EXPECT_EQ(block3->prev_free(), block1);
-  EXPECT_EQ(block1->inner_size(), kSplit2);
-  EXPECT_EQ(block3->outer_size(), orig_size - block1->outer_size());
+  EXPECT_EQ(block1->outer_size(), block1_size);
+  EXPECT_EQ(block3->outer_size(), total_size - block1->outer_size());
 }
 
 TEST(LlvmLibcBlockTest, CannotMergeWithFirstOrLastBlock) {
   constexpr size_t kN = 1024;
   constexpr size_t kSplitN = 512;
 
-  alignas(Block::ALIGNMENT) array<byte, kN> bytes;
+  array<byte, kN> bytes;
   auto result = Block::init(bytes);
   ASSERT_TRUE(result.has_value());
   Block *block1 = *result;
@@ -316,7 +311,7 @@ TEST(LlvmLibcBlockTest, CannotMergeUsedBlock) {
   constexpr size_t kN = 1024;
   constexpr size_t kSplitN = 512;
 
-  alignas(Block::ALIGNMENT) array<byte, kN> bytes;
+  array<byte, kN> bytes;
   auto result = Block::init(bytes);
   ASSERT_TRUE(result.has_value());
   Block *block = *result;
@@ -330,9 +325,7 @@ TEST(LlvmLibcBlockTest, CannotMergeUsedBlock) {
 }
 
 TEST(LlvmLibcBlockTest, CanGetBlockFromUsableSpace) {
-  constexpr size_t kN = 1024;
-
-  array<byte, kN> bytes{};
+  array<byte, 1024> bytes;
   auto result = Block::init(bytes);
   ASSERT_TRUE(result.has_value());
   Block *block1 = *result;
@@ -355,93 +348,85 @@ TEST(LlvmLibcBlockTest, CanGetConstBlockFromUsableSpace) {
   EXPECT_EQ(block1, block2);
 }
 
-TEST(LlvmLibcBlockTest, CanAllocate) {
-  constexpr size_t kN = 1024 + Block::BLOCK_OVERHEAD;
+TEST(LlvmLibcBlockTest, Allocate) {
+  constexpr size_t kN = 1024;
 
   // Ensure we can allocate everything up to the block size within this block.
-  for (size_t i = 0; i < kN - 2 * Block::BLOCK_OVERHEAD; ++i) {
-    alignas(Block::ALIGNMENT) array<byte, kN> bytes{};
+  for (size_t i = 0; i < kN; ++i) {
+    array<byte, kN> bytes;
     auto result = Block::init(bytes);
     ASSERT_TRUE(result.has_value());
     Block *block = *result;
 
-    constexpr size_t ALIGN = 1; // Effectively ignores alignment.
-    EXPECT_TRUE(block->can_allocate(ALIGN, i));
+    if (i > block->inner_size())
+      continue;
 
-    // For each can_allocate, we should be able to do a successful call to
-    // allocate.
-    auto info = Block::allocate(block, ALIGN, i);
+    auto info = Block::allocate(block, alignof(max_align_t), i);
     EXPECT_NE(info.block, static_cast<Block *>(nullptr));
   }
 
-  alignas(Block::ALIGNMENT) array<byte, kN> bytes{};
-  auto result = Block::init(bytes);
-  ASSERT_TRUE(result.has_value());
-  Block *block = *result;
+  // Ensure we can allocate a byte at every guaranteeable alignment.
+  for (size_t i = 1; i < kN / alignof(max_align_t); ++i) {
+    array<byte, kN> bytes;
+    auto result = Block::init(bytes);
+    ASSERT_TRUE(result.has_value());
+    Block *block = *result;
 
-  // Given a block of size N (assuming it's also a power of two), we should be
-  // able to allocate a block within it that's aligned to N/2. This is
-  // because regardless of where the buffer is located, we can always find a
-  // starting location within it that meets this alignment.
-  EXPECT_TRUE(block->can_allocate(block->outer_size() / 2, 1));
-  auto info = Block::allocate(block, block->outer_size() / 2, 1);
-  EXPECT_NE(info.block, static_cast<Block *>(nullptr));
+    size_t alignment = i * alignof(max_align_t);
+    if (Block::min_size_for_allocation(alignment, 1) > block->inner_size())
+      continue;
+
+    auto info = Block::allocate(block, alignment, 1);
+    EXPECT_NE(info.block, static_cast<Block *>(nullptr));
+  }
 }
 
 TEST(LlvmLibcBlockTest, AllocateAlreadyAligned) {
   constexpr size_t kN = 1024;
 
-  alignas(Block::ALIGNMENT) array<byte, kN> bytes{};
+  array<byte, kN> bytes;
   auto result = Block::init(bytes);
   ASSERT_TRUE(result.has_value());
   Block *block = *result;
+  uintptr_t orig_end = reinterpret_cast<uintptr_t>(block) + block->outer_size();
 
-  // This should result in no new blocks.
-  constexpr size_t kAlignment = Block::ALIGNMENT;
-  constexpr size_t prev_field_size = sizeof(size_t);
-  constexpr size_t kExpectedSize = Block::ALIGNMENT + prev_field_size;
-  EXPECT_TRUE(block->can_allocate(kAlignment, kExpectedSize));
+  constexpr size_t SIZE = Block::PREV_FIELD_SIZE + 1;
 
   auto [aligned_block, prev, next] =
-      Block::allocate(block, Block::ALIGNMENT, kExpectedSize);
+      Block::allocate(block, alignof(max_align_t), SIZE);
 
   // Since this is already aligned, there should be no previous block.
   EXPECT_EQ(prev, static_cast<Block *>(nullptr));
 
-  // Ensure we the block is aligned and the size we expect.
+  // Ensure we the block is aligned and large enough.
   EXPECT_NE(aligned_block, static_cast<Block *>(nullptr));
-  EXPECT_TRUE(aligned_block->is_usable_space_aligned(Block::ALIGNMENT));
-  EXPECT_EQ(aligned_block->inner_size(), kExpectedSize);
+  EXPECT_TRUE(aligned_block->is_usable_space_aligned(alignof(max_align_t)));
+  EXPECT_GE(aligned_block->inner_size(), SIZE);
 
   // Check the next block.
   EXPECT_NE(next, static_cast<Block *>(nullptr));
   EXPECT_EQ(aligned_block->next(), next);
-  EXPECT_EQ(reinterpret_cast<byte *>(next) + next->outer_size(),
-            bytes.data() + bytes.size() - Block::BLOCK_OVERHEAD);
+  EXPECT_EQ(reinterpret_cast<uintptr_t>(next) + next->outer_size(), orig_end);
 }
 
 TEST(LlvmLibcBlockTest, AllocateNeedsAlignment) {
   constexpr size_t kN = 1024;
 
-  alignas(kN) array<byte, kN> bytes{};
+  array<byte, kN> bytes;
   auto result = Block::init(bytes);
   ASSERT_TRUE(result.has_value());
   Block *block = *result;
 
-  // Ensure first the usable_data is only aligned to the block alignment.
-  ASSERT_EQ(block->usable_space(), bytes.data() + Block::BLOCK_OVERHEAD);
-  ASSERT_EQ(block->prev_free(), static_cast<Block *>(nullptr));
+  uintptr_t orig_end = reinterpret_cast<uintptr_t>(block) + block->outer_size();
 
   // Now pick an alignment such that the usable space is not already aligned to
   // it. We want to explicitly test that the block will split into one before
   // it.
-  constexpr size_t kAlignment = bit_ceil(Block::BLOCK_OVERHEAD) * 8;
-  ASSERT_FALSE(block->is_usable_space_aligned(kAlignment));
-
-  constexpr size_t kSize = 10;
-  EXPECT_TRUE(block->can_allocate(kAlignment, kSize));
+  size_t alignment = alignof(max_align_t);
+  while (block->is_usable_space_aligned(alignment))
+    alignment += alignof(max_align_t);
 
-  auto [aligned_block, prev, next] = Block::allocate(block, kAlignment, kSize);
+  auto [aligned_block, prev, next] = Block::allocate(block, alignment, 10);
 
   // Check the previous block was created appropriately. Since this block is the
   // first block, a new one should be made before this.
@@ -453,19 +438,18 @@ TEST(LlvmLibcBlockTest, AllocateNeedsAlignment) {
 
   // Ensure we the block is aligned and the size we expect.
   EXPECT_NE(next, static_cast<Block *>(nullptr));
-  EXPECT_TRUE(aligned_block->is_usable_space_aligned(kAlignment));
+  EXPECT_TRUE(aligned_block->is_usable_space_aligned(alignment));
 
   // Check the next block.
   EXPECT_NE(next, static_cast<Block *>(nullptr));
   EXPECT_EQ(aligned_block->next(), next);
-  EXPECT_EQ(reinterpret_cast<byte *>(next) + next->outer_size(),
-            bytes.data() + bytes.size() - Block::BLOCK_OVERHEAD);
+  EXPECT_EQ(reinterpret_cast<uintptr_t>(next) + next->outer_size(), orig_end);
 }
 
 TEST(LlvmLibcBlockTest, PreviousBlockMergedIfNotFirst) {
   constexpr size_t kN = 1024;
 
-  alignas(kN) array<byte, kN> bytes{};
+  array<byte, kN> bytes;
   auto result = Block::init(bytes);
   ASSERT_TRUE(result.has_value());
   Block *block = *result;
@@ -480,15 +464,12 @@ TEST(LlvmLibcBlockTest, PreviousBlockMergedIfNotFirst) {
   // Now pick an alignment such that the usable space is not already aligned to
   // it. We want to explicitly test that the block will split into one before
   // it.
-  constexpr size_t kAlignment = bit_ceil(Block::BLOCK_OVERHEAD) * 8;
-  ASSERT_FALSE(newblock->is_usable_space_aligned(kAlignment));
+  size_t alignment = alignof(max_align_t);
+  while (newblock->is_usable_space_aligned(alignment))
+    alignment += alignof(max_align_t);
 
   // Ensure we can allocate in the new block.
-  constexpr size_t kSize = Block::ALIGNMENT;
-  EXPECT_TRUE(newblock->can_allocate(kAlignment, kSize));
-
-  auto [aligned_block, prev, next] =
-      Block::allocate(newblock, kAlignment, kSize);
+  auto [aligned_block, prev, next] = Block::allocate(newblock, alignment, 1);
 
   // Now there should be no new previous block. Instead, the padding we did
   // create should be merged into the original previous block.
@@ -505,26 +486,26 @@ TEST(LlvmLibcBlockTest, CanRemergeBlockAllocations) {
   // This is the same setup as with the `AllocateNeedsAlignment` test case.
   constexpr size_t kN = 1024;
 
-  alignas(kN) array<byte, kN> bytes{};
+  array<byte, kN> bytes;
   auto result = Block::init(bytes);
   ASSERT_TRUE(result.has_value());
   Block *block = *result;
+
+  Block *orig_block = block;
+  size_t orig_size = orig_block->outer_size();
+
   Block *last = block->next();
 
-  // Ensure first the usable_data is only aligned to the block alignment.
-  ASSERT_EQ(block->usable_space(), bytes.data() + Block::BLOCK_OVERHEAD);
   ASSERT_EQ(block->prev_free(), static_cast<Block *>(nullptr));
 
   // Now pick an alignment such that the usable space is not already aligned to
   // it. We want to explicitly test that the block will split into one before
   // it.
-  constexpr size_t kAlignment = bit_ceil(Block::BLOCK_OVERHEAD) * 8;
-  ASSERT_FALSE(block->is_usable_space_aligned(kAlignment));
-
-  constexpr size_t kSize = Block::ALIGNMENT;
-  EXPECT_TRUE(block->can_allocate(kAlignment, kSize));
+  size_t alignment = alignof(max_align_t);
+  while (block->is_usable_space_aligned(alignment))
+    alignment += alignof(max_align_t);
 
-  auto [aligned_block, prev, next] = Block::allocate(block, kAlignment, kSize);
+  auto [aligned_block, prev, next] = Block::allocate(block, alignment, 1);
 
   // Check we have the appropriate blocks.
   ASSERT_NE(prev, static_cast<Block *>(nullptr));
@@ -540,8 +521,6 @@ TEST(LlvmLibcBlockTest, CanRemergeBlockAllocations) {
   EXPECT_EQ(prev->next(), last);
 
   // We should have the original buffer.
-  EXPECT_EQ(reinterpret_cast<byte *>(prev), &*bytes.begin());
-  EXPECT_EQ(prev->outer_size(), bytes.size() - Block::BLOCK_OVERHEAD);
-  EXPECT_EQ(reinterpret_cast<byte *>(prev) + prev->outer_size(),
-            &*bytes.end() - Block::BLOCK_OVERHEAD);
+  EXPECT_EQ(prev, orig_block);
+  EXPECT_EQ(prev->outer_size(), orig_size);
 }
diff --git a/libc/test/src/__support/freelist_heap_test.cpp b/libc/test/src/__support/freelist_heap_test.cpp
index 991c158825a88..0623272dd5b9f 100644
--- a/libc/test/src/__support/freelist_heap_test.cpp
+++ b/libc/test/src/__support/freelist_heap_test.cpp
@@ -42,7 +42,7 @@ using LIBC_NAMESPACE::cpp::span;
     void RunTest(FreeListHeap &allocator, [[maybe_unused]] size_t N);          \
   };                                                                           \
   TEST_F(LlvmLibcFreeListHeapTest##TestCase, TestCase) {                       \
-    alignas(Block) byte buf[BufferSize] = {byte(0)};                           \
+    byte buf[BufferSize] = {byte(0)};                                          \
     FreeListHeap allocator(buf);                                               \
     RunTest(allocator, BufferSize);                                            \
     RunTest(*freelist_heap, freelist_heap->region().size());                   \
@@ -95,30 +95,31 @@ TEST_FOR_EACH_ALLOCATOR(ReturnsNullWhenAllocationTooLarge, 2048) {
 // is used for other test cases and we don't explicitly free them.
 TEST(LlvmLibcFreeListHeap, ReturnsNullWhenFull) {
   constexpr size_t N = 2048;
-  alignas(Block) byte buf[N] = {byte(0)};
+  byte buf[N];
 
   FreeListHeap allocator(buf);
 
-  // Use aligned_allocate so we don't need to worry about ensuring the `buf`
-  // being aligned to max_align_t.
-  EXPECT_NE(allocator.aligned_allocate(1, N - 2 * Block::BLOCK_OVERHEAD),
-            static_cast<void *>(nullptr));
+  bool went_null = false;
+  for (size_t i = 0; i < N; i++) {
+    if (!allocator.allocate(1)) {
+      went_null = true;
+      break;
+    }
+  }
+  EXPECT_TRUE(went_null);
   EXPECT_EQ(allocator.allocate(1), static_cast<void *>(nullptr));
 }
 
 TEST_FOR_EACH_ALLOCATOR(ReturnedPointersAreAligned, 2048) {
   void *ptr1 = allocator.allocate(1);
 
-  // Should be aligned to native pointer alignment
   uintptr_t ptr1_start = reinterpret_cast<uintptr_t>(ptr1);
-  size_t alignment = alignof(void *);
-
-  EXPECT_EQ(ptr1_start % alignment, static_cast<size_t>(0));
+  EXPECT_EQ(ptr1_start % alignof(max_align_t), static_cast<size_t>(0));
 
   void *ptr2 = allocator.allocate(1);
   uintptr_t ptr2_start = reinterpret_cast<uintptr_t>(ptr2);
 
-  EXPECT_EQ(ptr2_start % alignment, static_cast<size_t>(0));
+  EXPECT_EQ(ptr2_start % alignof(max_align_t), static_cast<size_t>(0));
 }
 
 TEST_FOR_EACH_ALLOCATOR(CanRealloc, 2048) {
@@ -241,16 +242,14 @@ TEST_FOR_EACH_ALLOCATOR(AlignedAlloc, 2048) {
 
 // This test is not part of the TEST_FOR_EACH_ALLOCATOR since we want to
 // explicitly ensure that the buffer can still return aligned allocations even
-// if the underlying buffer is at most aligned to the Block alignment. This
-// is so we can check that we can still get aligned allocations even if the
-// underlying buffer is not aligned to the alignments we request.
-TEST(LlvmLibcFreeListHeap, AlignedAllocOnlyBlockAligned) {
-  constexpr size_t BUFFER_SIZE = 4096;
-  constexpr size_t BUFFER_ALIGNMENT = alignof(Block) * 2;
-  alignas(BUFFER_ALIGNMENT) byte buf[BUFFER_SIZE] = {byte(0)};
-
-  // Ensure the underlying buffer is at most aligned to the block type.
-  FreeListHeap allocator(span<byte>(buf).subspan(alignof(Block)));
+// if the underlying buffer is unaligned. This is so we can check that we can
+// still get aligned allocations even if the underlying buffer is not aligned to
+// the alignments we request.
+TEST(LlvmLibcFreeListHeap, AlignedAllocUnalignedBuffer) {
+  byte buf[4096] = {byte(0)};
+
+  // Ensure the underlying buffer is poorly aligned.
+  FreeListHeap allocator(span<byte>(buf).subspan(1));
 
   constexpr size_t ALIGNMENTS[] = {1, 2, 4, 8, 16, 32, 64, 128, 256};
   constexpr size_t SIZE_SCALES[] = {1, 2, 3, 4, 5};
diff --git a/libc/test/src/__support/freestore_test.cpp b/libc/test/src/__support/freestore_test.cpp
index 7960d32c8bbf0..39292b6a1211b 100644
--- a/libc/test/src/__support/freestore_test.cpp
+++ b/libc/test/src/__support/freestore_test.cpp
@@ -24,8 +24,12 @@ TEST(LlvmLibcFreeStore, TooSmall) {
   optional<Block *> maybeBlock = Block::init(mem);
   ASSERT_TRUE(maybeBlock.has_value());
   Block *too_small = *maybeBlock;
-  maybeBlock = too_small->split(sizeof(size_t));
+  maybeBlock = too_small->split(Block::PREV_FIELD_SIZE);
   ASSERT_TRUE(maybeBlock.has_value());
+  // On platforms with high alignment the smallest legal block may be large
+  // enough for a node.
+  if (too_small->outer_size() >= sizeof(Block) + sizeof(FreeList::Node))
+    return;
   Block *remainder = *maybeBlock;
 
   FreeStore store;
@@ -43,12 +47,12 @@ TEST(LlvmLibcFreeStore, RemoveBestFit) {
   ASSERT_TRUE(maybeBlock.has_value());
 
   Block *smallest = *maybeBlock;
-  maybeBlock = smallest->split(sizeof(FreeList::Node) + sizeof(size_t));
+  maybeBlock = smallest->split(sizeof(FreeList::Node) + Block::PREV_FIELD_SIZE);
   ASSERT_TRUE(maybeBlock.has_value());
 
   Block *largest_small = *maybeBlock;
-  maybeBlock = largest_small->split(sizeof(FreeTrie::Node) + sizeof(size_t) -
-                                    alignof(max_align_t));
+  maybeBlock = largest_small->split(
+      sizeof(FreeTrie::Node) + Block::PREV_FIELD_SIZE - alignof(max_align_t));
   ASSERT_TRUE(maybeBlock.has_value());
   if (largest_small->inner_size() == smallest->inner_size())
     largest_small = smallest;
@@ -86,7 +90,7 @@ TEST(LlvmLibcFreeStore, Remove) {
   ASSERT_TRUE(maybeBlock.has_value());
 
   Block *small = *maybeBlock;
-  maybeBlock = small->split(sizeof(FreeList::Node) + sizeof(size_t));
+  maybeBlock = small->split(sizeof(FreeList::Node) + Block::PREV_FIELD_SIZE);
   ASSERT_TRUE(maybeBlock.has_value());
 
   Block *remainder = *maybeBlock;
diff --git a/libc/test/src/unistd/getopt_test.cpp b/libc/test/src/unistd/getopt_test.cpp
index 8217f7bb6e731..1a31094e98fc8 100644
--- a/libc/test/src/unistd/getopt_test.cpp
+++ b/libc/test/src/unistd/getopt_test.cpp
@@ -79,7 +79,7 @@ struct LlvmLibcGetoptTest : public LIBC_NAMESPACE::testing::Test {
 
 // This is safe because getopt doesn't currently permute argv like GNU's getopt
 // does so this just helps silence warnings.
-char *operator"" _c(const char *c, size_t) { return const_cast<char *>(c); }
+char *operator""_c(const char *c, size_t) { return const_cast<char *>(c); }
 
 TEST_F(LlvmLibcGetoptTest, NoMatch) {
   array<char *, 3> argv{"prog"_c, "arg1"_c, nullptr};
diff --git a/libc/utils/docgen/sys/stat.yaml b/libc/utils/docgen/sys/stat.yaml
new file mode 100644
index 0000000000000..86dc84a1e06d2
--- /dev/null
+++ b/libc/utils/docgen/sys/stat.yaml
@@ -0,0 +1,118 @@
+macros:
+  S_IFMT:
+    in-latest-posix: ''
+  S_IFBLK:
+    in-latest-posix: ''
+  S_IFCHR:
+    in-latest-posix: ''
+  S_IFIFO:
+    in-latest-posix: ''
+  S_IFREG:
+    in-latest-posix: ''
+  S_IFDIR:
+    in-latest-posix: ''
+  S_IFLNK:
+    in-latest-posix: ''
+  S_IFSOCK:
+    in-latest-posix: ''
+  st_atime:
+    in-latest-posix: ''
+  st_ctime:
+    in-latest-posix: ''
+  st_mtime:
+    in-latest-posix: ''
+  UTIME_NOW:
+    in-latest-posix: ''
+  UTIME_OMIT:
+    in-latest-posix: ''
+
+  S_IRWXU:
+    in-latest-posix: ''
+  S_IRUSR:
+    in-latest-posix: ''
+  S_IWUSR:
+    in-latest-posix: ''
+  S_IXUSR:
+    in-latest-posix: ''
+  S_IRWXG:
+    in-latest-posix: ''
+  S_IRGRP:
+    in-latest-posix: ''
+  S_IWGRP:
+    in-latest-posix: ''
+  S_IXGRP:
+    in-latest-posix: ''
+  
+  S_IRWXO:
+    in-latest-posix: ''
+  S_IROTH:
+    in-latest-posix: ''
+  S_IWOTH:
+    in-latest-posix: ''
+  S_IXOTH:
+    in-latest-posix: ''
+  S_ISUID:
+    in-latest-posix: ''
+  S_ISGID:
+    in-latest-posix: ''
+  S_ISVTX:
+    in-latest-posix: ''
+
+  S_ISBLK:
+    in-latest-posix: ''
+  S_ISCHR:
+    in-latest-posix: ''
+  S_ISDIR:
+    in-latest-posix: ''
+  S_ISFIFO:
+    in-latest-posix: ''
+  S_ISREG:
+    in-latest-posix: ''
+  S_ISLNK:
+    in-latest-posix: ''
+  S_ISSOCK:
+    in-latest-posix: ''
+
+  S_TYPEISMQ:
+    in-latest-posix: ''
+  S_TYPEISSEM:
+    in-latest-posix: ''
+  S_TYPEISSHM:
+    in-latest-posix: ''
+
+  S_TYPEISTMO:
+    in-latest-posix: ''
+
+functions:
+  chmod:
+    in-latest-posix: ''
+  fchmod:
+    in-latest-posix: ''
+  fchmodat:
+    in-latest-posix: ''
+  fstat:
+    in-latest-posix: ''
+  fstatat:
+    in-latest-posix: ''
+  futimens:
+    in-latest-posix: ''
+  lstat:
+    in-latest-posix: ''
+  mkdir:
+    in-latest-posix: ''
+  mkdirat:
+    in-latest-posix: ''
+  mkfifo:
+    in-latest-posix: ''
+  mkfifoat:
+    in-latest-posix: ''
+  mknod:
+    in-latest-posix: ''
+  mknodat:
+    in-latest-posix: ''
+  stat:
+    in-latest-posix: ''
+  umask:
+    in-latest-posix: ''
+  utimensat:
+    in-latest-posix: ''
\ No newline at end of file
diff --git a/libc/utils/docgen/sys/time.yaml b/libc/utils/docgen/sys/time.yaml
new file mode 100644
index 0000000000000..a1d19c3fc636c
--- /dev/null
+++ b/libc/utils/docgen/sys/time.yaml
@@ -0,0 +1,5 @@
+functions:
+  select:
+    in-latest-posix: ''
+  utimes:
+    in-latest-posix: ''
\ No newline at end of file
diff --git a/libc/utils/docgen/sys/wait.yaml b/libc/utils/docgen/sys/wait.yaml
new file mode 100644
index 0000000000000..91d67ad4a358b
--- /dev/null
+++ b/libc/utils/docgen/sys/wait.yaml
@@ -0,0 +1,37 @@
+functions:
+  wait:
+    in-latest-posix: ''
+  waitid:
+    in-latest-posix: ''
+  waitpid:
+    in-latest-posix: ''
+
+macros:
+  WCONTINUED:
+    in-latest-posix: ''
+  WEXITED:
+    in-latest-posix: ''
+  WEXITSTATUS:
+    in-latest-posix: ''
+  WIFCONTINUED:
+    in-latest-posix: ''
+  WIFEXITED:
+    in-latest-posix: ''
+  WIFSIGNALED:
+    in-latest-posix: ''
+  WIFSTOPPED:
+    in-latest-posix: ''
+  WNOHANG:
+    in-latest-posix: ''
+  WNOWAIT:
+    in-latest-posix: ''
+  WSTOPPED:
+    in-latest-posix: ''
+  WSTOPSIG:
+    in-latest-posix: ''
+  WTERMSIG:
+    in-latest-posix: ''
+  WUNTRACED:
+    in-latest-posix: ''
+  WCORE_DUMPED:
+    in-latest-posix: ''
\ No newline at end of file
diff --git a/libc/utils/docgen/termios.yaml b/libc/utils/docgen/termios.yaml
new file mode 100644
index 0000000000000..81dd8da9f240c
--- /dev/null
+++ b/libc/utils/docgen/termios.yaml
@@ -0,0 +1,243 @@
+macros:
+  NCCS:
+    in-latest-posix: ''
+
+  VEOF:
+    in-latest-posix: ''
+  VEOL:
+    in-latest-posix: ''
+  VERASE:
+    in-latest-posix: ''
+  VINTR:
+    in-latest-posix: ''
+  VKILL:
+    in-latest-posix: ''
+  VMIN:
+    in-latest-posix: ''
+  VQUIT:
+    in-latest-posix: ''
+  VSTART:
+    in-latest-posix: ''
+  VSTOP:
+    in-latest-posix: ''
+  VSUSP:
+    in-latest-posix: ''
+  VTIME:
+    in-latest-posix: ''
+
+  BRKINT:
+    in-latest-posix: ''
+  ICRNL:
+    in-latest-posix: ''
+  IGNBRK:
+    in-latest-posix: ''
+  IGNCR: 
+    in-latest-posix: ''
+  IGNPAR:
+    in-latest-posix: ''
+  INLCR:
+    in-latest-posix: ''
+  INPCK:
+    in-latest-posix: ''
+  ISTRIP:
+    in-latest-posix: ''
+  IXANY:
+    in-latest-posix: ''
+  IXOFF:
+    in-latest-posix: ''
+  IXON:
+    in-latest-posix: ''
+  PARMRK:
+    in-latest-posix: ''
+
+  OPOST:
+    in-latest-posix: ''
+  ONLCR:
+    in-latest-posix: ''
+  OCRNL:
+    in-latest-posix: ''
+  ONOCR:
+    in-latest-posix: ''
+  ONLRET:
+    in-latest-posix: ''
+  OFDEL:
+    in-latest-posix: ''
+  OFILL:
+    in-latest-posix: ''
+    
+  NLDLY:
+    in-latest-posix: ''
+  NL0:
+    in-latest-posix: ''
+  NL1:
+    in-latest-posix: ''
+
+  CRDLY:
+    in-latest-posix: ''
+  CR0:
+    in-latest-posix: ''
+  CR1:
+    in-latest-posix: ''
+  CR2:
+    in-latest-posix: ''
+  CR3:
+    in-latest-posix: ''
+
+  TABDLY:
+    in-latest-posix: ''
+  TAB0:
+    in-latest-posix: ''
+  TAB1:
+    in-latest-posix: ''
+  TAB2:
+    in-latest-posix: ''
+  TAB3:
+    in-latest-posix: ''
+
+  BSDLY:
+    in-latest-posix: ''
+  BS0:
+    in-latest-posix: ''
+  BS1:
+    in-latest-posix: ''
+
+  VTDLY:
+    in-latest-posix: ''
+  VT0:
+    in-latest-posix: ''
+  VT1:
+    in-latest-posix: ''
+
+  FFDLY:
+    in-latest-posix: ''
+  FF0:
+    in-latest-posix: ''
+  FF1:
+    in-latest-posix: ''
+
+  B0:
+    in-latest-posix: ''
+  B50:
+    in-latest-posix: ''
+  B75:
+    in-latest-posix: ''
+  B110:
+    in-latest-posix: ''
+  B134:
+    in-latest-posix: ''
+  B150:
+    in-latest-posix: ''
+  B200:
+    in-latest-posix: ''
+  B300:
+    in-latest-posix: ''
+  B600:
+    in-latest-posix: ''
+  B1200:
+    in-latest-posix: ''
+  B1800:
+    in-latest-posix: ''
+  B2400:
+    in-latest-posix: ''
+  B4800:  
+    in-latest-posix: ''
+  B9600:
+    in-latest-posix: ''
+  B19200:
+    in-latest-posix: ''
+  B38400:
+    in-latest-posix: ''
+
+  CSIZE:
+    in-latest-posix: ''
+  CS5:
+    in-latest-posix: ''
+  CS6:
+    in-latest-posix: ''
+  CS7:
+    in-latest-posix: ''
+  CS8:
+    in-latest-posix: ''
+
+  CSTOPB:
+    in-latest-posix: ''
+  CREAD:
+    in-latest-posix: ''
+  PARENB:
+    in-latest-posix: ''
+  PARODD:
+    in-latest-posix: ''
+  HUPCL:
+    in-latest-posix: ''
+  CLOCAL:
+    in-latest-posix: ''
+
+  ECHO:
+    in-latest-posix: ''
+  ECHOE:
+    in-latest-posix: ''
+  ECHOK:
+    in-latest-posix: ''
+  ECHONL:
+    in-latest-posix: ''
+  ICANON:
+    in-latest-posix: ''
+  IEXTEN:
+    in-latest-posix: ''
+  ISIG:
+    in-latest-posix: ''
+  NOFLSH:
+    in-latest-posix: ''
+  TOSTOP:
+    in-latest-posix: ''
+
+  TCSANOW:
+    in-latest-posix: ''
+  TCSADRAIN:
+    in-latest-posix: ''
+  TCSAFLUSH:
+    in-latest-posix: ''
+
+  TCIFLUSH:
+    in-latest-posix: ''
+  TCIOFLUSH:
+    in-latest-posix: ''
+  TCOFLUSH:
+    in-latest-posix: ''
+
+  TCIOFF:
+    in-latest-posix: ''
+  TCION:
+    in-latest-posix: ''
+  TCOOFF:
+    in-latest-posix: ''
+  TCOON:
+    in-latest-posix: ''
+
+functions:
+  cfgetispeed:
+    in-latest-posix: ''
+  cfgetospeed:
+    in-latest-posix: ''
+  cfsetispeed:
+    in-latest-posix: ''
+  cfsetospeed:
+    in-latest-posix: ''
+  tcdrain:
+    in-latest-posix: ''
+  tcflow:
+    in-latest-posix: ''
+  tcflush:
+    in-latest-posix: ''
+  tcgetattr:
+    in-latest-posix: ''
+  tcgetsid: 
+    in-latest-posix: ''
+  tcgetwinsize:
+    in-latest-posix: ''
+  tcsendbreak:
+    in-latest-posix: ''
+  tcsetattr: 
+    in-latest-posix: ''
+  tcsetwinsize:
+    in-latest-posix: ''
\ No newline at end of file
diff --git a/libc/utils/hdrgen/function.py b/libc/utils/hdrgen/function.py
index d97df7f8a50ec..8ae47e574785f 100644
--- a/libc/utils/hdrgen/function.py
+++ b/libc/utils/hdrgen/function.py
@@ -22,7 +22,7 @@ def __init__(
 
     def __str__(self):
         attributes_str = " ".join(self.attributes)
-        arguments_str = ", ".join(self.arguments)
+        arguments_str = ", ".join(self.arguments) if self.arguments else "void"
         if attributes_str == "":
             result = f"{self.return_type} {self.name}({arguments_str})"
         else:
diff --git a/libc/utils/hdrgen/tests/expected_output/test_header.h b/libc/utils/hdrgen/tests/expected_output/test_header.h
index a777976134b04..d730078fba064 100644
--- a/libc/utils/hdrgen/tests/expected_output/test_header.h
+++ b/libc/utils/hdrgen/tests/expected_output/test_header.h
@@ -28,10 +28,10 @@ enum {
 
 __BEGIN_C_DECLS
 
-CONST_FUNC_A void func_a() __NOEXCEPT;
+CONST_FUNC_A void func_a(void) __NOEXCEPT;
 
 #ifdef LIBC_TYPES_HAS_FLOAT128
-float128 func_b() __NOEXCEPT;
+float128 func_b(void) __NOEXCEPT;
 #endif // LIBC_TYPES_HAS_FLOAT128
 
 #ifdef LIBC_TYPES_HAS_FLOAT16
diff --git a/libclc/clc/include/clc/clcmacro.h b/libclc/clc/include/clc/clcmacro.h
index c6583749eca66..3c3a69f4f848b 100644
--- a/libclc/clc/include/clc/clcmacro.h
+++ b/libclc/clc/include/clc/clcmacro.h
@@ -102,29 +102,6 @@
                           FUNCTION(x.hi, y.hi, z.hi));                         \
   }
 
-#define _CLC_V_S_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,        \
-                               ARG2_TYPE, ARG3_TYPE)                           \
-  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##2 z) {    \
-    return (RET_TYPE##2)(FUNCTION(x, y, z.lo), FUNCTION(x, y, z.hi));          \
-  }                                                                            \
-                                                                               \
-  DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##3 z) {    \
-    return (RET_TYPE##3)(FUNCTION(x, y, z.x), FUNCTION(x, y, z.y),             \
-                         FUNCTION(x, y, z.z));                                 \
-  }                                                                            \
-                                                                               \
-  DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##4 z) {    \
-    return (RET_TYPE##4)(FUNCTION(x, y, z.lo), FUNCTION(x, y, z.hi));          \
-  }                                                                            \
-                                                                               \
-  DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##8 z) {    \
-    return (RET_TYPE##8)(FUNCTION(x, y, z.lo), FUNCTION(x, y, z.hi));          \
-  }                                                                            \
-                                                                               \
-  DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##16 z) {  \
-    return (RET_TYPE##16)(FUNCTION(x, y, z.lo), FUNCTION(x, y, z.hi));         \
-  }
-
 #define _CLC_V_V_VP_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,         \
                               ADDR_SPACE, ARG2_TYPE)                           \
   DECLSPEC __CLC_XCONCAT(RET_TYPE, 2)                                          \
diff --git a/libclc/clc/include/clc/common/clc_degrees.h b/libclc/clc/include/clc/common/clc_degrees.h
new file mode 100644
index 0000000000000..e8bb684fcd4d7
--- /dev/null
+++ b/libclc/clc/include/clc/common/clc_degrees.h
@@ -0,0 +1,12 @@
+#ifndef __CLC_MATH_CLC_DEGREES_H__
+#define __CLC_MATH_CLC_DEGREES_H__
+
+#define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION __clc_degrees
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_DEGREES_H__
diff --git a/libclc/clc/include/clc/common/clc_radians.h b/libclc/clc/include/clc/common/clc_radians.h
new file mode 100644
index 0000000000000..80d481e8de723
--- /dev/null
+++ b/libclc/clc/include/clc/common/clc_radians.h
@@ -0,0 +1,12 @@
+#ifndef __CLC_MATH_CLC_RADIANS_H__
+#define __CLC_MATH_CLC_RADIANS_H__
+
+#define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION __clc_radians
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_RADIANS_H__
diff --git a/libclc/clc/include/clc/common/clc_smoothstep.h b/libclc/clc/include/clc/common/clc_smoothstep.h
new file mode 100644
index 0000000000000..fa212245e0794
--- /dev/null
+++ b/libclc/clc/include/clc/common/clc_smoothstep.h
@@ -0,0 +1,11 @@
+#ifndef __CLC_COMMON_CLC_SMOOTHSTEP_H__
+#define __CLC_COMMON_CLC_SMOOTHSTEP_H__
+
+// note: Unlike OpenCL __clc_smoothstep is only defined for three matching
+// argument types.
+
+#define __CLC_BODY <clc/common/clc_smoothstep.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_BODY
+
+#endif // __CLC_COMMON_CLC_SMOOTHSTEP_H__
diff --git a/libclc/clc/include/clc/common/clc_smoothstep.inc b/libclc/clc/include/clc/common/clc_smoothstep.inc
new file mode 100644
index 0000000000000..3ce33c5573f6c
--- /dev/null
+++ b/libclc/clc/include/clc/common/clc_smoothstep.inc
@@ -0,0 +1,3 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_smoothstep(__CLC_GENTYPE edge0,
+                                                       __CLC_GENTYPE edge1,
+                                                       __CLC_GENTYPE x);
diff --git a/libclc/clc/include/clc/shared/clc_clamp.h b/libclc/clc/include/clc/shared/clc_clamp.h
index d9d39413c5618..7fd22771c09c0 100644
--- a/libclc/clc/include/clc/shared/clc_clamp.h
+++ b/libclc/clc/include/clc/shared/clc_clamp.h
@@ -1,17 +1,10 @@
 #ifndef __CLC_SHARED_CLC_CLAMP_H__
 #define __CLC_SHARED_CLC_CLAMP_H__
 
-#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
-// clspv and spir-v targets provide their own OpenCL-compatible clamp
-#define __clc_clamp clamp
-#else
-
 #define __CLC_BODY <clc/shared/clc_clamp.inc>
 #include <clc/integer/gentype.inc>
 
 #define __CLC_BODY <clc/shared/clc_clamp.inc>
 #include <clc/math/gentype.inc>
 
-#endif
-
 #endif // __CLC_SHARED_CLC_CLAMP_H__
diff --git a/libclc/clc/lib/clspv/SOURCES b/libclc/clc/lib/clspv/SOURCES
index 393e8d773cda0..e6573f586080c 100644
--- a/libclc/clc/lib/clspv/SOURCES
+++ b/libclc/clc/lib/clspv/SOURCES
@@ -3,3 +3,4 @@
 ../generic/math/clc_floor.cl
 ../generic/math/clc_rint.cl
 ../generic/math/clc_trunc.cl
+../generic/shared/clc_clamp.cl
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index 3916ea15f5c45..d74bff20ba87b 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -1,3 +1,6 @@
+common/clc_degrees.cl
+common/clc_radians.cl
+common/clc_smoothstep.cl
 geometric/clc_dot.cl
 integer/clc_abs.cl
 integer/clc_abs_diff.cl
diff --git a/libclc/clc/lib/generic/common/clc_degrees.cl b/libclc/clc/lib/generic/common/clc_degrees.cl
new file mode 100644
index 0000000000000..ce705982072e8
--- /dev/null
+++ b/libclc/clc/lib/generic/common/clc_degrees.cl
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clcmacro.h>
+#include <clc/internal/clc.h>
+
+#define DEGREES_SINGLE_DEF(TYPE, LITERAL)                                      \
+  _CLC_OVERLOAD _CLC_DEF TYPE __clc_degrees(TYPE radians) {                    \
+    return (TYPE)LITERAL * radians;                                            \
+  }
+
+#define DEGREES_DEF(TYPE, LITERAL)                                             \
+  DEGREES_SINGLE_DEF(TYPE, LITERAL)                                            \
+  DEGREES_SINGLE_DEF(TYPE##2, LITERAL)                                         \
+  DEGREES_SINGLE_DEF(TYPE##3, LITERAL)                                         \
+  DEGREES_SINGLE_DEF(TYPE##4, LITERAL)                                         \
+  DEGREES_SINGLE_DEF(TYPE##8, LITERAL)                                         \
+  DEGREES_SINGLE_DEF(TYPE##16, LITERAL)
+
+// 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
+DEGREES_DEF(float, 0x1.ca5dc2p+5F)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
+DEGREES_DEF(double, 0x1.ca5dc1a63c1f8p+5)
+
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
+DEGREES_DEF(half, (half)0x1.ca5dc1a63c1f8p+5)
+
+#endif
diff --git a/libclc/clc/lib/generic/common/clc_radians.cl b/libclc/clc/lib/generic/common/clc_radians.cl
new file mode 100644
index 0000000000000..850b8eb84f9da
--- /dev/null
+++ b/libclc/clc/lib/generic/common/clc_radians.cl
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clcmacro.h>
+#include <clc/internal/clc.h>
+
+#define RADIANS_SINGLE_DEF(TYPE, LITERAL)                                      \
+  _CLC_OVERLOAD _CLC_DEF TYPE __clc_radians(TYPE radians) {                    \
+    return (TYPE)LITERAL * radians;                                            \
+  }
+
+#define RADIANS_DEF(TYPE, LITERAL)                                             \
+  RADIANS_SINGLE_DEF(TYPE, LITERAL)                                            \
+  RADIANS_SINGLE_DEF(TYPE##2, LITERAL)                                         \
+  RADIANS_SINGLE_DEF(TYPE##3, LITERAL)                                         \
+  RADIANS_SINGLE_DEF(TYPE##4, LITERAL)                                         \
+  RADIANS_SINGLE_DEF(TYPE##8, LITERAL)                                         \
+  RADIANS_SINGLE_DEF(TYPE##16, LITERAL)
+
+// pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
+RADIANS_DEF(float, 0x1.1df46ap-6F)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
+RADIANS_DEF(double, 0x1.1df46a2529d39p-6)
+
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
+RADIANS_DEF(half, (half)0x1.1df46a2529d39p-6)
+
+#endif
diff --git a/libclc/clc/lib/generic/common/clc_smoothstep.cl b/libclc/clc/lib/generic/common/clc_smoothstep.cl
new file mode 100644
index 0000000000000..ea0e9ed3bb19c
--- /dev/null
+++ b/libclc/clc/lib/generic/common/clc_smoothstep.cl
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <clc/clcmacro.h>
+#include <clc/internal/clc.h>
+#include <clc/shared/clc_clamp.h>
+
+#define SMOOTHSTEP_SINGLE_DEF(edge_type, x_type, lit_suff)                     \
+  _CLC_OVERLOAD _CLC_DEF x_type __clc_smoothstep(edge_type edge0,              \
+                                                 edge_type edge1, x_type x) {  \
+    x_type t = __clc_clamp((x - edge0) / (edge1 - edge0), 0.0##lit_suff,       \
+                           1.0##lit_suff);                                     \
+    return t * t * (3.0##lit_suff - 2.0##lit_suff * t);                        \
+  }
+
+#define SMOOTHSTEP_DEF(type, lit_suffix)                                       \
+  SMOOTHSTEP_SINGLE_DEF(type, type, lit_suffix)                                \
+  SMOOTHSTEP_SINGLE_DEF(type##2, type##2, lit_suffix)                          \
+  SMOOTHSTEP_SINGLE_DEF(type##3, type##3, lit_suffix)                          \
+  SMOOTHSTEP_SINGLE_DEF(type##4, type##4, lit_suffix)                          \
+  SMOOTHSTEP_SINGLE_DEF(type##8, type##8, lit_suffix)                          \
+  SMOOTHSTEP_SINGLE_DEF(type##16, type##16, lit_suffix)
+
+SMOOTHSTEP_DEF(float, F)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+SMOOTHSTEP_DEF(double, );
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+SMOOTHSTEP_DEF(half, H);
+#endif
diff --git a/libclc/clc/lib/spirv/SOURCES b/libclc/clc/lib/spirv/SOURCES
index 3b29fa0a91624..ac855ea5184ed 100644
--- a/libclc/clc/lib/spirv/SOURCES
+++ b/libclc/clc/lib/spirv/SOURCES
@@ -1,6 +1,10 @@
+../generic/common/clc_degrees.cl
+../generic/common/clc_radians.cl
+../generic/common/clc_smoothstep.cl
 ../generic/geometric/clc_dot.cl
 ../generic/math/clc_ceil.cl
 ../generic/math/clc_fabs.cl
 ../generic/math/clc_floor.cl
 ../generic/math/clc_rint.cl
 ../generic/math/clc_trunc.cl
+../generic/shared/clc_clamp.cl
diff --git a/libclc/clc/lib/spirv64/SOURCES b/libclc/clc/lib/spirv64/SOURCES
index 3b29fa0a91624..ac855ea5184ed 100644
--- a/libclc/clc/lib/spirv64/SOURCES
+++ b/libclc/clc/lib/spirv64/SOURCES
@@ -1,6 +1,10 @@
+../generic/common/clc_degrees.cl
+../generic/common/clc_radians.cl
+../generic/common/clc_smoothstep.cl
 ../generic/geometric/clc_dot.cl
 ../generic/math/clc_ceil.cl
 ../generic/math/clc_fabs.cl
 ../generic/math/clc_floor.cl
 ../generic/math/clc_rint.cl
 ../generic/math/clc_trunc.cl
+../generic/shared/clc_clamp.cl
diff --git a/libclc/generic/lib/common/degrees.cl b/libclc/generic/lib/common/degrees.cl
index cf49b190c76b3..a9715d64f507a 100644
--- a/libclc/generic/lib/common/degrees.cl
+++ b/libclc/generic/lib/common/degrees.cl
@@ -22,23 +22,20 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/common/clc_degrees.h>
 
-_CLC_OVERLOAD _CLC_DEF float degrees(float radians) {
-  // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
-  return 0x1.ca5dc2p+5F * radians;
-}
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, degrees, float);
-
+_CLC_DEFINE_UNARY_BUILTIN(float, degrees, __clc_degrees, float)
 
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
-_CLC_OVERLOAD _CLC_DEF double degrees(double radians) {
-  // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
-  return 0x1.ca5dc1a63c1f8p+5 * radians;
-}
+_CLC_DEFINE_UNARY_BUILTIN(double, degrees, __clc_degrees, double)
+
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, degrees, double);
+_CLC_DEFINE_UNARY_BUILTIN(half, degrees, __clc_degrees, half)
 
 #endif
diff --git a/libclc/generic/lib/common/radians.cl b/libclc/generic/lib/common/radians.cl
index 645a30549afed..b5dcbfe6e3fd2 100644
--- a/libclc/generic/lib/common/radians.cl
+++ b/libclc/generic/lib/common/radians.cl
@@ -22,23 +22,20 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/common/clc_radians.h>
 
-_CLC_OVERLOAD _CLC_DEF float radians(float degrees) {
-  // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
-  return 0x1.1df46ap-6F * degrees;
-}
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, radians, float);
-
+_CLC_DEFINE_UNARY_BUILTIN(float, radians, __clc_radians, float)
 
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
-_CLC_OVERLOAD _CLC_DEF double radians(double degrees) {
-  // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
-  return 0x1.1df46a2529d39p-6 * degrees;
-}
+_CLC_DEFINE_UNARY_BUILTIN(double, radians, __clc_radians, double)
+
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, radians, double);
+_CLC_DEFINE_UNARY_BUILTIN(half, radians, __clc_radians, half)
 
 #endif
diff --git a/libclc/generic/lib/common/smoothstep.cl b/libclc/generic/lib/common/smoothstep.cl
index 4cdecfc4abe26..78d62044f439b 100644
--- a/libclc/generic/lib/common/smoothstep.cl
+++ b/libclc/generic/lib/common/smoothstep.cl
@@ -22,35 +22,61 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/common/clc_smoothstep.h>
 
-_CLC_OVERLOAD _CLC_DEF float smoothstep(float edge0, float edge1, float x) {
-  float t = clamp((x - edge0) / (edge1 - edge0), 0.0f, 1.0f);
-  return t * t * (3.0f - 2.0f * t);
-}
+#define SMOOTHSTEP_SINGLE_DEF(X_TYPE)                                          \
+  _CLC_OVERLOAD _CLC_DEF X_TYPE smoothstep(X_TYPE edge0, X_TYPE edge1,         \
+                                           X_TYPE x) {                         \
+    return __clc_smoothstep(edge0, edge1, x);                                  \
+  }
 
-_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, smoothstep, float, float, float);
+#define SMOOTHSTEP_S_S_V_DEFS(X_TYPE)                                          \
+  _CLC_OVERLOAD _CLC_DEF X_TYPE##2 smoothstep(X_TYPE x, X_TYPE y,              \
+                                              X_TYPE##2 z) {                   \
+    return __clc_smoothstep((X_TYPE##2)x, (X_TYPE##2)y, z);                    \
+  }                                                                            \
+                                                                               \
+  _CLC_OVERLOAD _CLC_DEF X_TYPE##3 smoothstep(X_TYPE x, X_TYPE y,              \
+                                              X_TYPE##3 z) {                   \
+    return __clc_smoothstep((X_TYPE##3)x, (X_TYPE##3)y, z);                    \
+  }                                                                            \
+                                                                               \
+  _CLC_OVERLOAD _CLC_DEF X_TYPE##4 smoothstep(X_TYPE x, X_TYPE y,              \
+                                              X_TYPE##4 z) {                   \
+    return __clc_smoothstep((X_TYPE##4)x, (X_TYPE##4)y, z);                    \
+  }                                                                            \
+                                                                               \
+  _CLC_OVERLOAD _CLC_DEF X_TYPE##8 smoothstep(X_TYPE x, X_TYPE y,              \
+                                              X_TYPE##8 z) {                   \
+    return __clc_smoothstep((X_TYPE##8)x, (X_TYPE##8)y, z);                    \
+  }                                                                            \
+                                                                               \
+  _CLC_OVERLOAD _CLC_DEF X_TYPE##16 smoothstep(X_TYPE x, X_TYPE y,             \
+                                               X_TYPE##16 z) {                 \
+    return __clc_smoothstep((X_TYPE##16)x, (X_TYPE##16)y, z);                  \
+  }
 
-_CLC_V_S_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, smoothstep, float, float, float);
+#define SMOOTHSTEP_DEF(type)                                                   \
+  SMOOTHSTEP_SINGLE_DEF(type)                                                  \
+  SMOOTHSTEP_SINGLE_DEF(type##2)                                               \
+  SMOOTHSTEP_SINGLE_DEF(type##3)                                               \
+  SMOOTHSTEP_SINGLE_DEF(type##4)                                               \
+  SMOOTHSTEP_SINGLE_DEF(type##8)                                               \
+  SMOOTHSTEP_SINGLE_DEF(type##16)                                              \
+  SMOOTHSTEP_S_S_V_DEFS(type)
+
+SMOOTHSTEP_DEF(float)
 
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
-#define SMOOTH_STEP_DEF(edge_type, x_type, impl) \
-  _CLC_OVERLOAD _CLC_DEF x_type smoothstep(edge_type edge0, edge_type edge1, x_type x) { \
-    double t = clamp((x - edge0) / (edge1 - edge0), 0.0, 1.0); \
-    return t * t * (3.0 - 2.0 * t); \
- }
-
-SMOOTH_STEP_DEF(double, double, SMOOTH_STEP_IMPL_D);
+SMOOTHSTEP_DEF(double);
 
-_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, smoothstep, double, double, double);
+#endif
 
-#if !defined(CLC_SPIRV)
-SMOOTH_STEP_DEF(float, double, SMOOTH_STEP_IMPL_D);
-SMOOTH_STEP_DEF(double, float, SMOOTH_STEP_IMPL_D);
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-_CLC_V_S_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, smoothstep, float, float, double);
-_CLC_V_S_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, smoothstep, double, double, float);
-#endif
+SMOOTHSTEP_DEF(half);
 
 #endif
diff --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst
index be330a9afc331..2736061544c53 100644
--- a/libcxx/docs/ReleaseNotes/20.rst
+++ b/libcxx/docs/ReleaseNotes/20.rst
@@ -55,7 +55,8 @@ Improvements and New Features
 - The ``_LIBCPP_ENABLE_CXX20_REMOVED_TEMPORARY_BUFFER`` macro has been added to make ``std::get_temporary_buffer`` and
   ``std::return_temporary_buffer`` available.
 
-- The ``_LIBCPP_ENABLE_CXX20_REMOVED_UNCAUGHT_EXCEPTION`` macro has been added to make ``std::uncaught_exception``
+- The ``std::uncaught_exception`` function was marked as deprecated since C++17 and removed since C++20. The
+  ``_LIBCPP_ENABLE_CXX20_REMOVED_UNCAUGHT_EXCEPTION`` macro has been added to make ``std::uncaught_exception``
   available in C++20 and later modes.
 
 - The internal structure ``__compressed_pair`` has been replaced with ``[[no_unique_address]]``, resulting in reduced
@@ -69,12 +70,12 @@ Improvements and New Features
 - The ``_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY`` ABI configuration was added, which allows storing valid bounds
   in ``std::array::iterator`` and detecting OOB accesses when the appropriate hardening mode is enabled.
 
-- The ``input_iterator``-pair overload of ``void assign(InputIt, InputIt)`` has been optimized for ``std::vector``, 
-  resulting in a performance improvement of up to 2x for trivial element types (e.g., ``std::vector<int>``), and up 
+- The ``input_iterator``-pair overload of ``void assign(InputIt, InputIt)`` has been optimized for ``std::vector``,
+  resulting in a performance improvement of up to 2x for trivial element types (e.g., ``std::vector<int>``), and up
   to 3.4x for non-trivial element types (e.g., ``std::vector<std::vector<int>>``).
 
-- The ``input_iterator``-pair overload of ``iterator insert(const_iterator, InputIt, InputIt)`` has been optimized 
-  for ``std::vector``, resulting in a performance improvement of up to 10x for ``std::vector<int>``, and up to 2.3x 
+- The ``input_iterator``-pair overload of ``iterator insert(const_iterator, InputIt, InputIt)`` has been optimized
+  for ``std::vector``, resulting in a performance improvement of up to 10x for ``std::vector<int>``, and up to 2.3x
   for ``std::vector<std::vector<int>>``.
 
 - On Windows, ``<system_error>``'s ``std::system_category`` is now distinct from ``std::generic_category``. The behavior
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 658a7e16fae91..5d5c90d7b87a7 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -1166,9 +1166,7 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_NOESCAPE
 #  endif
 
-// FIXME: Expand this to [[__gnu__::__nodebug__]] again once the testcase reported in
-// https://github.com/llvm/llvm-project/pull/118710 has been analyzed
-#  define _LIBCPP_NODEBUG
+#  define _LIBCPP_NODEBUG [[__gnu__::__nodebug__]]
 
 #  if __has_attribute(__standalone_debug__)
 #    define _LIBCPP_STANDALONE_DEBUG __attribute__((__standalone_debug__))
diff --git a/libcxx/include/__flat_map/key_value_iterator.h b/libcxx/include/__flat_map/key_value_iterator.h
index 06a23f3429974..3ebb653deb197 100644
--- a/libcxx/include/__flat_map/key_value_iterator.h
+++ b/libcxx/include/__flat_map/key_value_iterator.h
@@ -15,9 +15,7 @@
 #include <__config>
 #include <__iterator/iterator_traits.h>
 #include <__memory/addressof.h>
-#include <__ranges/access.h>
 #include <__type_traits/conditional.h>
-#include <__type_traits/maybe_const.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 
@@ -41,9 +39,10 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Owner, class _KeyContainer, class _MappedContainer, bool _Const>
 struct __key_value_iterator {
 private:
-  using __key_iterator _LIBCPP_NODEBUG    = ranges::iterator_t<const _KeyContainer>;
-  using __mapped_iterator _LIBCPP_NODEBUG = ranges::iterator_t<__maybe_const<_Const, _MappedContainer>>;
-  using __reference _LIBCPP_NODEBUG       = _If<_Const, typename _Owner::const_reference, typename _Owner::reference>;
+  using __key_iterator _LIBCPP_NODEBUG = typename _KeyContainer::const_iterator;
+  using __mapped_iterator _LIBCPP_NODEBUG =
+      _If<_Const, typename _MappedContainer::const_iterator, typename _MappedContainer::iterator>;
+  using __reference _LIBCPP_NODEBUG = _If<_Const, typename _Owner::const_reference, typename _Owner::reference>;
 
   struct __arrow_proxy {
     __reference __ref_;
@@ -71,8 +70,8 @@ struct __key_value_iterator {
   _LIBCPP_HIDE_FROM_ABI __key_value_iterator() = default;
 
   _LIBCPP_HIDE_FROM_ABI __key_value_iterator(__key_value_iterator<_Owner, _KeyContainer, _MappedContainer, !_Const> __i)
-    requires _Const && convertible_to<ranges::iterator_t<_KeyContainer>, __key_iterator> &&
-                 convertible_to<ranges::iterator_t<_MappedContainer>, __mapped_iterator>
+    requires _Const && convertible_to<typename _KeyContainer::iterator, __key_iterator> &&
+                 convertible_to<typename _MappedContainer::iterator, __mapped_iterator>
       : __key_iter_(std::move(__i.__key_iter_)), __mapped_iter_(std::move(__i.__mapped_iter_)) {}
 
   _LIBCPP_HIDE_FROM_ABI __key_value_iterator(__key_iterator __key_iter, __mapped_iterator __mapped_iter)
diff --git a/libcxx/include/__vector/vector.h b/libcxx/include/__vector/vector.h
index 3a7ae53178596..28e9495a314a2 100644
--- a/libcxx/include/__vector/vector.h
+++ b/libcxx/include/__vector/vector.h
@@ -51,6 +51,7 @@
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
+#include <__type_traits/is_pointer.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_trivially_relocatable.h>
 #include <__type_traits/type_identity.h>
@@ -341,13 +342,17 @@ class _LIBCPP_TEMPLATE_VIS vector {
   //
   // Iterators
   //
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return __make_iter(this->__begin_); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT {
+    return __make_iter(__add_alignment_assumption(this->__begin_));
+  }
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT {
-    return __make_iter(this->__begin_);
+    return __make_iter(__add_alignment_assumption(this->__begin_));
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT {
+    return __make_iter(__add_alignment_assumption(this->__end_));
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return __make_iter(this->__end_); }
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT {
-    return __make_iter(this->__end_);
+    return __make_iter(__add_alignment_assumption(this->__end_));
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() _NOEXCEPT {
@@ -775,6 +780,17 @@ class _LIBCPP_TEMPLATE_VIS vector {
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(vector&, false_type) _NOEXCEPT {}
+
+  static _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pointer __add_alignment_assumption(pointer __p) _NOEXCEPT {
+#ifndef _LIBCPP_CXX03_LANG
+    if constexpr (is_pointer<pointer>::value) {
+      if (!__libcpp_is_constant_evaluated()) {
+        return static_cast<pointer>(__builtin_assume_aligned(__p, alignof(decltype(*__p))));
+      }
+    }
+#endif
+    return __p;
+  }
 };
 
 #if _LIBCPP_STD_VER >= 17
diff --git a/libcxx/include/atomic b/libcxx/include/atomic
index 80f9e437bfaab..75af5de33ca4c 100644
--- a/libcxx/include/atomic
+++ b/libcxx/include/atomic
@@ -592,6 +592,16 @@ template <class T>
 #else
 #  include <__config>
 
+#  if defined(_LIBCPP_STDATOMIC_H) || defined(kill_dependency) || defined(atomic_load)
+#    define _LIBCPP_STDATOMIC_H_HAS_DEFINITELY_BEEN_INCLUDED 1
+#  else
+#    define _LIBCPP_STDATOMIC_H_HAS_DEFINITELY_BEEN_INCLUDED 0
+#  endif
+
+#  if _LIBCPP_STD_VER < 23 && _LIBCPP_STDATOMIC_H_HAS_DEFINITELY_BEEN_INCLUDED
+#    error <atomic> is incompatible with <stdatomic.h> before C++23. Please compile with -std=c++23.
+#  endif
+
 #  include <__atomic/aliases.h>
 #  include <__atomic/atomic.h>
 #  include <__atomic/atomic_flag.h>
diff --git a/libcxx/include/stdatomic.h b/libcxx/include/stdatomic.h
index a0b46e3b7bc17..2991030eee456 100644
--- a/libcxx/include/stdatomic.h
+++ b/libcxx/include/stdatomic.h
@@ -126,7 +126,7 @@ using std::atomic_signal_fence                         // see below
 #    pragma GCC system_header
 #  endif
 
-#  if defined(__cplusplus)
+#  if defined(__cplusplus) && _LIBCPP_STD_VER >= 23
 
 #    include <atomic>
 #    include <version>
@@ -231,13 +231,17 @@ using std::atomic_store_explicit _LIBCPP_USING_IF_EXISTS;
 using std::atomic_signal_fence _LIBCPP_USING_IF_EXISTS;
 using std::atomic_thread_fence _LIBCPP_USING_IF_EXISTS;
 
-#  else
+#  elif defined(_LIBCPP_COMPILER_CLANG_BASED)
 
+// Before C++23, we include the next <stdatomic.h> on the path to avoid hijacking
+// the header. We do this because Clang has historically shipped a <stdatomic.h>
+// header that would be available in all Standard modes, and we don't want to
+// break that use case.
 #    if __has_include_next(<stdatomic.h>)
 #      include_next <stdatomic.h>
 #    endif
 
-#  endif // defined(__cplusplus)
+#  endif // defined(__cplusplus) && _LIBCPP_STD_VER >= 23
 #endif   // defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
 
 #endif // _LIBCPP_STDATOMIC_H
diff --git a/libcxx/test/benchmarks/containers/string.bench.cpp b/libcxx/test/benchmarks/containers/string.bench.cpp
index f7da3e2da312b..0b62c87acf7a2 100644
--- a/libcxx/test/benchmarks/containers/string.bench.cpp
+++ b/libcxx/test/benchmarks/containers/string.bench.cpp
@@ -237,29 +237,6 @@ struct StringMove {
   static std::string name() { return "BM_StringMove" + Length::name(); }
 };
 
-template <class Length, class Opaque>
-struct StringResizeDefaultInit {
-  static void run(benchmark::State& state) {
-    constexpr bool opaque     = Opaque{} == Opacity::Opaque;
-    constexpr int kNumStrings = 4 << 10;
-    size_t length             = makeString(Length()).size();
-    std::string strings[kNumStrings];
-    while (state.KeepRunningBatch(kNumStrings)) {
-      state.PauseTiming();
-      for (int i = 0; i < kNumStrings; ++i) {
-        std::string().swap(strings[i]);
-      }
-      benchmark::DoNotOptimize(strings);
-      state.ResumeTiming();
-      for (int i = 0; i < kNumStrings; ++i) {
-        strings[i].__resize_default_init(maybeOpaque(length, opaque));
-      }
-    }
-  }
-
-  static std::string name() { return "BM_StringResizeDefaultInit" + Length::name() + Opaque::name(); }
-};
-
 template <class Length, class Opaque>
 struct StringAssignStr {
   static void run(benchmark::State& state) {
@@ -577,7 +554,6 @@ int main(int argc, char** argv) {
   makeCartesianProductBenchmark<StringCopy, AllLengths>();
   makeCartesianProductBenchmark<StringMove, AllLengths>();
   makeCartesianProductBenchmark<StringDestroy, AllLengths>();
-  makeCartesianProductBenchmark<StringResizeDefaultInit, AllLengths, AllOpacity>();
   makeCartesianProductBenchmark<StringEraseToEnd, AllLengths, AllOpacity>();
   makeCartesianProductBenchmark<StringEraseWithMove, AllLengths, AllOpacity>();
   makeCartesianProductBenchmark<StringRelational, AllRelations, AllLengths, AllLengths, AllDiffTypes>();
diff --git a/libcxx/test/configs/stdlib-libstdc++.cfg.in b/libcxx/test/configs/stdlib-libstdc++.cfg.in
index d89254ab47d6a..b9672f038a763 100644
--- a/libcxx/test/configs/stdlib-libstdc++.cfg.in
+++ b/libcxx/test/configs/stdlib-libstdc++.cfg.in
@@ -1,15 +1,15 @@
 #
 # This testing configuration runs the test suite using the libstdc++ Standard library.
 #
-# The additional '--param libstdcxx-install-prefix=<PATH>', '--param libstdcxx-triple=<TRIPLE>' and
-# '--param libstdcxx-version=<VERSION>' lit parameters must be provided when invoking lit for the
+# The additional '--param libstdcxx_install_prefix=<PATH>', '--param libstdcxx_triple=<TRIPLE>' and
+# '--param libstdcxx_version=<VERSION>' lit parameters must be provided when invoking lit for the
 # configuration to find the appropriate headers and library.
 #
 # For example:
 #
-#  $ ./libcxx/utils/libcxx-lit <BUILD> -sv libcxx/test/std --param libstdcxx-install-prefix=/opt/homebrew/Cellar/gcc/14.1.0_1 \
-#                                                          --param libstdcxx-version=14 \
-#                                                          --param libstdcxx-triple=aarch64-apple-darwin22
+#  $ ./libcxx/utils/libcxx-lit <BUILD> -sv libcxx/test/std --param libstdcxx_install_prefix=/opt/homebrew/Cellar/gcc/14.1.0_1 \
+#                                                          --param libstdcxx_version=14 \
+#                                                          --param libstdcxx_triple=aarch64-apple-darwin22
 #
 
 lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
@@ -20,19 +20,19 @@ import libcxx.test.params, libcxx.test.config, libcxx.test.dsl
 
 # Additional parameters for libstdc++
 LIBSTDCXX_PARAMETERS = [
-    libcxx.test.dsl.Parameter(name='libstdcxx-install-prefix', type=str,
+    libcxx.test.dsl.Parameter(name='libstdcxx_install_prefix', type=str,
         actions=lambda path: [libcxx.test.dsl.AddSubstitution('%{libstdcxx-install-prefix}', path)],
         help="""
         The installation prefix where libstdc++ was installed. This is used to find the libstdc++ headers,
         link against its built library, etc.
         """),
-    libcxx.test.dsl.Parameter(name='libstdcxx-triple', type=str,
+    libcxx.test.dsl.Parameter(name='libstdcxx_triple', type=str,
         actions=lambda triple: [libcxx.test.dsl.AddSubstitution('%{libstdcxx-triple}', triple)],
         help="""
         The target triple used for the target-specific include directory of libstdc++. This is used to find the
         libstdc++ headers.
         """),
-    libcxx.test.dsl.Parameter(name='libstdcxx-version', type=str,
+    libcxx.test.dsl.Parameter(name='libstdcxx_version', type=str,
         actions=lambda version: [libcxx.test.dsl.AddSubstitution('%{libstdcxx-version}', version)],
         help="""
         The version of libstdc++. This is used to find the libstdc++ headers and library.
diff --git a/libcxx/test/libcxx/atomics/atomics.syn/compatible_with_stdatomic.compile.pass.cpp b/libcxx/test/libcxx/atomics/atomics.syn/compatible_with_stdatomic.compile.pass.cpp
index 323072da14463..30e9672a25683 100644
--- a/libcxx/test/libcxx/atomics/atomics.syn/compatible_with_stdatomic.compile.pass.cpp
+++ b/libcxx/test/libcxx/atomics/atomics.syn/compatible_with_stdatomic.compile.pass.cpp
@@ -7,15 +7,16 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
-// This test verifies that <stdatomic.h> redirects to <atomic>. As an extension,
-// libc++ enables this redirection even before C++23.
+// This test verifies that <stdatomic.h> redirects to <atomic>.
 
-// Ordinarily, <stdatomic.h> can be included after <atomic>, but including it
-// first doesn't work because its macros break <atomic>. Verify that
-// <stdatomic.h> can be included first.
+// Before C++23, <stdatomic.h> can be included after <atomic>, but including it
+// first doesn't work because its macros break <atomic>. Fixing that is the point
+// of the C++23 change that added <stdatomic.h> to C++. Thus, this test verifies
+// that <stdatomic.h> can be included first.
 #include <stdatomic.h>
 #include <atomic>
 
diff --git a/libcxx/test/libcxx/atomics/atomics.syn/incompatible_with_stdatomic.verify.cpp b/libcxx/test/libcxx/atomics/atomics.syn/incompatible_with_stdatomic.verify.cpp
new file mode 100644
index 0000000000000..ca092d9c60275
--- /dev/null
+++ b/libcxx/test/libcxx/atomics/atomics.syn/incompatible_with_stdatomic.verify.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+// REQUIRES: c++03 || c++11 || c++14 || c++17 || c++20
+
+// This test ensures that we issue a reasonable diagnostic when including <atomic> after
+// <stdatomic.h> has been included. Before C++23, this otherwise leads to obscure errors
+// because <atomic> may try to redefine things defined by <stdatomic.h>.
+
+// Ignore additional weird errors that happen when the two headers are mixed.
+// ADDITIONAL_COMPILE_FLAGS: -Xclang -verify-ignore-unexpected=error -Xclang -verify-ignore-unexpected=warning
+
+#include <stdatomic.h>
+#include <atomic>
+
+// expected-error@*:* {{<atomic> is incompatible with <stdatomic.h> before C++23.}}
diff --git a/libcxx/test/libcxx/atomics/stdatomic.h.syn/dont_hijack_header.compile.pass.cpp b/libcxx/test/libcxx/atomics/stdatomic.h.syn/dont_hijack_header.compile.pass.cpp
new file mode 100644
index 0000000000000..6df80daf9414e
--- /dev/null
+++ b/libcxx/test/libcxx/atomics/stdatomic.h.syn/dont_hijack_header.compile.pass.cpp
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+
+// This test ensures that we don't hijack the <stdatomic.h> header (e.g. by providing
+// an empty header) even when compiling before C++23, since some users were using the
+// Clang or platform provided header before libc++ added its own.
+
+// On GCC, the compiler-provided <stdatomic.h> is not C++ friendly, so including <stdatomic.h>
+// doesn't work at all if we don't use the <stdatomic.h> provided by libc++ in C++23 and above.
+// XFAIL: (c++11 || c++14 || c++17 || c++20) && gcc
+
+#include <stdatomic.h>
+
+void f() {
+  atomic_int i; // just make sure the header isn't empty
+  (void)i;
+}
diff --git a/libcxx/test/libcxx/atomics/stdatomic.h.syn/dont_hijack_header.cxx23.compile.pass.cpp b/libcxx/test/libcxx/atomics/stdatomic.h.syn/dont_hijack_header.cxx23.compile.pass.cpp
new file mode 100644
index 0000000000000..a8a99e6937f31
--- /dev/null
+++ b/libcxx/test/libcxx/atomics/stdatomic.h.syn/dont_hijack_header.cxx23.compile.pass.cpp
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-threads
+
+// This test verifies that <stdatomic.h> DOES NOT redirect to <atomic> before C++23,
+// since doing so is a breaking change. Several things can break when that happens,
+// because the type of _Atomic(T) changes from _Atomic(T) to std::atomic<T>.
+//
+// For example, redeclarations can become invalid depending on whether they
+// have been declared with <stdatomic.h> in scope or not.
+
+// REQUIRES: c++03 || c++11 || c++14 || c++17 || c++20
+
+// On GCC, the compiler-provided <stdatomic.h> is not C++ friendly, so including <stdatomic.h>
+// doesn't work at all if we don't use the <stdatomic.h> provided by libc++ in C++23 and above.
+// XFAIL: (c++11 || c++14 || c++17 || c++20) && gcc
+
+#include <atomic>
+#include <stdatomic.h>
+#include <type_traits>
+
+static_assert(!std::is_same<_Atomic(int), std::atomic<int> >::value, "");
diff --git a/libcxx/test/std/depr/depr.c.headers/stdlib_h.pass.cpp b/libcxx/test/std/depr/depr.c.headers/stdlib_h.pass.cpp
index 587c6b6e10ddb..662331558c121 100644
--- a/libcxx/test/std/depr/depr.c.headers/stdlib_h.pass.cpp
+++ b/libcxx/test/std/depr/depr.c.headers/stdlib_h.pass.cpp
@@ -8,6 +8,11 @@
 
 // test <stdlib.h>
 
+// mblen was added in Android API 26.
+// TODO: Switch from UNSUPPORTED to XFAIL once the Android CI Docker sysroot is
+// updated.
+// UNSUPPORTED: LIBCXX-ANDROID-FIXME && target={{.+}}-android{{(eabi)?(21|22|23|24|25)}}
+
 #include <stdlib.h>
 #include <cassert>
 #include <type_traits>
diff --git a/libcxx/test/std/depr/depr.c.headers/wctype_h.compile.pass.cpp b/libcxx/test/std/depr/depr.c.headers/wctype_h.compile.pass.cpp
index 35b294532b2bd..4e2fb319336f1 100644
--- a/libcxx/test/std/depr/depr.c.headers/wctype_h.compile.pass.cpp
+++ b/libcxx/test/std/depr/depr.c.headers/wctype_h.compile.pass.cpp
@@ -8,6 +8,11 @@
 
 // UNSUPPORTED: no-wide-characters
 
+// towctrans and wctrans were added in Android API 26.
+// TODO: Switch from UNSUPPORTED to XFAIL once the Android CI Docker sysroot is
+// updated.
+// UNSUPPORTED: LIBCXX-ANDROID-FIXME && target={{.+}}-android{{(eabi)?(21|22|23|24|25)}}
+
 // <wctype.h>
 
 #include <wctype.h>
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/offset_range.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/offset_range.pass.cpp
new file mode 100644
index 0000000000000..5afd4465db31e
--- /dev/null
+++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/offset_range.pass.cpp
@@ -0,0 +1,82 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <fstream>
+
+// Test that we can seek using offsets larger than 32 bit, and that we can
+// retrieve file offsets larger than 32 bit.
+
+// On MSVC targets, we only use the 32 bit fseek/ftell functions. For MinGW
+// targets, we use fseeko/ftello, but the user needs to define
+// _FILE_OFFSET_BITS=64 to make them 64 bit.
+//
+// XFAIL: target={{.*}}-windows{{.*}}
+
+// On 32 bit Android platforms, off_t is 32 bit by default. By defining
+// _FILE_OFFSET_BITS=64, one gets a 64 bit off_t, but the corresponding
+// 64 bit ftello/fseeko functions are only available since Android API 24 (7.0).
+// (On 64 bit Android platforms, off_t has always been 64 bit.)
+//
+// XFAIL: target={{i686|arm.*}}-{{.+}}-android{{.*}}
+
+// Writing the >4 GB test file fails on 32 bit AIX.
+//
+// XFAIL: target=powerpc-{{.+}}-aix{{.*}}
+
+#include <fstream>
+#include <iostream>
+#include <cassert>
+#include <vector>
+
+#include "assert_macros.h"
+#include "platform_support.h"
+#include "test_macros.h"
+
+void test_tellg(std::streamoff total_size) {
+  std::vector<char> data(8192);
+  for (std::size_t i = 0; i < data.size(); ++i)
+    data[i] = static_cast<char>(i % (1 << 8 * sizeof(char)));
+  std::string p = get_temp_file_name();
+  {
+    std::ofstream ofs;
+    ofs.open(p, std::ios::out | std::ios::binary);
+    assert(ofs.is_open());
+    for (std::streamoff size = 0; size < total_size;) {
+      std::size_t n = std::min(static_cast<std::streamoff>(data.size()), total_size - size);
+      ofs.write(data.data(), n);
+      size += n;
+    }
+    assert(!ofs.fail());
+    ofs.close();
+  }
+  {
+    std::ifstream ifs;
+    ifs.open(p, std::ios::binary);
+    assert(ifs.is_open());
+    std::streamoff in_off = ifs.tellg();
+    TEST_REQUIRE(in_off == 0, "in_off not zero at start");
+    ifs.seekg(total_size - 20, std::ios::beg);
+    in_off = ifs.tellg();
+    TEST_REQUIRE(in_off == total_size - 20, "in_off incorrect after >32 bit seek");
+    ifs.seekg(10, std::ios::cur);
+    in_off = ifs.tellg();
+    TEST_REQUIRE(in_off == total_size - 10, "in_off incorrect after incremental seek");
+    ifs.seekg(0, std::ios::end);
+    in_off = ifs.tellg();
+    TEST_REQUIRE(in_off == total_size, "in_off incorrect after seek to end");
+  }
+  std::remove(p.c_str());
+}
+
+int main(int, char**) {
+  // This test assumes and requires that std::streamoff is larger than
+  // 32 bit - this is not required in the standard itself.
+  static_assert(sizeof(std::streamoff) > 4, "");
+  test_tellg(0x100000042ULL);
+  return 0;
+}
diff --git a/libcxx/test/std/language.support/support.runtime/cstdlib.pass.cpp b/libcxx/test/std/language.support/support.runtime/cstdlib.pass.cpp
index a1f7e1143a1e9..9d3e6d892daf0 100644
--- a/libcxx/test/std/language.support/support.runtime/cstdlib.pass.cpp
+++ b/libcxx/test/std/language.support/support.runtime/cstdlib.pass.cpp
@@ -8,6 +8,11 @@
 
 // test <cstdlib>
 
+// mblen was added in Android API 26.
+// TODO: Switch from UNSUPPORTED to XFAIL once the Android CI Docker sysroot is
+// updated.
+// UNSUPPORTED: LIBCXX-ANDROID-FIXME && target={{.+}}-android{{(eabi)?(21|22|23|24|25)}}
+
 #include <cstdlib>
 #include <cassert>
 #include <type_traits>
diff --git a/libcxx/test/std/strings/c.strings/cwctype.pass.cpp b/libcxx/test/std/strings/c.strings/cwctype.pass.cpp
index 5bc2531d6f6ac..0deabf51ed59c 100644
--- a/libcxx/test/std/strings/c.strings/cwctype.pass.cpp
+++ b/libcxx/test/std/strings/c.strings/cwctype.pass.cpp
@@ -10,6 +10,11 @@
 
 // UNSUPPORTED: no-wide-characters
 
+// towctrans and wctrans were added in Android API 26.
+// TODO: Switch from UNSUPPORTED to XFAIL once the Android CI Docker sysroot is
+// updated.
+// UNSUPPORTED: LIBCXX-ANDROID-FIXME && target={{.+}}-android{{(eabi)?(21|22|23|24|25)}}
+
 #include <cwctype>
 #include <type_traits>
 
diff --git a/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp b/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp
index f49f3e3c615ca..bc7c8ce7ec443 100644
--- a/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp
+++ b/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp
@@ -27,7 +27,7 @@ class LibcxxTestModule : public clang::tidy::ClangTidyModule {
     check_factories.registerCheck<libcpp::header_exportable_declarations>("libcpp-header-exportable-declarations");
     check_factories.registerCheck<libcpp::hide_from_abi>("libcpp-hide-from-abi");
     check_factories.registerCheck<libcpp::internal_ftm_use>("libcpp-internal-ftms");
-    // check_factories.registerCheck<libcpp::nodebug_on_aliases>("libcpp-nodebug-on-aliases");
+    check_factories.registerCheck<libcpp::nodebug_on_aliases>("libcpp-nodebug-on-aliases");
     check_factories.registerCheck<libcpp::proper_version_checks>("libcpp-cpp-version-check");
     check_factories.registerCheck<libcpp::robust_against_adl_check>("libcpp-robust-against-adl");
     check_factories.registerCheck<libcpp::uglify_attributes>("libcpp-uglify-attributes");
diff --git a/libcxx/utils/libcxx-compare-benchmarks b/libcxx/utils/libcxx-compare-benchmarks
index e04820fc57ed9..08c53b2420c8e 100755
--- a/libcxx/utils/libcxx-compare-benchmarks
+++ b/libcxx/utils/libcxx-compare-benchmarks
@@ -7,15 +7,16 @@ MONOREPO_ROOT="$(realpath $(dirname "${PROGNAME}"))"
 function usage() {
 cat <<EOF
 Usage:
-${PROGNAME} [-h|--help] <baseline-build> <candidate-build> benchmarks...
+${PROGNAME} [-h|--help] <baseline-build> <candidate-build> benchmarks... [-- gbench-args...]
 
 Compare the given benchmarks between the baseline and the candidate build directories.
 
 This requires those benchmarks to have already been generated in both build directories.
 
-<baseline-build>   The path to the build directory considered the baseline.
-<candidate-build>  The path to the build directory considered the candidate.
-benchmarks...      Paths of the benchmarks to compare. Those paths are relative to '<monorepo-root>'.
+<baseline-build>     The path to the build directory considered the baseline.
+<candidate-build>    The path to the build directory considered the candidate.
+benchmarks...        Paths of the benchmarks to compare. Those paths are relative to '<monorepo-root>'.
+[-- gbench-args...]  Any arguments provided after '--' will be passed as-is to GoogleBenchmark's compare.py tool.
 
 Example
 =======
@@ -45,7 +46,17 @@ python3 -m venv /tmp/libcxx-compare-benchmarks-venv
 source /tmp/libcxx-compare-benchmarks-venv/bin/activate
 pip3 install -r ${GBENCH}/tools/requirements.txt
 
-for benchmark in ${@}; do
+benchmarks=""
+while [[ $# -gt 0 ]]; do
+    if [[ "${1}" == "--" ]]; then
+        shift
+        break
+    fi
+    benchmarks+=" ${1}"
+    shift
+done
+
+for benchmark in ${benchmarks}; do
     base="$(${MONOREPO_ROOT}/libcxx/utils/libcxx-benchmark-json ${baseline} ${benchmark})"
     cand="$(${MONOREPO_ROOT}/libcxx/utils/libcxx-benchmark-json ${candidate} ${benchmark})"
 
@@ -58,5 +69,5 @@ for benchmark in ${@}; do
         continue
     fi
 
-    "${GBENCH}/tools/compare.py" benchmarks "${base}" "${cand}"
+    "${GBENCH}/tools/compare.py" benchmarks "${base}" "${cand}" ${@}
 done
diff --git a/libunwind/src/CMakeLists.txt b/libunwind/src/CMakeLists.txt
index e7ea57734cca9..ecbd019bb29ea 100644
--- a/libunwind/src/CMakeLists.txt
+++ b/libunwind/src/CMakeLists.txt
@@ -20,7 +20,12 @@ set(LIBUNWIND_C_SOURCES
     )
 set_source_files_properties(${LIBUNWIND_C_SOURCES}
                             PROPERTIES
-                              COMPILE_FLAGS "-std=c99")
+                              # We need to set `-fexceptions` here so that key
+                              # unwinding functions, like
+                              # _UNWIND_RaiseException, are not marked as
+                              # `nounwind`, which breaks LTO builds of
+                              # libunwind.  See #56825 and #120657 for context.
+                              COMPILE_FLAGS "-std=c99 -fexceptions")
 
 set(LIBUNWIND_ASM_SOURCES
     UnwindRegistersRestore.S
diff --git a/lld/COFF/COFFLinkerContext.h b/lld/COFF/COFFLinkerContext.h
index bdd625b8c3916..8322f829d4055 100644
--- a/lld/COFF/COFFLinkerContext.h
+++ b/lld/COFF/COFFLinkerContext.h
@@ -56,7 +56,6 @@ class COFFLinkerContext : public CommonLinkerContext {
   std::vector<ObjFile *> objFileInstances;
   std::map<std::string, PDBInputFile *> pdbInputFileInstances;
   std::vector<ImportFile *> importFileInstances;
-  std::vector<BitcodeFile *> bitcodeFileInstances;
 
   MergeChunk *mergeChunkInstances[Log2MaxSectionAlignment + 1] = {};
 
diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h
index 9e6b17e87c9e7..924560fef0231 100644
--- a/lld/COFF/Config.h
+++ b/lld/COFF/Config.h
@@ -120,7 +120,6 @@ struct Configuration {
   size_t wordsize;
   bool verbose = false;
   WindowsSubsystem subsystem = llvm::COFF::IMAGE_SUBSYSTEM_UNKNOWN;
-  Symbol *entry = nullptr;
   bool noEntry = false;
   std::string outputFile;
   std::string importName;
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 6af6b4f730766..898c6c17d2062 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -175,6 +175,15 @@ llvm::Triple::ArchType LinkerDriver::getArch() {
   return getMachineArchType(ctx.config.machine);
 }
 
+std::vector<Chunk *> LinkerDriver::getChunks() const {
+  std::vector<Chunk *> res;
+  for (ObjFile *file : ctx.objFileInstances) {
+    ArrayRef<Chunk *> v = file->getChunks();
+    res.insert(res.end(), v.begin(), v.end());
+  }
+  return res;
+}
+
 static bool compatibleMachineType(COFFLinkerContext &ctx, MachineTypes mt) {
   if (mt == IMAGE_FILE_MACHINE_UNKNOWN)
     return true;
@@ -209,7 +218,7 @@ void LinkerDriver::addFile(InputFile *file) {
                  << " linked in after "
                     "doing LTO compilation.";
       }
-      ctx.bitcodeFileInstances.push_back(f);
+      f->symtab.bitcodeFileInstances.push_back(f);
     } else if (auto *f = dyn_cast<ImportFile>(file)) {
       ctx.importFileInstances.push_back(f);
     }
@@ -276,7 +285,7 @@ void LinkerDriver::addBuffer(std::unique_ptr<MemoryBuffer> mb,
     addFile(make<ArchiveFile>(ctx, mbref));
     break;
   case file_magic::bitcode:
-    addFile(make<BitcodeFile>(ctx, mbref, "", 0, lazy));
+    addFile(BitcodeFile::create(ctx, mbref, "", 0, lazy));
     break;
   case file_magic::coff_object:
   case file_magic::coff_import_library:
@@ -365,8 +374,8 @@ void LinkerDriver::addArchiveBuffer(MemoryBufferRef mb, StringRef symName,
   if (magic == file_magic::coff_object) {
     obj = ObjFile::create(ctx, mb);
   } else if (magic == file_magic::bitcode) {
-    obj =
-        make<BitcodeFile>(ctx, mb, parentName, offsetInArchive, /*lazy=*/false);
+    obj = BitcodeFile::create(ctx, mb, parentName, offsetInArchive,
+                              /*lazy=*/false);
   } else if (magic == file_magic::coff_cl_gl_object) {
     Err(ctx) << mb.getBufferIdentifier()
              << ": is not a native COFF file. Recompile without /GL?";
@@ -491,8 +500,9 @@ void LinkerDriver::parseDirectives(InputFile *file) {
     case OPT_entry:
       if (!arg->getValue()[0])
         Fatal(ctx) << "missing entry point symbol name";
-      ctx.config.entry =
-          file->symtab.addGCRoot(file->symtab.mangle(arg->getValue()), true);
+      ctx.forEachSymtab([&](SymbolTable &symtab) {
+        symtab.entry = symtab.addGCRoot(symtab.mangle(arg->getValue()), true);
+      });
       break;
     case OPT_failifmismatch:
       checkFailIfMismatch(arg->getValue(), file);
@@ -1092,7 +1102,7 @@ void LinkerDriver::parseOrderFile(StringRef arg) {
 
   // Get a list of all comdat sections for error checking.
   DenseSet<StringRef> set;
-  for (Chunk *c : ctx.symtab.getChunks())
+  for (Chunk *c : ctx.driver.getChunks())
     if (auto *sec = dyn_cast<SectionChunk>(c))
       if (sec->sym)
         set.insert(sec->sym->getName());
@@ -1394,8 +1404,9 @@ void LinkerDriver::createECExportThunks() {
     }
   }
 
-  if (ctx.config.entry)
-    maybeCreateECExportThunk(ctx.config.entry->getName(), ctx.config.entry);
+  if (ctx.symtabEC->entry)
+    maybeCreateECExportThunk(ctx.symtabEC->entry->getName(),
+                             ctx.symtabEC->entry);
   for (Export &e : ctx.config.exports) {
     if (!e.data)
       maybeCreateECExportThunk(e.extName.empty() ? e.name : e.extName, e.sym);
@@ -2351,39 +2362,38 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   // and after the early return when just writing an import library.
   if (config->subsystem == IMAGE_SUBSYSTEM_UNKNOWN) {
     llvm::TimeTraceScope timeScope("Infer subsystem");
-    config->subsystem = ctx.symtab.inferSubsystem();
+    config->subsystem = mainSymtab.inferSubsystem();
     if (config->subsystem == IMAGE_SUBSYSTEM_UNKNOWN)
       Fatal(ctx) << "subsystem must be defined";
   }
 
   // Handle /entry and /dll
-  {
+  ctx.forEachSymtab([&](SymbolTable &symtab) {
     llvm::TimeTraceScope timeScope("Entry point");
     if (auto *arg = args.getLastArg(OPT_entry)) {
       if (!arg->getValue()[0])
         Fatal(ctx) << "missing entry point symbol name";
-      config->entry =
-          ctx.symtab.addGCRoot(ctx.symtab.mangle(arg->getValue()), true);
-    } else if (!config->entry && !config->noEntry) {
+      symtab.entry = symtab.addGCRoot(symtab.mangle(arg->getValue()), true);
+    } else if (!symtab.entry && !config->noEntry) {
       if (args.hasArg(OPT_dll)) {
         StringRef s = (config->machine == I386) ? "__DllMainCRTStartup@12"
                                                 : "_DllMainCRTStartup";
-        config->entry = ctx.symtab.addGCRoot(s, true);
+        symtab.entry = symtab.addGCRoot(s, true);
       } else if (config->driverWdm) {
         // /driver:wdm implies /entry:_NtProcessStartup
-        config->entry =
-            ctx.symtab.addGCRoot(ctx.symtab.mangle("_NtProcessStartup"), true);
+        symtab.entry =
+            symtab.addGCRoot(symtab.mangle("_NtProcessStartup"), true);
       } else {
         // Windows specific -- If entry point name is not given, we need to
         // infer that from user-defined entry name.
-        StringRef s = ctx.symtab.findDefaultEntry();
+        StringRef s = symtab.findDefaultEntry();
         if (s.empty())
           Fatal(ctx) << "entry point must be defined";
-        config->entry = ctx.symtab.addGCRoot(s, true);
+        symtab.entry = symtab.addGCRoot(s, true);
         Log(ctx) << "Entry name inferred: " << s;
       }
     }
-  }
+  });
 
   // Handle /delayload
   {
@@ -2522,10 +2532,12 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   {
     llvm::TimeTraceScope timeScope("Add unresolved symbols");
     do {
-      // Windows specific -- if entry point is not found,
-      // search for its mangled names.
-      if (config->entry)
-        ctx.symtab.mangleMaybe(config->entry);
+      ctx.forEachSymtab([&](SymbolTable &symtab) {
+        // Windows specific -- if entry point is not found,
+        // search for its mangled names.
+        if (symtab.entry)
+          symtab.mangleMaybe(symtab.entry);
+      });
 
       // Windows specific -- Make sure we resolve all dllexported symbols.
       for (Export &e : config->exports) {
@@ -2559,21 +2571,24 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
         }
       }
 
-      // If any inputs are bitcode files, the LTO code generator may create
-      // references to library functions that are not explicit in the bitcode
-      // file's symbol table. If any of those library functions are defined in a
-      // bitcode file in an archive member, we need to arrange to use LTO to
-      // compile those archive members by adding them to the link beforehand.
-      if (!ctx.bitcodeFileInstances.empty()) {
-        llvm::Triple TT(
-            ctx.bitcodeFileInstances.front()->obj->getTargetTriple());
-        for (auto *s : lto::LTO::getRuntimeLibcallSymbols(TT))
-          ctx.symtab.addLibcall(s);
-      }
+      ctx.forEachSymtab([&](SymbolTable &symtab) {
+        // If any inputs are bitcode files, the LTO code generator may create
+        // references to library functions that are not explicit in the bitcode
+        // file's symbol table. If any of those library functions are defined in
+        // a bitcode file in an archive member, we need to arrange to use LTO to
+        // compile those archive members by adding them to the link beforehand.
+        if (!symtab.bitcodeFileInstances.empty()) {
+          llvm::Triple TT(
+              symtab.bitcodeFileInstances.front()->obj->getTargetTriple());
+          for (auto *s : lto::LTO::getRuntimeLibcallSymbols(TT))
+            symtab.addLibcall(s);
+        }
 
-      // Windows specific -- if __load_config_used can be resolved, resolve it.
-      if (ctx.symtab.findUnderscore("_load_config_used"))
-        ctx.symtab.addGCRoot(ctx.symtab.mangle("_load_config_used"));
+        // Windows specific -- if __load_config_used can be resolved, resolve
+        // it.
+        if (symtab.findUnderscore("_load_config_used"))
+          symtab.addGCRoot(symtab.mangle("_load_config_used"));
+      });
 
       if (args.hasArg(OPT_include_optional)) {
         // Handle /includeoptional
@@ -2624,8 +2639,11 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   // If we are going to do codegen for link-time optimization, check for
   // unresolvable symbols first, so we don't spend time generating code that
   // will fail to link anyway.
-  if (!ctx.bitcodeFileInstances.empty() && !config->forceUnresolved)
-    ctx.symtab.reportUnresolvable();
+  if (!config->forceUnresolved)
+    ctx.forEachSymtab([](SymbolTable &symtab) {
+      if (!symtab.bitcodeFileInstances.empty())
+        symtab.reportUnresolvable();
+    });
   if (errorCount())
     return;
 
@@ -2640,7 +2658,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   // link those files (unless -thinlto-index-only was given, in which case we
   // resolve symbols and write indices, but don't generate native code or link).
   ltoCompilationDone = true;
-  ctx.symtab.compileBitcodeFiles();
+  ctx.forEachSymtab([](SymbolTable &symtab) { symtab.compileBitcodeFiles(); });
 
   if (Defined *d =
           dyn_cast_or_null<Defined>(ctx.symtab.findUnderscore("_tls_used")))
diff --git a/lld/COFF/Driver.h b/lld/COFF/Driver.h
index 4558f68c041fa..8ce2e13129ba6 100644
--- a/lld/COFF/Driver.h
+++ b/lld/COFF/Driver.h
@@ -94,6 +94,9 @@ class LinkerDriver {
 
   void enqueuePath(StringRef path, bool wholeArchive, bool lazy);
 
+  // Returns a list of chunks of selected symbols.
+  std::vector<Chunk *> getChunks() const;
+
   std::unique_ptr<llvm::TarWriter> tar; // for /linkrepro
 
   void pullArm64ECIcallHelper();
diff --git a/lld/COFF/ICF.cpp b/lld/COFF/ICF.cpp
index 796d3a4108ba4..e6c965160e4ef 100644
--- a/lld/COFF/ICF.cpp
+++ b/lld/COFF/ICF.cpp
@@ -264,7 +264,7 @@ void ICF::run() {
 
   // Collect only mergeable sections and group by hash value.
   uint32_t nextId = 1;
-  for (Chunk *c : ctx.symtab.getChunks()) {
+  for (Chunk *c : ctx.driver.getChunks()) {
     if (auto *sc = dyn_cast<SectionChunk>(c)) {
       if (isEligible(sc))
         chunks.push_back(sc);
diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index e7891d3e67f3e..5ee73d4dc4f8b 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -746,6 +746,26 @@ std::optional<Symbol *> ObjFile::createDefined(
   if (sectionNumber == llvm::COFF::IMAGE_SYM_DEBUG)
     return nullptr;
 
+  if (sym.isEmptySectionDeclaration()) {
+    // As there is no coff_section in the object file for these, make a
+    // new virtual one, with everything zeroed out (i.e. an empty section),
+    // with only the name and characteristics set.
+    StringRef name = getName();
+    auto *hdr = make<coff_section>();
+    memset(hdr, 0, sizeof(*hdr));
+    strncpy(hdr->Name, name.data(),
+            std::min(name.size(), (size_t)COFF::NameSize));
+    // We have no idea what characteristics should be assumed here; pick
+    // a default. This matches what is used for .idata sections in the regular
+    // object files in import libraries.
+    hdr->Characteristics = IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_READ |
+                           IMAGE_SCN_MEM_WRITE | IMAGE_SCN_ALIGN_4BYTES;
+    auto *sc = make<SectionChunk>(this, hdr);
+    chunks.push_back(sc);
+    return make<DefinedRegular>(this, /*name=*/"", /*isCOMDAT=*/false,
+                                /*isExternal=*/false, sym.getGeneric(), sc);
+  }
+
   if (llvm::COFF::isReservedSectionNumber(sectionNumber))
     Fatal(ctx) << toString(this) << ": " << getName()
                << " should not refer to special section "
@@ -1209,10 +1229,15 @@ void ImportFile::parse() {
   }
 }
 
-BitcodeFile::BitcodeFile(COFFLinkerContext &ctx, MemoryBufferRef mb,
-                         StringRef archiveName, uint64_t offsetInArchive,
-                         bool lazy)
-    : InputFile(ctx.symtab, BitcodeKind, mb, lazy) {
+BitcodeFile::BitcodeFile(SymbolTable &symtab, MemoryBufferRef mb,
+                         std::unique_ptr<lto::InputFile> &o, bool lazy)
+    : InputFile(symtab, BitcodeKind, mb, lazy) {
+  obj.swap(o);
+}
+
+BitcodeFile *BitcodeFile::create(COFFLinkerContext &ctx, MemoryBufferRef mb,
+                                 StringRef archiveName,
+                                 uint64_t offsetInArchive, bool lazy) {
   std::string path = mb.getBufferIdentifier().str();
   if (ctx.config.thinLTOIndexOnly)
     path = replaceThinLTOSuffix(mb.getBufferIdentifier(),
@@ -1232,7 +1257,9 @@ BitcodeFile::BitcodeFile(COFFLinkerContext &ctx, MemoryBufferRef mb,
                                                sys::path::filename(path) +
                                                utostr(offsetInArchive)));
 
-  obj = check(lto::InputFile::create(mbref));
+  std::unique_ptr<lto::InputFile> obj = check(lto::InputFile::create(mbref));
+  return make<BitcodeFile>(ctx.getSymtab(getMachineType(obj.get())), mb, obj,
+                           lazy);
 }
 
 BitcodeFile::~BitcodeFile() = default;
@@ -1309,7 +1336,7 @@ void BitcodeFile::parseLazy() {
     }
 }
 
-MachineTypes BitcodeFile::getMachineType() const {
+MachineTypes BitcodeFile::getMachineType(const llvm::lto::InputFile *obj) {
   Triple t(obj->getTargetTriple());
   switch (t.getArch()) {
   case Triple::x86_64:
diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h
index d3075c5e0a338..823561cda247a 100644
--- a/lld/COFF/InputFiles.h
+++ b/lld/COFF/InputFiles.h
@@ -386,13 +386,19 @@ class ImportFile : public InputFile {
 // Used for LTO.
 class BitcodeFile : public InputFile {
 public:
-  explicit BitcodeFile(COFFLinkerContext &ctx, MemoryBufferRef mb,
-                       StringRef archiveName, uint64_t offsetInArchive,
-                       bool lazy);
+  explicit BitcodeFile(SymbolTable &symtab, MemoryBufferRef mb,
+                       std::unique_ptr<llvm::lto::InputFile> &obj, bool lazy);
   ~BitcodeFile();
+
+  static BitcodeFile *create(COFFLinkerContext &ctx, MemoryBufferRef mb,
+                             StringRef archiveName, uint64_t offsetInArchive,
+                             bool lazy);
   static bool classof(const InputFile *f) { return f->kind() == BitcodeKind; }
   ArrayRef<Symbol *> getSymbols() { return symbols; }
-  MachineTypes getMachineType() const override;
+  MachineTypes getMachineType() const override {
+    return getMachineType(obj.get());
+  }
+  static MachineTypes getMachineType(const llvm::lto::InputFile *obj);
   void parseLazy();
   std::unique_ptr<llvm::lto::InputFile> obj;
 
diff --git a/lld/COFF/MapFile.cpp b/lld/COFF/MapFile.cpp
index e3531c04e7747..af87587d143d5 100644
--- a/lld/COFF/MapFile.cpp
+++ b/lld/COFF/MapFile.cpp
@@ -301,7 +301,7 @@ void lld::coff::writeMapFile(COFFLinkerContext &ctx) {
   uint64_t entryAddress = 0;
 
   if (!ctx.config.noEntry) {
-    Defined *entry = dyn_cast_or_null<Defined>(ctx.config.entry);
+    Defined *entry = dyn_cast_or_null<Defined>(ctx.symtab.entry);
     if (entry) {
       Chunk *chunk = entry->getChunk();
       entrySecIndex = chunk->getOutputSectionIdx();
diff --git a/lld/COFF/MarkLive.cpp b/lld/COFF/MarkLive.cpp
index 3c09baa73a9f7..ad50536892ebb 100644
--- a/lld/COFF/MarkLive.cpp
+++ b/lld/COFF/MarkLive.cpp
@@ -31,7 +31,7 @@ void markLive(COFFLinkerContext &ctx) {
   // COMDAT section chunks are dead by default. Add non-COMDAT chunks. Do not
   // traverse DWARF sections. They are live, but they should not keep other
   // sections alive.
-  for (Chunk *c : ctx.symtab.getChunks())
+  for (Chunk *c : ctx.driver.getChunks())
     if (auto *sc = dyn_cast<SectionChunk>(c))
       if (sc->live && !sc->isDWARF())
         worklist.push_back(sc);
diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index 7c43ada3d136e..bf965e8a2332d 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -347,8 +347,8 @@ bool SymbolTable::handleMinGWAutomaticImport(Symbol *sym, StringRef name) {
 /// defined symbol imported" diagnostic for symbols in localImports.
 /// objFiles and bitcodeFiles (if not nullptr) are used to report where
 /// undefined symbols are referenced.
-static void reportProblemSymbols(
-    COFFLinkerContext &ctx, const SmallPtrSetImpl<Symbol *> &undefs,
+void SymbolTable::reportProblemSymbols(
+    const SmallPtrSetImpl<Symbol *> &undefs,
     const DenseMap<Symbol *, Symbol *> *localImports, bool needBitcodeFiles) {
   // Return early if there is nothing to report (which should be
   // the common case).
@@ -392,7 +392,7 @@ static void reportProblemSymbols(
     processFile(file, file->getSymbols());
 
   if (needBitcodeFiles)
-    for (BitcodeFile *file : ctx.bitcodeFileInstances)
+    for (BitcodeFile *file : bitcodeFileInstances)
       processFile(file, file->getSymbols());
 
   for (const UndefinedDiag &undefDiag : undefDiags)
@@ -423,8 +423,7 @@ void SymbolTable::reportUnresolvable() {
     undefs.insert(sym);
   }
 
-  reportProblemSymbols(ctx, undefs,
-                       /* localImports */ nullptr, true);
+  reportProblemSymbols(undefs, /*localImports=*/nullptr, true);
 }
 
 bool SymbolTable::resolveRemainingUndefines() {
@@ -506,8 +505,8 @@ bool SymbolTable::resolveRemainingUndefines() {
   }
 
   reportProblemSymbols(
-      ctx, undefs,
-      ctx.config.warnLocallyDefinedImported ? &localImports : nullptr, false);
+      undefs, ctx.config.warnLocallyDefinedImported ? &localImports : nullptr,
+      false);
   return foundLazy;
 }
 
@@ -945,15 +944,6 @@ void SymbolTable::addLibcall(StringRef name) {
   }
 }
 
-std::vector<Chunk *> SymbolTable::getChunks() const {
-  std::vector<Chunk *> res;
-  for (ObjFile *file : ctx.objFileInstances) {
-    ArrayRef<Chunk *> v = file->getChunks();
-    res.insert(res.end(), v.begin(), v.end());
-  }
-  return res;
-}
-
 Symbol *SymbolTable::find(StringRef name) const {
   return symMap.lookup(CachedHashStringRef(name));
 }
@@ -1133,13 +1123,13 @@ Symbol *SymbolTable::addUndefined(StringRef name) {
 }
 
 void SymbolTable::compileBitcodeFiles() {
-  if (ctx.bitcodeFileInstances.empty())
+  if (bitcodeFileInstances.empty())
     return;
 
   llvm::TimeTraceScope timeScope("Compile bitcode");
   ScopedTimer t(ctx.ltoTimer);
   lto.reset(new BitcodeCompiler(ctx));
-  for (BitcodeFile *f : ctx.bitcodeFileInstances)
+  for (BitcodeFile *f : bitcodeFileInstances)
     lto->add(*f);
   for (InputFile *newObj : lto->compile()) {
     ObjFile *obj = cast<ObjFile>(newObj);
diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h
index 1de0b3e1deac3..66bca0d63e5ff 100644
--- a/lld/COFF/SymbolTable.h
+++ b/lld/COFF/SymbolTable.h
@@ -14,6 +14,7 @@
 #include "llvm/ADT/CachedHashString.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
@@ -67,9 +68,6 @@ class SymbolTable {
   void loadMinGWSymbols();
   bool handleMinGWAutomaticImport(Symbol *sym, StringRef name);
 
-  // Returns a list of chunks of selected symbols.
-  std::vector<Chunk *> getChunks() const;
-
   // Returns a symbol for a given name. Returns a nullptr if not found.
   Symbol *find(StringRef name) const;
   Symbol *findUnderscore(StringRef name) const;
@@ -143,6 +141,9 @@ class SymbolTable {
 
   bool isEC() const { return machine == ARM64EC; }
 
+  // An entry point symbol.
+  Symbol *entry = nullptr;
+
   // A list of chunks which to be added to .rdata.
   std::vector<Chunk *> localImportChunks;
 
@@ -155,6 +156,8 @@ class SymbolTable {
       callback(pair.second);
   }
 
+  std::vector<BitcodeFile *> bitcodeFileInstances;
+
   DefinedRegular *loadConfigSym = nullptr;
   uint32_t loadConfigSize = 0;
   void initializeLoadConfig();
@@ -175,6 +178,11 @@ class SymbolTable {
   std::unique_ptr<BitcodeCompiler> lto;
   std::vector<std::pair<Symbol *, Symbol *>> entryThunks;
   llvm::DenseMap<Symbol *, Symbol *> exitThunks;
+
+  void
+  reportProblemSymbols(const llvm::SmallPtrSetImpl<Symbol *> &undefs,
+                       const llvm::DenseMap<Symbol *, Symbol *> *localImports,
+                       bool needBitcodeFiles);
 };
 
 std::vector<std::string> getSymbolLocations(ObjFile *file, uint32_t symIndex);
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index eb82a9cc01593..8247f131dcf07 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -1077,7 +1077,7 @@ void Writer::createSections() {
   dtorsSec = createSection(".dtors", data | r | w);
 
   // Then bin chunks by name and output characteristics.
-  for (Chunk *c : ctx.symtab.getChunks()) {
+  for (Chunk *c : ctx.driver.getChunks()) {
     auto *sc = dyn_cast<SectionChunk>(c);
     if (sc && !sc->live) {
       if (ctx.config.verbose)
@@ -1748,7 +1748,7 @@ template <typename PEHeaderTy> void Writer::writeHeader() {
   pe->SizeOfImage = sizeOfImage;
   pe->SizeOfHeaders = sizeOfHeaders;
   if (!config->noEntry) {
-    Defined *entry = cast<Defined>(config->entry);
+    Defined *entry = cast<Defined>(ctx.symtab.entry);
     pe->AddressOfEntryPoint = entry->getRVA();
     // Pointer to thumb code must have the LSB set, so adjust it.
     if (config->machine == ARMNT)
@@ -2031,8 +2031,10 @@ void Writer::createGuardCFTables() {
   }
 
   // Mark the image entry as address-taken.
-  if (config->entry)
-    maybeAddAddressTakenFunction(addressTakenSyms, config->entry);
+  ctx.forEachSymtab([&](SymbolTable &symtab) {
+    if (symtab.entry)
+      maybeAddAddressTakenFunction(addressTakenSyms, symtab.entry);
+  });
 
   // Mark exported symbols in executable sections as address-taken.
   for (Export &e : config->exports)
@@ -2217,7 +2219,7 @@ void Writer::createECChunks() {
 void Writer::createRuntimePseudoRelocs() {
   std::vector<RuntimePseudoReloc> rels;
 
-  for (Chunk *c : ctx.symtab.getChunks()) {
+  for (Chunk *c : ctx.driver.getChunks()) {
     auto *sc = dyn_cast<SectionChunk>(c);
     if (!sc || !sc->live)
       continue;
@@ -2584,6 +2586,12 @@ void Writer::createDynamicRelocs() {
                          coffHeaderOffset + offsetof(coff_file_header, Machine),
                          AMD64);
 
+  if (ctx.symtab.entry != ctx.hybridSymtab->entry)
+    ctx.dynamicRelocs->add(IMAGE_DVRT_ARM64X_FIXUP_TYPE_VALUE, sizeof(uint32_t),
+                           peHeaderOffset +
+                               offsetof(pe32plus_header, AddressOfEntryPoint),
+                           cast_or_null<Defined>(ctx.hybridSymtab->entry));
+
   // Set the hybrid load config to the EC load config.
   ctx.dynamicRelocs->add(IMAGE_DVRT_ARM64X_FIXUP_TYPE_VALUE, sizeof(uint32_t),
                          dataDirOffset64 +
diff --git a/lld/Common/Strings.cpp b/lld/Common/Strings.cpp
index 41cbbf36f38cb..58839e4afe5db 100644
--- a/lld/Common/Strings.cpp
+++ b/lld/Common/Strings.cpp
@@ -19,10 +19,14 @@
 using namespace llvm;
 using namespace lld;
 
-SingleStringMatcher::SingleStringMatcher(StringRef Pattern) {
-  if (Pattern.size() > 2 && Pattern.starts_with("\"") &&
-      Pattern.ends_with("\"")) {
-    ExactMatch = true;
+static bool isExact(StringRef Pattern) {
+  return Pattern.size() > 2 && Pattern.starts_with("\"") &&
+         Pattern.ends_with("\"");
+}
+
+SingleStringMatcher::SingleStringMatcher(StringRef Pattern)
+    : ExactMatch(isExact(Pattern)) {
+  if (ExactMatch) {
     ExactPattern = Pattern.substr(1, Pattern.size() - 2);
   } else {
     Expected<GlobPattern> Glob = GlobPattern::create(Pattern);
@@ -30,7 +34,6 @@ SingleStringMatcher::SingleStringMatcher(StringRef Pattern) {
       error(toString(Glob.takeError()) + ": " + Pattern);
       return;
     }
-    ExactMatch = false;
     GlobPatternMatcher = *Glob;
   }
 }
diff --git a/lld/MachO/BPSectionOrderer.h b/lld/MachO/BPSectionOrderer.h
index 4facb652d4c87..69c6b260f044c 100644
--- a/lld/MachO/BPSectionOrderer.h
+++ b/lld/MachO/BPSectionOrderer.h
@@ -19,7 +19,10 @@
 #include "Symbols.h"
 #include "lld/Common/BPSectionOrdererBase.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StableHashing.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/xxhash.h"
 
 namespace lld::macho {
 
@@ -90,23 +93,24 @@ class BPSectionMacho : public BPSectionBase {
                             &sectionToIdx) const override {
     constexpr unsigned windowSize = 4;
 
-    // Calculate content hashes
-    size_t dataSize = isec->data.size();
-    for (size_t i = 0; i < dataSize; i++) {
-      auto window = isec->data.drop_front(i).take_front(windowSize);
-      hashes.push_back(xxHash64(window));
-    }
+    // Calculate content hashes: k-mers and the last k-1 bytes.
+    ArrayRef<uint8_t> data = isec->data;
+    if (data.size() >= windowSize)
+      for (size_t i = 0; i <= data.size() - windowSize; ++i)
+        hashes.push_back(llvm::support::endian::read32le(data.data() + i));
+    for (uint8_t byte : data.take_back(windowSize - 1))
+      hashes.push_back(byte);
 
     // Calculate relocation hashes
     for (const auto &r : isec->relocs) {
-      if (r.length == 0 || r.referent.isNull() || r.offset >= isec->data.size())
+      if (r.length == 0 || r.referent.isNull() || r.offset >= data.size())
         continue;
 
       uint64_t relocHash = getRelocHash(r, sectionToIdx);
       uint32_t start = (r.offset < windowSize) ? 0 : r.offset - windowSize + 1;
       for (uint32_t i = start; i < r.offset + r.length; i++) {
-        auto window = isec->data.drop_front(i).take_front(windowSize);
-        hashes.push_back(xxHash64(window) + relocHash);
+        auto window = data.drop_front(i).take_front(windowSize);
+        hashes.push_back(xxh3_64bits(window) ^ relocHash);
       }
     }
 
@@ -124,19 +128,17 @@ class BPSectionMacho : public BPSectionBase {
     std::optional<uint64_t> sectionIdx;
     if (auto it = sectionToIdx.find(isec); it != sectionToIdx.end())
       sectionIdx = it->second;
-    std::string kind;
+    uint64_t kind = -1, value = 0;
     if (isec)
-      kind = ("Section " + Twine(isec->kind())).str();
+      kind = uint64_t(isec->kind());
 
     if (auto *sym = reloc.referent.dyn_cast<Symbol *>()) {
-      kind += (" Symbol " + Twine(sym->kind())).str();
-      if (auto *d = llvm::dyn_cast<Defined>(sym)) {
-        return BPSectionBase::getRelocHash(kind, sectionIdx.value_or(0),
-                                           d->value, reloc.addend);
-      }
+      kind = (kind << 8) | uint8_t(sym->kind());
+      if (auto *d = llvm::dyn_cast<Defined>(sym))
+        value = d->value;
     }
-    return BPSectionBase::getRelocHash(kind, sectionIdx.value_or(0), 0,
-                                       reloc.addend);
+    return llvm::stable_hash_combine(kind, sectionIdx.value_or(0), value,
+                                     reloc.addend);
   }
 };
 
diff --git a/lld/MachO/MapFile.cpp b/lld/MachO/MapFile.cpp
index 12417df8cecb8..8919c8d2f9b9c 100644
--- a/lld/MachO/MapFile.cpp
+++ b/lld/MachO/MapFile.cpp
@@ -161,20 +161,6 @@ static uint64_t getSymSizeForMap(Defined *sym) {
   return sym->size;
 }
 
-// Merges two vectors of input sections in order of their outSecOff values.
-// This approach creates a new (temporary) vector which is not ideal but the
-// ideal approach leads to a lot of code duplication.
-static std::vector<ConcatInputSection *>
-mergeOrderedInputs(ArrayRef<ConcatInputSection *> inputs1,
-                   ArrayRef<ConcatInputSection *> inputs2) {
-  std::vector<ConcatInputSection *> vec(inputs1.size() + inputs2.size());
-  std::merge(inputs1.begin(), inputs1.end(), inputs2.begin(), inputs2.end(),
-             vec.begin(), [](ConcatInputSection *a, ConcatInputSection *b) {
-               return a->outSecOff < b->outSecOff;
-             });
-  return vec;
-}
-
 void macho::writeMapFile() {
   if (config->mapFile.empty())
     return;
@@ -217,15 +203,32 @@ void macho::writeMapFile() {
                    seg->name.str().c_str(), osec->name.str().c_str());
     }
 
-  // Shared function to print an array of symbols.
-  auto printIsecArrSyms = [&](const std::vector<ConcatInputSection *> &arr) {
-    for (const ConcatInputSection *isec : arr) {
-      for (Defined *sym : isec->symbols) {
-        if (!(isPrivateLabel(sym->getName()) && getSymSizeForMap(sym) == 0))
-          os << format("0x%08llX\t0x%08llX\t[%3u] %s\n", sym->getVA(),
-                       getSymSizeForMap(sym),
-                       readerToFileOrdinal[sym->getFile()],
-                       sym->getName().str().data());
+  // Helper lambda that prints all symbols from one ConcatInputSection.
+  auto printOne = [&](const ConcatInputSection *isec) {
+    for (Defined *sym : isec->symbols) {
+      if (!(isPrivateLabel(sym->getName()) && getSymSizeForMap(sym) == 0)) {
+        os << format("0x%08llX\t0x%08llX\t[%3u] %s\n", sym->getVA(),
+                     getSymSizeForMap(sym),
+                     readerToFileOrdinal.lookup(sym->getFile()),
+                     sym->getName().str().data());
+      }
+    }
+  };
+  // Shared function to print one or two arrays of ConcatInputSection in
+  // ascending outSecOff order. The second array is optional; if provided, we
+  // interleave the printing in sorted order without allocating a merged temp
+  // array.
+  auto printIsecArrSyms = [&](ArrayRef<ConcatInputSection *> arr1,
+                              ArrayRef<ConcatInputSection *> arr2 = {}) {
+    // Print both arrays in sorted order, interleaving as necessary.
+    while (!arr1.empty() || !arr2.empty()) {
+      if (!arr1.empty() && (arr2.empty() || arr1.front()->outSecOff <=
+                                                arr2.front()->outSecOff)) {
+        printOne(arr1.front());
+        arr1 = arr1.drop_front();
+      } else if (!arr2.empty()) {
+        printOne(arr2.front());
+        arr2 = arr2.drop_front();
       }
     }
   };
@@ -235,9 +238,7 @@ void macho::writeMapFile() {
   for (const OutputSegment *seg : outputSegments) {
     for (const OutputSection *osec : seg->getSections()) {
       if (auto *textOsec = dyn_cast<TextOutputSection>(osec)) {
-        auto inputsAndThunks =
-            mergeOrderedInputs(textOsec->inputs, textOsec->getThunks());
-        printIsecArrSyms(inputsAndThunks);
+        printIsecArrSyms(textOsec->inputs, textOsec->getThunks());
       } else if (auto *concatOsec = dyn_cast<ConcatOutputSection>(osec)) {
         printIsecArrSyms(concatOsec->inputs);
       } else if (osec == in.cStringSection || osec == in.objcMethnameSection) {
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index 4b1e9e4391070..4c89f96c3ebaa 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -82,8 +82,12 @@ def print_dylib_search: Flag<["--"], "print-dylib-search">,
     HelpText<"Print which paths lld searched when trying to find dylibs">,
     Group<grp_lld>;
 def icf_eq: Joined<["--"], "icf=">,
-    HelpText<"Set level for identical code folding (default: none)">,
-    MetaVarName<"[none,safe,all]">,
+    HelpText<"Set level for identical code folding (default: none). Possible values:\n"
+            "  none        - Disable ICF\n"
+            "  safe        - Only folds non-address significant functions (as described by `__addrsig` section)\n"
+            "  safe_thunks - Like safe, but replaces address-significant functions with thunks\n"
+            "  all         - Fold all identical functions">,
+    MetaVarName<"[none,safe,safe_thunks,all]">,
     Group<grp_lld>;
 def keep_icf_stabs: Joined<["--"], "keep-icf-stabs">,
     HelpText<"Generate STABS entries for symbols folded by ICF. These entries can then be used by dsymutil to discover the address range where folded symbols are located.">,
@@ -129,7 +133,7 @@ def print_symbol_order_eq: Joined<["--"], "print-symbol-order=">,
 def irpgo_profile: Separate<["--"], "irpgo-profile">, Group<grp_lld>;
 def irpgo_profile_eq: Joined<["--"], "irpgo-profile=">,
     Alias<!cast<Separate>(irpgo_profile)>, MetaVarName<"<profile>">,
-    HelpText<"Read the IRPGO <profile> for use with -bp-startup-sort and other profile-guided optimizations">,
+    HelpText<"Read the IRPGO <profile> for use with --bp-startup-sort and other profile-guided optimizations">,
     Group<grp_lld>;
 def bp_startup_sort: Joined<["--"], "bp-startup-sort=">,
     MetaVarName<"[none,function]">,
diff --git a/lld/include/lld/Common/BPSectionOrdererBase.h b/lld/include/lld/Common/BPSectionOrdererBase.h
index bd5bd638ccd2a..bbd05edc5e55e 100644
--- a/lld/include/lld/Common/BPSectionOrdererBase.h
+++ b/lld/include/lld/Common/BPSectionOrdererBase.h
@@ -18,7 +18,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/Support/xxhash.h"
 #include <memory>
 #include <optional>
 
@@ -56,14 +55,6 @@ class BPSectionBase {
     return P1;
   }
 
-  static uint64_t getRelocHash(llvm::StringRef kind, uint64_t sectionIdx,
-                               uint64_t offset, uint64_t addend) {
-    return llvm::xxHash64((kind + ": " + llvm::Twine::utohexstr(sectionIdx) +
-                           " + " + llvm::Twine::utohexstr(offset) + " + " +
-                           llvm::Twine::utohexstr(addend))
-                              .str());
-  }
-
   /// Reorders sections using balanced partitioning algorithm based on profile
   /// data.
   static llvm::DenseMap<const BPSectionBase *, int>
diff --git a/lld/test/COFF/arm64x-entry.test b/lld/test/COFF/arm64x-entry.test
new file mode 100644
index 0000000000000..d5363c66544a5
--- /dev/null
+++ b/lld/test/COFF/arm64x-entry.test
@@ -0,0 +1,92 @@
+REQUIRES: aarch64, x86
+RUN: split-file %s %t.dir && cd %t.dir
+
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows arm64ec-dllmain.s -o arm64ec-dllmain.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows arm64-dllmain.s -o arm64-dllmain.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows arm64ec-func.s -o arm64ec-func.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows arm64-func.s -o arm64-func.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows arm64-drectve.s -o arm64ec-drectve.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows arm64-drectve.s -o arm64-drectve.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj
+
+RUN: lld-link -machine:arm64x -dll -out:out.dll arm64ec-dllmain.obj arm64-dllmain.obj \
+RUN:          loadconfig-arm64.obj loadconfig-arm64ec.obj
+
+RUN: llvm-objdump -d out.dll | FileCheck --check-prefix=DISASM %s
+DISASM: Disassembly of section .text:
+DISASM-EMPTY:
+DISASM-NEXT: 0000000180001000 <.text>:
+DISASM-NEXT: 180001000: 52800020     mov     w0, #0x1                // =1
+DISASM-NEXT: 180001004: d65f03c0     ret
+DISASM-NEXT:                 ...
+DISASM-NEXT: 180002000: 52800040     mov     w0, #0x2                // =2
+DISASM-NEXT: 180002004: d65f03c0     ret
+DISASM-EMPTY:
+DISASM-NEXT: Disassembly of section .hexpthk:
+DISASM-EMPTY:
+DISASM-NEXT: 0000000180003000 <.hexpthk>:
+DISASM-NEXT: 180003000: 48 8b c4                     movq    %rsp, %rax
+DISASM-NEXT: 180003003: 48 89 58 20                  movq    %rbx, 0x20(%rax)
+DISASM-NEXT: 180003007: 55                           pushq   %rbp
+DISASM-NEXT: 180003008: 5d                           popq    %rbp
+DISASM-NEXT: 180003009: e9 f2 ef ff ff               jmp     0x180002000 <.text+0x1000>
+DISASM-NEXT: 18000300e: cc                           int3
+DISASM-NEXT: 18000300f: cc                           int3
+
+RUN: llvm-readobj --headers out.dll | FileCheck --check-prefix=READOBJ %s
+READOBJ: AddressOfEntryPoint: 0x1000
+READOBJ: HybridObject {
+READOBJ:   AddressOfEntryPoint: 0x3000
+READOBJ: }
+
+RUN: lld-link -machine:arm64x -dll -out:out2.dll arm64ec-func.obj arm64-func.obj \
+RUN:          arm64ec-drectve.obj loadconfig-arm64.obj loadconfig-arm64ec.obj
+RUN: llvm-objdump -d out2.dll | FileCheck --check-prefix=DISASM %s
+RUN: llvm-readobj --headers --coff-load-config out2.dll | FileCheck --check-prefix=READOBJ %s
+
+RUN: lld-link -machine:arm64x -dll -out:out3.dll arm64ec-func.obj arm64-func.obj \
+RUN:          arm64-drectve.obj loadconfig-arm64.obj loadconfig-arm64ec.obj
+RUN: llvm-objdump -d out3.dll | FileCheck --check-prefix=DISASM %s
+RUN: llvm-readobj --headers --coff-load-config out3.dll | FileCheck --check-prefix=READOBJ %s
+
+RUN: lld-link -machine:arm64x -dll -out:out4.dll arm64ec-func.obj arm64-func.obj \
+RUN:          loadconfig-arm64.obj loadconfig-arm64ec.obj -entry:func
+RUN: llvm-objdump -d out4.dll | FileCheck --check-prefix=DISASM %s
+RUN: llvm-readobj --headers --coff-load-config out4.dll | FileCheck --check-prefix=READOBJ %s
+
+#--- arm64-dllmain.s
+    .section .text,"xr",discard,_DllMainCRTStartup
+    .globl _DllMainCRTStartup
+    .p2align 2
+_DllMainCRTStartup:
+    mov w0, #1
+    ret
+
+#--- arm64ec-dllmain.s
+    .section .text,"xr",discard,_DllMainCRTStartup
+    .globl _DllMainCRTStartup
+    .p2align 2
+_DllMainCRTStartup:
+    mov w0, #2
+    ret
+
+#--- arm64-func.s
+    .section .text,"xr",discard,func
+    .globl func
+    .p2align 2
+func:
+    mov w0, #1
+    ret
+
+#--- arm64ec-func.s
+    .section .text,"xr",discard,func
+    .globl func
+    .p2align 2
+func:
+    mov w0, #2
+    ret
+
+#--- arm64-drectve.s
+.section .drectve
+    .ascii "-entry:func"
diff --git a/lld/test/COFF/arm64x-loadconfig.s b/lld/test/COFF/arm64x-loadconfig.s
index f413adff2868c..12f7387d9ecdc 100644
--- a/lld/test/COFF/arm64x-loadconfig.s
+++ b/lld/test/COFF/arm64x-loadconfig.s
@@ -7,6 +7,7 @@
 // RUN: llvm-mc -filetype=obj -triple=arm64ec-windows loadconfig-ec.s -o loadconfig-ec.obj
 // RUN: llvm-mc -filetype=obj -triple=aarch64-windows loadconfig-short.s -o loadconfig-short.obj
 // RUN: llvm-mc -filetype=obj -triple=arm64ec-windows loadconfig-short.s -o loadconfig-short-arm64ec.obj
+// RUN: llvm-lib -machine:arm64x -out:loadconfig.lib loadconfig.obj loadconfig-ec.obj
 
 // RUN: lld-link -machine:arm64x -out:out-warn.dll -dll -noentry test.obj \
 // RUN:           2>&1 | FileCheck --check-prefixes=WARN-LOADCFG,WARN-EC-LOADCFG %s
@@ -158,6 +159,10 @@
 // BASERELOC-NEXT:     Address: 0x2074
 // BASERELOC-NEXT:   }
 
+// RUN: lld-link -machine:arm64x -out:out-hyb-lib.dll -dll -noentry loadconfig.lib chpe.obj test.obj
+// RUN: llvm-readobj --coff-load-config out-hyb-lib.dll | FileCheck --check-prefix=LOADCFG %s
+// RUN: llvm-readobj --coff-basereloc out-hyb-lib.dll | FileCheck --check-prefix=BASERELOC %s
+
 #--- test.s
         .data
 sym:
diff --git a/lld/test/COFF/empty-section-decl.yaml b/lld/test/COFF/empty-section-decl.yaml
new file mode 100644
index 0000000000000..320df34000028
--- /dev/null
+++ b/lld/test/COFF/empty-section-decl.yaml
@@ -0,0 +1,56 @@
+# REQUIRES: x86
+
+# RUN: yaml2obj %s -o %t.obj
+# RUN: lld-link -dll -out:%t.dll %t.obj -noentry -subsystem:console -lldmap:%t.map
+# RUN: llvm-objdump -s %t.dll | FileCheck %s
+# RUN: FileCheck %s --check-prefix=MAP < %t.map
+
+# CHECK:      Contents of section .itest:
+# CHECK-NEXT:  180001000 0c100080 01000000 00000000 01000000
+
+# MAP: 00001000 0000000a     4         {{.*}}:(.itest$2)
+# MAP: 00001000 00000000     0                 .itest$2
+# MAP: 0000100c 00000000     4         {{.*}}:(.itest$4)
+# MAP: 0000100c 00000000     0                 .itest$4
+# MAP: 0000100c 00000004     2         {{.*}}:(.itest$6)
+# MAP: 0000100c 00000000     0                 .itest$6
+
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [  ]
+sections:
+  - Name:            '.itest$2'
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     '00000000000000000000'
+    SizeOfRawData:   10
+    Relocations:
+      - VirtualAddress:  0
+        SymbolName:      '.itest$4'
+        Type:            IMAGE_REL_AMD64_ADDR64
+  - Name:            '.itest$6'
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       2
+    SectionData:     01000000
+    SizeOfRawData:   4
+symbols:
+  - Name:            '.itest$2'
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_SECTION
+  - Name:            '.itest$6'
+    Value:           0
+    SectionNumber:   2
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            '.itest$4'
+    Value:           0
+    SectionNumber:   0
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_SECTION
+...
diff --git a/lld/test/COFF/lto-arm64x.ll b/lld/test/COFF/lto-arm64x.ll
new file mode 100644
index 0000000000000..bbfc6b64c6fce
--- /dev/null
+++ b/lld/test/COFF/lto-arm64x.ll
@@ -0,0 +1,47 @@
+; REQUIRES: aarch64, x86
+; RUN: split-file %s %t.dir && cd %t.dir
+
+; RUN: llvm-as arm64ec.ll -o arm64ec.obj
+; RUN: llvm-as aarch64.ll -o aarch64.obj
+; RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj
+; RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
+
+; RUN: lld-link -machine:arm64x aarch64.obj arm64ec.obj loadconfig-arm64.obj loadconfig-arm64ec.obj -out:out.exe -subsystem:console
+; RUN: llvm-objdump -d out.exe | FileCheck %s
+
+; CHECK:      0000000140001000 <.text>:
+; CHECK-NEXT: 140001000: 52800020     mov     w0, #0x1                // =1
+; CHECK-NEXT: 140001004: d65f03c0     ret
+; CHECK-NEXT:                 ...
+; CHECK-NEXT: 140002000: 00000009     udf     #0x9
+; CHECK-NEXT: 140002004: 52800040     mov     w0, #0x2                // =2
+; CHECK-NEXT: 140002008: d65f03c0     ret
+
+; CHECK:      0000000140003000 <.hexpthk>:
+; CHECK-NEXT: 140003000: 48 8b c4                     movq    %rsp, %rax
+; CHECK-NEXT: 140003003: 48 89 58 20                  movq    %rbx, 0x20(%rax)
+; CHECK-NEXT: 140003007: 55                           pushq   %rbp
+; CHECK-NEXT: 140003008: 5d                           popq    %rbp
+; CHECK-NEXT: 140003009: e9 f6 ef ff ff               jmp     0x140002004 <.text+0x1004>
+; CHECK-NEXT: 14000300e: cc                           int3
+; CHECK-NEXT: 14000300f: cc                           int3
+
+#--- arm64ec.ll
+
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64ec-unknown-windows-msvc"
+
+define dso_local i32 @mainCRTStartup() {
+entry:
+  ret i32 2
+}
+
+#--- aarch64.ll
+
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-windows-msvc"
+
+define dso_local i32 @mainCRTStartup() {
+entry:
+  ret i32 1
+}
diff --git a/lld/test/COFF/subsystem-arm64x.test b/lld/test/COFF/subsystem-arm64x.test
new file mode 100644
index 0000000000000..68438b6f6f43b
--- /dev/null
+++ b/lld/test/COFF/subsystem-arm64x.test
@@ -0,0 +1,41 @@
+REQUIRES: aarch64
+RUN: split-file %s %t.dir && cd %t.dir
+
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows main.s -o main-arm64ec.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows main.s -o main-arm64.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows winmain.s -o winmain-arm64ec.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows winmain.s -o winmain-arm64.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj
+
+Check that the subsystem is inferred from EC symbols.
+
+RUN: lld-link -machine:arm64x -entry:entry -out:out.exe main-arm64.obj winmain-arm64ec.obj \
+RUN:          loadconfig-arm64.obj loadconfig-arm64ec.obj
+RUN: llvm-readobj --headers out.exe | FileCheck --check-prefix=GUI %s
+GUI: Subsystem: IMAGE_SUBSYSTEM_WINDOWS_GUI (0x2)
+GUI: Subsystem: IMAGE_SUBSYSTEM_WINDOWS_GUI (0x2)
+
+RUN: lld-link -machine:arm64x -entry:entry -out:out.exe main-arm64ec.obj winmain-arm64.obj \
+RUN:          loadconfig-arm64.obj loadconfig-arm64ec.obj
+RUN: llvm-readobj --headers out.exe | FileCheck --check-prefix=CUI %s
+CUI: Subsystem: IMAGE_SUBSYSTEM_WINDOWS_CUI (0x3)
+CUI: Subsystem: IMAGE_SUBSYSTEM_WINDOWS_CUI (0x3)
+
+#--- main.s
+    .globl "#main"
+    .globl main
+    .globl entry
+"#main":
+main:
+entry:
+    ret
+
+#--- winmain.s
+    .globl "#WinMain"
+    .globl WinMain
+    .globl entry
+"#WinMain":
+WinMain:
+entry:
+    ret
diff --git a/lld/test/COFF/wholearchive-implib.s b/lld/test/COFF/wholearchive-implib.s
new file mode 100644
index 0000000000000..0c98ca0ddef07
--- /dev/null
+++ b/lld/test/COFF/wholearchive-implib.s
@@ -0,0 +1,35 @@
+// REQUIRES: x86
+// RUN: split-file %s %t.dir
+// RUN: llvm-lib -machine:amd64 -out:%t.lib -def:%t.dir/lib.def
+// RUN: llvm-mc -filetype=obj -triple=x86_64-windows %t.dir/main.s -o %t.main.obj
+
+// RUN: lld-link -out:%t.exe %t.main.obj -wholearchive:%t.lib -entry:entry -subsystem:console
+// RUN: llvm-readobj --coff-imports %t.exe | FileCheck %s
+
+// As LLD usually doesn't use the header/trailer object files from import
+// libraries, but instead synthesizes those structures, we end up with two
+// import directory entries if we force those objects to be included.
+
+// CHECK:      Import {
+// CHECK-NEXT:   Name: lib.dll
+// CHECK-NEXT:   ImportLookupTableRVA: 0x2050
+// CHECK-NEXT:   ImportAddressTableRVA: 0x2068
+// CHECK-NEXT: }
+// CHECK-NEXT: Import {
+// CHECK-NEXT:   Name: lib.dll
+// CHECK-NEXT:   ImportLookupTableRVA: 0x2058
+// CHECK-NEXT:   ImportAddressTableRVA: 0x2070
+// CHECK-NEXT:   Symbol: func (0)
+// CHECK-NEXT: }
+
+
+#--- main.s
+.global entry
+entry:
+  call func
+  ret
+
+#--- lib.def
+LIBRARY lib.dll
+EXPORTS
+func
diff --git a/lld/test/MachO/arm64-thunks.s b/lld/test/MachO/arm64-thunks.s
index 858a27dfe36af..76c7d108104d1 100644
--- a/lld/test/MachO/arm64-thunks.s
+++ b/lld/test/MachO/arm64-thunks.s
@@ -17,13 +17,7 @@
 # RUN: %lld -arch arm64 -dead_strip -lSystem -U _extern_sym -map %t/thunk.map -o %t/thunk %t/input.o
 # RUN: llvm-objdump --no-print-imm-hex -d --no-show-raw-insn %t/thunk | FileCheck %s
 
-## Check that the thunks appear in the map file and that everything is sorted by address
-# Because of the `.space` instructions, there will end up being a lot of dead symbols in the 
-# linker map (linker map will be ~2.7GB). So to avoid the test trying to (slowly) match regex
-# across all the ~2.7GB of the linker map - generate a version of the linker map without dead symbols.
-# RUN: awk '/# Dead Stripped Symbols:/ {exit} {print}' %t/thunk.map > %t/thunk_no_dead_syms.map
-
-# RUN: FileCheck %s --input-file %t/thunk_no_dead_syms.map --check-prefix=MAP
+# RUN: FileCheck %s --input-file %t/thunk.map --check-prefix=MAP
  
 # MAP:      0x{{[[:xdigit:]]+}} {{.*}} _b
 # MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _c
@@ -339,7 +333,12 @@ _main:
   ret
 
 .section __TEXT,__cstring
-  .space 0x4000000
+  # The .space below has to be composed of non-zero characters. Otherwise, the
+  # linker will create a symbol for every '0' in the section, leading to
+  # dramatic memory usage and a huge linker map file
+  .space 0x4000000, 'A'
+  .byte 0
+
 
 .section __TEXT,__lcxx_override,regular,pure_instructions
 
diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp
index 715fba1ee6da5..7fb44b9f0c009 100644
--- a/lld/wasm/SyntheticSections.cpp
+++ b/lld/wasm/SyntheticSections.cpp
@@ -594,7 +594,7 @@ void ElemSection::writeBody() {
   }
   writeInitExpr(os, initExpr);
 
-  if (flags & WASM_ELEM_SEGMENT_MASK_HAS_ELEM_KIND) {
+  if (flags & WASM_ELEM_SEGMENT_MASK_HAS_ELEM_DESC) {
     // We only write active function table initializers, for which the elem kind
     // is specified to be written as 0x00 and interpreted to mean "funcref".
     const uint8_t elemKind = 0;
diff --git a/lldb/bindings/headers.swig b/lldb/bindings/headers.swig
index c0dde905f986b..5e7c54d1eb839 100644
--- a/lldb/bindings/headers.swig
+++ b/lldb/bindings/headers.swig
@@ -52,6 +52,7 @@
 #include "lldb/API/SBProcess.h"
 #include "lldb/API/SBProcessInfo.h"
 #include "lldb/API/SBProcessInfoList.h"
+#include "lldb/API/SBProgress.h"
 #include "lldb/API/SBQueue.h"
 #include "lldb/API/SBQueueItem.h"
 #include "lldb/API/SBReproducer.h"
diff --git a/lldb/bindings/interface/SBProgressDocstrings.i b/lldb/bindings/interface/SBProgressDocstrings.i
new file mode 100644
index 0000000000000..2997fe619fcc7
--- /dev/null
+++ b/lldb/bindings/interface/SBProgressDocstrings.i
@@ -0,0 +1,14 @@
+%feature("docstring",
+"A Progress indicator helper class.
+
+Any potentially long running sections of code in LLDB should report
+progress so that clients are aware of delays that might appear during
+debugging. Delays commonly include indexing debug information, parsing
+symbol tables for object files, downloading symbols from remote
+repositories, and many more things.
+
+The Progress class helps make sure that progress is correctly reported
+and will always send an initial progress update, updates when
+Progress::Increment() is called, and also will make sure that a progress
+completed update is reported even if the user doesn't explicitly cause one
+to be sent.") lldb::SBProgress;
diff --git a/lldb/bindings/interfaces.swig b/lldb/bindings/interfaces.swig
index 8a6fed95f0b72..08df9a1a8d539 100644
--- a/lldb/bindings/interfaces.swig
+++ b/lldb/bindings/interfaces.swig
@@ -54,6 +54,7 @@
 %include "./interface/SBPlatformDocstrings.i"
 %include "./interface/SBProcessDocstrings.i"
 %include "./interface/SBProcessInfoDocstrings.i"
+%include "./interface/SBProgressDocstrings.i"
 %include "./interface/SBQueueDocstrings.i"
 %include "./interface/SBQueueItemDocstrings.i"
 %include "./interface/SBReproducerDocstrings.i"
@@ -133,6 +134,7 @@
 %include "lldb/API/SBProcess.h"
 %include "lldb/API/SBProcessInfo.h"
 %include "lldb/API/SBProcessInfoList.h"
+%include "lldb/API/SBProgress.h"
 %include "lldb/API/SBQueue.h"
 %include "lldb/API/SBQueueItem.h"
 %include "lldb/API/SBReproducer.h"
diff --git a/lldb/examples/synthetic/gnu_libstdcpp.py b/lldb/examples/synthetic/gnu_libstdcpp.py
index a6605a7a7eb5b..20b9488af5597 100644
--- a/lldb/examples/synthetic/gnu_libstdcpp.py
+++ b/lldb/examples/synthetic/gnu_libstdcpp.py
@@ -61,19 +61,11 @@ class StdUnorderedMapSynthProvider:
     def __init__(self, valobj, dict):
         self.valobj = valobj
         self.count = None
-        self.kind = self.get_object_kind(valobj)
-
-    def get_object_kind(self, valobj):
-        type_name = valobj.GetTypeName()
-        return "set" if "set" in type_name else "map"
 
     def extract_type(self):
         type = self.valobj.GetType()
-        # type of std::pair<key, value> is the first template
-        # argument type of the 4th template argument to std::map and
-        # 3rd template argument for std::set. That's why
-        # we need to know kind of the object
-        template_arg_num = 4 if self.kind == "map" else 3
+        # The last template argument is the allocator type.
+        template_arg_num = type.GetNumberOfTemplateArguments() - 1
         allocator_type = type.GetTemplateArgumentType(template_arg_num)
         data_type = allocator_type.GetTemplateArgumentType(0)
         return data_type
diff --git a/lldb/include/lldb/API/SBDebugger.h b/lldb/include/lldb/API/SBDebugger.h
index 787bd040dd15b..eb371e33c4951 100644
--- a/lldb/include/lldb/API/SBDebugger.h
+++ b/lldb/include/lldb/API/SBDebugger.h
@@ -203,7 +203,7 @@ class LLDB_API SBDebugger {
   lldb::SBCommandInterpreter GetCommandInterpreter();
 
   void HandleCommand(const char *command);
-  
+
   void RequestInterrupt();
   void CancelInterruptRequest();
   bool InterruptRequested();
@@ -517,6 +517,7 @@ class LLDB_API SBDebugger {
   friend class SBPlatform;
   friend class SBTarget;
   friend class SBTrace;
+  friend class SBProgress;
 
   lldb::SBTarget FindTargetWithLLDBProcess(const lldb::ProcessSP &processSP);
 
diff --git a/lldb/include/lldb/API/SBProgress.h b/lldb/include/lldb/API/SBProgress.h
new file mode 100644
index 0000000000000..d2eaf0a743cb3
--- /dev/null
+++ b/lldb/include/lldb/API/SBProgress.h
@@ -0,0 +1,66 @@
+//===-- SBProgress.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_API_SBPROGRESS_H
+#define LLDB_API_SBPROGRESS_H
+
+#include "lldb/API/SBDebugger.h"
+#include "lldb/API/SBDefines.h"
+
+namespace lldb {
+
+/// A Progress indicator helper class.
+///
+/// Any potentially long running sections of code in LLDB should report
+/// progress so that clients are aware of delays that might appear during
+/// debugging. Delays commonly include indexing debug information, parsing
+/// symbol tables for object files, downloading symbols from remote
+/// repositories, and many more things.
+///
+/// The Progress class helps make sure that progress is correctly reported
+/// and will always send an initial progress update, updates when
+/// Progress::Increment() is called, and also will make sure that a progress
+/// completed update is reported even if the user doesn't explicitly cause one
+/// to be sent.
+class LLDB_API SBProgress {
+public:
+  /// Construct a progress object with a title, details and a given debugger.
+  /// \param title
+  ///   The title of the progress object.
+  /// \param details
+  ///   The details of the progress object.
+  /// \param debugger
+  ///   The debugger for this progress object to report to.
+  SBProgress(const char *title, const char *details, SBDebugger &debugger);
+
+  /// Construct a progress object with a title, details, the total units of work
+  /// to be done, and a given debugger.
+  /// \param title
+  ///   The title of the progress object.
+  /// \param details
+  ///   The details of the progress object.
+  /// \param total_units
+  ///   The total number of units of work to be done.
+  /// \param debugger
+  ///   The debugger for this progress object to report to.
+  SBProgress(const char *title, const char *details, uint64_t total_units,
+             SBDebugger &debugger);
+
+  ~SBProgress();
+
+  void Increment(uint64_t amount, const char *description = nullptr);
+
+protected:
+  lldb_private::Progress &ref() const;
+
+private:
+  std::unique_ptr<lldb_private::Progress> m_opaque_up;
+}; // SBProgress
+} // namespace lldb
+
+#endif // LLDB_API_SBPROGRESS_H
diff --git a/lldb/include/lldb/API/SBSaveCoreOptions.h b/lldb/include/lldb/API/SBSaveCoreOptions.h
index 74aa2fe5bd5f9..7852858f8ade9 100644
--- a/lldb/include/lldb/API/SBSaveCoreOptions.h
+++ b/lldb/include/lldb/API/SBSaveCoreOptions.h
@@ -14,6 +14,7 @@
 #include "lldb/API/SBFileSpec.h"
 #include "lldb/API/SBProcess.h"
 #include "lldb/API/SBThread.h"
+#include "lldb/API/SBThreadCollection.h"
 
 namespace lldb {
 
@@ -111,11 +112,19 @@ class LLDB_API SBSaveCoreOptions {
   ///   style specific regions.
   SBError AddMemoryRegionToSave(const SBMemoryRegionInfo &region);
 
+  /// Get an unsorted copy of all threads to save
+  ///
+  /// \returns
+  ///   An unsorted copy of all threads to save. If no process is specified
+  ///   an empty collection will be returned.
+  SBThreadCollection GetThreadsToSave() const;
+
   /// Reset all options.
   void Clear();
 
 protected:
   friend class SBProcess;
+  friend class SBThreadCollection;
   lldb_private::SaveCoreOptions &ref() const;
 
 private:
diff --git a/lldb/include/lldb/API/SBThreadCollection.h b/lldb/include/lldb/API/SBThreadCollection.h
index fe57a6b95d909..5a052e6246026 100644
--- a/lldb/include/lldb/API/SBThreadCollection.h
+++ b/lldb/include/lldb/API/SBThreadCollection.h
@@ -48,7 +48,7 @@ class LLDB_API SBThreadCollection {
 private:
   friend class SBProcess;
   friend class SBThread;
-
+  friend class SBSaveCoreOptions;
   lldb::ThreadCollectionSP m_opaque_sp;
 };
 
diff --git a/lldb/include/lldb/Interpreter/Interfaces/OperatingSystemInterface.h b/lldb/include/lldb/Interpreter/Interfaces/OperatingSystemInterface.h
index 3c46f99f3b356..58aab7ec914dd 100644
--- a/lldb/include/lldb/Interpreter/Interfaces/OperatingSystemInterface.h
+++ b/lldb/include/lldb/Interpreter/Interfaces/OperatingSystemInterface.h
@@ -27,6 +27,8 @@ class OperatingSystemInterface : virtual public ScriptedThreadInterface {
   virtual std::optional<std::string> GetRegisterContextForTID(lldb::tid_t tid) {
     return std::nullopt;
   }
+
+  virtual std::optional<bool> DoesPluginReportAllThreads() { return {}; }
 };
 } // namespace lldb_private
 
diff --git a/lldb/include/lldb/Symbol/Function.h b/lldb/include/lldb/Symbol/Function.h
index 157c007bdf0e8..d0b27269568b0 100644
--- a/lldb/include/lldb/Symbol/Function.h
+++ b/lldb/include/lldb/Symbol/Function.h
@@ -454,6 +454,11 @@ class Function : public UserID, public SymbolContextScope {
   /// and variables).
   const Address &GetAddress() const { return m_address; }
 
+  bool GetRangeContainingLoadAddress(lldb::addr_t load_addr, Target &target,
+                                     AddressRange &range) {
+    return m_block.GetRangeContainingLoadAddress(load_addr, target, range);
+  }
+
   lldb::LanguageType GetLanguage() const;
   /// Find the file and line number of the source location of the start of the
   /// function.  This will use the declaration if present and fall back on the
diff --git a/lldb/include/lldb/Symbol/SaveCoreOptions.h b/lldb/include/lldb/Symbol/SaveCoreOptions.h
index d90d08026016d..bcf0087fbea5c 100644
--- a/lldb/include/lldb/Symbol/SaveCoreOptions.h
+++ b/lldb/include/lldb/Symbol/SaveCoreOptions.h
@@ -9,11 +9,11 @@
 #ifndef LLDB_SOURCE_PLUGINS_OBJECTFILE_SaveCoreOPTIONS_H
 #define LLDB_SOURCE_PLUGINS_OBJECTFILE_SaveCoreOPTIONS_H
 
+#include "lldb/Target/ThreadCollection.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/RangeMap.h"
 
 #include <optional>
-#include <set>
 #include <string>
 #include <unordered_set>
 
@@ -47,6 +47,8 @@ class SaveCoreOptions {
 
   void AddMemoryRegionToSave(const lldb_private::MemoryRegionInfo &region);
 
+  lldb_private::ThreadCollection::collection GetThreadsToSave() const;
+
   void Clear();
 
 private:
diff --git a/lldb/include/lldb/Symbol/SymbolContext.h b/lldb/include/lldb/Symbol/SymbolContext.h
index f65f57b0d1103..07769cd8dffae 100644
--- a/lldb/include/lldb/Symbol/SymbolContext.h
+++ b/lldb/include/lldb/Symbol/SymbolContext.h
@@ -91,15 +91,6 @@ class SymbolContext {
   /// their default state.
   void Clear(bool clear_target);
 
-  /// Dump a description of this object to a Stream.
-  ///
-  /// Dump a description of the contents of this object to the supplied stream
-  /// \a s.
-  ///
-  /// \param[in] s
-  ///     The stream to which to dump the object description.
-  void Dump(Stream *s, Target *target) const;
-
   /// Dump the stop context in this object to a Stream.
   ///
   /// Dump the best description of this object to the stream. The information
diff --git a/lldb/include/lldb/Target/OperatingSystem.h b/lldb/include/lldb/Target/OperatingSystem.h
index ceeddceb0f2c1..128239569790f 100644
--- a/lldb/include/lldb/Target/OperatingSystem.h
+++ b/lldb/include/lldb/Target/OperatingSystem.h
@@ -61,6 +61,8 @@ class OperatingSystem : public PluginInterface {
 
   virtual bool IsOperatingSystemPluginThread(const lldb::ThreadSP &thread_sp);
 
+  virtual bool DoesPluginReportAllThreads() = 0;
+
 protected:
   // Member variables.
   Process
diff --git a/lldb/include/lldb/Target/Thread.h b/lldb/include/lldb/Target/Thread.h
index 38b65b2bc5849..ef66fa11574db 100644
--- a/lldb/include/lldb/Target/Thread.h
+++ b/lldb/include/lldb/Target/Thread.h
@@ -200,11 +200,14 @@ class Thread : public std::enable_shared_from_this<Thread>,
   ///    The User resume state for this thread.
   lldb::StateType GetResumeState() const { return m_resume_state; }
 
-  // This function is called on all the threads before "ShouldResume" and
-  // "WillResume" in case a thread needs to change its state before the
-  // ThreadList polls all the threads to figure out which ones actually will
-  // get to run and how.
-  void SetupForResume();
+  /// This function is called on all the threads before "ShouldResume" and
+  /// "WillResume" in case a thread needs to change its state before the
+  /// ThreadList polls all the threads to figure out which ones actually will
+  /// get to run and how.
+  ///
+  /// \return
+  ///    True if we pushed a ThreadPlanStepOverBreakpoint
+  bool SetupForResume();
 
   // Do not override this function, it is for thread plan logic only
   bool ShouldResume(lldb::StateType resume_state);
diff --git a/lldb/include/lldb/Utility/LLDBAssert.h b/lldb/include/lldb/Utility/LLDBAssert.h
index aeef3e51e20a8..21dbdb3b3202d 100644
--- a/lldb/include/lldb/Utility/LLDBAssert.h
+++ b/lldb/include/lldb/Utility/LLDBAssert.h
@@ -19,24 +19,30 @@
 // __FILE__ but only renders the last path component (the filename) instead of
 // an invocation dependent full path to that file.
 #define lldbassert(x)                                                          \
-  lldb_private::lldb_assert(static_cast<bool>(x), #x, __FUNCTION__,            \
-                            __FILE_NAME__, __LINE__)
+  lldb_private::_lldb_assert(static_cast<bool>(x), #x, __FUNCTION__,           \
+                             __FILE_NAME__, __LINE__)
 #else
 #define lldbassert(x)                                                          \
-  lldb_private::lldb_assert(static_cast<bool>(x), #x, __FUNCTION__, __FILE__,  \
-                            __LINE__)
+  lldb_private::_lldb_assert(static_cast<bool>(x), #x, __FUNCTION__, __FILE__, \
+                             __LINE__)
 #endif
 #endif
 
 namespace lldb_private {
-void lldb_assert(bool expression, const char *expr_text, const char *func,
-                 const char *file, unsigned int line);
 
+/// Don't use _lldb_assert directly. Use the lldbassert macro instead so that
+/// LLDB asserts become regular asserts in NDEBUG builds.
+void _lldb_assert(bool expression, const char *expr_text, const char *func,
+                  const char *file, unsigned int line);
+
+/// The default LLDB assert callback, which prints to stderr.
 typedef void (*LLDBAssertCallback)(llvm::StringRef message,
                                    llvm::StringRef backtrace,
                                    llvm::StringRef prompt);
 
+/// Replace the LLDB assert callback.
 void SetLLDBAssertCallback(LLDBAssertCallback callback);
+
 } // namespace lldb_private
 
 #endif // LLDB_UTILITY_LLDBASSERT_H
diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h
index d09edeeccaff1..fc7456a4b9a32 100644
--- a/lldb/include/lldb/lldb-forward.h
+++ b/lldb/include/lldb/lldb-forward.h
@@ -233,6 +233,7 @@ class Symtab;
 class SyntheticChildren;
 class SyntheticChildrenFrontEnd;
 class SystemRuntime;
+class Progress;
 class Target;
 class TargetList;
 class TargetProperties;
diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt
index d8308841c05db..147b30f3b0026 100644
--- a/lldb/source/API/CMakeLists.txt
+++ b/lldb/source/API/CMakeLists.txt
@@ -83,6 +83,7 @@ add_lldb_library(liblldb SHARED ${option_framework}
   SBModule.cpp
   SBModuleSpec.cpp
   SBPlatform.cpp
+  SBProgress.cpp
   SBProcess.cpp
   SBProcessInfo.cpp
   SBProcessInfoList.cpp
diff --git a/lldb/source/API/SBProgress.cpp b/lldb/source/API/SBProgress.cpp
new file mode 100644
index 0000000000000..d6ed5f0d15fc9
--- /dev/null
+++ b/lldb/source/API/SBProgress.cpp
@@ -0,0 +1,43 @@
+//===-- SBProgress.cpp --------------------------------------------------*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/API/SBProgress.h"
+#include "lldb/Core/Progress.h"
+#include "lldb/Utility/Instrumentation.h"
+
+using namespace lldb;
+
+SBProgress::SBProgress(const char *title, const char *details,
+                       SBDebugger &debugger) {
+  LLDB_INSTRUMENT_VA(this, title, details, debugger);
+
+  m_opaque_up = std::make_unique<lldb_private::Progress>(
+      title, details, /*total=*/std::nullopt, debugger.get(),
+      /*minimum_report_time=*/std::nullopt,
+      lldb_private::Progress::Origin::eExternal);
+}
+
+SBProgress::SBProgress(const char *title, const char *details,
+                       uint64_t total_units, SBDebugger &debugger) {
+  LLDB_INSTRUMENT_VA(this, title, details, total_units, debugger);
+
+  m_opaque_up = std::make_unique<lldb_private::Progress>(
+      title, details, total_units, debugger.get(),
+      /*minimum_report_time=*/std::nullopt,
+      lldb_private::Progress::Origin::eExternal);
+}
+
+SBProgress::~SBProgress() = default;
+
+void SBProgress::Increment(uint64_t amount, const char *description) {
+  LLDB_INSTRUMENT_VA(amount, description);
+
+  m_opaque_up->Increment(amount, description);
+}
+
+lldb_private::Progress &SBProgress::ref() const { return *m_opaque_up; }
diff --git a/lldb/source/API/SBSaveCoreOptions.cpp b/lldb/source/API/SBSaveCoreOptions.cpp
index c79b57fa62c2b..35b9da569dfa1 100644
--- a/lldb/source/API/SBSaveCoreOptions.cpp
+++ b/lldb/source/API/SBSaveCoreOptions.cpp
@@ -10,6 +10,7 @@
 #include "lldb/API/SBMemoryRegionInfo.h"
 #include "lldb/Host/FileSystem.h"
 #include "lldb/Symbol/SaveCoreOptions.h"
+#include "lldb/Target/ThreadCollection.h"
 #include "lldb/Utility/Instrumentation.h"
 
 #include "Utils.h"
@@ -100,6 +101,14 @@ SBSaveCoreOptions::AddMemoryRegionToSave(const SBMemoryRegionInfo &region) {
   return SBError();
 }
 
+lldb::SBThreadCollection SBSaveCoreOptions::GetThreadsToSave() const {
+  LLDB_INSTRUMENT_VA(this);
+  lldb::ThreadCollectionSP threadcollection_sp =
+      std::make_shared<lldb_private::ThreadCollection>(
+          m_opaque_up->GetThreadsToSave());
+  return SBThreadCollection(threadcollection_sp);
+}
+
 void SBSaveCoreOptions::Clear() {
   LLDB_INSTRUMENT_VA(this);
   m_opaque_up->Clear();
diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp
index 4e61c83889b0b..cc848076dab5f 100644
--- a/lldb/source/API/SBThread.cpp
+++ b/lldb/source/API/SBThread.cpp
@@ -842,7 +842,6 @@ SBError SBThread::StepOverUntil(lldb::SBFrame &sb_frame,
     // appropriate error message.
 
     bool all_in_function = true;
-    AddressRange fun_range = frame_sc.function->GetAddressRange();
 
     std::vector<addr_t> step_over_until_addrs;
     const bool abort_other_plans = false;
@@ -859,7 +858,9 @@ SBError SBThread::StepOverUntil(lldb::SBFrame &sb_frame,
       addr_t step_addr =
           sc.line_entry.range.GetBaseAddress().GetLoadAddress(target);
       if (step_addr != LLDB_INVALID_ADDRESS) {
-        if (fun_range.ContainsLoadAddress(step_addr, target))
+        AddressRange unused_range;
+        if (frame_sc.function->GetRangeContainingLoadAddress(step_addr, *target,
+                                                             unused_range))
           step_over_until_addrs.push_back(step_addr);
         else
           all_in_function = false;
diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp
index 6ceb209269c9e..2df2aeb20aa26 100644
--- a/lldb/source/Core/Debugger.cpp
+++ b/lldb/source/Core/Debugger.cpp
@@ -1952,7 +1952,8 @@ lldb::thread_result_t Debugger::DefaultEventHandler() {
   listener_sp->StartListeningForEvents(
       &m_broadcaster, lldb::eBroadcastBitProgress | lldb::eBroadcastBitWarning |
                           lldb::eBroadcastBitError |
-                          lldb::eBroadcastSymbolChange);
+                          lldb::eBroadcastSymbolChange |
+                          lldb::eBroadcastBitExternalProgress);
 
   // Let the thread that spawned us know that we have started up and that we
   // are now listening to all required events so no events get missed
diff --git a/lldb/source/Core/Mangled.cpp b/lldb/source/Core/Mangled.cpp
index 387c4fac6b0f8..51c22495a16d7 100644
--- a/lldb/source/Core/Mangled.cpp
+++ b/lldb/source/Core/Mangled.cpp
@@ -79,8 +79,10 @@ Mangled::ManglingScheme Mangled::GetManglingScheme(llvm::StringRef const name) {
   // Swift 4.2 used "$S" and "_$S".
   // Swift 5 and onward uses "$s" and "_$s".
   // Swift also uses "@__swiftmacro_" as a prefix for mangling filenames.
+  // Embedded Swift introduced "$e" and  "_$e" as Swift mangling prefixes.
   if (name.starts_with("$S") || name.starts_with("_$S") ||
       name.starts_with("$s") || name.starts_with("_$s") ||
+      name.starts_with("$e") || name.starts_with("_$e") ||
       name.starts_with("@__swiftmacro_"))
     return Mangled::eManglingSchemeSwift;
 
diff --git a/lldb/source/Expression/LLVMUserExpression.cpp b/lldb/source/Expression/LLVMUserExpression.cpp
index 529ac462dfd8f..fac3ce6f5799d 100644
--- a/lldb/source/Expression/LLVMUserExpression.cpp
+++ b/lldb/source/Expression/LLVMUserExpression.cpp
@@ -187,18 +187,22 @@ LLVMUserExpression::DoExecute(DiagnosticManager &diagnostic_manager,
     if (execution_result == lldb::eExpressionInterrupted ||
         execution_result == lldb::eExpressionHitBreakpoint) {
       const char *error_desc = nullptr;
+      const char *explanation = execution_result == lldb::eExpressionInterrupted
+                                    ? "was interrupted"
+                                    : "hit a breakpoint";
 
       if (user_expression_plan) {
         if (auto real_stop_info_sp = user_expression_plan->GetRealStopInfo())
           error_desc = real_stop_info_sp->GetDescription();
       }
+
       if (error_desc)
         diagnostic_manager.Printf(lldb::eSeverityError,
-                                  "Execution was interrupted, reason: %s.",
+                                  "Expression execution %s: %s.", explanation,
                                   error_desc);
       else
-        diagnostic_manager.PutString(lldb::eSeverityError,
-                                     "Execution was interrupted.");
+        diagnostic_manager.Printf(lldb::eSeverityError,
+                                  "Expression execution %s.", explanation);
 
       if ((execution_result == lldb::eExpressionInterrupted &&
            options.DoesUnwindOnError()) ||
@@ -212,31 +216,35 @@ LLVMUserExpression::DoExecute(DiagnosticManager &diagnostic_manager,
           user_expression_plan->TransferExpressionOwnership();
         diagnostic_manager.AppendMessageToDiagnostic(
             "The process has been left at the point where it was "
-            "interrupted, "
-            "use \"thread return -x\" to return to the state before "
-            "expression evaluation.");
+            "interrupted, use \"thread return -x\" to return to the state "
+            "before expression evaluation.");
       }
 
       return execution_result;
-    } else if (execution_result == lldb::eExpressionStoppedForDebug) {
+    }
+
+    if (execution_result == lldb::eExpressionStoppedForDebug) {
       diagnostic_manager.PutString(
           lldb::eSeverityInfo,
-          "Execution was halted at the first instruction of the expression "
-          "function because \"debug\" was requested.\n"
+          "Expression execution was halted at the first instruction of the "
+          "expression function because \"debug\" was requested.\n"
           "Use \"thread return -x\" to return to the state before expression "
           "evaluation.");
       return execution_result;
-    } else if (execution_result == lldb::eExpressionThreadVanished) {
-      diagnostic_manager.Printf(
-          lldb::eSeverityError,
-          "Couldn't complete execution; the thread "
-          "on which the expression was being run: 0x%" PRIx64
-          " exited during its execution.",
-          expr_thread_id);
+    }
+
+    if (execution_result == lldb::eExpressionThreadVanished) {
+      diagnostic_manager.Printf(lldb::eSeverityError,
+                                "Couldn't execute expression: the thread on "
+                                "which the expression was being run (0x%" PRIx64
+                                ") exited during its execution.",
+                                expr_thread_id);
       return execution_result;
-    } else if (execution_result != lldb::eExpressionCompleted) {
+    }
+
+    if (execution_result != lldb::eExpressionCompleted) {
       diagnostic_manager.Printf(lldb::eSeverityError,
-                                "Couldn't execute function; result was %s",
+                                "Couldn't execute expression: result was %s",
                                 toString(execution_result).c_str());
       return execution_result;
     }
@@ -245,9 +253,9 @@ LLVMUserExpression::DoExecute(DiagnosticManager &diagnostic_manager,
   if (FinalizeJITExecution(diagnostic_manager, exe_ctx, result,
                            function_stack_bottom, function_stack_top)) {
     return lldb::eExpressionCompleted;
-  } else {
-    return lldb::eExpressionResultUnavailable;
   }
+
+  return lldb::eExpressionResultUnavailable;
 }
 
 bool LLVMUserExpression::FinalizeJITExecution(
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ASTUtils.h b/lldb/source/Plugins/ExpressionParser/Clang/ASTUtils.h
index 7403b79be6cc0..a1f02dc3d1b09 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ASTUtils.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ASTUtils.h
@@ -70,10 +70,10 @@ class ExternalASTSourceWrapper : public clang::ExternalSemaSource {
     m_Source->updateOutOfDateIdentifier(II);
   }
 
-  bool FindExternalVisibleDeclsByName(const clang::DeclContext *DC,
-                                      clang::DeclarationName Name,
-                                      clang::Module *NamedModule) override {
-    return m_Source->FindExternalVisibleDeclsByName(DC, Name, NamedModule);
+  bool FindExternalVisibleDeclsByName(
+      const clang::DeclContext *DC, clang::DeclarationName Name,
+      const clang::DeclContext *OriginalDC) override {
+    return m_Source->FindExternalVisibleDeclsByName(DC, Name, OriginalDC);
   }
 
   bool LoadExternalSpecializations(const clang::Decl *D,
@@ -388,11 +388,11 @@ class SemaSourceWithPriorities : public clang::ExternalSemaSource {
     return EK_ReplyHazy;
   }
 
-  bool FindExternalVisibleDeclsByName(const clang::DeclContext *DC,
-                                      clang::DeclarationName Name,
-                                      clang::Module *NamedModule) override {
+  bool FindExternalVisibleDeclsByName(
+      const clang::DeclContext *DC, clang::DeclarationName Name,
+      const clang::DeclContext *OriginalDC) override {
     for (size_t i = 0; i < Sources.size(); ++i)
-      if (Sources[i]->FindExternalVisibleDeclsByName(DC, Name, NamedModule))
+      if (Sources[i]->FindExternalVisibleDeclsByName(DC, Name, OriginalDC))
         return true;
     return false;
   }
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp
index 94ce867ef4a0f..34129807277d5 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp
@@ -100,7 +100,7 @@ void ClangASTSource::StartTranslationUnit(ASTConsumer *Consumer) {
 // The core lookup interface.
 bool ClangASTSource::FindExternalVisibleDeclsByName(
     const DeclContext *decl_ctx, DeclarationName clang_decl_name,
-    clang::Module *NamedModule) {
+    const clang::DeclContext *original_dc) {
   if (!m_ast_context) {
     SetNoExternalVisibleDeclsForName(decl_ctx, clang_decl_name);
     return false;
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h
index 6dc4ecc94e0ed..dd89bae96f629 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h
@@ -83,9 +83,10 @@ class ClangASTSource : public clang::ExternalASTSource,
   ///
   /// \return
   ///     Whatever SetExternalVisibleDeclsForName returns.
-  bool FindExternalVisibleDeclsByName(const clang::DeclContext *DC,
-                                      clang::DeclarationName Name,
-                                      clang::Module *NamedModule) override;
+  bool
+  FindExternalVisibleDeclsByName(const clang::DeclContext *DC,
+                                 clang::DeclarationName Name,
+                                 const clang::DeclContext *OriginalDC) override;
 
   /// Enumerate all Decls in a given lexical context.
   ///
@@ -212,10 +213,10 @@ class ClangASTSource : public clang::ExternalASTSource,
   public:
     ClangASTSourceProxy(ClangASTSource &original) : m_original(original) {}
 
-    bool FindExternalVisibleDeclsByName(const clang::DeclContext *DC,
-                                        clang::DeclarationName Name,
-                                        clang::Module *NamedModule) override {
-      return m_original.FindExternalVisibleDeclsByName(DC, Name, NamedModule);
+    bool FindExternalVisibleDeclsByName(
+        const clang::DeclContext *DC, clang::DeclarationName Name,
+        const clang::DeclContext *OriginalDC) override {
+      return m_original.FindExternalVisibleDeclsByName(DC, Name, OriginalDC);
     }
 
     void FindExternalLexicalDecls(
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.cpp
index bf4537e69eb63..3eddf49a8b7e7 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.cpp
@@ -51,7 +51,7 @@ void ClangExternalASTSourceCallbacks::FindExternalLexicalDecls(
 
 bool ClangExternalASTSourceCallbacks::FindExternalVisibleDeclsByName(
     const clang::DeclContext *DC, clang::DeclarationName Name,
-    clang::Module *NamedModule) {
+    const clang::DeclContext *OriginalDC) {
   llvm::SmallVector<clang::NamedDecl *, 4> decls;
   // Objective-C methods are not added into the LookupPtr when they originate
   // from an external source. SetExternalVisibleDeclsForName() adds them.
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.h
index d2e9c1552fd38..d0eabb509455c 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.h
@@ -37,9 +37,10 @@ class ClangExternalASTSourceCallbacks : public clang::ExternalASTSource {
       llvm::function_ref<bool(clang::Decl::Kind)> IsKindWeWant,
       llvm::SmallVectorImpl<clang::Decl *> &Result) override;
 
-  bool FindExternalVisibleDeclsByName(const clang::DeclContext *DC,
-                                      clang::DeclarationName Name,
-                                      clang::Module *NamedModule) override;
+  bool
+  FindExternalVisibleDeclsByName(const clang::DeclContext *DC,
+                                 clang::DeclarationName Name,
+                                 const clang::DeclContext *OriginalDC) override;
 
   void CompleteType(clang::TagDecl *tag_decl) override;
 
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
index bf91fc42482f3..be520ee27af06 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
@@ -111,7 +111,8 @@ CompilerType lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::
   // that wraps a std::pair. Peel away the internal wrapper type - whose
   // structure is of no value to users, to expose the std::pair. This
   // matches the structure returned by the std::map synthetic provider.
-  if (isUnorderedMap(m_backend.GetTypeName())) {
+  if (isUnorderedMap(
+          m_backend.GetCompilerType().GetCanonicalType().GetTypeName())) {
     std::string name;
     CompilerType field_type =
         element_type.GetFieldAtIndex(0, name, nullptr, nullptr, nullptr);
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp
index 24fc5bb2c047f..e4b20b30a069f 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp
@@ -29,9 +29,9 @@ class lldb_private::AppleObjCExternalASTSource
   AppleObjCExternalASTSource(AppleObjCDeclVendor &decl_vendor)
       : m_decl_vendor(decl_vendor) {}
 
-  bool FindExternalVisibleDeclsByName(const clang::DeclContext *decl_ctx,
-                                      clang::DeclarationName name,
-                                      clang::Module *NamedModule) override {
+  bool FindExternalVisibleDeclsByName(
+      const clang::DeclContext *decl_ctx, clang::DeclarationName name,
+      const clang::DeclContext *original_dc) override {
 
     Log *log(GetLog(
         LLDBLog::Expressions)); // FIXME - a more appropriate log channel?
diff --git a/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp b/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp
index 3848a2b1deb97..aff521890858c 100644
--- a/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp
+++ b/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp
@@ -386,4 +386,12 @@ lldb::ThreadSP OperatingSystemPython::CreateThread(lldb::tid_t tid,
   return ThreadSP();
 }
 
+bool OperatingSystemPython::DoesPluginReportAllThreads() {
+  // If the python plugin has a "DoesPluginReportAllThreads" method, use it.
+  if (std::optional<bool> plugin_answer =
+          m_operating_system_interface_sp->DoesPluginReportAllThreads())
+    return *plugin_answer;
+  return m_process->GetOSPluginReportsAllThreads();
+}
+
 #endif // #if LLDB_ENABLE_PYTHON
diff --git a/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.h b/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.h
index 90973acde3ebf..980a544241de4 100644
--- a/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.h
+++ b/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.h
@@ -60,6 +60,8 @@ class OperatingSystemPython : public lldb_private::OperatingSystem {
   // Method for lazy creation of threads on demand
   lldb::ThreadSP CreateThread(lldb::tid_t tid, lldb::addr_t context) override;
 
+  bool DoesPluginReportAllThreads() override;
+
 protected:
   bool IsValid() const {
     return m_script_object_sp && m_script_object_sp->IsValid();
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp
index c3379e774a0b8..d8b2ea984fd88 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp
@@ -82,6 +82,16 @@ OperatingSystemPythonInterface::GetRegisterContextForTID(lldb::tid_t tid) {
   return obj->GetAsString()->GetValue().str();
 }
 
+std::optional<bool> OperatingSystemPythonInterface::DoesPluginReportAllThreads() {
+  Status error;
+  StructuredData::ObjectSP obj = Dispatch("does_plugin_report_all_threads", error);
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
+    return {};
+
+  return obj->GetAsBoolean()->GetValue();
+}
+
 void OperatingSystemPythonInterface::Initialize() {
   const std::vector<llvm::StringRef> ci_usages = {
       "settings set target.process.python-os-plugin-path <script-path>",
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.h
index 102c3c3953768..8df48f1b64cc9 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.h
@@ -45,6 +45,8 @@ class OperatingSystemPythonInterface
 
   std::optional<std::string> GetRegisterContextForTID(lldb::tid_t tid) override;
 
+  std::optional<bool> DoesPluginReportAllThreads() override;
+
   static void Initialize();
 
   static void Terminate();
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index e2f76e88dd6f0..fb3af44abfa8d 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -377,7 +377,12 @@ ParsedDWARFTypeAttributes::ParsedDWARFTypeAttributes(const DWARFDIE &die) {
       break;
 
     case DW_AT_object_pointer:
-      object_pointer = form_value.Reference();
+      // GetAttributes follows DW_AT_specification.
+      // DW_TAG_subprogram definitions and declarations may both
+      // have a DW_AT_object_pointer. Don't overwrite the one
+      // we parsed for the definition with the one from the declaration.
+      if (!object_pointer.IsValid())
+        object_pointer = form_value.Reference();
       break;
 
     case DW_AT_signature:
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
index 6857878b354a0..1e2564cb22f25 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
@@ -19,6 +19,8 @@
 #include "llvm/ADT/iterator.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFAddressRange.h"
+#include "llvm/DebugInfo/DWARF/DWARFTypePrinter.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace lldb_private;
 using namespace lldb_private::dwarf;
@@ -376,7 +378,8 @@ lldb_private::Type *DWARFDIE::ResolveTypeUID(const DWARFDIE &die) const {
   return nullptr;
 }
 
-static CompilerContext GetContextEntry(DWARFDIE die) {
+static CompilerContext GetContextEntry(DWARFDIE die,
+                                       bool derive_template_names) {
   auto ctx = [die](CompilerContextKind kind) {
     return CompilerContext(kind, ConstString(die.GetName()));
   };
@@ -386,11 +389,6 @@ static CompilerContext GetContextEntry(DWARFDIE die) {
     return ctx(CompilerContextKind::Module);
   case DW_TAG_namespace:
     return ctx(CompilerContextKind::Namespace);
-  case DW_TAG_class_type:
-  case DW_TAG_structure_type:
-    return ctx(CompilerContextKind::ClassOrStruct);
-  case DW_TAG_union_type:
-    return ctx(CompilerContextKind::Union);
   case DW_TAG_enumeration_type:
     return ctx(CompilerContextKind::Enum);
   case DW_TAG_subprogram:
@@ -401,12 +399,28 @@ static CompilerContext GetContextEntry(DWARFDIE die) {
     return ctx(CompilerContextKind::Typedef);
   case DW_TAG_base_type:
     return ctx(CompilerContextKind::Builtin);
+  case DW_TAG_class_type:
+  case DW_TAG_structure_type:
+  case DW_TAG_union_type: {
+    CompilerContextKind kind = die.Tag() == DW_TAG_union_type
+                                   ? CompilerContextKind::Union
+                                   : CompilerContextKind::ClassOrStruct;
+    llvm::StringRef name = die.GetName();
+    if (!derive_template_names || name.contains('<'))
+      return CompilerContext(kind, ConstString(name));
+
+    std::string name_storage = name.str();
+    llvm::raw_string_ostream os(name_storage);
+    llvm::DWARFTypePrinter<DWARFDIE>(os).appendAndTerminateTemplateParameters(
+        die);
+    return CompilerContext(kind, ConstString(os.str()));
+  }
   default:
     llvm_unreachable("Check tag type in the caller!");
   }
 }
 
-static void GetDeclContextImpl(DWARFDIE die,
+static void GetDeclContextImpl(DWARFDIE die, bool derive_template_names,
                                llvm::SmallSet<lldb::user_id_t, 4> &seen,
                                std::vector<CompilerContext> &context) {
   // Stop if we hit a cycle.
@@ -428,7 +442,7 @@ static void GetDeclContextImpl(DWARFDIE die,
     case DW_TAG_subprogram:
     case DW_TAG_variable:
     case DW_TAG_typedef:
-      context.push_back(GetContextEntry(die));
+      context.push_back(GetContextEntry(die, derive_template_names));
       break;
     default:
       break;
@@ -438,15 +452,16 @@ static void GetDeclContextImpl(DWARFDIE die,
   }
 }
 
-std::vector<CompilerContext> DWARFDIE::GetDeclContext() const {
+std::vector<CompilerContext>
+DWARFDIE::GetDeclContext(bool derive_template_names) const {
   llvm::SmallSet<lldb::user_id_t, 4> seen;
   std::vector<CompilerContext> context;
-  GetDeclContextImpl(*this, seen, context);
+  GetDeclContextImpl(*this, derive_template_names, seen, context);
   std::reverse(context.begin(), context.end());
   return context;
 }
 
-static void GetTypeLookupContextImpl(DWARFDIE die,
+static void GetTypeLookupContextImpl(DWARFDIE die, bool derive_template_names,
                                      llvm::SmallSet<lldb::user_id_t, 4> &seen,
                                      std::vector<CompilerContext> &context) {
   // Stop if we hit a cycle.
@@ -461,7 +476,7 @@ static void GetTypeLookupContextImpl(DWARFDIE die,
     case DW_TAG_variable:
     case DW_TAG_typedef:
     case DW_TAG_base_type:
-      context.push_back(GetContextEntry(die));
+      context.push_back(GetContextEntry(die, derive_template_names));
       break;
 
     // If any of the tags below appear in the parent chain, stop the decl
@@ -484,10 +499,11 @@ static void GetTypeLookupContextImpl(DWARFDIE die,
   }
 }
 
-std::vector<CompilerContext> DWARFDIE::GetTypeLookupContext() const {
+std::vector<CompilerContext>
+DWARFDIE::GetTypeLookupContext(bool derive_template_names) const {
   llvm::SmallSet<lldb::user_id_t, 4> seen;
   std::vector<CompilerContext> context;
-  GetTypeLookupContextImpl(*this, seen, context);
+  GetTypeLookupContextImpl(*this, derive_template_names, seen, context);
   std::reverse(context.begin(), context.end());
   return context;
 }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.h
index c3239b5b121f9..8785ac09b1f14 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.h
@@ -73,7 +73,15 @@ class DWARFDIE : public DWARFBaseDIE {
   /// Return this DIE's decl context as it is needed to look up types
   /// in Clang modules. This context will include any modules or functions that
   /// the type is declared in so an exact module match can be efficiently made.
-  std::vector<CompilerContext> GetDeclContext() const;
+  ///
+  /// \param[in] derive_template_names
+  ///   If true, augments the returned names with template arguments derived
+  ///   from the child DIEs, if the names don't contained template arguments
+  ///   already. If false, the returned context will contain the names exactly
+  ///   as they are spelled in the debug info, regardless of whether that
+  ///   includes template arguments or not.
+  std::vector<CompilerContext>
+  GetDeclContext(bool derive_template_names = false) const;
 
   /// Get a context to a type so it can be looked up.
   ///
@@ -85,7 +93,15 @@ class DWARFDIE : public DWARFBaseDIE {
   /// appropriate time, like either the translation unit or at a function
   /// context. This is designed to allow users to efficiently look for types
   /// using a full or partial CompilerContext array.
-  std::vector<CompilerContext> GetTypeLookupContext() const;
+  ///
+  /// \param[in] derive_template_names
+  ///   If true, augments the returned names with template arguments derived
+  ///   from the child DIEs, if the names don't contained template arguments
+  ///   already. If false, the returned context will contain the names exactly
+  ///   as they are spelled in the debug info, regardless of whether that
+  ///   includes template arguments or not.
+  std::vector<CompilerContext>
+  GetTypeLookupContext(bool derive_template_names = false) const;
 
   DWARFDeclContext GetDWARFDeclContext() const;
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
index 6d073411de876..aafdd2ec68309 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
@@ -281,22 +281,34 @@ bool DWARFDebugInfoEntry::GetDIENamesAndRanges(
   return !ranges.empty();
 }
 
-// Get all attribute values for a given DIE, including following any
-// specification or abstract origin attributes and including those in the
-// results. Any duplicate attributes will have the first instance take
-// precedence (this can happen for declaration attributes).
-void DWARFDebugInfoEntry::GetAttributes(DWARFUnit *cu,
-                                        DWARFAttributes &attributes,
-                                        Recurse recurse,
-                                        uint32_t curr_depth) const {
-  const auto *abbrevDecl = GetAbbreviationDeclarationPtr(cu);
-  if (!abbrevDecl) {
-    attributes.Clear();
-    return;
-  }
+/// Helper for the public \ref DWARFDebugInfoEntry::GetAttributes API.
+/// Adds all attributes of the DIE at the top of the \c worklist to the
+/// \c attributes list. Specifcations and abstract origins are added
+/// to the \c worklist if the referenced DIE has not been seen before.
+static bool GetAttributes(llvm::SmallVectorImpl<DWARFDIE> &worklist,
+                          llvm::SmallSet<DWARFDebugInfoEntry const *, 3> &seen,
+                          DWARFAttributes &attributes) {
+  assert(!worklist.empty() && "Need at least one DIE to visit.");
+  assert(seen.size() >= 1 &&
+         "Need to have seen at least the currently visited entry.");
+
+  DWARFDIE current = worklist.pop_back_val();
+
+  const auto *cu = current.GetCU();
+  assert(cu);
+
+  const auto *entry = current.GetDIE();
+  assert(entry);
+
+  const auto *abbrevDecl =
+      entry->GetAbbreviationDeclarationPtr(current.GetCU());
+  if (!abbrevDecl)
+    return false;
 
   const DWARFDataExtractor &data = cu->GetData();
-  lldb::offset_t offset = GetFirstAttributeOffset();
+  lldb::offset_t offset = current.GetDIE()->GetFirstAttributeOffset();
+
+  const bool is_first_die = seen.size() == 1;
 
   for (const auto &attribute : abbrevDecl->attributes()) {
     DWARFFormValue form_value(cu);
@@ -309,10 +321,10 @@ void DWARFDebugInfoEntry::GetAttributes(DWARFUnit *cu,
     switch (attr) {
     case DW_AT_sibling:
     case DW_AT_declaration:
-      if (curr_depth > 0) {
+      if (!is_first_die) {
         // This attribute doesn't make sense when combined with the DIE that
         // references this DIE. We know a DIE is referencing this DIE because
-        // curr_depth is not zero
+        // we've visited more than one DIE already.
         break;
       }
       [[fallthrough]];
@@ -321,13 +333,12 @@ void DWARFDebugInfoEntry::GetAttributes(DWARFUnit *cu,
       break;
     }
 
-    if (recurse == Recurse::yes &&
-        ((attr == DW_AT_specification) || (attr == DW_AT_abstract_origin))) {
+    if (attr == DW_AT_specification || attr == DW_AT_abstract_origin) {
       if (form_value.ExtractValue(data, &offset)) {
-        DWARFDIE spec_die = form_value.Reference();
-        if (spec_die)
-          spec_die.GetDIE()->GetAttributes(spec_die.GetCU(), attributes,
-                                           recurse, curr_depth + 1);
+        if (DWARFDIE spec_die = form_value.Reference()) {
+          if (seen.insert(spec_die.GetDIE()).second)
+            worklist.push_back(spec_die);
+        }
       }
     } else {
       const dw_form_t form = form_value.Form();
@@ -339,6 +350,34 @@ void DWARFDebugInfoEntry::GetAttributes(DWARFUnit *cu,
         DWARFFormValue::SkipValue(form, data, &offset, cu);
     }
   }
+
+  return true;
+}
+
+DWARFAttributes DWARFDebugInfoEntry::GetAttributes(const DWARFUnit *cu,
+                                                   Recurse recurse) const {
+  // FIXME: use ElaboratingDIEIterator to follow specifications/abstract origins
+  // instead of maintaining our own worklist/seen list.
+
+  DWARFAttributes attributes;
+
+  llvm::SmallVector<DWARFDIE, 3> worklist;
+  worklist.emplace_back(cu, this);
+
+  // Keep track if DIEs already seen to prevent infinite recursion.
+  // Value of '3' was picked for the same reason that
+  // DWARFDie::findRecursively does.
+  llvm::SmallSet<DWARFDebugInfoEntry const *, 3> seen;
+  seen.insert(this);
+
+  do {
+    if (!::GetAttributes(worklist, seen, attributes)) {
+      attributes.Clear();
+      break;
+    }
+  } while (!worklist.empty() && recurse == Recurse::yes);
+
+  return attributes;
 }
 
 // GetAttributeValue
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
index de6bbf1d52789..72aeb2743b1e2 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
@@ -52,12 +52,28 @@ class DWARFDebugInfoEntry {
                lldb::offset_t *offset_ptr);
 
   using Recurse = DWARFBaseDIE::Recurse;
-  DWARFAttributes GetAttributes(DWARFUnit *cu,
-                                Recurse recurse = Recurse::yes) const {
-    DWARFAttributes attrs;
-    GetAttributes(cu, attrs, recurse, 0 /* curr_depth */);
-    return attrs;
-  }
+
+  /// Get all attribute values for a given DIE, optionally following any
+  /// specifications and abstract origins and including their attributes
+  /// in the result too.
+  ///
+  /// When following specifications/abstract origins, the attributes
+  /// on the referring DIE are guaranteed to be visited before the attributes of
+  /// the referenced DIE.
+  ///
+  /// \param[in] cu DWARFUnit that this entry belongs to.
+  ///
+  /// \param[in] recurse If set to \c Recurse::yes, will include attributes
+  /// on DIEs referenced via \c DW_AT_specification and \c DW_AT_abstract_origin
+  /// (including across multiple levels of indirection).
+  ///
+  /// \returns DWARFAttributes that include all attributes found on this DIE
+  /// (and possibly referenced DIEs). Attributes may appear multiple times
+  /// (e.g., if a declaration and definition both specify the same attribute).
+  /// On failure, the returned DWARFAttributes will be empty.
+  ///
+  DWARFAttributes GetAttributes(const DWARFUnit *cu,
+                                Recurse recurse = Recurse::yes) const;
 
   dw_offset_t GetAttributeValue(const DWARFUnit *cu, const dw_attr_t attr,
                                 DWARFFormValue &formValue,
@@ -178,10 +194,6 @@ class DWARFDebugInfoEntry {
   /// A copy of the DW_TAG value so we don't have to go through the compile
   /// unit abbrev table
   dw_tag_t m_tag = llvm::dwarf::DW_TAG_null;
-
-private:
-  void GetAttributes(DWARFUnit *cu, DWARFAttributes &attrs, Recurse recurse,
-                     uint32_t curr_depth) const;
 };
 } // namespace dwarf
 } // namespace lldb_private::plugin
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 360dbaa1beb5e..ad5005b660c64 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -10,10 +10,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/DWARF/DWARFAddressRange.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
-#include "llvm/DebugInfo/DWARF/DWARFTypePrinter.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FileUtilities.h"
-#include "llvm/Support/Format.h"
 #include "llvm/Support/FormatAdapters.h"
 #include "llvm/Support/Threading.h"
 
@@ -2740,18 +2738,11 @@ void SymbolFileDWARF::FindTypes(const TypeQuery &query, TypeResults &results) {
       // Copy our match's context and update the basename we are looking for
       // so we can use this only to compare the context correctly.
       m_index->GetTypesWithQuery(query_simple, [&](DWARFDIE die) {
-        // Check the language, but only if we have a language filter.
-        if (query.HasLanguage()) {
-          if (!query.LanguageMatches(GetLanguageFamily(*die.GetCU())))
-            return true; // Keep iterating over index types, language mismatch.
-        }
-
-        std::string qualified_name;
-        llvm::raw_string_ostream os(qualified_name);
-        llvm::DWARFTypePrinter<DWARFDIE> type_printer(os);
-        type_printer.appendQualifiedName(die);
-        TypeQuery die_query(qualified_name, e_exact_match);
-        if (query.ContextMatches(die_query.GetContextRef()))
+        std::vector<CompilerContext> qualified_context =
+            query.GetModuleSearch()
+                ? die.GetDeclContext(/*derive_template_names=*/true)
+                : die.GetTypeLookupContext(/*derive_template_names=*/true);
+        if (query.ContextMatches(qualified_context))
           if (Type *matching_type = ResolveType(die, true, true))
             results.InsertUnique(matching_type->shared_from_this());
         return !results.Done(query); // Keep iterating if we aren't done.
@@ -3423,7 +3414,10 @@ VariableSP SymbolFileDWARF::ParseVariableDIE(const SymbolContext &sc,
       mangled = form_value.AsCString();
       break;
     case DW_AT_type:
-      type_die_form = form_value;
+      // DW_AT_type on declaration may be less accurate than
+      // that of definition, so don't overwrite it.
+      if (!type_die_form.IsValid())
+        type_die_form = form_value;
       break;
     case DW_AT_external:
       is_external = form_value.Boolean();
diff --git a/lldb/source/Symbol/CompilerType.cpp b/lldb/source/Symbol/CompilerType.cpp
index e9e6e3bf2600c..09820fb3f0101 100644
--- a/lldb/source/Symbol/CompilerType.cpp
+++ b/lldb/source/Symbol/CompilerType.cpp
@@ -1105,8 +1105,11 @@ bool CompilerType::GetValueAsScalar(const lldb_private::DataExtractor &data,
       return false;
 
     std::optional<uint64_t> byte_size = GetByteSize(exe_scope);
-    if (!byte_size)
+    // A bit or byte size of 0 is not a bug, but it doesn't make sense to read a
+    // scalar of zero size.
+    if (!byte_size || *byte_size == 0)
       return false;
+
     lldb::offset_t offset = data_byte_offset;
     switch (encoding) {
     case lldb::eEncodingInvalid:
diff --git a/lldb/source/Symbol/SaveCoreOptions.cpp b/lldb/source/Symbol/SaveCoreOptions.cpp
index 8d9aadece2152..c9f6efeb25d22 100644
--- a/lldb/source/Symbol/SaveCoreOptions.cpp
+++ b/lldb/source/Symbol/SaveCoreOptions.cpp
@@ -114,9 +114,8 @@ void SaveCoreOptions::AddMemoryRegionToSave(
 const MemoryRanges &SaveCoreOptions::GetCoreFileMemoryRanges() const {
   return m_regions_to_save;
 }
-
-Status SaveCoreOptions::EnsureValidConfiguration(
-    lldb::ProcessSP process_sp) const {
+Status
+SaveCoreOptions::EnsureValidConfiguration(lldb::ProcessSP process_sp) const {
   Status error;
   std::string error_str;
   if (!m_threads_to_save.empty() && GetStyle() == lldb::eSaveCoreFull)
@@ -132,10 +131,24 @@ Status SaveCoreOptions::EnsureValidConfiguration(
   return error;
 }
 
-void SaveCoreOptions::ClearProcessSpecificData() { 
+lldb_private::ThreadCollection::collection
+SaveCoreOptions::GetThreadsToSave() const {
+  lldb_private::ThreadCollection::collection thread_collection;
+  // In cases where no process is set, such as when no threads are specified.
+  if (!m_process_sp)
+    return thread_collection;
+
+  ThreadList &thread_list = m_process_sp->GetThreadList();
+  for (const auto &tid : m_threads_to_save)
+    thread_collection.push_back(thread_list.FindThreadByID(tid));
+
+  return thread_collection;
+}
+
+void SaveCoreOptions::ClearProcessSpecificData() {
   // Deliberately not following the formatter style here to indicate that
   // this method will be expanded in the future.
-  m_threads_to_save.clear(); 
+  m_threads_to_save.clear();
 }
 
 void SaveCoreOptions::Clear() {
diff --git a/lldb/source/Symbol/SymbolContext.cpp b/lldb/source/Symbol/SymbolContext.cpp
index 19b6ff6a5302b..f4270ee839676 100644
--- a/lldb/source/Symbol/SymbolContext.cpp
+++ b/lldb/source/Symbol/SymbolContext.cpp
@@ -317,65 +317,6 @@ uint32_t SymbolContext::GetResolvedMask() const {
   return resolved_mask;
 }
 
-void SymbolContext::Dump(Stream *s, Target *target) const {
-  *s << this << ": ";
-  s->Indent();
-  s->PutCString("SymbolContext");
-  s->IndentMore();
-  s->EOL();
-  s->IndentMore();
-  s->Indent();
-  *s << "Module       = " << module_sp.get() << ' ';
-  if (module_sp)
-    module_sp->GetFileSpec().Dump(s->AsRawOstream());
-  s->EOL();
-  s->Indent();
-  *s << "CompileUnit  = " << comp_unit;
-  if (comp_unit != nullptr)
-    s->Format(" {{{0:x-16}} {1}", comp_unit->GetID(),
-              comp_unit->GetPrimaryFile());
-  s->EOL();
-  s->Indent();
-  *s << "Function     = " << function;
-  if (function != nullptr) {
-    s->Format(" {{{0:x-16}} {1}, address-range = ", function->GetID(),
-              function->GetType()->GetName());
-    function->GetAddressRange().Dump(s, target, Address::DumpStyleLoadAddress,
-                                     Address::DumpStyleModuleWithFileAddress);
-    s->EOL();
-    s->Indent();
-    Type *func_type = function->GetType();
-    if (func_type) {
-      *s << "        Type = ";
-      func_type->Dump(s, false);
-    }
-  }
-  s->EOL();
-  s->Indent();
-  *s << "Block        = " << block;
-  if (block != nullptr)
-    s->Format(" {{{0:x-16}}", block->GetID());
-  s->EOL();
-  s->Indent();
-  *s << "LineEntry    = ";
-  line_entry.Dump(s, target, true, Address::DumpStyleLoadAddress,
-                  Address::DumpStyleModuleWithFileAddress, true);
-  s->EOL();
-  s->Indent();
-  *s << "Symbol       = " << symbol;
-  if (symbol != nullptr && symbol->GetMangled())
-    *s << ' ' << symbol->GetName().AsCString();
-  s->EOL();
-  *s << "Variable     = " << variable;
-  if (variable != nullptr) {
-    s->Format(" {{{0:x-16}} {1}", variable->GetID(),
-              variable->GetType()->GetName());
-    s->EOL();
-  }
-  s->IndentLess();
-  s->IndentLess();
-}
-
 bool lldb_private::operator==(const SymbolContext &lhs,
                               const SymbolContext &rhs) {
   return lhs.function == rhs.function && lhs.symbol == rhs.symbol &&
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index 68485a40a3fcc..c47e728fdf716 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -1182,7 +1182,7 @@ void Process::UpdateThreadListIfNeeded() {
           // See if the OS plugin reports all threads.  If it does, then
           // it is safe to clear unseen thread's plans here.  Otherwise we
           // should preserve them in case they show up again:
-          clear_unused_threads = GetOSPluginReportsAllThreads();
+          clear_unused_threads = os->DoesPluginReportAllThreads();
 
           // Turn off dynamic types to ensure we don't run any expressions.
           // Objective-C can run an expression to determine if a SBValue is a
diff --git a/lldb/source/Target/TargetProperties.td b/lldb/source/Target/TargetProperties.td
index bb3b500d5fdfb..38a345dfd8849 100644
--- a/lldb/source/Target/TargetProperties.td
+++ b/lldb/source/Target/TargetProperties.td
@@ -223,7 +223,7 @@ let Definition = "process_experimental" in {
   def OSPluginReportsAllThreads: Property<"os-plugin-reports-all-threads", "Boolean">,
     Global,
     DefaultTrue,
-    Desc<"Set to False if your OS Plugins doesn't report all threads on each stop.">;
+    Desc<"Set to False if your Python OS Plugin doesn't report all threads on each stop.">;
 }
 
 let Definition = "process" in {
diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp
index a6130f6b925bb..b526131097061 100644
--- a/lldb/source/Target/Thread.cpp
+++ b/lldb/source/Target/Thread.cpp
@@ -617,7 +617,7 @@ void Thread::WillStop() {
   current_plan->WillStop();
 }
 
-void Thread::SetupForResume() {
+bool Thread::SetupForResume() {
   if (GetResumeState() != eStateSuspended) {
     // First check whether this thread is going to "actually" resume at all.
     // For instance, if we're stepping from one level to the next of an
@@ -625,7 +625,7 @@ void Thread::SetupForResume() {
     // without actually running this thread.  In that case, for this thread we
     // shouldn't push a step over breakpoint plan or do that work.
     if (GetCurrentPlan()->IsVirtualStep())
-      return;
+      return false;
 
     // If we're at a breakpoint push the step-over breakpoint plan.  Do this
     // before telling the current plan it will resume, since we might change
@@ -663,11 +663,13 @@ void Thread::SetupForResume() {
               step_bp_plan->SetAutoContinue(true);
             }
             QueueThreadPlan(step_bp_plan_sp, false);
+            return true;
           }
         }
       }
     }
   }
+  return false;
 }
 
 bool Thread::ShouldResume(StateType resume_state) {
diff --git a/lldb/source/Target/ThreadList.cpp b/lldb/source/Target/ThreadList.cpp
index 1a2d7dd61c778..6cbef330bf488 100644
--- a/lldb/source/Target/ThreadList.cpp
+++ b/lldb/source/Target/ThreadList.cpp
@@ -518,58 +518,7 @@ bool ThreadList::WillResume() {
 
   collection::iterator pos, end = m_threads.end();
 
-  // See if any thread wants to run stopping others.  If it does, then we won't
-  // setup the other threads for resume, since they aren't going to get a
-  // chance to run.  This is necessary because the SetupForResume might add
-  // "StopOthers" plans which would then get to be part of the who-gets-to-run
-  // negotiation, but they're coming in after the fact, and the threads that
-  // are already set up should take priority.
-
-  bool wants_solo_run = false;
-
-  for (pos = m_threads.begin(); pos != end; ++pos) {
-    lldbassert((*pos)->GetCurrentPlan() &&
-               "thread should not have null thread plan");
-    if ((*pos)->GetResumeState() != eStateSuspended &&
-        (*pos)->GetCurrentPlan()->StopOthers()) {
-      if ((*pos)->IsOperatingSystemPluginThread() &&
-          !(*pos)->GetBackingThread())
-        continue;
-      wants_solo_run = true;
-      break;
-    }
-  }
-
-  if (wants_solo_run) {
-    Log *log = GetLog(LLDBLog::Step);
-    if (log && log->GetVerbose())
-      LLDB_LOGF(log, "Turning on notification of new threads while single "
-                     "stepping a thread.");
-    m_process.StartNoticingNewThreads();
-  } else {
-    Log *log = GetLog(LLDBLog::Step);
-    if (log && log->GetVerbose())
-      LLDB_LOGF(log, "Turning off notification of new threads while single "
-                     "stepping a thread.");
-    m_process.StopNoticingNewThreads();
-  }
-
-  // Give all the threads that are likely to run a last chance to set up their
-  // state before we negotiate who is actually going to get a chance to run...
-  // Don't set to resume suspended threads, and if any thread wanted to stop
-  // others, only call setup on the threads that request StopOthers...
-
-  for (pos = m_threads.begin(); pos != end; ++pos) {
-    if ((*pos)->GetResumeState() != eStateSuspended &&
-        (!wants_solo_run || (*pos)->GetCurrentPlan()->StopOthers())) {
-      if ((*pos)->IsOperatingSystemPluginThread() &&
-          !(*pos)->GetBackingThread())
-        continue;
-      (*pos)->SetupForResume();
-    }
-  }
-
-  // Now go through the threads and see if any thread wants to run just itself.
+  // Go through the threads and see if any thread wants to run just itself.
   // if so then pick one and run it.
 
   ThreadList run_me_only_list(m_process);
@@ -582,14 +531,13 @@ bool ThreadList::WillResume() {
   // There are two special kinds of thread that have priority for "StopOthers":
   // a "ShouldRunBeforePublicStop thread, or the currently selected thread.  If
   // we find one satisfying that critereon, put it here.
-  ThreadSP stop_others_thread_sp;
-
+  ThreadSP thread_to_run;
   for (pos = m_threads.begin(); pos != end; ++pos) {
     ThreadSP thread_sp(*pos);
     if (thread_sp->GetResumeState() != eStateSuspended &&
         thread_sp->GetCurrentPlan()->StopOthers()) {
-      if ((*pos)->IsOperatingSystemPluginThread() &&
-          !(*pos)->GetBackingThread())
+      if (thread_sp->IsOperatingSystemPluginThread() &&
+          !thread_sp->GetBackingThread())
         continue;
 
       // You can't say "stop others" and also want yourself to be suspended.
@@ -597,19 +545,76 @@ bool ThreadList::WillResume() {
       run_me_only_list.AddThread(thread_sp);
 
       if (thread_sp == GetSelectedThread())
-        stop_others_thread_sp = thread_sp;
-        
+        thread_to_run = thread_sp;
+
       if (thread_sp->ShouldRunBeforePublicStop()) {
         // This takes precedence, so if we find one of these, service it:
-        stop_others_thread_sp = thread_sp;
+        thread_to_run = thread_sp;
         break;
       }
     }
   }
 
+  if (run_me_only_list.GetSize(false) > 0 && !thread_to_run) {
+    if (run_me_only_list.GetSize(false) == 1) {
+      thread_to_run = run_me_only_list.GetThreadAtIndex(0);
+    } else {
+      int random_thread =
+          (int)((run_me_only_list.GetSize(false) * (double)rand()) /
+                (RAND_MAX + 1.0));
+      thread_to_run = run_me_only_list.GetThreadAtIndex(random_thread);
+    }
+  }
+
+  // Give all the threads that are likely to run a last chance to set up their
+  // state before we negotiate who is actually going to get a chance to run...
+  // Don't set to resume suspended threads, and if any thread wanted to stop
+  // others, only call setup on the threads that request StopOthers...
+  if (thread_to_run != nullptr) {
+    // See if any thread wants to run stopping others.  If it does, then we
+    // won't setup the other threads for resume, since they aren't going to get
+    // a chance to run.  This is necessary because the SetupForResume might add
+    // "StopOthers" plans which would then get to be part of the who-gets-to-run
+    // negotiation, but they're coming in after the fact, and the threads that
+    // are already set up should take priority.
+    thread_to_run->SetupForResume();
+  } else {
+    for (pos = m_threads.begin(); pos != end; ++pos) {
+      ThreadSP thread_sp(*pos);
+      if (thread_sp->GetResumeState() != eStateSuspended) {
+        if (thread_sp->IsOperatingSystemPluginThread() &&
+            !thread_sp->GetBackingThread())
+          continue;
+        if (thread_sp->SetupForResume()) {
+          // You can't say "stop others" and also want yourself to be suspended.
+          assert(thread_sp->GetCurrentPlan()->RunState() != eStateSuspended);
+          thread_to_run = thread_sp;
+          if (thread_sp->ShouldRunBeforePublicStop()) {
+            // This takes precedence, so if we find one of these, service it:
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  if (thread_to_run != nullptr) {
+    Log *log = GetLog(LLDBLog::Step);
+    if (log && log->GetVerbose())
+      LLDB_LOGF(log, "Turning on notification of new threads while single "
+                     "stepping a thread.");
+    m_process.StartNoticingNewThreads();
+  } else {
+    Log *log = GetLog(LLDBLog::Step);
+    if (log && log->GetVerbose())
+      LLDB_LOGF(log, "Turning off notification of new threads while single "
+                     "stepping a thread.");
+    m_process.StopNoticingNewThreads();
+  }
+
   bool need_to_resume = true;
 
-  if (run_me_only_list.GetSize(false) == 0) {
+  if (thread_to_run == nullptr) {
     // Everybody runs as they wish:
     for (pos = m_threads.begin(); pos != end; ++pos) {
       ThreadSP thread_sp(*pos);
@@ -622,19 +627,6 @@ bool ThreadList::WillResume() {
         need_to_resume = false;
     }
   } else {
-    ThreadSP thread_to_run;
-
-    if (stop_others_thread_sp) {
-      thread_to_run = stop_others_thread_sp;
-    } else if (run_me_only_list.GetSize(false) == 1) {
-      thread_to_run = run_me_only_list.GetThreadAtIndex(0);
-    } else {
-      int random_thread =
-          (int)((run_me_only_list.GetSize(false) * (double)rand()) /
-                (RAND_MAX + 1.0));
-      thread_to_run = run_me_only_list.GetThreadAtIndex(random_thread);
-    }
-
     for (pos = m_threads.begin(); pos != end; ++pos) {
       ThreadSP thread_sp(*pos);
       if (thread_sp == thread_to_run) {
diff --git a/lldb/source/Utility/LLDBAssert.cpp b/lldb/source/Utility/LLDBAssert.cpp
index b0c39a284910b..d7adb52f95fa4 100644
--- a/lldb/source/Utility/LLDBAssert.cpp
+++ b/lldb/source/Utility/LLDBAssert.cpp
@@ -20,6 +20,7 @@
 
 namespace lldb_private {
 
+/// The default callback prints to stderr.
 static void DefaultAssertCallback(llvm::StringRef message,
                                   llvm::StringRef backtrace,
                                   llvm::StringRef prompt) {
@@ -31,8 +32,8 @@ static void DefaultAssertCallback(llvm::StringRef message,
 static std::atomic<LLDBAssertCallback> g_lldb_assert_callback =
     &DefaultAssertCallback;
 
-void lldb_assert(bool expression, const char *expr_text, const char *func,
-                 const char *file, unsigned int line) {
+void _lldb_assert(bool expression, const char *expr_text, const char *func,
+                  const char *file, unsigned int line) {
   if (LLVM_LIKELY(expression))
     return;
 
@@ -44,8 +45,6 @@ void lldb_assert(bool expression, const char *expr_text, const char *func,
   }
 #endif
 
-  // Print a warning and encourage the user to file a bug report, similar to
-  // LLVM’s crash handler, and then return execution.
   std::string buffer;
   llvm::raw_string_ostream backtrace(buffer);
   llvm::sys::PrintStackTrace(backtrace);
diff --git a/lldb/test/API/commands/expression/call-function/TestCallStopAndContinue.py b/lldb/test/API/commands/expression/call-function/TestCallStopAndContinue.py
index 69f02ec99f64b..d856b5c23a5ea 100644
--- a/lldb/test/API/commands/expression/call-function/TestCallStopAndContinue.py
+++ b/lldb/test/API/commands/expression/call-function/TestCallStopAndContinue.py
@@ -31,7 +31,7 @@ def test(self):
         self.expect(
             "expr -i false -- returnsFive()",
             error=True,
-            substrs=["Execution was interrupted, reason: breakpoint"],
+            substrs=["Expression execution hit a breakpoint: breakpoint"],
         )
 
         self.runCmd("continue", "Continue completed")
diff --git a/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
index 1c32222e64f14..759077302bfca 100644
--- a/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
@@ -23,13 +23,6 @@ def test(self):
 
         self.runCmd("settings set target.import-std-module true")
 
-        if self.expectedCompiler(["clang"]) and self.expectedCompilerVersion(
-            [">", "16.0"]
-        ):
-            vector_type = "std::vector<Foo>"
-        else:
-            vector_type = "std::vector<Foo, std::allocator<Foo> >"
-
         size_type = "size_type"
         value_type = "value_type"
         iterator = "iterator"
@@ -41,13 +34,14 @@ def test(self):
             ValueCheck(name="current"),
         ]
 
-        self.expect_expr(
-            "a",
-            result_type=vector_type,
-            result_children=[
-                ValueCheck(children=[ValueCheck(value="3")]),
-                ValueCheck(children=[ValueCheck(value="1")]),
-                ValueCheck(children=[ValueCheck(value="2")]),
+        self.expect(
+            "expr a",
+            patterns=[
+                """\(std::vector<Foo(, std::allocator<Foo> )*>\) \$0 = size=3 \{
+  \[0\] = \(a = 3\)
+  \[1\] = \(a = 1\)
+  \[2\] = \(a = 2\)
+\}"""
             ],
         )
 
diff --git a/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
index a1f33271f39d2..e18785ec1359c 100644
--- a/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
@@ -17,42 +17,26 @@ def test(self):
             self, "// Set break point at this line.", lldb.SBFileSpec("main.cpp")
         )
 
-        if self.expectedCompiler(["clang"]) and self.expectedCompilerVersion(
-            [">", "16.0"]
-        ):
-            vector_type = "std::vector<int>"
-            vector_of_vector_type = "std::vector<std::vector<int> >"
-        else:
-            vector_type = "std::vector<int>"
-            vector_of_vector_type = (
-                "std::vector<std::vector<int>, std::allocator<std::vector<int> > >"
-            )
-
         size_type = "size_type"
         value_type = "value_type"
 
         self.runCmd("settings set target.import-std-module true")
 
-        self.expect_expr(
-            "a",
-            result_type=vector_of_vector_type,
-            result_children=[
-                ValueCheck(
-                    type=vector_type,
-                    children=[
-                        ValueCheck(value="1"),
-                        ValueCheck(value="2"),
-                        ValueCheck(value="3"),
-                    ],
-                ),
-                ValueCheck(
-                    type=vector_type,
-                    children=[
-                        ValueCheck(value="3"),
-                        ValueCheck(value="2"),
-                        ValueCheck(value="1"),
-                    ],
-                ),
+        self.expect(
+            "expr a",
+            patterns=[
+                """\(std::vector<std::vector<int>(, std::allocator<std::vector<int> )* >\) \$0 = size=2 \{
+  \[0\] = size=3 \{
+    \[0\] = 1
+    \[1\] = 2
+    \[2\] = 3
+  \}
+  \[1\] = size=3 \{
+    \[0\] = 3
+    \[1\] = 2
+    \[2\] = 1
+  \}
+\}"""
             ],
         )
         self.expect_expr("a.size()", result_type=size_type, result_value="2")
diff --git a/lldb/test/API/commands/expression/unwind_expression/TestUnwindExpression.py b/lldb/test/API/commands/expression/unwind_expression/TestUnwindExpression.py
index 82f062876a773..c61fe5d01fd5b 100644
--- a/lldb/test/API/commands/expression/unwind_expression/TestUnwindExpression.py
+++ b/lldb/test/API/commands/expression/unwind_expression/TestUnwindExpression.py
@@ -2,7 +2,6 @@
 Test stopping at a breakpoint in an expression, and unwinding from there.
 """
 
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
@@ -71,7 +70,7 @@ def do_unwind_test(self, thread, bkpt, timeout):
         self.assertTrue(val.GetError().Fail(), "We did not complete the execution.")
         error_str = val.GetError().GetCString()
         self.assertIn(
-            "Execution was interrupted, reason: breakpoint",
+            "Expression execution hit a breakpoint: breakpoint",
             error_str,
             "And the reason was right.",
         )
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unordered/TestDataFormatterGenericUnordered.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unordered/TestDataFormatterGenericUnordered.py
index 59c24bcead4a4..c3043b489d951 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unordered/TestDataFormatterGenericUnordered.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unordered/TestDataFormatterGenericUnordered.py
@@ -54,7 +54,7 @@ def cleanup():
         self.look_for_content_and_continue(
             "map",
             [
-                "%s::unordered_map" % ns,
+                "UnorderedMap",
                 children_are_key_value,
                 "size=5 {",
                 "hello",
@@ -68,7 +68,7 @@ def cleanup():
         self.look_for_content_and_continue(
             "mmap",
             [
-                "%s::unordered_multimap" % ns,
+                "UnorderedMultiMap",
                 children_are_key_value,
                 "size=6 {",
                 "first = 3",
@@ -81,7 +81,7 @@ def cleanup():
         self.look_for_content_and_continue(
             "iset",
             [
-                "%s::unordered_set" % ns,
+                "IntsUnorderedSet",
                 "size=5 {",
                 "\[\d\] = 5",
                 "\[\d\] = 3",
@@ -92,7 +92,7 @@ def cleanup():
         self.look_for_content_and_continue(
             "sset",
             [
-                "%s::unordered_set" % ns,
+                "StringsUnorderedSet",
                 "size=5 {",
                 '\[\d\] = "is"',
                 '\[\d\] = "world"',
@@ -103,7 +103,7 @@ def cleanup():
         self.look_for_content_and_continue(
             "imset",
             [
-                "%s::unordered_multiset" % ns,
+                "IntsUnorderedMultiSet",
                 "size=6 {",
                 "(\[\d\] = 3(\\n|.)+){3}",
                 "\[\d\] = 2",
@@ -114,7 +114,7 @@ def cleanup():
         self.look_for_content_and_continue(
             "smset",
             [
-                "%s::unordered_multiset" % ns,
+                "StringsUnorderedMultiSet",
                 "size=5 {",
                 '(\[\d\] = "is"(\\n|.)+){2}',
                 '(\[\d\] = "world"(\\n|.)+){2}',
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unordered/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unordered/main.cpp
index 00d37dcb4bd04..59a5166c505b3 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unordered/main.cpp
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unordered/main.cpp
@@ -18,7 +18,9 @@ int main() {
   char buffer[sizeof(std::unordered_map<int, std::string>)] = {0};
   std::unordered_map<int, std::string> &corrupt_map = *(std::unordered_map<int, std::string> *)buffer;
 
-  std::unordered_map<int, std::string> map; // Set break point at this line.
+  // Make a typedef to ensure functionality when typedefs are used.
+  typedef std::unordered_map<int, std::string> UnorderedMap;
+  UnorderedMap map; // Set break point at this line.
   map.emplace(1, "hello");
   map.emplace(2, "world");
   map.emplace(3, "this");
@@ -26,7 +28,9 @@ int main() {
   map.emplace(5, "me");
   thefoo_rw(); // Set break point at this line.
 
-  std::unordered_multimap<int, std::string> mmap;
+  // Make a typedef to ensure functionality when typedefs are used.
+  typedef std::unordered_multimap<int, std::string> UnorderedMultiMap;
+  UnorderedMultiMap mmap;
   mmap.emplace(1, "hello");
   mmap.emplace(2, "hello");
   mmap.emplace(2, "world");
@@ -35,7 +39,9 @@ int main() {
   mmap.emplace(3, "this");
   thefoo_rw(); // Set break point at this line.
 
-  std::unordered_set<int> iset;
+  // Make a typedef to ensure functionality when typedefs are used.
+  typedef std::unordered_set<int> IntsUnorderedSet;
+  IntsUnorderedSet iset;
   iset.emplace(1);
   iset.emplace(2);
   iset.emplace(3);
@@ -43,7 +49,9 @@ int main() {
   iset.emplace(5);
   thefoo_rw(); // Set break point at this line.
 
-  std::unordered_set<std::string> sset;
+  // Make a typedef to ensure functionality when typedefs are used.
+  typedef std::unordered_set<std::string> StringsUnorderedSet;
+  StringsUnorderedSet sset;
   sset.emplace("hello");
   sset.emplace("world");
   sset.emplace("this");
@@ -51,7 +59,9 @@ int main() {
   sset.emplace("me");
   thefoo_rw(); // Set break point at this line.
 
-  std::unordered_multiset<int> imset;
+  // Make a typedef to ensure functionality when typedefs are used.
+  typedef std::unordered_multiset<int> IntsUnorderedMultiSet;
+  IntsUnorderedMultiSet imset;
   imset.emplace(1);
   imset.emplace(2);
   imset.emplace(2);
@@ -60,7 +70,9 @@ int main() {
   imset.emplace(3);
   thefoo_rw(); // Set break point at this line.
 
-  std::unordered_multiset<std::string> smset;
+  // Make a typedef to ensure functionality when typedefs are used.
+  typedef std::unordered_multiset<std::string> StringsUnorderedMultiSet;
+  StringsUnorderedMultiSet smset;
   smset.emplace("hello");
   smset.emplace("world");
   smset.emplace("world");
diff --git a/lldb/test/API/functionalities/plugins/python_os_plugin/stepping_plugin_threads/TestOSPluginStepping.py b/lldb/test/API/functionalities/plugins/python_os_plugin/stepping_plugin_threads/TestOSPluginStepping.py
index 47d6f5d68bbe6..0d06a9da6535c 100644
--- a/lldb/test/API/functionalities/plugins/python_os_plugin/stepping_plugin_threads/TestOSPluginStepping.py
+++ b/lldb/test/API/functionalities/plugins/python_os_plugin/stepping_plugin_threads/TestOSPluginStepping.py
@@ -40,11 +40,6 @@ def is_os_thread(self, thread):
     def run_python_os_step_missing_thread(self, do_prune):
         """Test that the Python operating system plugin works correctly"""
 
-        # Our OS plugin does NOT report all threads:
-        result = self.dbg.HandleCommand(
-            "settings set process.experimental.os-plugin-reports-all-threads false"
-        )
-
         python_os_plugin_path = os.path.join(self.getSourceDir(), "operating_system.py")
         (target, self.process, thread, thread_bkpt) = lldbutil.run_to_source_breakpoint(
             self, "first stop in thread - do a step out", self.main_file
diff --git a/lldb/test/API/functionalities/plugins/python_os_plugin/stepping_plugin_threads/operating_system.py b/lldb/test/API/functionalities/plugins/python_os_plugin/stepping_plugin_threads/operating_system.py
index eb02ff534f210..855cdbaf7cdc8 100644
--- a/lldb/test/API/functionalities/plugins/python_os_plugin/stepping_plugin_threads/operating_system.py
+++ b/lldb/test/API/functionalities/plugins/python_os_plugin/stepping_plugin_threads/operating_system.py
@@ -34,6 +34,9 @@ def __init__(self, process):
             if not self.g_value.IsValid():
                 print("Could not find g_value")
 
+    def does_plugin_report_all_threads(self):
+        return False
+
     def create_thread(self, tid, context):
         print("Called create thread with tid: ", tid)
         return None
diff --git a/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py b/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py
new file mode 100644
index 0000000000000..59e028acf014c
--- /dev/null
+++ b/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py
@@ -0,0 +1,136 @@
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class TestStepUntilAPI(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def setUp(self):
+        super().setUp()
+
+        self.main_source = "main.c"
+        self.main_spec = lldb.SBFileSpec(self.main_source)
+        self.less_than_two = line_number("main.c", "Less than 2")
+        self.greater_than_two = line_number("main.c", "Greater than or equal to 2.")
+        self.back_out_in_main = line_number("main.c", "Back out in main")
+        self.in_foo = line_number("main.c", "In foo")
+
+    def _build_dict_for_discontinuity(self):
+        return dict(
+            CFLAGS_EXTRAS="-funique-basic-block-section-names "
+            + "-ffunction-sections -fbasic-block-sections=list="
+            + self.getSourcePath("function.list"),
+            LD_EXTRAS="-Wl,--script=" + self.getSourcePath("symbol.order"),
+        )
+
+    def _do_until(self, build_dict, args, until_line, expected_line):
+        self.build(dictionary=build_dict)
+        launch_info = lldb.SBLaunchInfo(args)
+        _, _, thread, _ = lldbutil.run_to_source_breakpoint(
+            self, "At the start", self.main_spec, launch_info
+        )
+
+        self.assertSuccess(
+            thread.StepOverUntil(self.frame(), self.main_spec, until_line)
+        )
+
+        self.runCmd("process status")
+
+        line = self.frame().GetLineEntry().GetLine()
+        self.assertEqual(
+            line, expected_line, "Did not get the expected stop line number"
+        )
+
+    def _assertDiscontinuity(self):
+        target = self.target()
+        foo = target.FindFunctions("foo")
+        self.assertEqual(len(foo), 1)
+        foo = foo[0]
+
+        call_me = self.target().FindFunctions("call_me")
+        self.assertEqual(len(call_me), 1)
+        call_me = call_me[0]
+
+        foo_addr = foo.function.GetStartAddress().GetLoadAddress(target)
+        found_before = False
+        found_after = False
+        for range in call_me.function.GetRanges():
+            addr = range.GetBaseAddress().GetLoadAddress(target)
+            if addr < foo_addr:
+                found_before = True
+            if addr > foo_addr:
+                found_after = True
+
+        self.assertTrue(
+            found_before and found_after,
+            "'foo' is not between 'call_me'" + str(foo) + str(call_me),
+        )
+
+    def test_hitting(self):
+        """Test SBThread.StepOverUntil - targeting a line and hitting it."""
+        self._do_until(None, None, self.less_than_two, self.less_than_two)
+
+    @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"])
+    @skipIf(archs=no_match(["x86_64", "aarch64"]))
+    def test_hitting_discontinuous(self):
+        """Test SBThread.StepOverUntil - targeting a line and hitting it -- with
+        discontinuous functions"""
+        self._do_until(
+            self._build_dict_for_discontinuity(),
+            None,
+            self.less_than_two,
+            self.less_than_two,
+        )
+        self._assertDiscontinuity()
+
+    def test_missing(self):
+        """Test SBThread.StepOverUntil - targeting a line and missing it by stepping out to call site"""
+        self._do_until(
+            None, ["foo", "bar", "baz"], self.less_than_two, self.back_out_in_main
+        )
+
+    @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"])
+    @skipIf(archs=no_match(["x86_64", "aarch64"]))
+    def test_missing_discontinuous(self):
+        """Test SBThread.StepOverUntil - targeting a line and missing it by
+        stepping out to call site -- with discontinuous functions"""
+        self._do_until(
+            self._build_dict_for_discontinuity(),
+            ["foo", "bar", "baz"],
+            self.less_than_two,
+            self.back_out_in_main,
+        )
+        self._assertDiscontinuity()
+
+    def test_bad_line(self):
+        """Test that we get an error if attempting to step outside the current
+        function"""
+        self.build()
+        _, _, thread, _ = lldbutil.run_to_source_breakpoint(
+            self, "At the start", self.main_spec
+        )
+        self.assertIn(
+            "step until target not in current function",
+            thread.StepOverUntil(
+                self.frame(), self.main_spec, self.in_foo
+            ).GetCString(),
+        )
+
+    @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"])
+    @skipIf(archs=no_match(["x86_64", "aarch64"]))
+    def test_bad_line_discontinuous(self):
+        """Test that we get an error if attempting to step outside the current
+        function -- and the function is discontinuous"""
+        self.build(dictionary=self._build_dict_for_discontinuity())
+        _, _, thread, _ = lldbutil.run_to_source_breakpoint(
+            self, "At the start", self.main_spec
+        )
+        self.assertIn(
+            "step until target not in current function",
+            thread.StepOverUntil(
+                self.frame(), self.main_spec, self.in_foo
+            ).GetCString(),
+        )
+        self._assertDiscontinuity()
diff --git a/lldb/test/API/functionalities/thread/step_until/function.list b/lldb/test/API/functionalities/thread/step_until/function.list
new file mode 100644
index 0000000000000..5900fe8c35069
--- /dev/null
+++ b/lldb/test/API/functionalities/thread/step_until/function.list
@@ -0,0 +1 @@
+!call_me
diff --git a/lldb/test/API/functionalities/thread/step_until/main.c b/lldb/test/API/functionalities/thread/step_until/main.c
index bb866079cf5f5..4c52308f030e9 100644
--- a/lldb/test/API/functionalities/thread/step_until/main.c
+++ b/lldb/test/API/functionalities/thread/step_until/main.c
@@ -4,6 +4,9 @@
  * unrelated to the program, just to achieve consistent
  * debug line tables, across platforms, that are not
  * dependent on compiler optimzations. */
+
+int foo(int x) { return x; /* In foo */ }
+
 int call_me(int argc) {
   printf ("At the start, argc: %d.\n", argc);
 
diff --git a/lldb/test/API/functionalities/thread/step_until/symbol.order b/lldb/test/API/functionalities/thread/step_until/symbol.order
new file mode 100644
index 0000000000000..dcc9607a4188f
--- /dev/null
+++ b/lldb/test/API/functionalities/thread/step_until/symbol.order
@@ -0,0 +1,9 @@
+SECTIONS {
+  .text.ordered : {
+    *(.text.call_me)
+    *(.text.foo)
+    *(.text.call_me.call_me.__part.1)
+    *(.text.call_me.call_me.__part.2)
+    *(.text.call_me.call_me.__part.3)
+  }
+} INSERT BEFORE .text;
diff --git a/lldb/test/API/lang/cpp/nested-template/TestNestedTemplate.py b/lldb/test/API/lang/cpp/nested-template/TestNestedTemplate.py
index 42db060529a81..055a8e6e21042 100644
--- a/lldb/test/API/lang/cpp/nested-template/TestNestedTemplate.py
+++ b/lldb/test/API/lang/cpp/nested-template/TestNestedTemplate.py
@@ -17,6 +17,16 @@ def do_test(self, debug_flags):
             DATA_TYPES_DISPLAYED_CORRECTLY,
             substrs=["1 match found"],
         )
+        self.expect(
+            "image lookup -A -t 'NS::Struct<int>'",
+            DATA_TYPES_DISPLAYED_CORRECTLY,
+            substrs=["1 match found"],
+        )
+        self.expect(
+            "image lookup -A -t 'NS::Union<int>'",
+            DATA_TYPES_DISPLAYED_CORRECTLY,
+            substrs=["1 match found"],
+        )
 
     @skipIf(compiler=no_match("clang"))
     @skipIf(compiler_version=["<", "15.0"])
diff --git a/lldb/test/API/lang/cpp/nested-template/main.cpp b/lldb/test/API/lang/cpp/nested-template/main.cpp
index 06d1094880964..9bef73052825f 100644
--- a/lldb/test/API/lang/cpp/nested-template/main.cpp
+++ b/lldb/test/API/lang/cpp/nested-template/main.cpp
@@ -5,6 +5,15 @@ struct Outer {
   struct Inner {};
 };
 
+namespace NS {
+namespace {
+template <typename T> struct Struct {};
+template <typename T> struct Union {};
+} // namespace
+} // namespace NS
+
 int main() {
   Outer::Inner<int> oi;
+  NS::Struct<int> ns_struct;
+  NS::Union<int> ns_union;
 }
diff --git a/lldb/test/API/python_api/sbprogress/TestSBProgress.py b/lldb/test/API/python_api/sbprogress/TestSBProgress.py
new file mode 100644
index 0000000000000..c456247da80c6
--- /dev/null
+++ b/lldb/test/API/python_api/sbprogress/TestSBProgress.py
@@ -0,0 +1,35 @@
+"""Test the SBProgress API."""
+
+import lldb
+from lldbsuite.test.lldbtest import *
+
+
+class SBProgressTestCase(TestBase):
+    def test_with_external_bit_set(self):
+        """Test SBProgress events are listened to when the external bit is set."""
+
+        progress = lldb.SBProgress("Test SBProgress", "Test progress", self.dbg)
+        listener = lldb.SBListener("Test listener")
+        broadcaster = self.dbg.GetBroadcaster()
+        broadcaster.AddListener(listener, lldb.eBroadcastBitExternalProgress)
+        event = lldb.SBEvent()
+
+        expected_string = "Test progress first increment"
+        progress.Increment(1, expected_string)
+        self.assertTrue(listener.PeekAtNextEvent(event))
+        stream = lldb.SBStream()
+        event.GetDescription(stream)
+        self.assertIn(expected_string, stream.GetData())
+
+    def test_without_external_bit_set(self):
+        """Test SBProgress events are not listened to on the internal progress bit."""
+
+        progress = lldb.SBProgress("Test SBProgress", "Test progress", self.dbg)
+        listener = lldb.SBListener("Test listener")
+        broadcaster = self.dbg.GetBroadcaster()
+        broadcaster.AddListener(listener, lldb.eBroadcastBitProgress)
+        event = lldb.SBEvent()
+
+        expected_string = "Test progress first increment"
+        progress.Increment(1, expected_string)
+        self.assertFalse(listener.PeekAtNextEvent(event))
diff --git a/lldb/test/API/python_api/sbsavecoreoptions/TestSBSaveCoreOptions.py b/lldb/test/API/python_api/sbsavecoreoptions/TestSBSaveCoreOptions.py
index 40d0cc7e96ff4..ace84e8497a59 100644
--- a/lldb/test/API/python_api/sbsavecoreoptions/TestSBSaveCoreOptions.py
+++ b/lldb/test/API/python_api/sbsavecoreoptions/TestSBSaveCoreOptions.py
@@ -4,15 +4,18 @@
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 
+
 class SBSaveCoreOptionsAPICase(TestBase):
     basic_minidump = "basic_minidump.yaml"
     basic_minidump_different_pid = "basic_minidump_different_pid.yaml"
 
     def get_process_from_yaml(self, yaml_file):
         minidump_path = self.getBuildArtifact(os.path.basename(yaml_file) + ".dmp")
-        print ("minidump_path: " + minidump_path)
+        print("minidump_path: " + minidump_path)
         self.yaml2obj(yaml_file, minidump_path)
-        self.assertTrue(os.path.exists(minidump_path), "yaml2obj did not emit a minidump file")
+        self.assertTrue(
+            os.path.exists(minidump_path), "yaml2obj did not emit a minidump file"
+        )
         target = self.dbg.CreateTarget(None)
         process = target.LoadCore(minidump_path)
         self.assertTrue(process.IsValid(), "Process is not valid")
@@ -59,7 +62,6 @@ def test_adding_and_removing_thread(self):
         removed_success = options.RemoveThread(thread)
         self.assertFalse(removed_success)
 
-
     def test_adding_thread_different_process(self):
         """Test adding and removing a thread from save core options."""
         options = lldb.SBSaveCoreOptions()
@@ -79,3 +81,26 @@ def test_adding_thread_different_process(self):
         self.assertTrue(error.Fail())
         error = options.AddThread(thread)
         self.assertTrue(error.Success())
+
+    def test_removing_and_adding_insertion_order(self):
+        """Test insertion order is maintained when removing and adding threads."""
+        options = lldb.SBSaveCoreOptions()
+        process = self.get_basic_process()
+        threads = []
+        for x in range(0, 3):
+            thread = process.GetThreadAtIndex(x)
+            threads.append(thread)
+            error = options.AddThread(thread)
+            self.assertTrue(error.Success())
+
+        # Get the middle thread, remove it, and insert it back.
+        middle_thread = threads[1]
+        self.assertTrue(options.RemoveThread(middle_thread))
+        thread_collection = options.GetThreadsToSave()
+        self.assertTrue(thread_collection is not None)
+        self.assertEqual(thread_collection.GetSize(), 2)
+        error = options.AddThread(middle_thread)
+        self.assertTrue(error.Success())
+        thread_collection = options.GetThreadsToSave()
+        self.assertEqual(thread_collection.GetSize(), 3)
+        self.assertIn(middle_thread, thread_collection)
diff --git a/lldb/test/API/python_api/sbsavecoreoptions/basic_minidump.yaml b/lldb/test/API/python_api/sbsavecoreoptions/basic_minidump.yaml
index 993c7da21225a..96302fbfb6b5c 100644
--- a/lldb/test/API/python_api/sbsavecoreoptions/basic_minidump.yaml
+++ b/lldb/test/API/python_api/sbsavecoreoptions/basic_minidump.yaml
@@ -24,3 +24,13 @@ Streams:
         Stack:
           Start of Memory Range: 0x00007FFFC8D0E000
           Content:               'DEADBEEF'
+      - Thread Id:       0x000074DE
+        Context:         0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000B0010000000000033000000000000000000000002020100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000040109600000000000100000000000000000000000000000068E7D0C8FF7F000068E7D0C8FF7F000097E6D0C8FF7F000010109600000000000000000000000000020000000000000088E4D0C8FF7F0000603FFF85C77F0000F00340000000000080E7D0C8FF7F000000000000000000000000000000000000E0034000000000007F0300000000000000000000000000000000000000000000801F0000FFFF00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF252525252525252525252525252525250000000000000000000000000000000000000000000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000FF00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+        Stack:
+          Start of Memory Range: 0x00007FFFC8D0A000
+          Content:               'BEEFDEAD'
+      - Thread Id:       0x000074DF
+        Context:         0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000B0010000000000033000000000000000000000002020100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000040109600000000000100000000000000000000000000000068E7D0C8FF7F000068E7D0C8FF7F000097E6D0C8FF7F000010109600000000000000000000000000020000000000000088E4D0C8FF7F0000603FFF85C77F0000F00340000000000080E7D0C8FF7F000000000000000000000000000000000000E0034000000000007F0300000000000000000000000000000000000000000000801F0000FFFF00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF252525252525252525252525252525250000000000000000000000000000000000000000000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000FF00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+        Stack:
+          Start of Memory Range: 0x00007FFFC8DFF000
+          Content:               'BAADBEEF'
diff --git a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
index f4f30b6677e53..580ad38ab51c1 100644
--- a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
+++ b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
@@ -672,6 +672,7 @@ def test_indexedVariables_with_raw_child_for_synthetics(self):
         self.do_test_indexedVariables(enableSyntheticChildDebugging=True)
 
     @skipIfWindows
+    @skipIfAsan # FIXME this fails with a non-asan issue on green dragon.
     def test_registers(self):
         """
         Test that registers whose byte size is the size of a pointer on
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
index f22d76b3973e5..b31f56aa372d5 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
@@ -741,3 +741,164 @@ TEST_F(DWARFASTParserClangTests, TestUniqueDWARFASTTypeMap_CppInsertMapFind) {
 
   ASSERT_EQ(type_sp, reparsed_type_sp);
 }
+
+TEST_F(DWARFASTParserClangTests, TestParseDWARFAttributes_ObjectPointer) {
+  // This tests the behaviour of ParsedDWARFTypeAttributes
+  // for DW_TAG_subprogram definitions which have a DW_AT_object_pointer
+  // *and* a DW_AT_specification that also has a DW_AT_object_pointer.
+  // We don't want the declaration DW_AT_object_pointer to overwrite the
+  // one from the more specific definition's.
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - Context
+    - func
+    - this
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x3
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_declaration
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_object_pointer
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_artificial
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_external
+              Form:            DW_FORM_flag_present
+        - Code:            0x4
+          Tag:             DW_TAG_formal_parameter
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_artificial
+              Form:            DW_FORM_flag_present
+        - Code:            0x5
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_object_pointer
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_specification
+              Form:            DW_FORM_ref4
+        - Code:            0x6
+          Tag:             DW_TAG_formal_parameter
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_artificial
+              Form:            DW_FORM_flag_present
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+# DW_TAG_compile_unit
+#   DW_AT_language [DW_FORM_data2]    (DW_LANG_C_plus_plus)
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+#   DW_TAG_structure_type
+#     DW_AT_name [DW_FORM_strp] ("Context")
+
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x0
+
+#     DW_TAG_subprogram
+#       DW_AT_name [DW_FORM_strp] ("func")
+#       DW_AT_object_pointer [DW_FORM_ref4]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x8
+            - Value:           0x1
+            - Value:           0x1d
+            - Value:           0x1
+            - Value:           0x1
+
+#       DW_TAG_formal_parameter
+#         DW_AT_artificial
+        - AbbrCode:        0x4
+          Values:
+          - Value: 0x1
+
+        - AbbrCode: 0x0
+        - AbbrCode: 0x0
+
+#     DW_TAG_subprogram
+#       DW_AT_object_pointer [DW_FORM_ref4] ("this")
+#       DW_AT_specification [DW_FORM_ref4] ("func")
+        - AbbrCode:        0x5
+          Values:
+            - Value:           0x29
+            - Value:           0x14
+
+#       DW_TAG_formal_parameter
+#         DW_AT_name [DW_FORM_strp] ("this")
+#         DW_AT_artificial
+        - AbbrCode:        0x6
+          Values:
+            - Value:           0xd
+            - Value:           0x1
+
+        - AbbrCode: 0x0
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(yamldata);
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto holder = std::make_unique<clang_utils::TypeSystemClangHolder>("ast");
+  auto &ast_ctx = *holder->GetAST();
+  DWARFASTParserClangStub ast_parser(ast_ctx);
+
+  auto context_die = cu_die.GetFirstChild();
+  ASSERT_TRUE(context_die.IsValid());
+  ASSERT_EQ(context_die.Tag(), DW_TAG_structure_type);
+
+  auto subprogram_definition = context_die.GetSibling();
+  ASSERT_TRUE(subprogram_definition.IsValid());
+  ASSERT_EQ(subprogram_definition.Tag(), DW_TAG_subprogram);
+  ASSERT_FALSE(subprogram_definition.GetAttributeValueAsOptionalUnsigned(
+      DW_AT_external));
+
+  auto param_die = subprogram_definition.GetFirstChild();
+  ASSERT_TRUE(param_die.IsValid());
+
+  ParsedDWARFTypeAttributes attrs(subprogram_definition);
+  EXPECT_TRUE(attrs.object_pointer.IsValid());
+  EXPECT_EQ(attrs.object_pointer, param_die);
+}
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
index 1e4c8f3ba0778..3f61d1607073c 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
@@ -394,3 +394,643 @@ TEST(DWARFDIETest, GetContextInFunction) {
   EXPECT_THAT(foo_struct_die.GetTypeLookupContext(),
               testing::ElementsAre(make_struct("struct_t")));
 }
+
+struct GetAttributesTestFixture : public testing::TestWithParam<dw_attr_t> {};
+
+TEST_P(GetAttributesTestFixture, TestGetAttributes_IterationOrder) {
+  // Tests that we accumulate all current DIE's attributes first
+  // before checking the attributes of the specification.
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - func
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_data4
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_declaration
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_external
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_data4
+        - Code:            0x3
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_data4
+            - Attribute:       {0}
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_data4
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+# DW_TAG_compile_unit
+#   DW_AT_language [DW_FORM_data2]    (DW_LANG_C_plus_plus)
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+#     DW_TAG_subprogram
+#       DW_AT_high_pc [DW_FORM_data4]
+#       DW_AT_name [DW_FORM_strp] ("func")
+#       DW_AT_low_pc [DW_FORM_data4]
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0xdeadbeef
+            - Value:           0x0
+            - Value:           0x1
+            - Value:           0x1
+            - Value:           0xdeadbeef
+
+#     DW_TAG_subprogram
+#       DW_AT_high_pc [DW_FORM_data4]
+#       DW_AT_specification [DW_FORM_ref4] ("func")
+#       DW_AT_low_pc [DW_FORM_data4]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0xf00dcafe
+            - Value:           0xf
+            - Value:           0xf00dcafe
+
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str());
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto declaration = cu_die.GetFirstChild();
+  ASSERT_TRUE(declaration.IsValid());
+  ASSERT_EQ(declaration.Tag(), DW_TAG_subprogram);
+
+  auto definition = declaration.GetSibling();
+  ASSERT_TRUE(definition.IsValid());
+  ASSERT_EQ(definition.Tag(), DW_TAG_subprogram);
+  ASSERT_FALSE(definition.GetAttributeValueAsOptionalUnsigned(DW_AT_external));
+
+  auto attrs = definition.GetAttributes(DWARFDebugInfoEntry::Recurse::yes);
+  EXPECT_EQ(attrs.Size(), 7U);
+
+  // Check that the attributes on the definition (that are also present
+  // on the declaration) take precedence.
+  for (auto attr : {DW_AT_low_pc, DW_AT_high_pc}) {
+    auto idx = attrs.FindAttributeIndex(attr);
+    EXPECT_NE(idx, UINT32_MAX);
+
+    DWARFFormValue form_value;
+    auto success = attrs.ExtractFormValueAtIndex(idx, form_value);
+    EXPECT_TRUE(success);
+
+    EXPECT_EQ(form_value.Unsigned(), 0xf00dcafe);
+  }
+}
+
+TEST_P(GetAttributesTestFixture, TestGetAttributes_Cycle) {
+  // Tests that GetAttributes can deal with cycles in
+  // specifications/abstract origins.
+  //
+  // Contrived example:
+  //
+  // func1 -> func3
+  //   ^       |
+  //   |       v
+  //   +------func2
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       {0}
+              Form:            DW_FORM_ref4
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x19
+
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0xf
+
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x14
+
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str());
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto func1 = cu_die.GetFirstChild();
+  ASSERT_TRUE(func1.IsValid());
+  ASSERT_EQ(func1.Tag(), DW_TAG_subprogram);
+
+  auto func2 = func1.GetSibling();
+  ASSERT_TRUE(func2.IsValid());
+  ASSERT_EQ(func2.Tag(), DW_TAG_subprogram);
+
+  auto func3 = func2.GetSibling();
+  ASSERT_TRUE(func3.IsValid());
+  ASSERT_EQ(func3.Tag(), DW_TAG_subprogram);
+
+  auto attrs = func1.GetAttributes(DWARFDebugInfoEntry::Recurse::yes);
+  EXPECT_EQ(attrs.Size(), 3U);
+
+  // Confirm that the specifications do form a cycle.
+  {
+    DWARFFormValue form_value;
+    auto success = attrs.ExtractFormValueAtIndex(0, form_value);
+    ASSERT_TRUE(success);
+
+    EXPECT_EQ(form_value.Reference(), func3);
+  }
+
+  {
+    DWARFFormValue form_value;
+    auto success = attrs.ExtractFormValueAtIndex(1, form_value);
+    ASSERT_TRUE(success);
+
+    EXPECT_EQ(form_value.Reference(), func2);
+  }
+
+  {
+    DWARFFormValue form_value;
+    auto success = attrs.ExtractFormValueAtIndex(2, form_value);
+    ASSERT_TRUE(success);
+
+    EXPECT_EQ(form_value.Reference(), func1);
+  }
+}
+
+TEST_P(GetAttributesTestFixture,
+       TestGetAttributes_SkipNonApplicableAttributes) {
+  // Tests that GetAttributes will omit attributes found through
+  // specifications/abstract origins which are not applicable.
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - func
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_declaration
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_sibling
+              Form:            DW_FORM_ref4
+        - Code:            0x3
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_declaration
+              Form:            DW_FORM_flag_present
+            - Attribute:       {0}
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_sibling
+              Form:            DW_FORM_ref4
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+# DW_TAG_compile_unit
+#   DW_AT_language [DW_FORM_data2]    (DW_LANG_C_plus_plus)
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+#     DW_TAG_subprogram
+#       DW_AT_declaration
+#       DW_AT_name [DW_FORM_strp] ("func")
+#       DW_AT_sibling
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x1
+            - Value:           0x0
+            - Value:           0x18
+
+#     DW_TAG_subprogram
+#       DW_AT_declaration
+#       DW_AT_specification [DW_FORM_ref4] ("func")
+#       DW_AT_sibling
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x1
+            - Value:           0xf
+            - Value:           0xdeadbeef
+
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str());
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto declaration = cu_die.GetFirstChild();
+  ASSERT_TRUE(declaration.IsValid());
+  ASSERT_EQ(declaration.Tag(), DW_TAG_subprogram);
+
+  auto definition = declaration.GetSibling();
+  ASSERT_TRUE(definition.IsValid());
+  ASSERT_EQ(definition.Tag(), DW_TAG_subprogram);
+
+  auto attrs = definition.GetAttributes(DWARFDebugInfoEntry::Recurse::yes);
+  EXPECT_EQ(attrs.Size(), 4U);
+  EXPECT_NE(attrs.FindAttributeIndex(DW_AT_name), UINT32_MAX);
+  EXPECT_NE(attrs.FindAttributeIndex(GetParam()), UINT32_MAX);
+
+  auto sibling_idx = attrs.FindAttributeIndex(DW_AT_sibling);
+  EXPECT_NE(sibling_idx, UINT32_MAX);
+
+  DWARFFormValue form_value;
+  auto success = attrs.ExtractFormValueAtIndex(sibling_idx, form_value);
+  ASSERT_TRUE(success);
+
+  EXPECT_EQ(form_value.Unsigned(), 0xdeadbeef);
+}
+
+TEST_P(GetAttributesTestFixture, TestGetAttributes_NoRecurse) {
+  // Tests that GetAttributes will not recurse if Recurse::No is passed to it.
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - func
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x3
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_data4
+            - Attribute:       {0}
+              Form:            DW_FORM_ref4
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+# DW_TAG_compile_unit
+#   DW_AT_language [DW_FORM_data2]    (DW_LANG_C_plus_plus)
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+#     DW_TAG_subprogram
+#       DW_AT_name [DW_FORM_strp] ("func")
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x0
+
+#     DW_TAG_subprogram
+#       DW_AT_low_pc [DW_FORM_data4]
+#       DW_AT_specification [DW_FORM_ref4]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0xdeadbeef
+            - Value:           0xf
+
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str());
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto declaration = cu_die.GetFirstChild();
+  ASSERT_TRUE(declaration.IsValid());
+  ASSERT_EQ(declaration.Tag(), DW_TAG_subprogram);
+
+  auto definition = declaration.GetSibling();
+  ASSERT_TRUE(definition.IsValid());
+  ASSERT_EQ(definition.Tag(), DW_TAG_subprogram);
+
+  auto attrs = definition.GetAttributes(DWARFDebugInfoEntry::Recurse::no);
+  EXPECT_EQ(attrs.Size(), 2U);
+  EXPECT_EQ(attrs.FindAttributeIndex(DW_AT_name), UINT32_MAX);
+  EXPECT_NE(attrs.FindAttributeIndex(GetParam()), UINT32_MAX);
+  EXPECT_NE(attrs.FindAttributeIndex(DW_AT_low_pc), UINT32_MAX);
+}
+
+TEST_P(GetAttributesTestFixture, TestGetAttributes_InvalidSpec) {
+  // Test that GetAttributes doesn't try following invalid
+  // specifications (but still add it to the list of attributes).
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - func
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x3
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       {0}
+              Form:            DW_FORM_ref4
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+# DW_TAG_compile_unit
+#   DW_AT_language [DW_FORM_data2]    (DW_LANG_C_plus_plus)
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+#     DW_TAG_subprogram
+#       DW_AT_name [DW_FORM_strp] ("func")
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x0
+
+#     DW_TAG_subprogram
+#       DW_AT_specification [DW_FORM_ref4]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0xdeadbeef
+
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str());
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto declaration = cu_die.GetFirstChild();
+  ASSERT_TRUE(declaration.IsValid());
+  ASSERT_EQ(declaration.Tag(), DW_TAG_subprogram);
+
+  auto definition = declaration.GetSibling();
+  ASSERT_TRUE(definition.IsValid());
+  ASSERT_EQ(definition.Tag(), DW_TAG_subprogram);
+
+  auto attrs = definition.GetAttributes(DWARFDebugInfoEntry::Recurse::yes);
+  EXPECT_EQ(attrs.Size(), 1U);
+  EXPECT_EQ(attrs.FindAttributeIndex(DW_AT_name), UINT32_MAX);
+  EXPECT_NE(attrs.FindAttributeIndex(GetParam()), UINT32_MAX);
+}
+
+TEST(DWARFDIETest, TestGetAttributes_Worklist) {
+  // Test that GetAttributes will follow both the abstract origin
+  // and specification on a single DIE correctly (omitting non-applicable
+  // attributes in the process).
+
+  // Contrived example where
+  // f1---> f2 --> f4
+  //    `-> f3 `-> f5
+  //
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - foo
+    - bar
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_specification
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_abstract_origin
+              Form:            DW_FORM_ref4
+        - Code:            0x3
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_declaration
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_artificial
+              Form:            DW_FORM_flag_present
+
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+#     DW_TAG_subprogram ("f1")
+#       DW_AT_specification [DW_FORM_ref4] ("f2")
+#       DW_AT_abstract_origin [DW_FORM_ref4] ("f3")
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x18
+            - Value:           0x21
+
+#     DW_TAG_subprogram ("f2")
+#       DW_AT_specification [DW_FORM_ref4] ("f4")
+#       DW_AT_abstract_origin [DW_FORM_ref4] ("f5")
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x22
+            - Value:           0x23
+
+#     DW_TAG_subprogram ("f3")
+#       DW_AT_declaration [DW_FORM_flag_present]
+#       DW_AT_artificial [DW_FORM_flag_present]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x1
+            - Value:           0x1
+
+#     DW_TAG_subprogram ("f4")
+#       DW_AT_declaration [DW_FORM_flag_present]
+#       DW_AT_artificial [DW_FORM_flag_present]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x1
+            - Value:           0x1
+
+#     DW_TAG_subprogram ("f5")
+#       DW_AT_declaration [DW_FORM_flag_present]
+#       DW_AT_artificial [DW_FORM_flag_present]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x1
+            - Value:           0x1
+
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(yamldata);
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto f1 = cu_die.GetFirstChild();
+  ASSERT_TRUE(f1.IsValid());
+  ASSERT_EQ(f1.Tag(), DW_TAG_subprogram);
+
+  auto attrs = f1.GetAttributes(DWARFDebugInfoEntry::Recurse::yes);
+  EXPECT_EQ(attrs.Size(), 7U);
+  EXPECT_EQ(attrs.FindAttributeIndex(DW_AT_declaration), UINT32_MAX);
+}
+
+INSTANTIATE_TEST_SUITE_P(GetAttributeTests, GetAttributesTestFixture,
+                         testing::Values(DW_AT_specification,
+                                         DW_AT_abstract_origin));
diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index e2af991ed37b1..10714b508ca68 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -169,10 +169,26 @@ rnk@google.com (email), [rnk](https://github.com/rnk) (GitHub)
 
 ### Backends / Targets
 
-#### AArch64 backend
+#### ARM and AArch64 backends
 
-Tim Northover \
-t.p.northover@gmail.com (email), [TNorthover](https://github.com/TNorthover) (GitHub)
+David Green \
+david.green@arm.com (email), [davemgreen](https://github.com/davemgreen) (GitHub) \
+Amara Emerson (esp. AArch64 GlobalISel) \
+amara@apple.com (email), [aemerson](https://github.com/aemerson) (GitHub) \
+Eli Friedman (esp. ARM64EC) \
+efriedma@quicinc.com (email), [efriedma-quic](https://github.com/efriedma-quic) (GitHub) \
+Sjoerd Meijer \
+smeijer@nvidia.com (email), [sjoerdmeijer](https://github.com/sjoerdmeijer) (GitHub) \
+Nashe Mncube \
+nashe.mncube@arm.com (email), [nasherm](https://github.com/nasherm) (GitHub) \
+Sander de Smalen (esp. scalable vectorization/SVE/SME) \
+sander.desmalen@arm.com (email), [sdesmalen-arm](https://github.com/sdesmalen-arm) (GitHub) \
+Peter Smith (Anything ABI) \
+peter.smith@arm.com (email), [smithp35](https://github.com/smithp35) (GitHub) \
+Oliver Stannard (esp. assembly/dissassembly) \
+oliver.stannard@arm.com (email), [ostannard](https://github.com/ostannard) (GitHub) \
+Ties Stuij (Arm GlobalISel and early arch support) \
+ties.stuij@arm.com (email), [stuij](https://github.com/stuij) (GitHub)
 
 #### AMDGPU backend
 
@@ -184,19 +200,6 @@ Matthew.Arsenault@amd.com, arsenm2@gmail.com (email), [arsenm](https://github.co
 Mark Schimmel \
 marksl@synopsys.com (email), [markschimmel](https://github.com/markschimmel) (GitHub)
 
-#### ARM backend
-
-David Green \
-david.green@arm.com (email), [davemgreen](https://github.com/davemgreen) (GitHub) \
-Oliver Stannard (Especially assembly/dissassembly) \
-oliver.stannard@arm.com (email), [ostannard](https://github.com/ostannard) (GitHub) \
-Nashe Mncube \
-nashe.mncube@arm.com (email), [nasherm](https://github.com/nasherm) (GitHub) \
-Peter Smith (Anything ABI) \
-peter.smith@arm.com (email), [smithp35](https://github.com/smithp35) (GitHub) \
-Ties Stuij (GlobalISel and early arch support) \
-ties.stuij@arm.com (email), [stuij](https://github.com/stuij) (GitHub)
-
 #### AVR backend
 
 Ben Shi \
@@ -480,6 +483,7 @@ James Grosbach (grosbach@apple.com) -- MC layer \
 Anton Korobeynikov (anton@korobeynikov.info, [asl](https://github.com/asl)) -- ARM EABI, Windows codegen \
 Benjamin Kramer (benny.kra@gmail.com, [d0k](https://github.com/d0k)) -- DWARF Parser \
 David Majnemer (david.majnemer@gmail.com, [majnemer](https://github.com/majnemer)) -- InstCombine, ConstantFold \
+Tim Northover (t.p.northover@gmail.com, [TNorthover](https://github.com/TNorthover)) -- AArch64 backend \
 Chad Rosier (mcrosier@codeaurora.org) -- FastISel \
 Hans Wennborg (hans@chromium.org, [zmodem](https://github.com/zmodem)) -- Release management \
 Kostya Serebryany ([kcc](https://github.com/kcc)) -- Sanitizers \
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 64878d28d9e1e..0cc3a4aa5cccd 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -17,6 +17,51 @@ include(CheckCompilerVersion)
 include(CheckProblematicConfigurations)
 include(HandleLLVMStdlib)
 
+if (ANDROID OR CYGWIN OR CMAKE_SYSTEM_NAME MATCHES "AIX|DragonFly|FreeBSD|Haiku|Linux|NetBSD|OpenBSD|SunOS")
+  set(HAVE_DLFCN_H 1)
+  set(HAVE_MACH_MACH_H 0)
+  set(HAVE_MALLOC_MALLOC_H 0)
+  set(HAVE_PTHREAD_H 1)
+  set(HAVE_SYS_MMAN_H 1)
+  set(HAVE_SYSEXITS_H 1)
+  set(HAVE_UNISTD_H 1)
+elseif (APPLE)
+  set(HAVE_DLFCN_H 1)
+  set(HAVE_MACH_MACH_H 1)
+  set(HAVE_MALLOC_MALLOC_H 1)
+  set(HAVE_PTHREAD_H 1)
+  set(HAVE_SYS_MMAN_H 1)
+  set(HAVE_SYSEXITS_H 1)
+  set(HAVE_UNISTD_H 1)
+elseif (PURE_WINDOWS)
+  set(HAVE_DLFCN_H 0)
+  set(HAVE_MACH_MACH_H 0)
+  set(HAVE_MALLOC_MALLOC_H 0)
+  set(HAVE_PTHREAD_H 0)
+  set(HAVE_SYS_MMAN_H 0)
+  set(HAVE_SYSEXITS_H 0)
+  set(HAVE_UNISTD_H 0)
+elseif (ZOS)
+  # Confirmed in
+  # https://github.com/llvm/llvm-project/pull/104706#issuecomment-2297109613
+  set(HAVE_DLFCN_H 1)
+  set(HAVE_MACH_MACH_H 0)
+  set(HAVE_MALLOC_MALLOC_H 0)
+  set(HAVE_PTHREAD_H 1)
+  set(HAVE_SYS_MMAN_H 1)
+  set(HAVE_SYSEXITS_H 0)
+  set(HAVE_UNISTD_H 1)
+else()
+  # Other platforms that we don't promise support for.
+  check_include_file(dlfcn.h HAVE_DLFCN_H)
+  check_include_file(mach/mach.h HAVE_MACH_MACH_H)
+  check_include_file(malloc/malloc.h HAVE_MALLOC_MALLOC_H)
+  check_include_file(pthread.h HAVE_PTHREAD_H)
+  check_include_file(sys/mman.h HAVE_SYS_MMAN_H)
+  check_include_file(sysexits.h HAVE_SYSEXITS_H)
+  check_include_file(unistd.h HAVE_UNISTD_H)
+endif()
+
 if( UNIX AND NOT (APPLE OR BEOS OR HAIKU) )
   # Used by check_symbol_exists:
   list(APPEND CMAKE_REQUIRED_LIBRARIES "m")
@@ -58,24 +103,7 @@ if(LLVM_USING_GLIBC)
 endif()
 
 # include checks
-check_include_file(dlfcn.h HAVE_DLFCN_H)
-check_include_file(errno.h HAVE_ERRNO_H)
-check_include_file(fcntl.h HAVE_FCNTL_H)
-check_include_file(malloc/malloc.h HAVE_MALLOC_MALLOC_H)
-if( NOT PURE_WINDOWS )
-  check_include_file(pthread.h HAVE_PTHREAD_H)
-endif()
-check_include_file(signal.h HAVE_SIGNAL_H)
-check_include_file(sys/ioctl.h HAVE_SYS_IOCTL_H)
-check_include_file(sys/mman.h HAVE_SYS_MMAN_H)
-check_include_file(sys/resource.h HAVE_SYS_RESOURCE_H)
-check_include_file(sys/stat.h HAVE_SYS_STAT_H)
-check_include_file(sys/time.h HAVE_SYS_TIME_H)
-check_include_file(sysexits.h HAVE_SYSEXITS_H)
-check_include_file(termios.h HAVE_TERMIOS_H)
-check_include_file(unistd.h HAVE_UNISTD_H)
 check_include_file(valgrind/valgrind.h HAVE_VALGRIND_VALGRIND_H)
-check_include_file(fenv.h HAVE_FENV_H)
 check_symbol_exists(FE_ALL_EXCEPT "fenv.h" HAVE_DECL_FE_ALL_EXCEPT)
 check_symbol_exists(FE_INEXACT "fenv.h" HAVE_DECL_FE_INEXACT)
 check_c_source_compiles("
@@ -90,7 +118,6 @@ check_c_source_compiles("
         int main(void) { return 0; }"
         HAVE_BUILTIN_THREAD_POINTER)
 
-check_include_file(mach/mach.h HAVE_MACH_MACH_H)
 check_include_file(CrashReporterClient.h HAVE_CRASHREPORTERCLIENT_H)
 if(APPLE)
   check_c_source_compiles("
@@ -294,7 +321,6 @@ check_symbol_exists(_Unwind_Backtrace "unwind.h" HAVE__UNWIND_BACKTRACE)
 check_symbol_exists(getpagesize unistd.h HAVE_GETPAGESIZE)
 check_symbol_exists(sysconf unistd.h HAVE_SYSCONF)
 check_symbol_exists(getrusage sys/resource.h HAVE_GETRUSAGE)
-check_symbol_exists(setrlimit sys/resource.h HAVE_SETRLIMIT)
 check_symbol_exists(isatty unistd.h HAVE_ISATTY)
 check_symbol_exists(futimens sys/stat.h HAVE_FUTIMENS)
 check_symbol_exists(futimes sys/time.h HAVE_FUTIMES)
@@ -302,7 +328,7 @@ check_symbol_exists(futimes sys/time.h HAVE_FUTIMES)
 # Avoid sigaltstack on Apple platforms, where backtrace() cannot handle it
 # (rdar://7089625) and _Unwind_Backtrace is unusable because it cannot unwind
 # past the signal handler after an assertion failure (rdar://29866587).
-if( HAVE_SIGNAL_H AND NOT LLVM_USE_SANITIZER MATCHES ".*Address.*" AND NOT APPLE )
+if( NOT LLVM_USE_SANITIZER MATCHES ".*Address.*" AND NOT APPLE )
   check_symbol_exists(sigaltstack signal.h HAVE_SIGALTSTACK)
 endif()
 check_symbol_exists(mallctl malloc_np.h HAVE_MALLCTL)
@@ -310,7 +336,6 @@ check_symbol_exists(mallinfo malloc.h HAVE_MALLINFO)
 check_symbol_exists(mallinfo2 malloc.h HAVE_MALLINFO2)
 check_symbol_exists(malloc_zone_statistics malloc/malloc.h
                     HAVE_MALLOC_ZONE_STATISTICS)
-check_symbol_exists(getrlimit "sys/types.h;sys/time.h;sys/resource.h" HAVE_GETRLIMIT)
 check_symbol_exists(posix_spawn spawn.h HAVE_POSIX_SPAWN)
 check_symbol_exists(pread unistd.h HAVE_PREAD)
 check_symbol_exists(sbrk unistd.h HAVE_SBRK)
diff --git a/llvm/cmake/modules/CrossCompile.cmake b/llvm/cmake/modules/CrossCompile.cmake
index c2cc84bec1521..3b31d3e218a37 100644
--- a/llvm/cmake/modules/CrossCompile.cmake
+++ b/llvm/cmake/modules/CrossCompile.cmake
@@ -12,6 +12,14 @@ function(llvm_create_cross_target project_name target_name toolchain buildtype)
     message(STATUS "Setting native build dir to " ${${project_name}_${target_name}_BUILD})
   endif(NOT DEFINED ${project_name}_${target_name}_BUILD)
 
+  if(NOT DEFINED ${project_name}_${target_name}_STAMP)
+    set(${project_name}_${target_name}_STAMP
+      "${CMAKE_CURRENT_BINARY_DIR}/${target_name}-stamps")
+    set(${project_name}_${target_name}_STAMP
+      ${${project_name}_${target_name}_STAMP} PARENT_SCOPE)
+    message(STATUS "Setting native stamp dir to " ${${project_name}_${target_name}_STAMP})
+  endif(NOT DEFINED ${project_name}_${target_name}_STAMP)
+
   if (EXISTS ${LLVM_MAIN_SRC_DIR}/cmake/platforms/${toolchain}.cmake)
     set(CROSS_TOOLCHAIN_FLAGS_INIT
       -DCMAKE_TOOLCHAIN_FILE=\"${LLVM_MAIN_SRC_DIR}/cmake/platforms/${toolchain}.cmake\")
@@ -130,13 +138,16 @@ function(build_native_tool target output_path_var)
     set_property(GLOBAL APPEND PROPERTY ${PROJECT_NAME}_HOST_TARGETS ${output_path})
   endif()
 
-  llvm_ExternalProject_BuildCmd(build_cmd ${target} ${${PROJECT_NAME}_NATIVE_BUILD}
+  llvm_ExternalProject_BuildCmd(build_cmd ${target}
+                                ${${PROJECT_NAME}_NATIVE_BUILD}
+                                ${${PROJECT_NAME}_NATIVE_STAMP}
                                 CONFIGURATION Release)
   add_custom_command(OUTPUT "${output_path}"
                      COMMAND ${build_cmd}
                      DEPENDS CONFIGURE_${PROJECT_NAME}_NATIVE ${ARG_DEPENDS} ${host_targets}
                      WORKING_DIRECTORY "${${PROJECT_NAME}_NATIVE_BUILD}"
                      COMMENT "Building native ${target}..."
-                     USES_TERMINAL)
+                     USES_TERMINAL
+                     VERBATIM)
   set(${output_path_var} "${output_path}" PARENT_SCOPE)
 endfunction()
diff --git a/llvm/cmake/modules/FileLock.cmake b/llvm/cmake/modules/FileLock.cmake
new file mode 100644
index 0000000000000..1e403a625847e
--- /dev/null
+++ b/llvm/cmake/modules/FileLock.cmake
@@ -0,0 +1,9 @@
+# CMake script that synchronizes process execution on a given file lock.
+#
+# Input variables:
+#   LOCK_FILE_PATH    - The file to be locked for the scope of the process of this cmake script.
+#   COMMAND           - The command to be executed.
+
+file(LOCK ${LOCK_FILE_PATH})
+string(REPLACE "@" ";" command_args ${COMMAND})
+execute_process(COMMAND ${command_args} COMMAND_ERROR_IS_FATAL ANY)
diff --git a/llvm/cmake/modules/GetHostTriple.cmake b/llvm/cmake/modules/GetHostTriple.cmake
index e58d5b1ef14d4..2a2f84ada098f 100644
--- a/llvm/cmake/modules/GetHostTriple.cmake
+++ b/llvm/cmake/modules/GetHostTriple.cmake
@@ -2,7 +2,7 @@
 # Invokes config.guess
 
 function( get_host_triple var )
-  if( MSVC OR (CMAKE_SYSTEM_NAME STREQUAL "Windows" AND CMAKE_C_COMPILER_ID MATCHES "Clang" AND NOT MINGW AND NOT MSYS))
+  if( MSVC OR CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC")
     if( CMAKE_C_COMPILER_ARCHITECTURE_ID MATCHES "ARM64.*" )
       set( value "aarch64-pc-windows-msvc" )
     elseif( CMAKE_C_COMPILER_ARCHITECTURE_ID MATCHES "ARM.*" )
diff --git a/llvm/cmake/modules/LLVMExternalProjectUtils.cmake b/llvm/cmake/modules/LLVMExternalProjectUtils.cmake
index f7e1165bc4b94..55422c2a4c023 100644
--- a/llvm/cmake/modules/LLVMExternalProjectUtils.cmake
+++ b/llvm/cmake/modules/LLVMExternalProjectUtils.cmake
@@ -2,14 +2,18 @@ include(ExternalProject)
 
 # llvm_ExternalProject_BuildCmd(out_var target)
 #   Utility function for constructing command lines for external project targets
-function(llvm_ExternalProject_BuildCmd out_var target bin_dir)
+function(llvm_ExternalProject_BuildCmd out_var target bin_dir stamp_dir)
   cmake_parse_arguments(ARG "" "CONFIGURATION" "" ${ARGN})
   if(NOT ARG_CONFIGURATION)
     set(ARG_CONFIGURATION "$<CONFIG>")
   endif()
   if (CMAKE_GENERATOR MATCHES "Make")
     # Use special command for Makefiles to support parallelism.
-    set(${out_var} "$(MAKE)" "-C" "${bin_dir}" "${target}" PARENT_SCOPE)
+    string(JOIN "@" make_cmd "$(MAKE)" "-C" "${bin_dir}" "${target}")
+    set(file_lock_script "${LLVM_CMAKE_DIR}/FileLock.cmake")
+    set(${out_var} ${CMAKE_COMMAND} "-DLOCK_FILE_PATH=${stamp_dir}/cmake.lock"
+                                    "-DCOMMAND=${make_cmd}"
+                                    "-P" "${file_lock_script}" PARENT_SCOPE)
   else()
     set(tool_args "${LLVM_EXTERNAL_PROJECT_BUILD_TOOL_ARGS}")
     if(NOT tool_args STREQUAL "")
@@ -409,7 +413,7 @@ function(llvm_ExternalProject_Add name source_dir)
     set(force_deps DEPENDS ${TOOLCHAIN_BINS})
   endif()
 
-  llvm_ExternalProject_BuildCmd(run_clean clean ${BINARY_DIR})
+  llvm_ExternalProject_BuildCmd(run_clean clean ${BINARY_DIR} ${STAMP_DIR})
   ExternalProject_Add_Step(${name} clean
     COMMAND ${run_clean}
     COMMENT "Cleaning ${name}..."
@@ -449,7 +453,7 @@ function(llvm_ExternalProject_Add name source_dir)
     else()
       set(external_target "${target}")
     endif()
-    llvm_ExternalProject_BuildCmd(build_runtime_cmd ${external_target} ${BINARY_DIR})
+    llvm_ExternalProject_BuildCmd(build_runtime_cmd ${external_target} ${BINARY_DIR} ${STAMP_DIR})
     add_custom_target(${target}
       COMMAND ${build_runtime_cmd}
       DEPENDS ${name}-configure
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index ebfa453d92d37..8f88b824f965a 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -459,6 +459,25 @@ Changes to LLDB
 
 * [New Core File API](https://lldb.llvm.org/python_api/lldb.SBSaveCoreOptions.html). This gives greater control on the data captured into the core file, relative to the existing `process save-core` styles.
 
+* When opening ELF core files, LLDB will print additional information about the
+  signal that killed the process and the disassembly view will display actual
+  (relocated) targets of the jump instructions instead of raw offsets encoded in
+  the instruction. This matches existing behavior for live processes.
+
+  Old:
+  ```
+  * thread #1: tid = 329384, 0x0000000000401262, name = 'a.out', stop reason = signal SIGSEGV
+
+  0x7f1e3193e0a7 <+23>:  ja     0xfe100        ; <+112>
+  ```
+
+  New:
+  ```
+  * thread #1: tid = 329384, 0x0000000000401262, name = 'a.out', stop reason = SIGSEGV: address not mapped to object (fault address: 0x0)
+
+  0x7f1e3193e0a7 <+23>:  ja     0x7f1e3193e100 ; <+112>
+  ```
+
 * `lldb-server` now listens to a single port for gdbserver connections and provides
   that port to the connection handler processes. This means that only 2 ports need
   to be opened in the firewall (one for the `lldb-server` platform, one for gdbserver connections).
diff --git a/llvm/include/llvm-c/DebugInfo.h b/llvm/include/llvm-c/DebugInfo.h
index 07f87d44088e7..ac7ee5a7cc9a1 100644
--- a/llvm/include/llvm-c/DebugInfo.h
+++ b/llvm/include/llvm-c/DebugInfo.h
@@ -870,13 +870,16 @@ LLVMDIBuilderCreateObjCProperty(LLVMDIBuilderRef Builder,
                                 LLVMMetadataRef Ty);
 
 /**
- * Create a uniqued DIType* clone with FlagObjectPointer and FlagArtificial set.
+ * Create a uniqued DIType* clone with FlagObjectPointer. If \c Implicit
+ * is true, then also set FlagArtificial.
  * \param Builder   The DIBuilder.
  * \param Type      The underlying type to which this pointer points.
+ * \param Implicit  Indicates whether this pointer was implicitly generated
+ *                  (i.e., not spelled out in source).
  */
-LLVMMetadataRef
-LLVMDIBuilderCreateObjectPointerType(LLVMDIBuilderRef Builder,
-                                     LLVMMetadataRef Type);
+LLVMMetadataRef LLVMDIBuilderCreateObjectPointerType(LLVMDIBuilderRef Builder,
+                                                     LLVMMetadataRef Type,
+                                                     LLVMBool Implicit);
 
 /**
  * Create debugging information entry for a qualified
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index 225390f1af60b..02d58d8c3d31c 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -2263,6 +2263,10 @@ APInt mulhs(const APInt &C1, const APInt &C2);
 /// Returns the high N bits of the multiplication result.
 APInt mulhu(const APInt &C1, const APInt &C2);
 
+/// Compute X^N for N>=0.
+/// 0^0 is supported and returns 1.
+APInt pow(const APInt &X, int64_t N);
+
 /// Compute GCD of two unsigned APInt values.
 ///
 /// This function returns the greatest common divisor of the two APInt values
diff --git a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
index 4aa922635c374..7a7a9594f4760 100644
--- a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
+++ b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
@@ -1409,11 +1409,10 @@ template <class BT> void BlockFrequencyInfoImpl<BT>::applyIterativeInference() {
     auto Node = getNode(&BB);
     if (!Node.isValid())
       continue;
-    if (BlockIndex.count(&BB)) {
-      Freqs[Node.Index].Scaled = Freq[BlockIndex[&BB]];
-    } else {
+    if (auto It = BlockIndex.find(&BB); It != BlockIndex.end())
+      Freqs[Node.Index].Scaled = Freq[It->second];
+    else
       Freqs[Node.Index].Scaled = Scaled64::getZero();
-    }
   }
 }
 
@@ -1764,8 +1763,8 @@ void BlockFrequencyInfoImpl<BT>::verifyMatch(
     for (auto &Entry : ValidNodes) {
       const BlockT *BB = Entry.first;
       BlockNode Node = Entry.second;
-      if (OtherValidNodes.count(BB)) {
-        BlockNode OtherNode = OtherValidNodes[BB];
+      if (auto It = OtherValidNodes.find(BB); It != OtherValidNodes.end()) {
+        BlockNode OtherNode = It->second;
         const auto &Freq = Freqs[Node.Index];
         const auto &OtherFreq = Other.Freqs[OtherNode.Index];
         if (Freq.Integer != OtherFreq.Integer) {
diff --git a/llvm/include/llvm/Analysis/CmpInstAnalysis.h b/llvm/include/llvm/Analysis/CmpInstAnalysis.h
index c7862a6d39d07..aeda58ac7535d 100644
--- a/llvm/include/llvm/Analysis/CmpInstAnalysis.h
+++ b/llvm/include/llvm/Analysis/CmpInstAnalysis.h
@@ -108,6 +108,12 @@ namespace llvm {
                        bool LookThroughTrunc = true,
                        bool AllowNonZeroC = false);
 
+  /// Decompose an icmp into the form ((X & Mask) pred C) if
+  /// possible. Unless \p AllowNonZeroC is true, C will always be 0.
+  std::optional<DecomposedBitTest>
+  decomposeBitTest(Value *Cond, bool LookThroughTrunc = true,
+                   bool AllowNonZeroC = false);
+
 } // end namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Analysis/CtxProfAnalysis.h b/llvm/include/llvm/Analysis/CtxProfAnalysis.h
index be040d5eca5f3..ea292250c63a9 100644
--- a/llvm/include/llvm/Analysis/CtxProfAnalysis.h
+++ b/llvm/include/llvm/Analysis/CtxProfAnalysis.h
@@ -140,7 +140,7 @@ class CtxProfAnalysis : public AnalysisInfoMixin<CtxProfAnalysis> {
 class CtxProfAnalysisPrinterPass
     : public PassInfoMixin<CtxProfAnalysisPrinterPass> {
 public:
-  enum class PrintMode { Everything, JSON };
+  enum class PrintMode { Everything, YAML };
   explicit CtxProfAnalysisPrinterPass(raw_ostream &OS);
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index fe13fc676e303..71b204f9c3fec 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1870,6 +1870,13 @@ class TargetTransformInfo {
   /// false, but it shouldn't matter what it returns anyway.
   bool hasArmWideBranch(bool Thumb) const;
 
+  /// Returns a bitmask constructed from the target-features or fmv-features
+  /// metadata of a function.
+  uint64_t getFeatureMask(const Function &F) const;
+
+  /// Returns true if this is an instance of a function with multiple versions.
+  bool isMultiversionedFunction(const Function &F) const;
+
   /// \return The maximum number of function arguments the target supports.
   unsigned getMaxNumArgs() const;
 
@@ -2312,6 +2319,8 @@ class TargetTransformInfo::Concept {
   virtual VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
+  virtual uint64_t getFeatureMask(const Function &F) const = 0;
+  virtual bool isMultiversionedFunction(const Function &F) const = 0;
   virtual unsigned getMaxNumArgs() const = 0;
   virtual unsigned getNumBytesToPadGlobalArray(unsigned Size,
                                                Type *ArrayType) const = 0;
@@ -3144,6 +3153,14 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.hasArmWideBranch(Thumb);
   }
 
+  uint64_t getFeatureMask(const Function &F) const override {
+    return Impl.getFeatureMask(F);
+  }
+
+  bool isMultiversionedFunction(const Function &F) const override {
+    return Impl.isMultiversionedFunction(F);
+  }
+
   unsigned getMaxNumArgs() const override {
     return Impl.getMaxNumArgs();
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 7ac3063ca9a37..dcef4a1abcfa3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1039,6 +1039,10 @@ class TargetTransformInfoImplBase {
 
   bool hasArmWideBranch(bool) const { return false; }
 
+  uint64_t getFeatureMask(const Function &F) const { return 0; }
+
+  bool isMultiversionedFunction(const Function &F) const { return false; }
+
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
   unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {
diff --git a/llvm/include/llvm/BinaryFormat/Wasm.h b/llvm/include/llvm/BinaryFormat/Wasm.h
index 759e432125091..ede2d692a5949 100644
--- a/llvm/include/llvm/BinaryFormat/Wasm.h
+++ b/llvm/include/llvm/BinaryFormat/Wasm.h
@@ -170,7 +170,7 @@ enum : unsigned {
   WASM_ELEM_SEGMENT_HAS_TABLE_NUMBER = 0x02, // if passive == 0
   WASM_ELEM_SEGMENT_HAS_INIT_EXPRS = 0x04,
 };
-const unsigned WASM_ELEM_SEGMENT_MASK_HAS_ELEM_KIND = 0x3;
+const unsigned WASM_ELEM_SEGMENT_MASK_HAS_ELEM_DESC = 0x3;
 
 // Feature policy prefixes used in the custom "target_features" section
 enum : uint8_t {
@@ -415,6 +415,10 @@ struct WasmDataSegment {
   uint32_t Comdat; // from the "comdat info" section
 };
 
+// 3 different element segment modes are encodable. This class is currently
+// only used during decoding (see WasmElemSegment below).
+enum class ElemSegmentMode { Active, Passive, Declarative };
+
 // Represents a Wasm element segment, with some limitations compared the spec:
 // 1) Does not model passive or declarative segments (Segment will end up with
 // an Offset field of i32.const 0)
diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index bf491096e3c47..5291369b3b9f1 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -44,6 +44,7 @@ class DebugHandlerBase;
 class DIE;
 class DIEAbbrev;
 class DwarfDebug;
+class EHStreamer;
 class GCMetadataPrinter;
 class GCStrategy;
 class GlobalAlias;
@@ -187,15 +188,17 @@ class AsmPrinter : public MachineFunctionPass {
   /// For dso_local functions, the current $local alias for the function.
   MCSymbol *CurrentFnBeginLocal = nullptr;
 
-  /// A vector of all debug/EH info emitters we should use. This vector
-  /// maintains ownership of the emitters.
+  /// A handle to the EH info emitter (if present).
+  // Only for EHStreamer subtypes, but some C++ compilers will incorrectly warn
+  // us if we declare that directly.
+  SmallVector<std::unique_ptr<AsmPrinterHandler>, 1> EHHandlers;
+
+  // A vector of all Debuginfo emitters we should use. Protected so that
+  // targets can add their own. This vector maintains ownership of the
+  // emitters.
   SmallVector<std::unique_ptr<AsmPrinterHandler>, 2> Handlers;
   size_t NumUserHandlers = 0;
 
-  /// Debuginfo handler. Protected so that targets can add their own.
-  SmallVector<std::unique_ptr<DebugHandlerBase>, 1> DebugHandlers;
-  size_t NumUserDebugHandlers = 0;
-
   StackMaps SM;
 
 private:
@@ -527,8 +530,6 @@ class AsmPrinter : public MachineFunctionPass {
 
   void addAsmPrinterHandler(std::unique_ptr<AsmPrinterHandler> Handler);
 
-  void addDebugHandler(std::unique_ptr<DebugHandlerBase> Handler);
-
   // Targets can, or in the case of EmitInstruction, must implement these to
   // customize output.
 
diff --git a/llvm/include/llvm/CodeGen/AsmPrinterHandler.h b/llvm/include/llvm/CodeGen/AsmPrinterHandler.h
index ed73e618431de..bf3f6c53027a7 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinterHandler.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinterHandler.h
@@ -64,6 +64,18 @@ class AsmPrinterHandler {
   /// immediately prior to markFunctionEnd.
   virtual void endBasicBlockSection(const MachineBasicBlock &MBB) {}
 
+  /// For symbols that have a size designated (e.g. common symbols),
+  /// this tracks that size.
+  virtual void setSymbolSize(const MCSymbol *Sym, uint64_t Size) {}
+
+  /// Process beginning of an instruction.
+  virtual void beginInstruction(const MachineInstr *MI) {}
+
+  /// Process end of an instruction.
+  virtual void endInstruction() {}
+
+  virtual void beginCodeAlignment(const MachineBasicBlock &MBB) {}
+
   /// Emit target-specific EH funclet machinery.
   virtual void beginFunclet(const MachineBasicBlock &MBB,
                             MCSymbol *Sym = nullptr) {}
diff --git a/llvm/include/llvm/CodeGen/DebugHandlerBase.h b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
index d39e7e68cb255..f669bd311ff56 100644
--- a/llvm/include/llvm/CodeGen/DebugHandlerBase.h
+++ b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
@@ -50,14 +50,10 @@ struct DbgVariableLocation {
 
 /// Base class for debug information backends. Common functionality related to
 /// tracking which variables and scopes are alive at a given PC live here.
-class DebugHandlerBase {
+class DebugHandlerBase : public AsmPrinterHandler {
 protected:
   DebugHandlerBase(AsmPrinter *A);
 
-public:
-  virtual ~DebugHandlerBase();
-
-protected:
   /// Target of debug info emission.
   AsmPrinter *Asm = nullptr;
 
@@ -120,24 +116,20 @@ class DebugHandlerBase {
 private:
   InstructionOrdering InstOrdering;
 
+  // AsmPrinterHandler overrides.
 public:
-  /// For symbols that have a size designated (e.g. common symbols),
-  /// this tracks that size. Only used by DWARF.
-  virtual void setSymbolSize(const MCSymbol *Sym, uint64_t Size) {}
-
-  virtual void beginModule(Module *M);
-  virtual void endModule() = 0;
+  virtual ~DebugHandlerBase() override;
 
-  virtual void beginInstruction(const MachineInstr *MI);
-  virtual void endInstruction();
+  void beginModule(Module *M) override;
 
-  void beginFunction(const MachineFunction *MF);
-  void endFunction(const MachineFunction *MF);
+  void beginInstruction(const MachineInstr *MI) override;
+  void endInstruction() override;
 
-  void beginBasicBlockSection(const MachineBasicBlock &MBB);
-  void endBasicBlockSection(const MachineBasicBlock &MBB);
+  void beginFunction(const MachineFunction *MF) override;
+  void endFunction(const MachineFunction *MF) override;
 
-  virtual void beginCodeAlignment(const MachineBasicBlock &MBB) {}
+  void beginBasicBlockSection(const MachineBasicBlock &MBB) override;
+  void endBasicBlockSection(const MachineBasicBlock &MBB) override;
 
   /// Return Label preceding the instruction.
   MCSymbol *getLabelBeforeInsn(const MachineInstr *MI);
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 4faa090901a6a..4488a6152117c 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -896,6 +896,10 @@ inline UnaryOpc_match<Opnd, true> m_ChainedUnaryOp(unsigned Opc,
   return UnaryOpc_match<Opnd, true>(Opc, Op);
 }
 
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_BitCast(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::BITCAST, Op);
+}
+
 template <typename Opnd>
 inline UnaryOpc_match<Opnd> m_BSwap(const Opnd &Op) {
   return UnaryOpc_match<Opnd>(ISD::BSWAP, Op);
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index 3e6b94dfbe545..1d2d00a3b758b 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -72,15 +72,6 @@
 /* Define if __unw_add_dynamic_fde() is available on this platform. */
 #cmakedefine HAVE_UNW_ADD_DYNAMIC_FDE ${HAVE_UNW_ADD_DYNAMIC_FDE}
 
-/* Define to 1 if you have the <errno.h> header file. */
-#cmakedefine HAVE_ERRNO_H ${HAVE_ERRNO_H}
-
-/* Define to 1 if you have the <fcntl.h> header file. */
-#cmakedefine HAVE_FCNTL_H ${HAVE_FCNTL_H}
-
-/* Define to 1 if you have the <fenv.h> header file. */
-#cmakedefine HAVE_FENV_H ${HAVE_FENV_H}
-
 /* Define if libffi is available on this platform. */
 #cmakedefine HAVE_FFI_CALL ${HAVE_FFI_CALL}
 
@@ -99,9 +90,6 @@
 /* Define to 1 if you have the `getpagesize' function. */
 #cmakedefine HAVE_GETPAGESIZE ${HAVE_GETPAGESIZE}
 
-/* Define to 1 if you have the `getrlimit' function. */
-#cmakedefine HAVE_GETRLIMIT ${HAVE_GETRLIMIT}
-
 /* Define to 1 if you have the `getrusage' function. */
 #cmakedefine HAVE_GETRUSAGE ${HAVE_GETRUSAGE}
 
@@ -174,45 +162,24 @@
 /* Define to 1 if you have the `setenv' function. */
 #cmakedefine HAVE_SETENV ${HAVE_SETENV}
 
-/* Define to 1 if you have the `setrlimit' function. */
-#cmakedefine HAVE_SETRLIMIT ${HAVE_SETRLIMIT}
-
 /* Define to 1 if you have the `sigaltstack' function. */
 #cmakedefine HAVE_SIGALTSTACK ${HAVE_SIGALTSTACK}
 
-/* Define to 1 if you have the <signal.h> header file. */
-#cmakedefine HAVE_SIGNAL_H ${HAVE_SIGNAL_H}
-
 /* Define to 1 if you have the `strerror_r' function. */
 #cmakedefine HAVE_STRERROR_R ${HAVE_STRERROR_R}
 
 /* Define to 1 if you have the `sysconf' function. */
 #cmakedefine HAVE_SYSCONF ${HAVE_SYSCONF}
 
-/* Define to 1 if you have the <sys/ioctl.h> header file. */
-#cmakedefine HAVE_SYS_IOCTL_H ${HAVE_SYS_IOCTL_H}
-
 /* Define to 1 if you have the <sys/mman.h> header file. */
 #cmakedefine HAVE_SYS_MMAN_H ${HAVE_SYS_MMAN_H}
 
-/* Define to 1 if you have the <sys/resource.h> header file. */
-#cmakedefine HAVE_SYS_RESOURCE_H ${HAVE_SYS_RESOURCE_H}
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#cmakedefine HAVE_SYS_STAT_H ${HAVE_SYS_STAT_H}
-
-/* Define to 1 if you have the <sys/time.h> header file. */
-#cmakedefine HAVE_SYS_TIME_H ${HAVE_SYS_TIME_H}
-
 /* Define to 1 if stat struct has st_mtimespec member .*/
 #cmakedefine HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC ${HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC}
 
 /* Define to 1 if stat struct has st_mtim member. */
 #cmakedefine HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC ${HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC}
 
-/* Define to 1 if you have the <termios.h> header file. */
-#cmakedefine HAVE_TERMIOS_H ${HAVE_TERMIOS_H}
-
 /* Define to 1 if you have the <unistd.h> header file. */
 #cmakedefine HAVE_UNISTD_H ${HAVE_UNISTD_H}
 
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
index 67bcb00787312..bd82fadea1027 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
@@ -336,6 +336,18 @@ class Block : public Addressable {
     return make_range(Edges.begin(), Edges.end());
   }
 
+  /// Returns an iterator over all edges at the given offset within the block.
+  auto edges_at(Edge::OffsetT O) {
+    return make_filter_range(edges(),
+                             [O](const Edge &E) { return E.getOffset() == O; });
+  }
+
+  /// Returns an iterator over all edges at the given offset within the block.
+  auto edges_at(Edge::OffsetT O) const {
+    return make_filter_range(edges(),
+                             [O](const Edge &E) { return E.getOffset() == O; });
+  }
+
   /// Return the size of the edges list.
   size_t edges_size() const { return Edges.size(); }
 
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h
index 356b8cd70aec5..e10242bb2d42c 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h
@@ -671,6 +671,11 @@ class GOTTableManager : public TableManager<GOTTableManager> {
 public:
   static StringRef getSectionName() { return "$__GOT"; }
 
+  GOTTableManager(LinkGraph &G) {
+    if ((GOTSection = G.findSectionByName(getSectionName())))
+      registerExistingEntries();
+  }
+
   bool visitEdge(LinkGraph &G, Block *B, Edge &E) {
     Edge::Kind KindToSet = Edge::Invalid;
     switch (E.getKind()) {
@@ -721,16 +726,21 @@ class GOTTableManager : public TableManager<GOTTableManager> {
     return *GOTSection;
   }
 
+  void registerExistingEntries();
+
   Section *GOTSection = nullptr;
 };
 
 /// Procedure Linkage Table Builder.
 class PLTTableManager : public TableManager<PLTTableManager> {
 public:
-  PLTTableManager(GOTTableManager &GOT) : GOT(GOT) {}
-
   static StringRef getSectionName() { return "$__STUBS"; }
 
+  PLTTableManager(LinkGraph &G, GOTTableManager &GOT) : GOT(GOT) {
+    if ((StubsSection = G.findSectionByName(getSectionName())))
+      registerExistingEntries();
+  }
+
   bool visitEdge(LinkGraph &G, Block *B, Edge &E) {
     if (E.getKind() == x86_64::BranchPCRel32 && !E.getTarget().isDefined()) {
       DEBUG_WITH_TYPE("jitlink", {
@@ -754,14 +764,16 @@ class PLTTableManager : public TableManager<PLTTableManager> {
 
 public:
   Section &getStubsSection(LinkGraph &G) {
-    if (!PLTSection)
-      PLTSection = &G.createSection(getSectionName(),
-                                    orc::MemProt::Read | orc::MemProt::Exec);
-    return *PLTSection;
+    if (!StubsSection)
+      StubsSection = &G.createSection(getSectionName(),
+                                      orc::MemProt::Read | orc::MemProt::Exec);
+    return *StubsSection;
   }
 
+  void registerExistingEntries();
+
   GOTTableManager &GOT;
-  Section *PLTSection = nullptr;
+  Section *StubsSection = nullptr;
 };
 
 /// Optimize the GOT and Stub relocations if the edge target address is in range
diff --git a/llvm/include/llvm/FileCheck/FileCheck.h b/llvm/include/llvm/FileCheck/FileCheck.h
index 321ce1d26e163..72d0b91b27ad0 100644
--- a/llvm/include/llvm/FileCheck/FileCheck.h
+++ b/llvm/include/llvm/FileCheck/FileCheck.h
@@ -180,8 +180,7 @@ struct FileCheckString;
 class FileCheck {
   FileCheckRequest Req;
   std::unique_ptr<FileCheckPatternContext> PatternContext;
-  // C++17 TODO: make this a plain std::vector.
-  std::unique_ptr<std::vector<FileCheckString>> CheckStrings;
+  std::vector<FileCheckString> CheckStrings;
 
 public:
   explicit FileCheck(FileCheckRequest Req);
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 6b6e5bc19d95a..9802cbe8b7b94 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1264,12 +1264,14 @@ class OpenMPIRBuilder {
   /// \param EventHandle If present, signifies the event handle as part of
   ///			 the detach clause
   /// \param Mergeable	 If the given task is `mergeable`
+  /// \param priority `priority-value' specifies the execution order of the
+  ///                 tasks that is generated by the construct
   InsertPointOrErrorTy
   createTask(const LocationDescription &Loc, InsertPointTy AllocaIP,
              BodyGenCallbackTy BodyGenCB, bool Tied = true,
              Value *Final = nullptr, Value *IfCondition = nullptr,
              SmallVector<DependData> Dependencies = {}, bool Mergeable = false,
-             Value *EventHandle = nullptr);
+             Value *EventHandle = nullptr, Value *Priority = nullptr);
 
   /// Generator for the taskgroup construct
   ///
diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h
index cb1150c269a1d..6c479415b9ed2 100644
--- a/llvm/include/llvm/IR/DIBuilder.h
+++ b/llvm/include/llvm/IR/DIBuilder.h
@@ -662,9 +662,9 @@ namespace llvm {
     /// Create a uniqued clone of \p Ty with FlagArtificial set.
     static DIType *createArtificialType(DIType *Ty);
 
-    /// Create a uniqued clone of \p Ty with FlagObjectPointer and
-    /// FlagArtificial set.
-    static DIType *createObjectPointerType(DIType *Ty);
+    /// Create a uniqued clone of \p Ty with FlagObjectPointer set.
+    /// If \p Implicit is true, also set FlagArtificial.
+    static DIType *createObjectPointerType(DIType *Ty, bool Implicit);
 
     /// Create a permanent forward-declared type.
     DICompositeType *createForwardDecl(unsigned Tag, StringRef Name,
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index b8d9cc10292f4..47ddc7555594c 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -1719,6 +1719,11 @@ class CallBase : public Instruction {
   // FIXME: Once this API is no longer duplicated in `CallSite`, rename this to
   // better indicate that this may return a conservative answer.
   bool onlyReadsMemory(unsigned OpNo) const {
+    // If the argument is passed byval, the callee does not have access to the
+    // original pointer and thus cannot write to it.
+    if (OpNo < arg_size() && isByValArgument(OpNo))
+      return true;
+
     return dataOperandHasImpliedAttr(OpNo, Attribute::ReadOnly) ||
            dataOperandHasImpliedAttr(OpNo, Attribute::ReadNone);
   }
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index 2a56ba78ce88e..f21948697c8a6 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -105,6 +105,8 @@ def int_dx_wave_active_countbits : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i1
 def int_dx_wave_all : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i1_ty], [IntrConvergent, IntrNoMem]>;
 def int_dx_wave_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i1_ty], [IntrConvergent, IntrNoMem]>;
 def int_dx_wave_getlaneindex : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrConvergent, IntrNoMem]>;
+def int_dx_wave_reduce_sum : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
+def int_dx_wave_reduce_usum : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
 def int_dx_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
 def int_dx_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
 def int_dx_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty], [IntrNoMem]>;
@@ -115,6 +117,7 @@ def int_dx_radians : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>
 def int_dx_discard : DefaultAttrsIntrinsic<[], [llvm_i1_ty], []>;
 def int_dx_firstbituhigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
 def int_dx_firstbitshigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
+def int_dx_firstbitlow : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
 
 def int_dx_group_memory_barrier_with_group_sync : DefaultAttrsIntrinsic<[], [], []>;
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 37057271b6c28..be337dbccaf8a 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -91,6 +91,7 @@ let TargetPrefix = "spv" in {
   def int_spv_wave_active_countbits : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i1_ty], [IntrConvergent, IntrNoMem]>;
   def int_spv_wave_all : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i1_ty], [IntrConvergent, IntrNoMem]>;
   def int_spv_wave_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i1_ty], [IntrConvergent, IntrNoMem]>;
+  def int_spv_wave_reduce_sum : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
   def int_spv_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
   def int_spv_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
   def int_spv_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty], [IntrNoMem]>;
@@ -113,6 +114,7 @@ let TargetPrefix = "spv" in {
 
   def int_spv_firstbituhigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
   def int_spv_firstbitshigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
+  def int_spv_firstbitlow : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
 
   def int_spv_resource_updatecounter
       : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_i8_ty],
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 42b211e0e1f75..fb12949e10c7e 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -5999,10 +5999,10 @@ let TargetPrefix = "x86" in {
   def int_x86_tcvtrowd2ps : ClangBuiltin<"__builtin_ia32_tcvtrowd2ps">,
               Intrinsic<[llvm_v16f32_ty], [llvm_i8_ty, llvm_i32_ty],
                         [ImmArg<ArgIndex<0>>]>;
-  def int_x86_tcvtrowps2pbf16h : ClangBuiltin<"__builtin_ia32_tcvtrowps2pbf16h">,
+  def int_x86_tcvtrowps2bf16h : ClangBuiltin<"__builtin_ia32_tcvtrowps2bf16h">,
               Intrinsic<[llvm_v32bf16_ty], [llvm_i8_ty, llvm_i32_ty],
                         [ImmArg<ArgIndex<0>>]>;
-  def int_x86_tcvtrowps2pbf16l : ClangBuiltin<"__builtin_ia32_tcvtrowps2pbf16l">,
+  def int_x86_tcvtrowps2bf16l : ClangBuiltin<"__builtin_ia32_tcvtrowps2bf16l">,
               Intrinsic<[llvm_v32bf16_ty], [llvm_i8_ty, llvm_i32_ty],
                         [ImmArg<ArgIndex<0>>]>;
   def int_x86_tcvtrowps2phh : ClangBuiltin<"__builtin_ia32_tcvtrowps2phh">,
@@ -6181,13 +6181,13 @@ let TargetPrefix = "x86" in {
               Intrinsic<[llvm_v16f32_ty],
                         [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, llvm_i32_ty],
                         []>;
-  def int_x86_tcvtrowps2pbf16h_internal :
-              ClangBuiltin<"__builtin_ia32_tcvtrowps2pbf16h_internal">,
+  def int_x86_tcvtrowps2bf16h_internal :
+              ClangBuiltin<"__builtin_ia32_tcvtrowps2bf16h_internal">,
               Intrinsic<[llvm_v32bf16_ty],
                         [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, llvm_i32_ty],
                         []>;
-  def int_x86_tcvtrowps2pbf16l_internal :
-              ClangBuiltin<"__builtin_ia32_tcvtrowps2pbf16l_internal">,
+  def int_x86_tcvtrowps2bf16l_internal :
+              ClangBuiltin<"__builtin_ia32_tcvtrowps2bf16l_internal">,
               Intrinsic<[llvm_v32bf16_ty],
                         [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, llvm_i32_ty],
                         []>;
@@ -7893,4 +7893,4 @@ def int_x86_movrsdi : ClangBuiltin<"__builtin_ia32_movrsdi">,
                   [IntrReadMem]>;
 def int_x86_prefetchrs : ClangBuiltin<"__builtin_ia32_prefetchrs">,
         Intrinsic<[], [llvm_ptr_ty], []>;
-}
\ No newline at end of file
+}
diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h
index df2384c5f6e69..ec7d030a20de8 100644
--- a/llvm/include/llvm/IR/Metadata.h
+++ b/llvm/include/llvm/IR/Metadata.h
@@ -138,6 +138,10 @@ class Metadata {
   void printAsOperand(raw_ostream &OS, ModuleSlotTracker &MST,
                       const Module *M = nullptr) const;
   /// @}
+
+  /// Metadata IDs that may generate poison.
+  constexpr static const unsigned PoisonGeneratingIDs[] = {
+      LLVMContext::MD_range, LLVMContext::MD_nonnull, LLVMContext::MD_align};
 };
 
 // Create wrappers for C Binding types (see CBindingWrapping.h).
diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
index 8ca073ba82253..ce794e2573637 100644
--- a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
+++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
@@ -38,9 +38,8 @@ enum class TMAReductionOp : uint8_t {
   XOR = 7,
 };
 
-inline bool IntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) {
+inline bool FPToIntegerIntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) {
   switch (IntrinsicID) {
-  // Float to i32 / i64 conversion intrinsics:
   case Intrinsic::nvvm_f2i_rm_ftz:
   case Intrinsic::nvvm_f2i_rn_ftz:
   case Intrinsic::nvvm_f2i_rp_ftz:
@@ -61,11 +60,53 @@ inline bool IntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) {
   case Intrinsic::nvvm_f2ull_rp_ftz:
   case Intrinsic::nvvm_f2ull_rz_ftz:
     return true;
+
+  case Intrinsic::nvvm_f2i_rm:
+  case Intrinsic::nvvm_f2i_rn:
+  case Intrinsic::nvvm_f2i_rp:
+  case Intrinsic::nvvm_f2i_rz:
+
+  case Intrinsic::nvvm_f2ui_rm:
+  case Intrinsic::nvvm_f2ui_rn:
+  case Intrinsic::nvvm_f2ui_rp:
+  case Intrinsic::nvvm_f2ui_rz:
+
+  case Intrinsic::nvvm_d2i_rm:
+  case Intrinsic::nvvm_d2i_rn:
+  case Intrinsic::nvvm_d2i_rp:
+  case Intrinsic::nvvm_d2i_rz:
+
+  case Intrinsic::nvvm_d2ui_rm:
+  case Intrinsic::nvvm_d2ui_rn:
+  case Intrinsic::nvvm_d2ui_rp:
+  case Intrinsic::nvvm_d2ui_rz:
+
+  case Intrinsic::nvvm_f2ll_rm:
+  case Intrinsic::nvvm_f2ll_rn:
+  case Intrinsic::nvvm_f2ll_rp:
+  case Intrinsic::nvvm_f2ll_rz:
+
+  case Intrinsic::nvvm_f2ull_rm:
+  case Intrinsic::nvvm_f2ull_rn:
+  case Intrinsic::nvvm_f2ull_rp:
+  case Intrinsic::nvvm_f2ull_rz:
+
+  case Intrinsic::nvvm_d2ll_rm:
+  case Intrinsic::nvvm_d2ll_rn:
+  case Intrinsic::nvvm_d2ll_rp:
+  case Intrinsic::nvvm_d2ll_rz:
+
+  case Intrinsic::nvvm_d2ull_rm:
+  case Intrinsic::nvvm_d2ull_rn:
+  case Intrinsic::nvvm_d2ull_rp:
+  case Intrinsic::nvvm_d2ull_rz:
+    return false;
   }
+  llvm_unreachable("Checking FTZ flag for invalid f2i/d2i intrinsic");
   return false;
 }
 
-inline bool IntrinsicConvertsToSignedInteger(Intrinsic::ID IntrinsicID) {
+inline bool FPToIntegerIntrinsicResultIsSigned(Intrinsic::ID IntrinsicID) {
   switch (IntrinsicID) {
   // f2i
   case Intrinsic::nvvm_f2i_rm:
@@ -96,12 +137,44 @@ inline bool IntrinsicConvertsToSignedInteger(Intrinsic::ID IntrinsicID) {
   case Intrinsic::nvvm_d2ll_rp:
   case Intrinsic::nvvm_d2ll_rz:
     return true;
+
+  // f2ui
+  case Intrinsic::nvvm_f2ui_rm:
+  case Intrinsic::nvvm_f2ui_rm_ftz:
+  case Intrinsic::nvvm_f2ui_rn:
+  case Intrinsic::nvvm_f2ui_rn_ftz:
+  case Intrinsic::nvvm_f2ui_rp:
+  case Intrinsic::nvvm_f2ui_rp_ftz:
+  case Intrinsic::nvvm_f2ui_rz:
+  case Intrinsic::nvvm_f2ui_rz_ftz:
+  // d2ui
+  case Intrinsic::nvvm_d2ui_rm:
+  case Intrinsic::nvvm_d2ui_rn:
+  case Intrinsic::nvvm_d2ui_rp:
+  case Intrinsic::nvvm_d2ui_rz:
+  // f2ull
+  case Intrinsic::nvvm_f2ull_rm:
+  case Intrinsic::nvvm_f2ull_rm_ftz:
+  case Intrinsic::nvvm_f2ull_rn:
+  case Intrinsic::nvvm_f2ull_rn_ftz:
+  case Intrinsic::nvvm_f2ull_rp:
+  case Intrinsic::nvvm_f2ull_rp_ftz:
+  case Intrinsic::nvvm_f2ull_rz:
+  case Intrinsic::nvvm_f2ull_rz_ftz:
+  // d2ull
+  case Intrinsic::nvvm_d2ull_rm:
+  case Intrinsic::nvvm_d2ull_rn:
+  case Intrinsic::nvvm_d2ull_rp:
+  case Intrinsic::nvvm_d2ull_rz:
+    return false;
   }
+  llvm_unreachable(
+      "Checking invalid f2i/d2i intrinsic for signed int conversion");
   return false;
 }
 
 inline APFloat::roundingMode
-IntrinsicGetRoundingMode(Intrinsic::ID IntrinsicID) {
+GetFPToIntegerRoundingMode(Intrinsic::ID IntrinsicID) {
   switch (IntrinsicID) {
   // RM:
   case Intrinsic::nvvm_f2i_rm:
@@ -167,10 +240,100 @@ IntrinsicGetRoundingMode(Intrinsic::ID IntrinsicID) {
   case Intrinsic::nvvm_d2ull_rz:
     return APFloat::rmTowardZero;
   }
-  llvm_unreachable("Invalid f2i/d2i rounding mode intrinsic");
+  llvm_unreachable("Checking rounding mode for invalid f2i/d2i intrinsic");
   return APFloat::roundingMode::Invalid;
 }
 
+inline bool FMinFMaxShouldFTZ(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  case Intrinsic::nvvm_fmax_ftz_f:
+  case Intrinsic::nvvm_fmax_ftz_nan_f:
+  case Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f:
+  case Intrinsic::nvvm_fmax_ftz_xorsign_abs_f:
+
+  case Intrinsic::nvvm_fmin_ftz_f:
+  case Intrinsic::nvvm_fmin_ftz_nan_f:
+  case Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f:
+  case Intrinsic::nvvm_fmin_ftz_xorsign_abs_f:
+    return true;
+
+  case Intrinsic::nvvm_fmax_d:
+  case Intrinsic::nvvm_fmax_f:
+  case Intrinsic::nvvm_fmax_nan_f:
+  case Intrinsic::nvvm_fmax_nan_xorsign_abs_f:
+  case Intrinsic::nvvm_fmax_xorsign_abs_f:
+
+  case Intrinsic::nvvm_fmin_d:
+  case Intrinsic::nvvm_fmin_f:
+  case Intrinsic::nvvm_fmin_nan_f:
+  case Intrinsic::nvvm_fmin_nan_xorsign_abs_f:
+  case Intrinsic::nvvm_fmin_xorsign_abs_f:
+    return false;
+  }
+  llvm_unreachable("Checking FTZ flag for invalid fmin/fmax intrinsic");
+  return false;
+}
+
+inline bool FMinFMaxPropagatesNaNs(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  case Intrinsic::nvvm_fmax_ftz_nan_f:
+  case Intrinsic::nvvm_fmax_nan_f:
+  case Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f:
+  case Intrinsic::nvvm_fmax_nan_xorsign_abs_f:
+
+  case Intrinsic::nvvm_fmin_ftz_nan_f:
+  case Intrinsic::nvvm_fmin_nan_f:
+  case Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f:
+  case Intrinsic::nvvm_fmin_nan_xorsign_abs_f:
+    return true;
+
+  case Intrinsic::nvvm_fmax_d:
+  case Intrinsic::nvvm_fmax_f:
+  case Intrinsic::nvvm_fmax_ftz_f:
+  case Intrinsic::nvvm_fmax_ftz_xorsign_abs_f:
+  case Intrinsic::nvvm_fmax_xorsign_abs_f:
+
+  case Intrinsic::nvvm_fmin_d:
+  case Intrinsic::nvvm_fmin_f:
+  case Intrinsic::nvvm_fmin_ftz_f:
+  case Intrinsic::nvvm_fmin_ftz_xorsign_abs_f:
+  case Intrinsic::nvvm_fmin_xorsign_abs_f:
+    return false;
+  }
+  llvm_unreachable("Checking NaN flag for invalid fmin/fmax intrinsic");
+  return false;
+}
+
+inline bool FMinFMaxIsXorSignAbs(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  case Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f:
+  case Intrinsic::nvvm_fmax_ftz_xorsign_abs_f:
+  case Intrinsic::nvvm_fmax_nan_xorsign_abs_f:
+  case Intrinsic::nvvm_fmax_xorsign_abs_f:
+
+  case Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f:
+  case Intrinsic::nvvm_fmin_ftz_xorsign_abs_f:
+  case Intrinsic::nvvm_fmin_nan_xorsign_abs_f:
+  case Intrinsic::nvvm_fmin_xorsign_abs_f:
+    return true;
+
+  case Intrinsic::nvvm_fmax_d:
+  case Intrinsic::nvvm_fmax_f:
+  case Intrinsic::nvvm_fmax_ftz_f:
+  case Intrinsic::nvvm_fmax_ftz_nan_f:
+  case Intrinsic::nvvm_fmax_nan_f:
+
+  case Intrinsic::nvvm_fmin_d:
+  case Intrinsic::nvvm_fmin_f:
+  case Intrinsic::nvvm_fmin_ftz_f:
+  case Intrinsic::nvvm_fmin_ftz_nan_f:
+  case Intrinsic::nvvm_fmin_nan_f:
+    return false;
+  }
+  llvm_unreachable("Checking XorSignAbs flag for invalid fmin/fmax intrinsic");
+  return false;
+}
+
 } // namespace nvvm
 } // namespace llvm
 #endif // LLVM_IR_NVVMINTRINSICUTILS_H
diff --git a/llvm/include/llvm/IR/PrintPasses.h b/llvm/include/llvm/IR/PrintPasses.h
index 95b97e76c867c..0aa1b379c35cf 100644
--- a/llvm/include/llvm/IR/PrintPasses.h
+++ b/llvm/include/llvm/IR/PrintPasses.h
@@ -51,6 +51,9 @@ std::vector<std::string> printAfterPasses();
 // Returns true if we should always print the entire module.
 bool forcePrintModuleIR();
 
+// Returns true if we should print the entire function for loop passes.
+bool forcePrintFuncIR();
+
 // Return true if -filter-passes is empty or contains the pass name.
 bool isPassInPrintList(StringRef PassName);
 bool isFilterPassesEmpty();
diff --git a/llvm/include/llvm/Object/COFF.h b/llvm/include/llvm/Object/COFF.h
index 05b3587224c29..4de2c680f57b1 100644
--- a/llvm/include/llvm/Object/COFF.h
+++ b/llvm/include/llvm/Object/COFF.h
@@ -392,6 +392,11 @@ class COFFSymbolRef {
            getValue() == 0;
   }
 
+  bool isEmptySectionDeclaration() const {
+    return isSection() && getSectionNumber() == COFF::IMAGE_SYM_UNDEFINED &&
+           getValue() == 0;
+  }
+
   bool isWeakExternal() const {
     return getStorageClass() == COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL;
   }
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index 81307d7b025d9..5a20a9ef63287 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -748,10 +748,15 @@ struct FunctionRecord {
 };
 
 /// Iterator over Functions, optionally filtered to a single file.
+/// When filtering to a single file, the iterator requires a list of potential
+/// indices where to find the desired records to avoid quadratic behavior when
+/// repeatedly iterating over functions from different files.
 class FunctionRecordIterator
     : public iterator_facade_base<FunctionRecordIterator,
                                   std::forward_iterator_tag, FunctionRecord> {
   ArrayRef<FunctionRecord> Records;
+  ArrayRef<unsigned> RecordIndices;
+  ArrayRef<unsigned>::iterator CurrentIndex;
   ArrayRef<FunctionRecord>::iterator Current;
   StringRef Filename;
 
@@ -760,8 +765,17 @@ class FunctionRecordIterator
 
 public:
   FunctionRecordIterator(ArrayRef<FunctionRecord> Records_,
-                         StringRef Filename = "")
-      : Records(Records_), Current(Records.begin()), Filename(Filename) {
+                         StringRef Filename = "",
+                         ArrayRef<unsigned> RecordIndices_ = {})
+      : Records(Records_), RecordIndices(RecordIndices_),
+        CurrentIndex(RecordIndices.begin()),
+        // If `RecordIndices` is provided, we can skip directly to the first
+        // index it provides.
+        Current(CurrentIndex == RecordIndices.end() ? Records.begin()
+                                                    : &Records[*CurrentIndex]),
+        Filename(Filename) {
+    assert(Filename.empty() == RecordIndices_.empty() &&
+           "If `Filename` is specified, `RecordIndices` must also be provided");
     skipOtherFiles();
   }
 
@@ -774,11 +788,29 @@ class FunctionRecordIterator
   const FunctionRecord &operator*() const { return *Current; }
 
   FunctionRecordIterator &operator++() {
-    assert(Current != Records.end() && "incremented past end");
-    ++Current;
+    advanceOne();
     skipOtherFiles();
     return *this;
   }
+
+private:
+  void advanceOne() {
+    if (RecordIndices.empty()) {
+      // Iteration over all entries, advance in the list of records.
+      assert(Current != Records.end() && "incremented past end");
+      ++Current;
+    } else {
+      // Iterator over entries filtered by file name. Advance in the list of
+      // indices, and adjust the cursor in the list of records accordingly.
+      assert(CurrentIndex != RecordIndices.end() && "incremented past end");
+      ++CurrentIndex;
+      if (CurrentIndex == RecordIndices.end()) {
+        Current = Records.end();
+      } else {
+        Current = &Records[*CurrentIndex];
+      }
+    }
+  }
 };
 
 /// Coverage information for a macro expansion or #included file.
@@ -1037,8 +1069,10 @@ class CoverageMapping {
   /// Gets all of the functions in a particular file.
   iterator_range<FunctionRecordIterator>
   getCoveredFunctions(StringRef Filename) const {
-    return make_range(FunctionRecordIterator(Functions, Filename),
-                      FunctionRecordIterator());
+    return make_range(
+        FunctionRecordIterator(Functions, Filename,
+                               getImpreciseRecordIndicesForFilename(Filename)),
+        FunctionRecordIterator());
   }
 
   /// Get the list of function instantiation groups in a particular file.
diff --git a/llvm/include/llvm/ProfileData/PGOCtxProfReader.h b/llvm/include/llvm/ProfileData/PGOCtxProfReader.h
index d358041e3a001..ffffae1a872a5 100644
--- a/llvm/include/llvm/ProfileData/PGOCtxProfReader.h
+++ b/llvm/include/llvm/ProfileData/PGOCtxProfReader.h
@@ -183,5 +183,8 @@ class PGOCtxProfileReader final {
 
   Expected<std::map<GlobalValue::GUID, PGOCtxProfContext>> loadContexts();
 };
+
+void convertCtxProfToYaml(raw_ostream &OS,
+                          const PGOCtxProfContext::CallTargetMapTy &);
 } // namespace llvm
 #endif
diff --git a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
index f6158609c1285..43a190ae0aa05 100644
--- a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
+++ b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
@@ -81,14 +81,6 @@ class PGOCtxProfileWriter final {
   static constexpr StringRef ContainerMagic = "CTXP";
 };
 
-/// Representation of the context node suitable for yaml / json serialization /
-/// deserialization.
-struct SerializableCtxRepresentation {
-  ctx_profile::GUID Guid = 0;
-  std::vector<uint64_t> Counters;
-  std::vector<std::vector<SerializableCtxRepresentation>> Callsites;
-};
-
 Error createCtxProfFromYAML(StringRef Profile, raw_ostream &Out);
 } // namespace llvm
 #endif
diff --git a/llvm/include/llvm/SandboxIR/Instruction.h b/llvm/include/llvm/SandboxIR/Instruction.h
index 34a7feb63bec4..49ea6707ecd82 100644
--- a/llvm/include/llvm/SandboxIR/Instruction.h
+++ b/llvm/include/llvm/SandboxIR/Instruction.h
@@ -2478,13 +2478,12 @@ class CmpInst : public SingleLLVMInstructionImpl<llvm::CmpInst> {
 public:
   using Predicate = llvm::CmpInst::Predicate;
 
-  static CmpInst *create(Predicate Pred, Value *S1, Value *S2,
-                         InsertPosition Pos, Context &Ctx,
-                         const Twine &Name = "");
-  static CmpInst *createWithCopiedFlags(Predicate Pred, Value *S1, Value *S2,
-                                        const Instruction *FlagsSource,
-                                        InsertPosition Pos, Context &Ctx,
-                                        const Twine &Name = "");
+  static Value *create(Predicate Pred, Value *S1, Value *S2, InsertPosition Pos,
+                       Context &Ctx, const Twine &Name = "");
+  static Value *createWithCopiedFlags(Predicate Pred, Value *S1, Value *S2,
+                                      const Instruction *FlagsSource,
+                                      InsertPosition Pos, Context &Ctx,
+                                      const Twine &Name = "");
   void setPredicate(Predicate P);
   void swapOperands();
 
diff --git a/llvm/include/llvm/Support/FileSystem.h b/llvm/include/llvm/Support/FileSystem.h
index 9cf53360b4e96..245e4a24c70df 100644
--- a/llvm/include/llvm/Support/FileSystem.h
+++ b/llvm/include/llvm/Support/FileSystem.h
@@ -44,10 +44,6 @@
 #include <system_error>
 #include <vector>
 
-#ifdef HAVE_SYS_STAT_H
-#include <sys/stat.h>
-#endif
-
 namespace llvm {
 namespace sys {
 namespace fs {
diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h
index a52a9f07bacd4..574e9a6116603 100644
--- a/llvm/include/llvm/Support/MathExtras.h
+++ b/llvm/include/llvm/Support/MathExtras.h
@@ -44,14 +44,15 @@ using common_sint =
 namespace numbers {
 // TODO: Track C++20 std::numbers.
 // TODO: Favor using the hexadecimal FP constants (requires C++17).
-constexpr double e          = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
+// clang-format off
+constexpr double e          = 2.7182818284590452354, // (0x1.5bf0a8b145769P+1) https://oeis.org/A001113
                  egamma     = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
                  ln2        = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
-                 ln10       = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
+                 ln10       = 2.3025850929940456840, // (0x1.26bb1bbb55516P+1) https://oeis.org/A002392
                  log2e      = 1.4426950408889634074, // (0x1.71547652b82feP+0)
                  log10e     = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
                  pi         = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
-                 inv_pi     = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
+                 inv_pi     = .31830988618379067154, // (0x1.45f306dc9c883P-2) https://oeis.org/A049541
                  sqrtpi     = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
                  inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
                  sqrt2      = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
@@ -74,6 +75,7 @@ constexpr float ef          = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A
                 sqrt3f      = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
                 inv_sqrt3f  = .577350269F, // (0x1.279a74P-1)
                 phif        = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
+// clang-format on
 } // namespace numbers
 
 /// Create a bitmask with the N right-most bits set to 1, and all other
diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h
index b15e9fc7328da..d9930a48e8084 100644
--- a/llvm/include/llvm/TableGen/Record.h
+++ b/llvm/include/llvm/TableGen/Record.h
@@ -1816,7 +1816,7 @@ class Record {
     assert(!CorrespondingDefInit &&
            "changing type of record after it has been referenced");
     assert(!isSubClassOf(R) && "Already subclassing record!");
-    SuperClasses.push_back(std::make_pair(R, Range));
+    SuperClasses.emplace_back(R, Range);
   }
 
   /// If there are any field references that refer to fields that have been
@@ -1971,21 +1971,20 @@ class RecordKeeper {
   }
 
   void addClass(std::unique_ptr<Record> R) {
-    bool Ins = Classes.insert(std::make_pair(std::string(R->getName()),
-                                             std::move(R))).second;
+    bool Ins =
+        Classes.try_emplace(std::string(R->getName()), std::move(R)).second;
     (void)Ins;
     assert(Ins && "Class already exists");
   }
 
   void addDef(std::unique_ptr<Record> R) {
-    bool Ins = Defs.insert(std::make_pair(std::string(R->getName()),
-                                          std::move(R))).second;
+    bool Ins = Defs.try_emplace(std::string(R->getName()), std::move(R)).second;
     (void)Ins;
     assert(Ins && "Record already exists");
   }
 
   void addExtraGlobal(StringRef Name, const Init *I) {
-    bool Ins = ExtraGlobals.insert(std::make_pair(std::string(Name), I)).second;
+    bool Ins = ExtraGlobals.try_emplace(std::string(Name), I).second;
     (void)Ins;
     assert(!getDef(Name));
     assert(Ins && "Global already exists");
@@ -2071,14 +2070,14 @@ struct LessRecordRegister {
       for (size_t I = 0, E = Rec.size(); I != E; ++I, ++Len) {
         bool IsDigit = isDigit(Curr[I]);
         if (IsDigit != IsDigitPart) {
-          Parts.push_back(std::make_pair(IsDigitPart, StringRef(Start, Len)));
+          Parts.emplace_back(IsDigitPart, StringRef(Start, Len));
           Len = 0;
           Start = &Curr[I];
           IsDigitPart = isDigit(Curr[I]);
         }
       }
       // Push the last part.
-      Parts.push_back(std::make_pair(IsDigitPart, StringRef(Start, Len)));
+      Parts.emplace_back(IsDigitPart, StringRef(Start, Len));
     }
 
     size_t size() { return Parts.size(); }
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 63f06a3a69298..0338770593bc4 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -270,13 +270,16 @@ void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
 
 bool isX18ReservedByDefault(const Triple &TT);
 
-// Return the priority for a given set of FMV features.
+// For a given set of feature names, which can be either target-features, or
+// fmv-features metadata, expand their dependencies and then return a bitmask
+// corresponding to the entries of AArch64::FeatPriorities.
 uint64_t getFMVPriority(ArrayRef<StringRef> Features);
 
-// For given feature names, return a bitmask corresponding to the entries of
-// AArch64::CPUFeatures. The values in CPUFeatures are not bitmasks themselves,
-// they are sequential (0, 1, 2, 3, ...). The resulting bitmask is used at
-// runtime to test whether a certain FMV feature is available on the host.
+// For a given set of FMV feature names, expand their dependencies and then
+// return a bitmask corresponding to the entries of AArch64::CPUFeatures.
+// The values in CPUFeatures are not bitmasks themselves, they are sequential
+// (0, 1, 2, 3, ...). The resulting bitmask is used at runtime to test whether
+// a certain FMV feature is available on the host.
 uint64_t getCpuSupportsMask(ArrayRef<StringRef> Features);
 
 void PrintSupportedExtensions();
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h
new file mode 100644
index 0000000000000..c931319d3b002
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h
@@ -0,0 +1,110 @@
+//===- InstrMaps.h ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVEC_PASSES_INSTRMAPS_H
+#define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVEC_PASSES_INSTRMAPS_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/SandboxIR/Context.h"
+#include "llvm/SandboxIR/Instruction.h"
+#include "llvm/SandboxIR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+
+namespace llvm::sandboxir {
+
+/// Maps the original instructions to the vectorized instrs and the reverse.
+/// For now an original instr can only map to a single vector.
+class InstrMaps {
+  /// A map from the original values that got combined into vectors, to the
+  /// vector value(s).
+  DenseMap<Value *, Value *> OrigToVectorMap;
+  /// A map from the vector value to a map of the original value to its lane.
+  /// Please note that for constant vectors, there may multiple original values
+  /// with the same lane, as they may be coming from vectorizing different
+  /// original values.
+  DenseMap<Value *, DenseMap<Value *, unsigned>> VectorToOrigLaneMap;
+  Context &Ctx;
+  std::optional<Context::CallbackID> EraseInstrCB;
+
+private:
+  void notifyEraseInstr(Value *V) {
+    // We don't know if V is an original or a vector value.
+    auto It = OrigToVectorMap.find(V);
+    if (It != OrigToVectorMap.end()) {
+      // V is an original value.
+      // Remove it from VectorToOrigLaneMap.
+      Value *Vec = It->second;
+      VectorToOrigLaneMap[Vec].erase(V);
+      // Now erase V from OrigToVectorMap.
+      OrigToVectorMap.erase(It);
+    } else {
+      // V is a vector value.
+      // Go over the original values it came from and remove them from
+      // OrigToVectorMap.
+      for (auto [Orig, Lane] : VectorToOrigLaneMap[V])
+        OrigToVectorMap.erase(Orig);
+      // Now erase V from VectorToOrigLaneMap.
+      VectorToOrigLaneMap.erase(V);
+    }
+  }
+
+public:
+  InstrMaps(Context &Ctx) : Ctx(Ctx) {
+    EraseInstrCB = Ctx.registerEraseInstrCallback(
+        [this](Instruction *I) { notifyEraseInstr(I); });
+  }
+  ~InstrMaps() { Ctx.unregisterEraseInstrCallback(*EraseInstrCB); }
+  /// \Returns the vector value that we got from vectorizing \p Orig, or
+  /// nullptr if not found.
+  Value *getVectorForOrig(Value *Orig) const {
+    auto It = OrigToVectorMap.find(Orig);
+    return It != OrigToVectorMap.end() ? It->second : nullptr;
+  }
+  /// \Returns the lane of \p Orig before it got vectorized into \p Vec, or
+  /// nullopt if not found.
+  std::optional<unsigned> getOrigLane(Value *Vec, Value *Orig) const {
+    auto It1 = VectorToOrigLaneMap.find(Vec);
+    if (It1 == VectorToOrigLaneMap.end())
+      return std::nullopt;
+    const auto &OrigToLaneMap = It1->second;
+    auto It2 = OrigToLaneMap.find(Orig);
+    if (It2 == OrigToLaneMap.end())
+      return std::nullopt;
+    return It2->second;
+  }
+  /// Update the map to reflect that \p Origs got vectorized into \p Vec.
+  void registerVector(ArrayRef<Value *> Origs, Value *Vec) {
+    auto &OrigToLaneMap = VectorToOrigLaneMap[Vec];
+    for (auto [Lane, Orig] : enumerate(Origs)) {
+      auto Pair = OrigToVectorMap.try_emplace(Orig, Vec);
+      assert(Pair.second && "Orig already exists in the map!");
+      (void)Pair;
+      OrigToLaneMap[Orig] = Lane;
+    }
+  }
+  void clear() {
+    OrigToVectorMap.clear();
+    VectorToOrigLaneMap.clear();
+  }
+#ifndef NDEBUG
+  void print(raw_ostream &OS) const {
+    OS << "OrigToVectorMap:\n";
+    for (auto [Orig, Vec] : OrigToVectorMap)
+      OS << *Orig << " : " << *Vec << "\n";
+  }
+  LLVM_DUMP_METHOD void dump() const;
+#endif
+};
+} // namespace llvm::sandboxir
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVEC_PASSES_INSTRMAPS_H
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Interval.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Interval.h
index 922dd2c3a1f89..18cd29e9e14ee 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Interval.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Interval.h
@@ -136,13 +136,7 @@ template <typename T> class Interval {
     return bottom()->comesBefore(Other.top());
   }
   /// \Returns true if this and \p Other have nothing in common.
-  bool disjoint(const Interval &Other) const {
-    if (Other.empty())
-      return true;
-    if (empty())
-      return true;
-    return Other.Bottom->comesBefore(Top) || Bottom->comesBefore(Other.Top);
-  }
+  bool disjoint(const Interval &Other) const;
   /// \Returns the intersection between this and \p Other.
   // Example:
   // |----|   this
@@ -232,23 +226,7 @@ template <typename T> class Interval {
   }
 
 #ifndef NDEBUG
-  void print(raw_ostream &OS) const {
-    auto *Top = top();
-    auto *Bot = bottom();
-    OS << "Top: ";
-    if (Top != nullptr)
-      OS << *Top;
-    else
-      OS << "nullptr";
-    OS << "\n";
-
-    OS << "Bot: ";
-    if (Bot != nullptr)
-      OS << *Bot;
-    else
-      OS << "nullptr";
-    OS << "\n";
-  }
+  void print(raw_ostream &OS) const;
   LLVM_DUMP_METHOD void dump() const;
 #endif
 };
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
index 233cf82a1b3df..4858ebaf0770a 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
@@ -23,10 +23,64 @@ namespace llvm::sandboxir {
 
 class LegalityAnalysis;
 class Value;
+class InstrMaps;
+
+class ShuffleMask {
+public:
+  using IndicesVecT = SmallVector<int, 8>;
+
+private:
+  IndicesVecT Indices;
+
+public:
+  ShuffleMask(SmallVectorImpl<int> &&Indices) : Indices(std::move(Indices)) {}
+  ShuffleMask(std::initializer_list<int> Indices) : Indices(Indices) {}
+  explicit ShuffleMask(ArrayRef<int> Indices) : Indices(Indices) {}
+  operator ArrayRef<int>() const { return Indices; }
+  /// Creates and returns an identity shuffle mask of size \p Sz.
+  /// For example if Sz == 4 the returned mask is {0, 1, 2, 3}.
+  static ShuffleMask getIdentity(unsigned Sz) {
+    IndicesVecT Indices;
+    Indices.reserve(Sz);
+    for (auto Idx : seq<int>(0, (int)Sz))
+      Indices.push_back(Idx);
+    return ShuffleMask(std::move(Indices));
+  }
+  /// \Returns true if the mask is a perfect identity mask with consecutive
+  /// indices, i.e., performs no lane shuffling, like 0,1,2,3...
+  bool isIdentity() const {
+    for (auto [Idx, Elm] : enumerate(Indices)) {
+      if ((int)Idx != Elm)
+        return false;
+    }
+    return true;
+  }
+  bool operator==(const ShuffleMask &Other) const {
+    return Indices == Other.Indices;
+  }
+  bool operator!=(const ShuffleMask &Other) const { return !(*this == Other); }
+  size_t size() const { return Indices.size(); }
+  int operator[](int Idx) const { return Indices[Idx]; }
+  using const_iterator = IndicesVecT::const_iterator;
+  const_iterator begin() const { return Indices.begin(); }
+  const_iterator end() const { return Indices.end(); }
+#ifndef NDEBUG
+  friend raw_ostream &operator<<(raw_ostream &OS, const ShuffleMask &Mask) {
+    Mask.print(OS);
+    return OS;
+  }
+  void print(raw_ostream &OS) const {
+    interleave(Indices, OS, [&OS](auto Elm) { OS << Elm; }, ",");
+  }
+  LLVM_DUMP_METHOD void dump() const;
+#endif
+};
 
 enum class LegalityResultID {
-  Pack,  ///> Collect scalar values.
-  Widen, ///> Vectorize by combining scalars to a vector.
+  Pack,                    ///> Collect scalar values.
+  Widen,                   ///> Vectorize by combining scalars to a vector.
+  DiamondReuse,            ///> Don't generate new code, reuse existing vector.
+  DiamondReuseWithShuffle, ///> Reuse the existing vector but add a shuffle.
 };
 
 /// The reason for vectorizing or not vectorizing.
@@ -50,6 +104,10 @@ struct ToStr {
       return "Pack";
     case LegalityResultID::Widen:
       return "Widen";
+    case LegalityResultID::DiamondReuse:
+      return "DiamondReuse";
+    case LegalityResultID::DiamondReuseWithShuffle:
+      return "DiamondReuseWithShuffle";
     }
     llvm_unreachable("Unknown LegalityResultID enum");
   }
@@ -137,6 +195,35 @@ class Widen final : public LegalityResult {
   }
 };
 
+class DiamondReuse final : public LegalityResult {
+  friend class LegalityAnalysis;
+  Value *Vec;
+  DiamondReuse(Value *Vec)
+      : LegalityResult(LegalityResultID::DiamondReuse), Vec(Vec) {}
+
+public:
+  static bool classof(const LegalityResult *From) {
+    return From->getSubclassID() == LegalityResultID::DiamondReuse;
+  }
+  Value *getVector() const { return Vec; }
+};
+
+class DiamondReuseWithShuffle final : public LegalityResult {
+  friend class LegalityAnalysis;
+  Value *Vec;
+  ShuffleMask Mask;
+  DiamondReuseWithShuffle(Value *Vec, const ShuffleMask &Mask)
+      : LegalityResult(LegalityResultID::DiamondReuseWithShuffle), Vec(Vec),
+        Mask(Mask) {}
+
+public:
+  static bool classof(const LegalityResult *From) {
+    return From->getSubclassID() == LegalityResultID::DiamondReuseWithShuffle;
+  }
+  Value *getVector() const { return Vec; }
+  const ShuffleMask &getMask() const { return Mask; }
+};
+
 class Pack final : public LegalityResultWithReason {
   Pack(ResultReason Reason)
       : LegalityResultWithReason(LegalityResultID::Pack, Reason) {}
@@ -148,6 +235,58 @@ class Pack final : public LegalityResultWithReason {
   }
 };
 
+/// Describes how to collect the values needed by each lane.
+class CollectDescr {
+public:
+  /// Describes how to get a value element. If the value is a vector then it
+  /// also provides the index to extract it from.
+  class ExtractElementDescr {
+    Value *V;
+    /// The index in `V` that the value can be extracted from.
+    /// This is nullopt if we need to use `V` as a whole.
+    std::optional<int> ExtractIdx;
+
+  public:
+    ExtractElementDescr(Value *V, int ExtractIdx)
+        : V(V), ExtractIdx(ExtractIdx) {}
+    ExtractElementDescr(Value *V) : V(V), ExtractIdx(std::nullopt) {}
+    Value *getValue() const { return V; }
+    bool needsExtract() const { return ExtractIdx.has_value(); }
+    int getExtractIdx() const { return *ExtractIdx; }
+  };
+
+  using DescrVecT = SmallVector<ExtractElementDescr, 4>;
+  DescrVecT Descrs;
+
+public:
+  CollectDescr(SmallVectorImpl<ExtractElementDescr> &&Descrs)
+      : Descrs(std::move(Descrs)) {}
+  /// If all elements come from a single vector input, then return that vector
+  /// and also the shuffle mask required to get them in order.
+  std::optional<std::pair<Value *, ShuffleMask>> getSingleInput() const {
+    const auto &Descr0 = *Descrs.begin();
+    Value *V0 = Descr0.getValue();
+    if (!Descr0.needsExtract())
+      return std::nullopt;
+    ShuffleMask::IndicesVecT MaskIndices;
+    MaskIndices.push_back(Descr0.getExtractIdx());
+    for (const auto &Descr : drop_begin(Descrs)) {
+      if (!Descr.needsExtract())
+        return std::nullopt;
+      if (Descr.getValue() != V0)
+        return std::nullopt;
+      MaskIndices.push_back(Descr.getExtractIdx());
+    }
+    return std::make_pair(V0, ShuffleMask(std::move(MaskIndices)));
+  }
+  bool hasVectorInputs() const {
+    return any_of(Descrs, [](const auto &D) { return D.needsExtract(); });
+  }
+  const SmallVector<ExtractElementDescr, 4> &getDescrs() const {
+    return Descrs;
+  }
+};
+
 /// Performs the legality analysis and returns a LegalityResult object.
 class LegalityAnalysis {
   Scheduler Sched;
@@ -160,11 +299,17 @@ class LegalityAnalysis {
 
   ScalarEvolution &SE;
   const DataLayout &DL;
+  InstrMaps &IMaps;
+
+  /// Finds how we can collect the values in \p Bndl from the vectorized or
+  /// non-vectorized code. It returns a map of the value we should extract from
+  /// and the corresponding shuffle mask we need to use.
+  CollectDescr getHowToCollectValues(ArrayRef<Value *> Bndl) const;
 
 public:
   LegalityAnalysis(AAResults &AA, ScalarEvolution &SE, const DataLayout &DL,
-                   Context &Ctx)
-      : Sched(AA, Ctx), SE(SE), DL(DL) {}
+                   Context &Ctx, InstrMaps &IMaps)
+      : Sched(AA, Ctx), SE(SE), DL(DL), IMaps(IMaps) {}
   /// A LegalityResult factory.
   template <typename ResultT, typename... ArgsT>
   ResultT &createLegalityResult(ArgsT... Args) {
@@ -177,7 +322,7 @@ class LegalityAnalysis {
   // TODO: Try to remove the SkipScheduling argument by refactoring the tests.
   const LegalityResult &canVectorize(ArrayRef<Value *> Bndl,
                                      bool SkipScheduling = false);
-  void clear() { Sched.clear(); }
+  void clear();
 };
 
 } // namespace llvm::sandboxir
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
index 1a53ca6e06f5f..ac051c3b6570f 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
@@ -18,6 +18,7 @@
 #include "llvm/SandboxIR/Pass.h"
 #include "llvm/SandboxIR/PassManager.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h"
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h"
 
 namespace llvm::sandboxir {
@@ -26,6 +27,8 @@ class BottomUpVec final : public FunctionPass {
   bool Change = false;
   std::unique_ptr<LegalityAnalysis> Legality;
   DenseSet<Instruction *> DeadInstrCandidates;
+  /// Maps scalars to vectors.
+  std::unique_ptr<InstrMaps> IMaps;
 
   /// Creates and returns a vector instruction that replaces the instructions in
   /// \p Bndl. \p Operands are the already vectorized operands.
@@ -33,6 +36,8 @@ class BottomUpVec final : public FunctionPass {
   /// Erases all dead instructions from the dead instruction candidates
   /// collected during vectorization.
   void tryEraseDeadInstrs();
+  /// Creates a shuffle instruction that shuffles \p VecOp according to \p Mask.
+  Value *createShuffle(Value *VecOp, const ShuffleMask &Mask);
   /// Packs all elements of \p ToPack into a vector and returns that vector.
   Value *createPack(ArrayRef<Value *> ToPack);
   void collectPotentiallyDeadInstrs(ArrayRef<Value *> Bndl);
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h
index 3ec0ac0f78a74..52891c3f7535c 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h
@@ -109,8 +109,16 @@ class SchedBundle {
 
 /// The list scheduler.
 class Scheduler {
+  /// This is a list-scheduler and this is the list containing the instructions
+  /// that are ready, meaning that all their dependency successors have already
+  /// been scheduled.
   ReadyListContainer ReadyList;
+  /// The dependency graph is used by the scheduler to determine the legal
+  /// ordering of instructions.
   DependencyGraph DAG;
+  /// This is the top of the schedule, i.e. the location where the scheduler
+  /// is about to place the scheduled instructions. It gets updated as we
+  /// schedule.
   std::optional<BasicBlock::iterator> ScheduleTopItOpt;
   // TODO: This is wasting memory in exchange for fast removal using a raw ptr.
   DenseMap<SchedBundle *, std::unique_ptr<SchedBundle>> Bndls;
@@ -145,7 +153,11 @@ class Scheduler {
 public:
   Scheduler(AAResults &AA, Context &Ctx) : DAG(AA, Ctx) {}
   ~Scheduler() {}
-
+  /// Tries to build a schedule that includes all of \p Instrs scheduled at the
+  /// same scheduling cycle. This essentially checks that there are no
+  /// dependencies among \p Instrs. This function may involve scheduling
+  /// intermediate instructions or canceling and re-scheduling if needed.
+  /// \Returns true on success, false otherwise.
   bool trySchedule(ArrayRef<Instruction *> Instrs);
   /// Clear the scheduler's state, including the DAG.
   void clear() {
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h
index 28fa33656dd5f..6cbbb396ea823 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h
@@ -134,15 +134,13 @@ class VecUtils {
     return ScalarTy;
   }
   /// \Returns the first integer power of 2 that is <= Num.
-  static unsigned getFloorPowerOf2(unsigned Num) {
-    if (Num == 0)
-      return Num;
-    unsigned Mask = Num;
-    Mask >>= 1;
-    for (unsigned ShiftBy = 1; ShiftBy < sizeof(Num) * 8; ShiftBy <<= 1)
-      Mask |= Mask >> ShiftBy;
-    return Num & ~Mask;
-  }
+  static unsigned getFloorPowerOf2(unsigned Num);
+
+#ifndef NDEBUG
+  /// Helper dump function for debugging.
+  LLVM_DUMP_METHOD static void dump(ArrayRef<Value *> Bndl);
+  LLVM_DUMP_METHOD static void dump(ArrayRef<Instruction *> Bndl);
+#endif // NDEBUG
 };
 
 } // namespace llvm::sandboxir
diff --git a/llvm/lib/Analysis/CmpInstAnalysis.cpp b/llvm/lib/Analysis/CmpInstAnalysis.cpp
index 2580ea7e97248..3599428c5ff41 100644
--- a/llvm/lib/Analysis/CmpInstAnalysis.cpp
+++ b/llvm/lib/Analysis/CmpInstAnalysis.cpp
@@ -165,3 +165,17 @@ llvm::decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate Pred,
 
   return Result;
 }
+
+std::optional<DecomposedBitTest>
+llvm::decomposeBitTest(Value *Cond, bool LookThruTrunc, bool AllowNonZeroC) {
+  if (auto *ICmp = dyn_cast<ICmpInst>(Cond)) {
+    // Don't allow pointers. Splat vectors are fine.
+    if (!ICmp->getOperand(0)->getType()->isIntOrIntVectorTy())
+      return std::nullopt;
+    return decomposeBitTestICmp(ICmp->getOperand(0), ICmp->getOperand(1),
+                                ICmp->getPredicate(), LookThruTrunc,
+                                AllowNonZeroC);
+  }
+
+  return std::nullopt;
+}
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 031d675c330ec..3e87ea0e90fd5 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1689,6 +1689,28 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::x86_avx512_cvttsd2usi64:
     return !Call->isStrictFP();
 
+  // NVVM FMax intrinsics
+  case Intrinsic::nvvm_fmax_d:
+  case Intrinsic::nvvm_fmax_f:
+  case Intrinsic::nvvm_fmax_ftz_f:
+  case Intrinsic::nvvm_fmax_ftz_nan_f:
+  case Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f:
+  case Intrinsic::nvvm_fmax_ftz_xorsign_abs_f:
+  case Intrinsic::nvvm_fmax_nan_f:
+  case Intrinsic::nvvm_fmax_nan_xorsign_abs_f:
+  case Intrinsic::nvvm_fmax_xorsign_abs_f:
+
+  // NVVM FMin intrinsics
+  case Intrinsic::nvvm_fmin_d:
+  case Intrinsic::nvvm_fmin_f:
+  case Intrinsic::nvvm_fmin_ftz_f:
+  case Intrinsic::nvvm_fmin_ftz_nan_f:
+  case Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f:
+  case Intrinsic::nvvm_fmin_ftz_xorsign_abs_f:
+  case Intrinsic::nvvm_fmin_nan_f:
+  case Intrinsic::nvvm_fmin_nan_xorsign_abs_f:
+  case Intrinsic::nvvm_fmin_xorsign_abs_f:
+
   // NVVM float/double to int32/uint32 conversion intrinsics
   case Intrinsic::nvvm_f2i_rm:
   case Intrinsic::nvvm_f2i_rn:
@@ -1885,7 +1907,7 @@ Constant *GetConstantFoldFPValue128(float128 V, Type *Ty) {
 
 /// Clear the floating-point exception state.
 inline void llvm_fenv_clearexcept() {
-#if defined(HAVE_FENV_H) && HAVE_DECL_FE_ALL_EXCEPT
+#if HAVE_DECL_FE_ALL_EXCEPT
   feclearexcept(FE_ALL_EXCEPT);
 #endif
   errno = 0;
@@ -1896,7 +1918,7 @@ inline bool llvm_fenv_testexcept() {
   int errno_val = errno;
   if (errno_val == ERANGE || errno_val == EDOM)
     return true;
-#if defined(HAVE_FENV_H) && HAVE_DECL_FE_ALL_EXCEPT && HAVE_DECL_FE_INEXACT
+#if HAVE_DECL_FE_ALL_EXCEPT && HAVE_DECL_FE_INEXACT
   if (fetestexcept(FE_ALL_EXCEPT & ~FE_INEXACT))
     return true;
 #endif
@@ -2431,9 +2453,10 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
       if (U.isNaN())
         return ConstantInt::get(Ty, 0);
 
-      APFloat::roundingMode RMode = nvvm::IntrinsicGetRoundingMode(IntrinsicID);
-      bool IsFTZ = nvvm::IntrinsicShouldFTZ(IntrinsicID);
-      bool IsSigned = nvvm::IntrinsicConvertsToSignedInteger(IntrinsicID);
+      APFloat::roundingMode RMode =
+          nvvm::GetFPToIntegerRoundingMode(IntrinsicID);
+      bool IsFTZ = nvvm::FPToIntegerIntrinsicShouldFTZ(IntrinsicID);
+      bool IsSigned = nvvm::FPToIntegerIntrinsicResultIsSigned(IntrinsicID);
 
       APSInt ResInt(Ty->getIntegerBitWidth(), !IsSigned);
       auto FloatToRound = IsFTZ ? FTZPreserveSign(U) : U;
@@ -2892,12 +2915,49 @@ static Constant *ConstantFoldIntrinsicCall2(Intrinsic::ID IntrinsicID, Type *Ty,
     case Intrinsic::minnum:
     case Intrinsic::maximum:
     case Intrinsic::minimum:
+    case Intrinsic::nvvm_fmax_d:
+    case Intrinsic::nvvm_fmin_d:
       // If one argument is undef, return the other argument.
       if (IsOp0Undef)
         return Operands[1];
       if (IsOp1Undef)
         return Operands[0];
       break;
+
+    case Intrinsic::nvvm_fmax_f:
+    case Intrinsic::nvvm_fmax_ftz_f:
+    case Intrinsic::nvvm_fmax_ftz_nan_f:
+    case Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f:
+    case Intrinsic::nvvm_fmax_ftz_xorsign_abs_f:
+    case Intrinsic::nvvm_fmax_nan_f:
+    case Intrinsic::nvvm_fmax_nan_xorsign_abs_f:
+    case Intrinsic::nvvm_fmax_xorsign_abs_f:
+
+    case Intrinsic::nvvm_fmin_f:
+    case Intrinsic::nvvm_fmin_ftz_f:
+    case Intrinsic::nvvm_fmin_ftz_nan_f:
+    case Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f:
+    case Intrinsic::nvvm_fmin_ftz_xorsign_abs_f:
+    case Intrinsic::nvvm_fmin_nan_f:
+    case Intrinsic::nvvm_fmin_nan_xorsign_abs_f:
+    case Intrinsic::nvvm_fmin_xorsign_abs_f:
+      // If one arg is undef, the other arg can be returned only if it is
+      // constant, as we may need to flush it to sign-preserving zero or
+      // canonicalize the NaN.
+      if (!IsOp0Undef && !IsOp1Undef)
+        break;
+      if (auto *Op = dyn_cast<ConstantFP>(Operands[IsOp0Undef ? 1 : 0])) {
+        if (Op->isNaN()) {
+          APInt NVCanonicalNaN(32, 0x7fffffff);
+          return ConstantFP::get(
+              Ty, APFloat(Ty->getFltSemantics(), NVCanonicalNaN));
+        }
+        if (nvvm::FMinFMaxShouldFTZ(IntrinsicID))
+          return ConstantFP::get(Ty, FTZPreserveSign(Op->getValueAPF()));
+        else
+          return Op;
+      }
+      break;
     }
   }
 
@@ -2955,6 +3015,79 @@ static Constant *ConstantFoldIntrinsicCall2(Intrinsic::ID IntrinsicID, Type *Ty,
         return ConstantFP::get(Ty->getContext(), minimum(Op1V, Op2V));
       case Intrinsic::maximum:
         return ConstantFP::get(Ty->getContext(), maximum(Op1V, Op2V));
+
+      case Intrinsic::nvvm_fmax_d:
+      case Intrinsic::nvvm_fmax_f:
+      case Intrinsic::nvvm_fmax_ftz_f:
+      case Intrinsic::nvvm_fmax_ftz_nan_f:
+      case Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f:
+      case Intrinsic::nvvm_fmax_ftz_xorsign_abs_f:
+      case Intrinsic::nvvm_fmax_nan_f:
+      case Intrinsic::nvvm_fmax_nan_xorsign_abs_f:
+      case Intrinsic::nvvm_fmax_xorsign_abs_f:
+
+      case Intrinsic::nvvm_fmin_d:
+      case Intrinsic::nvvm_fmin_f:
+      case Intrinsic::nvvm_fmin_ftz_f:
+      case Intrinsic::nvvm_fmin_ftz_nan_f:
+      case Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f:
+      case Intrinsic::nvvm_fmin_ftz_xorsign_abs_f:
+      case Intrinsic::nvvm_fmin_nan_f:
+      case Intrinsic::nvvm_fmin_nan_xorsign_abs_f:
+      case Intrinsic::nvvm_fmin_xorsign_abs_f: {
+
+        bool ShouldCanonicalizeNaNs = !(IntrinsicID == Intrinsic::nvvm_fmax_d ||
+                                        IntrinsicID == Intrinsic::nvvm_fmin_d);
+        bool IsFTZ = nvvm::FMinFMaxShouldFTZ(IntrinsicID);
+        bool IsNaNPropagating = nvvm::FMinFMaxPropagatesNaNs(IntrinsicID);
+        bool IsXorSignAbs = nvvm::FMinFMaxIsXorSignAbs(IntrinsicID);
+
+        APFloat A = IsFTZ ? FTZPreserveSign(Op1V) : Op1V;
+        APFloat B = IsFTZ ? FTZPreserveSign(Op2V) : Op2V;
+
+        bool XorSign = false;
+        if (IsXorSignAbs) {
+          XorSign = A.isNegative() ^ B.isNegative();
+          A = abs(A);
+          B = abs(B);
+        }
+
+        bool IsFMax = false;
+        switch (IntrinsicID) {
+        case Intrinsic::nvvm_fmax_d:
+        case Intrinsic::nvvm_fmax_f:
+        case Intrinsic::nvvm_fmax_ftz_f:
+        case Intrinsic::nvvm_fmax_ftz_nan_f:
+        case Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f:
+        case Intrinsic::nvvm_fmax_ftz_xorsign_abs_f:
+        case Intrinsic::nvvm_fmax_nan_f:
+        case Intrinsic::nvvm_fmax_nan_xorsign_abs_f:
+        case Intrinsic::nvvm_fmax_xorsign_abs_f:
+          IsFMax = true;
+          break;
+        }
+        APFloat Res = IsFMax ? maximum(A, B) : minimum(A, B);
+
+        if (ShouldCanonicalizeNaNs) {
+          APFloat NVCanonicalNaN(Res.getSemantics(), APInt(32, 0x7fffffff));
+          if (A.isNaN() && B.isNaN())
+            return ConstantFP::get(Ty, NVCanonicalNaN);
+          else if (IsNaNPropagating && (A.isNaN() || B.isNaN()))
+            return ConstantFP::get(Ty, NVCanonicalNaN);
+        }
+
+        if (A.isNaN() && B.isNaN())
+          return Operands[1];
+        else if (A.isNaN())
+          Res = B;
+        else if (B.isNaN())
+          Res = A;
+
+        if (IsXorSignAbs && XorSign != Res.isNegative())
+          Res.changeSign();
+
+        return ConstantFP::get(Ty->getContext(), Res);
+      }
       }
 
       if (!Ty->isHalfTy() && !Ty->isFloatTy() && !Ty->isDoubleTy())
diff --git a/llvm/lib/Analysis/CtxProfAnalysis.cpp b/llvm/lib/Analysis/CtxProfAnalysis.cpp
index 825e517cd09f5..bbf29e0d370e7 100644
--- a/llvm/lib/Analysis/CtxProfAnalysis.cpp
+++ b/llvm/lib/Analysis/CtxProfAnalysis.cpp
@@ -19,7 +19,6 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/ProfileData/PGOCtxProfReader.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/JSON.h"
 #include "llvm/Support/MemoryBuffer.h"
 
 #define DEBUG_TYPE "ctx_prof"
@@ -31,49 +30,13 @@ cl::opt<std::string>
 
 static cl::opt<CtxProfAnalysisPrinterPass::PrintMode> PrintLevel(
     "ctx-profile-printer-level",
-    cl::init(CtxProfAnalysisPrinterPass::PrintMode::JSON), cl::Hidden,
+    cl::init(CtxProfAnalysisPrinterPass::PrintMode::YAML), cl::Hidden,
     cl::values(clEnumValN(CtxProfAnalysisPrinterPass::PrintMode::Everything,
                           "everything", "print everything - most verbose"),
-               clEnumValN(CtxProfAnalysisPrinterPass::PrintMode::JSON, "json",
-                          "just the json representation of the profile")),
+               clEnumValN(CtxProfAnalysisPrinterPass::PrintMode::YAML, "yaml",
+                          "just the yaml representation of the profile")),
     cl::desc("Verbosity level of the contextual profile printer pass."));
 
-namespace llvm {
-namespace json {
-Value toJSON(const PGOCtxProfContext &P) {
-  Object Ret;
-  Ret["Guid"] = P.guid();
-  Ret["Counters"] = Array(P.counters());
-  if (P.callsites().empty())
-    return Ret;
-  auto AllCS =
-      ::llvm::map_range(P.callsites(), [](const auto &P) { return P.first; });
-  auto MaxIt = ::llvm::max_element(AllCS);
-  assert(MaxIt != AllCS.end() && "We should have a max value because the "
-                                 "callsites collection is not empty.");
-  Array CSites;
-  // Iterate to, and including, the maximum index.
-  for (auto I = 0U, Max = *MaxIt; I <= Max; ++I) {
-    CSites.push_back(Array());
-    Array &Targets = *CSites.back().getAsArray();
-    if (P.hasCallsite(I))
-      for (const auto &[_, Ctx] : P.callsite(I))
-        Targets.push_back(toJSON(Ctx));
-  }
-  Ret["Callsites"] = std::move(CSites);
-
-  return Ret;
-}
-
-Value toJSON(const PGOCtxProfContext::CallTargetMapTy &P) {
-  Array Ret;
-  for (const auto &[_, Ctx] : P)
-    Ret.push_back(toJSON(Ctx));
-  return Ret;
-}
-} // namespace json
-} // namespace llvm
-
 const char *AssignGUIDPass::GUIDMetadataName = "guid";
 
 PreservedAnalyses AssignGUIDPass::run(Module &M, ModuleAnalysisManager &MAM) {
@@ -214,15 +177,13 @@ PreservedAnalyses CtxProfAnalysisPrinterPass::run(Module &M,
          << ". MaxCallsiteID: " << FuncInfo.NextCallsiteIndex << "\n";
   }
 
-  const auto JSONed = ::llvm::json::toJSON(C.profiles());
-
   if (Mode == PrintMode::Everything)
     OS << "\nCurrent Profile:\n";
-  OS << formatv("{0:2}", JSONed);
-  if (Mode == PrintMode::JSON)
+  convertCtxProfToYaml(OS, C.profiles());
+  OS << "\n";
+  if (Mode == PrintMode::YAML)
     return PreservedAnalyses::all();
 
-  OS << "\n";
   OS << "\nFlat Profile:\n";
   auto Flat = C.flatten();
   for (const auto &[Guid, Counters] : Flat) {
diff --git a/llvm/lib/Analysis/InstructionPrecedenceTracking.cpp b/llvm/lib/Analysis/InstructionPrecedenceTracking.cpp
index fba5859b74cef..9555e2c8dd5dd 100644
--- a/llvm/lib/Analysis/InstructionPrecedenceTracking.cpp
+++ b/llvm/lib/Analysis/InstructionPrecedenceTracking.cpp
@@ -115,8 +115,9 @@ void InstructionPrecedenceTracking::insertInstructionTo(const Instruction *Inst,
 void InstructionPrecedenceTracking::removeInstruction(const Instruction *Inst) {
   auto *BB = Inst->getParent();
   assert(BB && "must be called before instruction is actually removed");
-  if (FirstSpecialInsts.count(BB) && FirstSpecialInsts[BB] == Inst)
-    FirstSpecialInsts.erase(BB);
+  auto It = FirstSpecialInsts.find(BB);
+  if (It != FirstSpecialInsts.end() && It->second == Inst)
+    FirstSpecialInsts.erase(It);
 }
 
 void InstructionPrecedenceTracking::removeUsersOf(const Instruction *Inst) {
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 7bbd469bd035d..11ccfa33821ca 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -25,6 +25,8 @@
 
 using namespace llvm;
 
+extern cl::opt<bool> UseDerefAtPointSemantics;
+
 static bool isAligned(const Value *Base, Align Alignment,
                       const DataLayout &DL) {
   return Base->getPointerAlignment(DL) >= Alignment;
@@ -168,7 +170,7 @@ static bool isDereferenceableAndAlignedPointer(
                                               Size, DL, CtxI, AC, DT, TLI,
                                               Visited, MaxDepth);
 
-  if (CtxI) {
+  if (CtxI && (!UseDerefAtPointSemantics || !V->canBeFreed())) {
     /// Look through assumes to see if both dereferencability and alignment can
     /// be proven by an assume if needed.
     RetainedKnowledge AlignRK;
diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index 6bb5f001e9bd1..7bd5e1e0cfac8 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -999,6 +999,18 @@ void llvm::printLoop(Loop &L, raw_ostream &OS, const std::string &Banner) {
     return;
   }
 
+  if (forcePrintFuncIR()) {
+    // handling -print-loop-func-scope.
+    // -print-module-scope overrides this.
+    OS << Banner << " (loop: ";
+    L.getHeader()->printAsOperand(OS, false);
+    OS << ")\n";
+
+    // printing whole function.
+    OS << *L.getHeader()->getParent();
+    return;
+  }
+
   OS << Banner;
 
   auto *PreHeader = L.getLoopPreheader();
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index df42dc2746daf..8b9722d047edc 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1383,6 +1383,14 @@ bool TargetTransformInfo::hasArmWideBranch(bool Thumb) const {
   return TTIImpl->hasArmWideBranch(Thumb);
 }
 
+uint64_t TargetTransformInfo::getFeatureMask(const Function &F) const {
+  return TTIImpl->getFeatureMask(F);
+}
+
+bool TargetTransformInfo::isMultiversionedFunction(const Function &F) const {
+  return TTIImpl->isMultiversionedFunction(F);
+}
+
 unsigned TargetTransformInfo::getMaxNumArgs() const {
   return TTIImpl->getMaxNumArgs();
 }
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 1bab9b32525c3..6e2f0ebde9bb6 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -6192,9 +6192,9 @@ Value *llvm::isBytewiseValue(Value *V, const DataLayout &DL) {
   if (isa<UndefValue>(V))
     return UndefInt8;
 
-  // Return Undef for zero-sized type.
+  // Return poison for zero-sized type.
   if (DL.getTypeStoreSize(V->getType()).isZero())
-    return UndefInt8;
+    return PoisonValue::get(Type::getInt8Ty(Ctx));
 
   Constant *C = dyn_cast<Constant>(V);
   if (!C) {
@@ -6886,7 +6886,7 @@ const Value *llvm::getUnderlyingObjectAggressive(const Value *V) {
       return FirstObject;
   } while (!Worklist.empty());
 
-  return Object;
+  return Object ? Object : FirstObject;
 }
 
 /// This is the function that does the work of looking through basic
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 55c1d12a6fa8f..b2a4721f37b26 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -561,11 +561,11 @@ bool AsmPrinter::doInitialization(Module &M) {
   if (MAI->doesSupportDebugInformation()) {
     bool EmitCodeView = M.getCodeViewFlag();
     if (EmitCodeView && TM.getTargetTriple().isOSWindows())
-      DebugHandlers.push_back(std::make_unique<CodeViewDebug>(this));
+      Handlers.push_back(std::make_unique<CodeViewDebug>(this));
     if (!EmitCodeView || M.getDwarfVersion()) {
       if (hasDebugInfo()) {
         DD = new DwarfDebug(this);
-        DebugHandlers.push_back(std::unique_ptr<DwarfDebug>(DD));
+        Handlers.push_back(std::unique_ptr<DwarfDebug>(DD));
       }
     }
   }
@@ -632,12 +632,12 @@ bool AsmPrinter::doInitialization(Module &M) {
 
   // Emit tables for any value of cfguard flag (i.e. cfguard=1 or cfguard=2).
   if (mdconst::extract_or_null<ConstantInt>(M.getModuleFlag("cfguard")))
-    Handlers.push_back(std::make_unique<WinCFGuard>(this));
+    EHHandlers.push_back(std::make_unique<WinCFGuard>(this));
 
-  for (auto &Handler : DebugHandlers)
-    Handler->beginModule(&M);
   for (auto &Handler : Handlers)
     Handler->beginModule(&M);
+  for (auto &Handler : EHHandlers)
+    Handler->beginModule(&M);
 
   return false;
 }
@@ -784,7 +784,7 @@ void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   // sections and expected to be contiguous (e.g. ObjC metadata).
   const Align Alignment = getGVAlignment(GV, DL);
 
-  for (auto &Handler : DebugHandlers)
+  for (auto &Handler : Handlers)
     Handler->setSymbolSize(GVSym, Size);
 
   // Handle common symbols
@@ -1054,14 +1054,14 @@ void AsmPrinter::emitFunctionHeader() {
   }
 
   // Emit pre-function debug and/or EH information.
-  for (auto &Handler : DebugHandlers) {
+  for (auto &Handler : Handlers) {
     Handler->beginFunction(MF);
     Handler->beginBasicBlockSection(MF->front());
   }
-  for (auto &Handler : Handlers)
+  for (auto &Handler : EHHandlers) {
     Handler->beginFunction(MF);
-  for (auto &Handler : Handlers)
     Handler->beginBasicBlockSection(MF->front());
+  }
 
   // Emit the prologue data.
   if (F.hasPrologueData())
@@ -1836,7 +1836,7 @@ void AsmPrinter::emitFunctionBody() {
       if (MDNode *MD = MI.getPCSections())
         emitPCSectionsLabel(*MF, *MD);
 
-      for (auto &Handler : DebugHandlers)
+      for (auto &Handler : Handlers)
         Handler->beginInstruction(&MI);
 
       if (isVerbose())
@@ -1952,7 +1952,7 @@ void AsmPrinter::emitFunctionBody() {
       if (MCSymbol *S = MI.getPostInstrSymbol())
         OutStreamer->emitLabel(S);
 
-      for (auto &Handler : DebugHandlers)
+      for (auto &Handler : Handlers)
         Handler->endInstruction();
     }
 
@@ -2089,13 +2089,15 @@ void AsmPrinter::emitFunctionBody() {
   // Call endBasicBlockSection on the last block now, if it wasn't already
   // called.
   if (!MF->back().isEndSection()) {
-    for (auto &Handler : DebugHandlers)
-      Handler->endBasicBlockSection(MF->back());
     for (auto &Handler : Handlers)
       Handler->endBasicBlockSection(MF->back());
+    for (auto &Handler : EHHandlers)
+      Handler->endBasicBlockSection(MF->back());
   }
   for (auto &Handler : Handlers)
     Handler->markFunctionEnd();
+  for (auto &Handler : EHHandlers)
+    Handler->markFunctionEnd();
   // Update the end label of the entry block's section.
   MBBSectionRanges[MF->front().getSectionID()].EndLabel = CurrentFnEnd;
 
@@ -2103,10 +2105,10 @@ void AsmPrinter::emitFunctionBody() {
   emitJumpTableInfo();
 
   // Emit post-function debug and/or EH information.
-  for (auto &Handler : DebugHandlers)
-    Handler->endFunction(MF);
   for (auto &Handler : Handlers)
     Handler->endFunction(MF);
+  for (auto &Handler : EHHandlers)
+    Handler->endFunction(MF);
 
   // Emit section containing BB address offsets and their metadata, when
   // BB labels are requested for this function. Skip empty functions.
@@ -2583,17 +2585,16 @@ bool AsmPrinter::doFinalization(Module &M) {
     emitGlobalIFunc(M, IFunc);
 
   // Finalize debug and EH information.
-  for (auto &Handler : DebugHandlers)
-    Handler->endModule();
   for (auto &Handler : Handlers)
     Handler->endModule();
+  for (auto &Handler : EHHandlers)
+    Handler->endModule();
 
   // This deletes all the ephemeral handlers that AsmPrinter added, while
   // keeping all the user-added handlers alive until the AsmPrinter is
   // destroyed.
+  EHHandlers.clear();
   Handlers.erase(Handlers.begin() + NumUserHandlers, Handlers.end());
-  DebugHandlers.erase(DebugHandlers.begin() + NumUserDebugHandlers,
-                      DebugHandlers.end());
   DD = nullptr;
 
   // If the target wants to know about weak references, print them all.
@@ -4196,6 +4197,10 @@ void AsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
       Handler->endFunclet();
       Handler->beginFunclet(MBB);
     }
+    for (auto &Handler : EHHandlers) {
+      Handler->endFunclet();
+      Handler->beginFunclet(MBB);
+    }
   }
 
   // Switch to a new section if this basic block must begin a section. The
@@ -4208,7 +4213,7 @@ void AsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
     CurrentSectionBeginSym = MBB.getSymbol();
   }
 
-  for (auto &Handler : DebugHandlers)
+  for (auto &Handler : Handlers)
     Handler->beginCodeAlignment(MBB);
 
   // Emit an alignment directive for this block, if needed.
@@ -4268,10 +4273,10 @@ void AsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
   // if it begins a section (Entry block call is handled separately, next to
   // beginFunction).
   if (MBB.isBeginSection() && !MBB.isEntryBlock()) {
-    for (auto &Handler : DebugHandlers)
-      Handler->beginBasicBlockSection(MBB);
     for (auto &Handler : Handlers)
       Handler->beginBasicBlockSection(MBB);
+    for (auto &Handler : EHHandlers)
+      Handler->beginBasicBlockSection(MBB);
   }
 }
 
@@ -4279,10 +4284,10 @@ void AsmPrinter::emitBasicBlockEnd(const MachineBasicBlock &MBB) {
   // Check if CFI information needs to be updated for this MBB with basic block
   // sections.
   if (MBB.isEndSection()) {
-    for (auto &Handler : DebugHandlers)
-      Handler->endBasicBlockSection(MBB);
     for (auto &Handler : Handlers)
       Handler->endBasicBlockSection(MBB);
+    for (auto &Handler : EHHandlers)
+      Handler->endBasicBlockSection(MBB);
   }
 }
 
@@ -4415,12 +4420,7 @@ void AsmPrinter::addAsmPrinterHandler(
   NumUserHandlers++;
 }
 
-void AsmPrinter::addDebugHandler(std::unique_ptr<DebugHandlerBase> Handler) {
-  DebugHandlers.insert(DebugHandlers.begin(), std::move(Handler));
-  NumUserDebugHandlers++;
-}
-
-/// Pin vtable to this file.
+/// Pin vtables to this file.
 AsmPrinterHandler::~AsmPrinterHandler() = default;
 
 void AsmPrinterHandler::markFunctionEnd() {}
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 0a8a1ad38c959..d3450b8b0556f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -849,7 +849,9 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) {
   }
 }
 
-void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
+DIE *DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
+  // Args[0] is the return type.
+  DIE *ObjectPointer = nullptr;
   for (unsigned i = 1, N = Args.size(); i < N; ++i) {
     const DIType *Ty = Args[i];
     if (!Ty) {
@@ -860,8 +862,14 @@ void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
       addType(Arg, Ty);
       if (Ty->isArtificial())
         addFlag(Arg, dwarf::DW_AT_artificial);
+      if (Ty->isObjectPointer()) {
+        assert(!ObjectPointer && "Can't have more than one object pointer");
+        ObjectPointer = &Arg;
+      }
     }
   }
+
+  return ObjectPointer;
 }
 
 void DwarfUnit::constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy) {
@@ -1358,7 +1366,8 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
 
     // Add arguments. Do not add arguments for subprogram definition. They will
     // be handled while processing variables.
-    constructSubprogramArguments(SPDie, Args);
+    if (auto *ObjectPointer = constructSubprogramArguments(SPDie, Args))
+      addDIEEntry(SPDie, dwarf::DW_AT_object_pointer, *ObjectPointer);
   }
 
   addThrownTypes(SPDie, SP->getThrownTypes());
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 163205378fb4b..7a5295d826a48 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -268,7 +268,9 @@ class DwarfUnit : public DIEUnit {
   void constructContainingTypeDIEs();
 
   /// Construct function argument DIEs.
-  void constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args);
+  ///
+  /// \returns DIE of the object pointer if one exists. Nullptr otherwise.
+  DIE *constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args);
 
   /// Create a DIE with the given Tag, add the DIE to its parent, and
   /// call insertDIE if MD is not null.
diff --git a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
index 35461e53fbf19..f11b552387501 100644
--- a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
+++ b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
@@ -14,7 +14,6 @@
 #define LLVM_LIB_CODEGEN_ASMPRINTER_PSEUDOPROBEPRINTER_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/CodeGen/AsmPrinterHandler.h"
 
 namespace llvm {
 
diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index a465f52bfd593..dbc724629d3be 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -230,9 +230,10 @@ void FunctionVarLocs::init(FunctionVarLocsBuilder &Builder) {
     for (const DbgVariableRecord &DVR : filterDbgVars(I->getDbgRecordRange())) {
       // Even though DVR defines a variable location, VarLocsBeforeInst can
       // still be empty if that VarLoc was redundant.
-      if (!Builder.VarLocsBeforeInst.count(&DVR))
+      auto It = Builder.VarLocsBeforeInst.find(&DVR);
+      if (It == Builder.VarLocsBeforeInst.end())
         continue;
-      for (const VarLocInfo &VarLoc : Builder.VarLocsBeforeInst[&DVR])
+      for (const VarLocInfo &VarLoc : It->second)
         VarLocRecords.emplace_back(VarLoc);
     }
     for (const VarLocInfo &VarLoc : P.second)
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index a3392b7110989..7106e53bd5516 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -7983,8 +7983,8 @@ class VectorPromoteHelper {
   /// \p UseSplat defines whether or not \p Val should be replicated
   /// across the whole vector.
   /// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>,
-  /// otherwise we generate a vector with as many undef as possible:
-  /// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only
+  /// otherwise we generate a vector with as many poison as possible:
+  /// <poison, ..., poison, Val, poison, ..., poison> where \p Val is only
   /// used at the index of the extract.
   Value *getConstantVector(Constant *Val, bool UseSplat) const {
     unsigned ExtractIdx = std::numeric_limits<unsigned>::max();
@@ -8004,12 +8004,12 @@ class VectorPromoteHelper {
 
     if (!EC.isScalable()) {
       SmallVector<Constant *, 4> ConstVec;
-      UndefValue *UndefVal = UndefValue::get(Val->getType());
+      PoisonValue *PoisonVal = PoisonValue::get(Val->getType());
       for (unsigned Idx = 0; Idx != EC.getKnownMinValue(); ++Idx) {
         if (Idx == ExtractIdx)
           ConstVec.push_back(Val);
         else
-          ConstVec.push_back(UndefVal);
+          ConstVec.push_back(PoisonVal);
       }
       return ConstantVector::get(ConstVec);
     } else
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index 5ca223852cbde..67df1cd5246ef 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -136,17 +136,6 @@ namespace {
 struct CachingVPExpander {
   const TargetTransformInfo &TTI;
 
-  /// \returns A (fixed length) vector with ascending integer indices
-  /// (<0, 1, ..., NumElems-1>).
-  /// \p Builder
-  ///    Used for instruction creation.
-  /// \p LaneTy
-  ///    Integer element type of the result vector.
-  /// \p NumElems
-  ///    Number of vector elements.
-  Value *createStepVector(IRBuilder<> &Builder, Type *LaneTy,
-                          unsigned NumElems);
-
   /// \returns A bitmask that is true where the lane position is less-than \p
   /// EVLParam
   ///
@@ -216,17 +205,6 @@ struct CachingVPExpander {
 
 //// CachingVPExpander {
 
-Value *CachingVPExpander::createStepVector(IRBuilder<> &Builder, Type *LaneTy,
-                                           unsigned NumElems) {
-  // TODO add caching
-  SmallVector<Constant *, 16> ConstElems;
-
-  for (unsigned Idx = 0; Idx < NumElems; ++Idx)
-    ConstElems.push_back(ConstantInt::get(LaneTy, Idx, false));
-
-  return ConstantVector::get(ConstElems);
-}
-
 Value *CachingVPExpander::convertEVLToMask(IRBuilder<> &Builder,
                                            Value *EVLParam,
                                            ElementCount ElemCount) {
@@ -245,7 +223,7 @@ Value *CachingVPExpander::convertEVLToMask(IRBuilder<> &Builder,
   Type *LaneTy = EVLParam->getType();
   unsigned NumElems = ElemCount.getFixedValue();
   Value *VLSplat = Builder.CreateVectorSplat(NumElems, EVLParam);
-  Value *IdxVec = createStepVector(Builder, LaneTy, NumElems);
+  Value *IdxVec = Builder.CreateStepVector(VectorType::get(LaneTy, ElemCount));
   return Builder.CreateICmp(CmpInst::ICMP_ULT, IdxVec, VLSplat);
 }
 
@@ -609,6 +587,15 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
   case Intrinsic::vp_umin:
   case Intrinsic::vp_bswap:
   case Intrinsic::vp_bitreverse:
+  case Intrinsic::vp_ctpop:
+  case Intrinsic::vp_ctlz:
+  case Intrinsic::vp_cttz:
+  case Intrinsic::vp_sadd_sat:
+  case Intrinsic::vp_uadd_sat:
+  case Intrinsic::vp_ssub_sat:
+  case Intrinsic::vp_usub_sat:
+  case Intrinsic::vp_fshl:
+  case Intrinsic::vp_fshr:
     return expandPredicationToIntCall(Builder, VPI);
   case Intrinsic::vp_fabs:
   case Intrinsic::vp_sqrt:
diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp
index 0a547050e91a8..728fd2f5f7cd4 100644
--- a/llvm/lib/CodeGen/MachineCSE.cpp
+++ b/llvm/lib/CodeGen/MachineCSE.cpp
@@ -832,12 +832,11 @@ bool MachineCSEImpl::ProcessBlockPRE(MachineDominatorTree *DT,
     if (!isPRECandidate(&MI, PhysRefs))
       continue;
 
-    if (!PREMap.count(&MI)) {
-      PREMap[&MI] = MBB;
+    auto [It, Inserted] = PREMap.try_emplace(&MI, MBB);
+    if (Inserted)
       continue;
-    }
 
-    auto MBB1 = PREMap[&MI];
+    auto *MBB1 = It->second;
     assert(
         !DT->properlyDominates(MBB, MBB1) &&
         "MBB cannot properly dominate MBB1 while DFS through dominators tree!");
diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 49ce4b660c3ae..0afd73d8ecdcc 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -117,7 +117,32 @@ class CopyTracker {
 
   DenseMap<MCRegUnit, CopyInfo> Copies;
 
+  // Memoised sets of register units which are preserved by each register mask,
+  // needed to efficiently remove copies which are invalidated by call
+  // instructions.
+  DenseMap<const uint32_t *, BitVector> RegMaskToPreservedRegUnits;
+
 public:
+  /// Get the set of register units which are preserved by RegMaskOp.
+  BitVector &getPreservedRegUnits(const MachineOperand &RegMaskOp,
+                                  const TargetRegisterInfo &TRI) {
+    const uint32_t *RegMask = RegMaskOp.getRegMask();
+    auto Existing = RegMaskToPreservedRegUnits.find(RegMask);
+    if (Existing != RegMaskToPreservedRegUnits.end()) {
+      return Existing->second;
+    } else {
+      BitVector &PreservedRegUnits = RegMaskToPreservedRegUnits[RegMask];
+
+      PreservedRegUnits.resize(TRI.getNumRegUnits());
+      for (unsigned SafeReg = 0, E = TRI.getNumRegs(); SafeReg < E; ++SafeReg)
+        if (!RegMaskOp.clobbersPhysReg(SafeReg))
+          for (auto SafeUnit : TRI.regunits(SafeReg))
+            PreservedRegUnits.set(SafeUnit);
+
+      return PreservedRegUnits;
+    }
+  }
+
   /// Mark all of the given registers and their subregisters as unavailable for
   /// copying.
   void markRegsUnavailable(ArrayRef<MCRegister> Regs,
@@ -164,64 +189,70 @@ class CopyTracker {
       Copies.erase(Unit);
   }
 
-  /// Clobber a single register, removing it from the tracker's copy maps.
-  void clobberRegister(MCRegister Reg, const TargetRegisterInfo &TRI,
-                       const TargetInstrInfo &TII, bool UseCopyInstr) {
-    for (MCRegUnit Unit : TRI.regunits(Reg)) {
-      auto I = Copies.find(Unit);
-      if (I != Copies.end()) {
-        // When we clobber the source of a copy, we need to clobber everything
-        // it defined.
-        markRegsUnavailable(I->second.DefRegs, TRI);
-        // When we clobber the destination of a copy, we need to clobber the
-        // whole register it defined.
-        if (MachineInstr *MI = I->second.MI) {
-          std::optional<DestSourcePair> CopyOperands =
-              isCopyInstr(*MI, TII, UseCopyInstr);
-
-          MCRegister Def = CopyOperands->Destination->getReg().asMCReg();
-          MCRegister Src = CopyOperands->Source->getReg().asMCReg();
-
-          markRegsUnavailable(Def, TRI);
-
-          // Since we clobber the destination of a copy, the semantic of Src's
-          // "DefRegs" to contain Def is no longer effectual. We will also need
-          // to remove the record from the copy maps that indicates Src defined
-          // Def. Failing to do so might cause the target to miss some
-          // opportunities to further eliminate redundant copy instructions.
-          // Consider the following sequence during the
-          // ForwardCopyPropagateBlock procedure:
-          // L1: r0 = COPY r9     <- TrackMI
-          // L2: r0 = COPY r8     <- TrackMI (Remove r9 defined r0 from tracker)
-          // L3: use r0           <- Remove L2 from MaybeDeadCopies
-          // L4: early-clobber r9 <- Clobber r9 (L2 is still valid in tracker)
-          // L5: r0 = COPY r8     <- Remove NopCopy
-          for (MCRegUnit SrcUnit : TRI.regunits(Src)) {
-            auto SrcCopy = Copies.find(SrcUnit);
-            if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) {
-              // If SrcCopy defines multiple values, we only need
-              // to erase the record for Def in DefRegs.
-              for (auto itr = SrcCopy->second.DefRegs.begin();
-                   itr != SrcCopy->second.DefRegs.end(); itr++) {
-                if (*itr == Def) {
-                  SrcCopy->second.DefRegs.erase(itr);
-                  // If DefReg becomes empty after removal, we can remove the
-                  // SrcCopy from the tracker's copy maps. We only remove those
-                  // entries solely record the Def is defined by Src. If an
-                  // entry also contains the definition record of other Def'
-                  // registers, it cannot be cleared.
-                  if (SrcCopy->second.DefRegs.empty() && !SrcCopy->second.MI) {
-                    Copies.erase(SrcCopy);
-                  }
-                  break;
+  /// Clobber a single register unit, removing it from the tracker's copy maps.
+  void clobberRegUnit(MCRegUnit Unit, const TargetRegisterInfo &TRI,
+                      const TargetInstrInfo &TII, bool UseCopyInstr) {
+    auto I = Copies.find(Unit);
+    if (I != Copies.end()) {
+      // When we clobber the source of a copy, we need to clobber everything
+      // it defined.
+      markRegsUnavailable(I->second.DefRegs, TRI);
+      // When we clobber the destination of a copy, we need to clobber the
+      // whole register it defined.
+      if (MachineInstr *MI = I->second.MI) {
+        std::optional<DestSourcePair> CopyOperands =
+            isCopyInstr(*MI, TII, UseCopyInstr);
+
+        MCRegister Def = CopyOperands->Destination->getReg().asMCReg();
+        MCRegister Src = CopyOperands->Source->getReg().asMCReg();
+
+        markRegsUnavailable(Def, TRI);
+
+        // Since we clobber the destination of a copy, the semantic of Src's
+        // "DefRegs" to contain Def is no longer effectual. We will also need
+        // to remove the record from the copy maps that indicates Src defined
+        // Def. Failing to do so might cause the target to miss some
+        // opportunities to further eliminate redundant copy instructions.
+        // Consider the following sequence during the
+        // ForwardCopyPropagateBlock procedure:
+        // L1: r0 = COPY r9     <- TrackMI
+        // L2: r0 = COPY r8     <- TrackMI (Remove r9 defined r0 from tracker)
+        // L3: use r0           <- Remove L2 from MaybeDeadCopies
+        // L4: early-clobber r9 <- Clobber r9 (L2 is still valid in tracker)
+        // L5: r0 = COPY r8     <- Remove NopCopy
+        for (MCRegUnit SrcUnit : TRI.regunits(Src)) {
+          auto SrcCopy = Copies.find(SrcUnit);
+          if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) {
+            // If SrcCopy defines multiple values, we only need
+            // to erase the record for Def in DefRegs.
+            for (auto itr = SrcCopy->second.DefRegs.begin();
+                 itr != SrcCopy->second.DefRegs.end(); itr++) {
+              if (*itr == Def) {
+                SrcCopy->second.DefRegs.erase(itr);
+                // If DefReg becomes empty after removal, we can remove the
+                // SrcCopy from the tracker's copy maps. We only remove those
+                // entries solely record the Def is defined by Src. If an
+                // entry also contains the definition record of other Def'
+                // registers, it cannot be cleared.
+                if (SrcCopy->second.DefRegs.empty() && !SrcCopy->second.MI) {
+                  Copies.erase(SrcCopy);
                 }
+                break;
               }
             }
           }
         }
-        // Now we can erase the copy.
-        Copies.erase(I);
       }
+      // Now we can erase the copy.
+      Copies.erase(I);
+    }
+  }
+
+  /// Clobber a single register, removing it from the tracker's copy maps.
+  void clobberRegister(MCRegister Reg, const TargetRegisterInfo &TRI,
+                       const TargetInstrInfo &TII, bool UseCopyInstr) {
+    for (MCRegUnit Unit : TRI.regunits(Reg)) {
+      clobberRegUnit(Unit, TRI, TII, UseCopyInstr);
     }
   }
 
@@ -960,6 +991,9 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
     // a large set of registers.  Treat clobbered registers the same way as
     // defined registers.
     if (RegMask) {
+      BitVector &PreservedRegUnits =
+          Tracker.getPreservedRegUnits(*RegMask, *TRI);
+
       // Erase any MaybeDeadCopies whose destination register is clobbered.
       for (SmallSetVector<MachineInstr *, 8>::iterator DI =
                MaybeDeadCopies.begin();
@@ -978,9 +1012,11 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
         LLVM_DEBUG(dbgs() << "MCP: Removing copy due to regmask clobbering: ";
                    MaybeDead->dump());
 
-        // Make sure we invalidate any entries in the copy maps before erasing
-        // the instruction.
-        Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr);
+        // Invalidate all entries in the copy map which are not preserved by
+        // this register mask.
+        for (unsigned RegUnit : TRI->regunits(Reg))
+          if (!PreservedRegUnits.test(RegUnit))
+            Tracker.clobberRegUnit(RegUnit, *TRI, *TII, UseCopyInstr);
 
         // erase() will return the next valid iterator pointing to the next
         // element after the erased one.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6805e0cb23ace..49e5b7d9ef014 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15770,7 +15770,7 @@ SDValue DAGCombiner::foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG,
   // FIXME: I don't think looking for bitcast intrinsically makes sense, but
   // removing this would require more changes.
   auto IsBitCastOrFree = [&TLI, FPOpcode](SDValue Op, EVT VT) {
-    if (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).getValueType() == VT)
+    if (sd_match(Op, m_BitCast(m_SpecificVT(VT))))
       return true;
 
     return FPOpcode == ISD::FABS ? TLI.isFAbsFree(VT) : TLI.isFNegFree(VT);
@@ -23807,6 +23807,13 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
   SmallVector<SDValue, 8> VecIn;
   VecIn.push_back(SDValue());
 
+  // If we have a single extract_element with a constant index, track the index
+  // value.
+  unsigned OneConstExtractIndex = ~0u;
+
+  // Count the number of extract_vector_elt sources (i.e. non-constant or undef)
+  unsigned NumExtracts = 0;
+
   for (unsigned i = 0; i != NumElems; ++i) {
     SDValue Op = N->getOperand(i);
 
@@ -23824,16 +23831,18 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
 
     // Not an undef or zero. If the input is something other than an
     // EXTRACT_VECTOR_ELT with an in-range constant index, bail out.
-    if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        !isa<ConstantSDNode>(Op.getOperand(1)))
+    if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
       return SDValue();
-    SDValue ExtractedFromVec = Op.getOperand(0);
 
+    SDValue ExtractedFromVec = Op.getOperand(0);
     if (ExtractedFromVec.getValueType().isScalableVector())
       return SDValue();
+    auto *ExtractIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (!ExtractIdx)
+      return SDValue();
 
-    const APInt &ExtractIdx = Op.getConstantOperandAPInt(1);
-    if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements()))
+    if (ExtractIdx->getAsAPIntVal().uge(
+            ExtractedFromVec.getValueType().getVectorNumElements()))
       return SDValue();
 
     // All inputs must have the same element type as the output.
@@ -23841,6 +23850,9 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
         ExtractedFromVec.getValueType().getVectorElementType())
       return SDValue();
 
+    OneConstExtractIndex = ExtractIdx->getZExtValue();
+    ++NumExtracts;
+
     // Have we seen this input vector before?
     // The vectors are expected to be tiny (usually 1 or 2 elements), so using
     // a map back from SDValues to numbers isn't worth it.
@@ -23863,6 +23875,20 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
   // VecIn accordingly.
   bool DidSplitVec = false;
   if (VecIn.size() == 2) {
+    // If we only found a single constant indexed extract_vector_elt feeding the
+    // build_vector, do not produce a more complicated shuffle if the extract is
+    // cheap with other constant/undef elements. Skip broadcast patterns with
+    // multiple uses in the build_vector.
+
+    // TODO: This should be more aggressive about skipping the shuffle
+    // formation, particularly if VecIn[1].hasOneUse(), and regardless of the
+    // index.
+    if (NumExtracts == 1 &&
+        TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) &&
+        TLI.isTypeLegal(VT.getVectorElementType()) &&
+        TLI.isExtractVecEltCheap(VT, OneConstExtractIndex))
+      return SDValue();
+
     unsigned MaxIndex = 0;
     unsigned NearestPow2 = 0;
     SDValue Vec = VecIn.back();
@@ -27526,23 +27552,27 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG,
   if ((Opcode == ISD::MULHS || Opcode == ISD::MULHU) && !TLI.isTypeLegal(EltVT))
     return SDValue();
 
+  if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode()) {
+    // All but one element should have an undef input, which will fold to a
+    // constant or undef. Avoid splatting which would over-define potentially
+    // undefined elements.
+
+    // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) -->
+    //   build_vec ..undef, (bo X, Y), undef...
+    SmallVector<SDValue, 16> EltsX, EltsY, EltsResult;
+    DAG.ExtractVectorElements(Src0, EltsX);
+    DAG.ExtractVectorElements(Src1, EltsY);
+
+    for (auto [X, Y] : zip(EltsX, EltsY))
+      EltsResult.push_back(DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags()));
+    return DAG.getBuildVector(VT, DL, EltsResult);
+  }
+
   SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL);
   SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src0, IndexC);
   SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src1, IndexC);
   SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags());
 
-  // If all lanes but 1 are undefined, no need to splat the scalar result.
-  // TODO: Keep track of undefs and use that info in the general case.
-  if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode() &&
-      count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1 &&
-      count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) {
-    // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) -->
-    // build_vec ..undef, (bo X, Y), undef...
-    SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), DAG.getUNDEF(EltVT));
-    Ops[Index0] = ScalarBO;
-    return DAG.getBuildVector(VT, DL, Ops);
-  }
-
   // bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index
   return DAG.getSplat(VT, DL, ScalarBO);
 }
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
index bccda8e90a1fb..2c8790273f8b2 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
@@ -88,8 +88,8 @@ const uint8_t TLSInfoTableManager_ELF_x86_64::TLSInfoEntryContent[16] = {
 Error buildTables_ELF_x86_64(LinkGraph &G) {
   LLVM_DEBUG(dbgs() << "Visiting edges in graph:\n");
 
-  x86_64::GOTTableManager GOT;
-  x86_64::PLTTableManager PLT(GOT);
+  x86_64::GOTTableManager GOT(G);
+  x86_64::PLTTableManager PLT(G, GOT);
   TLSInfoTableManager_ELF_x86_64 TLSInfo;
   visitExistingEdges(G, GOT, PLT, TLSInfo);
   return Error::success();
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
index 113b1953e36a6..9547266dc9789 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
@@ -459,8 +459,8 @@ class MachOLinkGraphBuilder_x86_64 : public MachOLinkGraphBuilder {
 };
 
 Error buildGOTAndStubs_MachO_x86_64(LinkGraph &G) {
-  x86_64::GOTTableManager GOT;
-  x86_64::PLTTableManager PLT(GOT);
+  x86_64::GOTTableManager GOT(G);
+  x86_64::PLTTableManager PLT(G, GOT);
   visitExistingEdges(G, GOT, PLT);
   return Error::success();
 }
diff --git a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
index a84e0001f115a..6ac991651f082 100644
--- a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
@@ -89,6 +89,26 @@ const char ReentryTrampolineContent[5] = {
   static_cast<char>(0xe8), 0x00, 0x00, 0x00, 0x00
 };
 
+void GOTTableManager::registerExistingEntries() {
+  for (auto *EntrySym : GOTSection->symbols()) {
+    assert(EntrySym->getBlock().edges_size() == 1 &&
+           "GOT block edge count != 1");
+    registerPreExistingEntry(EntrySym->getBlock().edges().begin()->getTarget(),
+                             *EntrySym);
+  }
+}
+
+void PLTTableManager::registerExistingEntries() {
+  for (auto *EntrySym : StubsSection->symbols()) {
+    assert(EntrySym->getBlock().edges_size() == 1 &&
+           "PLT block edge count != 1");
+    auto &GOTSym = EntrySym->getBlock().edges().begin()->getTarget();
+    assert(GOTSym.getBlock().edges_size() == 1 && "GOT block edge count != 1");
+    registerPreExistingEntry(GOTSym.getBlock().edges().begin()->getTarget(),
+                             *EntrySym);
+  }
+}
+
 Error optimizeGOTAndStubAccesses(LinkGraph &G) {
   LLVM_DEBUG(dbgs() << "Optimizing GOT entries and stubs:\n");
 
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
index 71036f33cf929..10160ddbf826e 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
@@ -20,11 +20,9 @@
 #ifdef __linux__
   // These includes used by RTDyldMemoryManager::getPointerToNamedFunction()
   // for Glibc trickery. See comments in this function for more information.
-  #ifdef HAVE_SYS_STAT_H
-    #include <sys/stat.h>
-  #endif
-  #include <fcntl.h>
-  #include <unistd.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
 #endif
 
 namespace llvm {
diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp
index b6c28385ebb09..a6df9672f8100 100644
--- a/llvm/lib/FileCheck/FileCheck.cpp
+++ b/llvm/lib/FileCheck/FileCheck.cpp
@@ -1766,8 +1766,7 @@ void FileCheckPatternContext::createLineVariable() {
 }
 
 FileCheck::FileCheck(FileCheckRequest Req)
-    : Req(Req), PatternContext(std::make_unique<FileCheckPatternContext>()),
-      CheckStrings(std::make_unique<std::vector<FileCheckString>>()) {}
+    : Req(Req), PatternContext(std::make_unique<FileCheckPatternContext>()) {}
 
 FileCheck::~FileCheck() = default;
 
@@ -1916,7 +1915,7 @@ bool FileCheck::readCheckFile(
     // Verify that CHECK-NEXT/SAME/EMPTY lines have at least one CHECK line before them.
     if ((CheckTy == Check::CheckNext || CheckTy == Check::CheckSame ||
          CheckTy == Check::CheckEmpty) &&
-        CheckStrings->empty()) {
+        CheckStrings.empty()) {
       StringRef Type = CheckTy == Check::CheckNext
                            ? "NEXT"
                            : CheckTy == Check::CheckEmpty ? "EMPTY" : "SAME";
@@ -1934,8 +1933,8 @@ bool FileCheck::readCheckFile(
     }
 
     // Okay, add the string we captured to the output vector and move on.
-    CheckStrings->emplace_back(P, UsedPrefix, PatternLoc);
-    std::swap(DagNotMatches, CheckStrings->back().DagNotStrings);
+    CheckStrings.emplace_back(P, UsedPrefix, PatternLoc);
+    std::swap(DagNotMatches, CheckStrings.back().DagNotStrings);
     DagNotMatches = ImplicitNegativeChecks;
   }
 
@@ -1962,10 +1961,10 @@ bool FileCheck::readCheckFile(
   // Add an EOF pattern for any trailing --implicit-check-not/CHECK-DAG/-NOTs,
   // and use the first prefix as a filler for the error message.
   if (!DagNotMatches.empty()) {
-    CheckStrings->emplace_back(
+    CheckStrings.emplace_back(
         Pattern(Check::CheckEOF, PatternContext.get(), LineNumber + 1),
         *Req.CheckPrefixes.begin(), SMLoc::getFromPointer(Buffer.data()));
-    std::swap(DagNotMatches, CheckStrings->back().DagNotStrings);
+    std::swap(DagNotMatches, CheckStrings.back().DagNotStrings);
   }
 
   return false;
@@ -2676,13 +2675,13 @@ bool FileCheck::checkInput(SourceMgr &SM, StringRef Buffer,
                            std::vector<FileCheckDiag> *Diags) {
   bool ChecksFailed = false;
 
-  unsigned i = 0, j = 0, e = CheckStrings->size();
+  unsigned i = 0, j = 0, e = CheckStrings.size();
   while (true) {
     StringRef CheckRegion;
     if (j == e) {
       CheckRegion = Buffer;
     } else {
-      const FileCheckString &CheckLabelStr = (*CheckStrings)[j];
+      const FileCheckString &CheckLabelStr = CheckStrings[j];
       if (CheckLabelStr.Pat.getCheckTy() != Check::CheckLabel) {
         ++j;
         continue;
@@ -2708,7 +2707,7 @@ bool FileCheck::checkInput(SourceMgr &SM, StringRef Buffer,
       PatternContext->clearLocalVars();
 
     for (; i != j; ++i) {
-      const FileCheckString &CheckStr = (*CheckStrings)[i];
+      const FileCheckString &CheckStr = CheckStrings[i];
 
       // Check each string within the scanned region, including a second check
       // of any final CHECK-LABEL (to verify CHECK-NOT and CHECK-DAG)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index c6603635d5e28..7dbf65fbf055b 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -264,6 +264,33 @@ computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
   return Result;
 }
 
+/// Emit an implicit cast to convert \p XRead to type of variable \p V
+static llvm::Value *emitImplicitCast(IRBuilder<> &Builder, llvm::Value *XRead,
+                                     llvm::Value *V) {
+  // TODO: Add this functionality to the `AtomicInfo` interface
+  llvm::Type *XReadType = XRead->getType();
+  llvm::Type *VType = V->getType();
+  if (llvm::AllocaInst *vAlloca = dyn_cast<llvm::AllocaInst>(V))
+    VType = vAlloca->getAllocatedType();
+
+  if (XReadType->isStructTy() && VType->isStructTy())
+    // No need to extract or convert. A direct
+    // `store` will suffice.
+    return XRead;
+
+  if (XReadType->isStructTy())
+    XRead = Builder.CreateExtractValue(XRead, /*Idxs=*/0);
+  if (VType->isIntegerTy() && XReadType->isFloatingPointTy())
+    XRead = Builder.CreateFPToSI(XRead, VType);
+  else if (VType->isFloatingPointTy() && XReadType->isIntegerTy())
+    XRead = Builder.CreateSIToFP(XRead, VType);
+  else if (VType->isIntegerTy() && XReadType->isIntegerTy())
+    XRead = Builder.CreateIntCast(XRead, VType, true);
+  else if (VType->isFloatingPointTy() && XReadType->isFloatingPointTy())
+    XRead = Builder.CreateFPCast(XRead, VType);
+  return XRead;
+}
+
 /// Make \p Source branch to \p Target.
 ///
 /// Handles two situations:
@@ -1850,7 +1877,8 @@ static Value *emitTaskDependencies(
 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
     const LocationDescription &Loc, InsertPointTy AllocaIP,
     BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
-    SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle) {
+    SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
+    Value *Priority) {
 
   if (!updateToLocation(Loc))
     return InsertPointTy();
@@ -1896,7 +1924,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
       Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
 
   OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
-                      Mergeable, EventHandle, TaskAllocaBB,
+                      Mergeable, Priority, EventHandle, TaskAllocaBB,
                       ToBeDeleted](Function &OutlinedFn) mutable {
     // Replace the Stale CI by appropriate RTL function call.
     assert(OutlinedFn.getNumUses() == 1 &&
@@ -1924,6 +1952,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
     // Task is not final iff (Flags & 2) == 0.
     // Task is mergeable iff (Flags & 4) == 4.
     // Task is not mergeable iff (Flags & 4) == 0.
+    // Task is priority iff (Flags & 32) == 32.
+    // Task is not priority iff (Flags & 32) == 0.
     // TODO: Handle the other flags.
     Value *Flags = Builder.getInt32(Tied);
     if (Final) {
@@ -1934,6 +1964,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
 
     if (Mergeable)
       Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
+    if (Priority)
+      Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
 
     // Argument - `sizeof_kmp_task_t` (TaskSize)
     // Tasksize refers to the size in bytes of kmp_task_t data structure
@@ -1990,6 +2022,33 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
                            SharedsSize);
     }
 
+    if (Priority) {
+      //
+      // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
+      // we populate the priority information into the "kmp_task_t" here
+      //
+      // The struct "kmp_task_t" definition is available in kmp.h
+      // kmp_task_t = { shareds, routine, part_id, data1, data2 }
+      // data2 is used for priority
+      //
+      Type *Int32Ty = Builder.getInt32Ty();
+      Constant *Zero = ConstantInt::get(Int32Ty, 0);
+      // kmp_task_t* => { ptr }
+      Type *TaskPtr = StructType::get(VoidPtr);
+      Value *TaskGEP =
+          Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
+      // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
+      Type *TaskStructType = StructType::get(
+          VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
+      Value *PriorityData = Builder.CreateInBoundsGEP(
+          TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
+      // kmp_cmplrdata_t => { ptr, ptr }
+      Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
+      Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
+                                                   PriorityData, {Zero, Zero});
+      Builder.CreateStore(Priority, CmplrData);
+    }
+
     Value *DepArray = nullptr;
     if (Dependencies.size()) {
       InsertPointTy OldIP = Builder.saveIP();
@@ -8469,6 +8528,8 @@ OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc,
     }
   }
   checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
+  if (XRead->getType() != V.Var->getType())
+    XRead = emitImplicitCast(Builder, XRead, V.Var);
   Builder.CreateStore(XRead, V.Var, V.IsVolatile);
   return Builder.saveIP();
 }
@@ -8753,6 +8814,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture(
     return AtomicResult.takeError();
   Value *CapturedVal =
       (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
+  if (CapturedVal->getType() != V.Var->getType())
+    CapturedVal = emitImplicitCast(Builder, CapturedVal, V.Var);
   Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
 
   checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index b240a2a39de36..d9bd4f11e89a3 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -644,11 +644,15 @@ DIType *DIBuilder::createArtificialType(DIType *Ty) {
   return createTypeWithFlags(Ty, DINode::FlagArtificial);
 }
 
-DIType *DIBuilder::createObjectPointerType(DIType *Ty) {
+DIType *DIBuilder::createObjectPointerType(DIType *Ty, bool Implicit) {
   // FIXME: Restrict this to the nodes where it's valid.
   if (Ty->isObjectPointer())
     return Ty;
-  DINode::DIFlags Flags = DINode::FlagObjectPointer | DINode::FlagArtificial;
+  DINode::DIFlags Flags = DINode::FlagObjectPointer;
+
+  if (Implicit)
+    Flags |= DINode::FlagArtificial;
+
   return createTypeWithFlags(Ty, Flags);
 }
 
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index e5b45e0082a82..4ce518009bd3e 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -1432,10 +1432,11 @@ LLVMDIBuilderCreateObjCProperty(LLVMDIBuilderRef Builder,
                   PropertyAttributes, unwrapDI<DIType>(Ty)));
 }
 
-LLVMMetadataRef
-LLVMDIBuilderCreateObjectPointerType(LLVMDIBuilderRef Builder,
-                                     LLVMMetadataRef Type) {
-  return wrap(unwrap(Builder)->createObjectPointerType(unwrapDI<DIType>(Type)));
+LLVMMetadataRef LLVMDIBuilderCreateObjectPointerType(LLVMDIBuilderRef Builder,
+                                                     LLVMMetadataRef Type,
+                                                     LLVMBool Implicit) {
+  return wrap(unwrap(Builder)->createObjectPointerType(unwrapDI<DIType>(Type),
+                                                       Implicit));
 }
 
 LLVMMetadataRef
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 147cd84125c8d..9eaae62a6390b 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -458,9 +458,8 @@ void Instruction::dropPoisonGeneratingFlags() {
 }
 
 bool Instruction::hasPoisonGeneratingMetadata() const {
-  return hasMetadata(LLVMContext::MD_range) ||
-         hasMetadata(LLVMContext::MD_nonnull) ||
-         hasMetadata(LLVMContext::MD_align);
+  return any_of(Metadata::PoisonGeneratingIDs,
+                [this](unsigned ID) { return hasMetadata(ID); });
 }
 
 bool Instruction::hasNonDebugLocLoopMetadata() const {
@@ -487,9 +486,8 @@ bool Instruction::hasNonDebugLocLoopMetadata() const {
 }
 
 void Instruction::dropPoisonGeneratingMetadata() {
-  eraseMetadata(LLVMContext::MD_range);
-  eraseMetadata(LLVMContext::MD_nonnull);
-  eraseMetadata(LLVMContext::MD_align);
+  for (unsigned ID : Metadata::PoisonGeneratingIDs)
+    eraseMetadata(ID);
 }
 
 bool Instruction::hasPoisonGeneratingReturnAttributes() const {
diff --git a/llvm/lib/IR/PrintPasses.cpp b/llvm/lib/IR/PrintPasses.cpp
index e2ef20bb81ba7..610411a3cf978 100644
--- a/llvm/lib/IR/PrintPasses.cpp
+++ b/llvm/lib/IR/PrintPasses.cpp
@@ -88,6 +88,12 @@ static cl::opt<bool>
                               "always print a module IR"),
                      cl::init(false), cl::Hidden);
 
+static cl::opt<bool> LoopPrintFuncScope(
+    "print-loop-func-scope",
+    cl::desc("When printing IR for print-[before|after]{-all} "
+             "for a loop pass, always print function IR"),
+    cl::init(false), cl::Hidden);
+
 // See the description for -print-changed for an explanation of the use
 // of this option.
 static cl::list<std::string> FilterPasses(
@@ -141,6 +147,8 @@ std::vector<std::string> llvm::printAfterPasses() {
 
 bool llvm::forcePrintModuleIR() { return PrintModuleScope; }
 
+bool llvm::forcePrintFuncIR() { return LoopPrintFuncScope; }
+
 bool llvm::isPassInPrintList(StringRef PassName) {
   static std::unordered_set<std::string> Set(FilterPasses.begin(),
                                              FilterPasses.end());
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index b2ee75811fbb7..eddb67282fca4 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -36,7 +36,7 @@
 
 using namespace llvm;
 
-static cl::opt<unsigned> UseDerefAtPointSemantics(
+cl::opt<bool> UseDerefAtPointSemantics(
     "use-dereferenceable-at-point-semantics", cl::Hidden, cl::init(false),
     cl::desc("Deref attributes and metadata infer facts at definition only"));
 
diff --git a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
index abaf0f0246183..2adc29172f9dd 100644
--- a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
+++ b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
@@ -156,7 +156,7 @@ llvm::mc::RegisterMCTargetOptionsFlags::RegisterMCTargetOptionsFlags() {
   MCBINDOPT(X86Sse2Avx);
 
   static cl::opt<std::string> ABIName(
-      "target-abi", cl::Hidden,
+      "target-abi",
       cl::desc("The name of the ABI to be targeted from the backend."),
       cl::init(""));
   MCBINDOPT(ABIName);
diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp
index 29a8c53d350a4..8ddbe929e68b9 100644
--- a/llvm/lib/MC/WasmObjectWriter.cpp
+++ b/llvm/lib/MC/WasmObjectWriter.cpp
@@ -1019,7 +1019,7 @@ void WasmObjectWriter::writeElemSection(
   encodeSLEB128(InitialTableOffset, W->OS);
   W->OS << char(wasm::WASM_OPCODE_END);
 
-  if (Flags & wasm::WASM_ELEM_SEGMENT_MASK_HAS_ELEM_KIND) {
+  if (Flags & wasm::WASM_ELEM_SEGMENT_MASK_HAS_ELEM_DESC) {
     // We only write active function table initializers, for which the elem kind
     // is specified to be written as 0x00 and interpreted to mean "funcref".
     const uint8_t ElemKind = 0;
diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index 39e02d0522bcf..da0c0661117b2 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -1193,6 +1193,11 @@ void WinCOFFObjectWriter::recordRelocation(MCAssembler &Asm,
 }
 
 uint64_t WinCOFFObjectWriter::writeObject(MCAssembler &Asm) {
+  // If the assember had an error, then layout will not have completed, so we
+  // cannot write an object file.
+  if (Asm.getContext().hadError())
+    return 0;
+
   uint64_t TotalSize = ObjWriter->writeObject(Asm);
   if (DwoWriter)
     TotalSize += DwoWriter->writeObject(Asm);
diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index 2c9b878a4cde9..0f6fd5612f9d8 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -1440,15 +1440,20 @@ Error WasmObjectFile::parseExportSection(ReadContext &Ctx) {
     Info.Flags = 0;
     switch (Ex.Kind) {
     case wasm::WASM_EXTERNAL_FUNCTION: {
-      if (!isDefinedFunctionIndex(Ex.Index))
+      if (!isValidFunctionIndex(Ex.Index))
         return make_error<GenericBinaryError>("invalid function export",
                                               object_error::parse_failed);
-      getDefinedFunction(Ex.Index).ExportName = Ex.Name;
       Info.Kind = wasm::WASM_SYMBOL_TYPE_FUNCTION;
       Info.ElementIndex = Ex.Index;
-      unsigned FuncIndex = Info.ElementIndex - NumImportedFunctions;
-      wasm::WasmFunction &Function = Functions[FuncIndex];
-      Signature = &Signatures[Function.SigIndex];
+      if (isDefinedFunctionIndex(Ex.Index)) {
+        getDefinedFunction(Ex.Index).ExportName = Ex.Name;
+        unsigned FuncIndex = Info.ElementIndex - NumImportedFunctions;
+        wasm::WasmFunction &Function = Functions[FuncIndex];
+        Signature = &Signatures[Function.SigIndex];
+      }
+      // Else the function is imported. LLVM object files don't use this
+      // pattern and we still treat this as an undefined symbol, but we want to
+      // parse it without crashing.
       break;
     }
     case wasm::WASM_EXTERNAL_GLOBAL: {
@@ -1645,17 +1650,25 @@ Error WasmObjectFile::parseElemSection(ReadContext &Ctx) {
       return make_error<GenericBinaryError>(
           "Unsupported flags for element segment", object_error::parse_failed);
 
-    bool IsPassive = (Segment.Flags & wasm::WASM_ELEM_SEGMENT_IS_PASSIVE) != 0;
-    bool IsDeclarative =
-        IsPassive && (Segment.Flags & wasm::WASM_ELEM_SEGMENT_IS_DECLARATIVE);
+    wasm::ElemSegmentMode Mode;
+    if ((Segment.Flags & wasm::WASM_ELEM_SEGMENT_IS_PASSIVE) == 0) {
+      Mode = wasm::ElemSegmentMode::Active;
+    } else if (Segment.Flags & wasm::WASM_ELEM_SEGMENT_IS_DECLARATIVE) {
+      Mode = wasm::ElemSegmentMode::Declarative;
+    } else {
+      Mode = wasm::ElemSegmentMode::Passive;
+    }
     bool HasTableNumber =
-        !IsPassive &&
+        Mode == wasm::ElemSegmentMode::Active &&
         (Segment.Flags & wasm::WASM_ELEM_SEGMENT_HAS_TABLE_NUMBER);
+    bool HasElemKind =
+        (Segment.Flags & wasm::WASM_ELEM_SEGMENT_MASK_HAS_ELEM_DESC) &&
+        !(Segment.Flags & wasm::WASM_ELEM_SEGMENT_HAS_INIT_EXPRS);
+    bool HasElemType =
+        (Segment.Flags & wasm::WASM_ELEM_SEGMENT_MASK_HAS_ELEM_DESC) &&
+        (Segment.Flags & wasm::WASM_ELEM_SEGMENT_HAS_INIT_EXPRS);
     bool HasInitExprs =
         (Segment.Flags & wasm::WASM_ELEM_SEGMENT_HAS_INIT_EXPRS);
-    bool HasElemKind =
-        (Segment.Flags & wasm::WASM_ELEM_SEGMENT_MASK_HAS_ELEM_KIND) &&
-        !HasInitExprs;
 
     if (HasTableNumber)
       Segment.TableNumber = readVaruint32(Ctx);
@@ -1666,7 +1679,7 @@ Error WasmObjectFile::parseElemSection(ReadContext &Ctx) {
       return make_error<GenericBinaryError>("invalid TableNumber",
                                             object_error::parse_failed);
 
-    if (IsPassive || IsDeclarative) {
+    if (Mode != wasm::ElemSegmentMode::Active) {
       Segment.Offset.Extended = false;
       Segment.Offset.Inst.Opcode = wasm::WASM_OPCODE_I32_CONST;
       Segment.Offset.Inst.Value.Int32 = 0;
@@ -1692,7 +1705,7 @@ Error WasmObjectFile::parseElemSection(ReadContext &Ctx) {
                                                 object_error::parse_failed);
         Segment.ElemKind = wasm::ValType::FUNCREF;
       }
-    } else if (HasInitExprs) {
+    } else if (HasElemType) {
       auto ElemType = parseValType(Ctx, readVaruint32(Ctx));
       Segment.ElemKind = ElemType;
     } else {
diff --git a/llvm/lib/ObjectYAML/WasmEmitter.cpp b/llvm/lib/ObjectYAML/WasmEmitter.cpp
index 817d364694b43..bd016764f5862 100644
--- a/llvm/lib/ObjectYAML/WasmEmitter.cpp
+++ b/llvm/lib/ObjectYAML/WasmEmitter.cpp
@@ -497,7 +497,7 @@ void WasmWriter::writeSectionContent(raw_ostream &OS,
 
     writeInitExpr(OS, Segment.Offset);
 
-    if (Segment.Flags & wasm::WASM_ELEM_SEGMENT_MASK_HAS_ELEM_KIND) {
+    if (Segment.Flags & wasm::WASM_ELEM_SEGMENT_MASK_HAS_ELEM_DESC) {
       // We only support active function table initializers, for which the elem
       // kind is specified to be written as 0x00 and interpreted to mean
       // "funcref".
diff --git a/llvm/lib/ObjectYAML/WasmYAML.cpp b/llvm/lib/ObjectYAML/WasmYAML.cpp
index 0636e19e05353..6af66ba62be18 100644
--- a/llvm/lib/ObjectYAML/WasmYAML.cpp
+++ b/llvm/lib/ObjectYAML/WasmYAML.cpp
@@ -381,7 +381,7 @@ void MappingTraits<WasmYAML::ElemSegment>::mapping(
       Segment.Flags & wasm::WASM_ELEM_SEGMENT_HAS_TABLE_NUMBER)
     IO.mapOptional("TableNumber", Segment.TableNumber);
   if (!IO.outputting() ||
-      Segment.Flags & wasm::WASM_ELEM_SEGMENT_MASK_HAS_ELEM_KIND)
+      Segment.Flags & wasm::WASM_ELEM_SEGMENT_MASK_HAS_ELEM_DESC)
     IO.mapOptional("ElemKind", Segment.ElemKind);
   // TODO: Omit "offset" for passive segments? It's neither meaningful nor
   // encoded.
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 6d6678e9e4afe..c39585681911a 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -618,7 +618,7 @@ unsigned CounterMappingContext::getMaxCounterID(const Counter &C) const {
 void FunctionRecordIterator::skipOtherFiles() {
   while (Current != Records.end() && !Filename.empty() &&
          Filename != Current->Filenames[0])
-    ++Current;
+    advanceOne();
   if (Current == Records.end())
     *this = FunctionRecordIterator();
 }
diff --git a/llvm/lib/ProfileData/PGOCtxProfReader.cpp b/llvm/lib/ProfileData/PGOCtxProfReader.cpp
index eb89d7c2f6d1d..e1363cfafdfd4 100644
--- a/llvm/lib/ProfileData/PGOCtxProfReader.cpp
+++ b/llvm/lib/ProfileData/PGOCtxProfReader.cpp
@@ -17,6 +17,10 @@
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/ProfileData/PGOCtxProfWriter.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <iterator>
+#include <utility>
 
 using namespace llvm;
 
@@ -176,3 +180,86 @@ PGOCtxProfileReader::loadContexts() {
   }
   return std::move(Ret);
 }
+
+namespace {
+// We want to pass `const` values PGOCtxProfContext references to the yaml
+// converter, and the regular yaml mapping APIs are designed to handle both
+// serialization and deserialization, which prevents using const for
+// serialization. Using an intermediate datastructure is overkill, both
+// space-wise and design complexity-wise. Instead, we use the lower-level APIs.
+void toYaml(yaml::Output &Out, const PGOCtxProfContext &Ctx);
+
+void toYaml(yaml::Output &Out,
+            const PGOCtxProfContext::CallTargetMapTy &CallTargets) {
+  Out.beginSequence();
+  size_t Index = 0;
+  void *SaveData = nullptr;
+  for (const auto &[_, Ctx] : CallTargets) {
+    Out.preflightElement(Index++, SaveData);
+    toYaml(Out, Ctx);
+    Out.postflightElement(nullptr);
+  }
+  Out.endSequence();
+}
+
+void toYaml(yaml::Output &Out,
+            const PGOCtxProfContext::CallsiteMapTy &Callsites) {
+  auto AllCS = ::llvm::make_first_range(Callsites);
+  auto MaxIt = ::llvm::max_element(AllCS);
+  assert(MaxIt != AllCS.end() && "We should have a max value because the "
+                                 "callsites collection is not empty.");
+  void *SaveData = nullptr;
+  Out.beginSequence();
+  for (auto I = 0U; I <= *MaxIt; ++I) {
+    Out.preflightElement(I, SaveData);
+    auto It = Callsites.find(I);
+    if (It == Callsites.end()) {
+      // This will produce a `[ ]` sequence, which is what we want here.
+      Out.beginFlowSequence();
+      Out.endFlowSequence();
+    } else {
+      toYaml(Out, It->second);
+    }
+    Out.postflightElement(nullptr);
+  }
+  Out.endSequence();
+}
+
+void toYaml(yaml::Output &Out, const PGOCtxProfContext &Ctx) {
+  yaml::EmptyContext Empty;
+  Out.beginMapping();
+  void *SaveInfo = nullptr;
+  bool UseDefault = false;
+  {
+    Out.preflightKey("Guid", /*Required=*/true, /*SameAsDefault=*/false,
+                     UseDefault, SaveInfo);
+    auto Guid = Ctx.guid();
+    yaml::yamlize(Out, Guid, true, Empty);
+    Out.postflightKey(nullptr);
+  }
+  {
+    Out.preflightKey("Counters", true, false, UseDefault, SaveInfo);
+    Out.beginFlowSequence();
+    for (size_t I = 0U, E = Ctx.counters().size(); I < E; ++I) {
+      Out.preflightFlowElement(I, SaveInfo);
+      uint64_t V = Ctx.counters()[I];
+      yaml::yamlize(Out, V, true, Empty);
+      Out.postflightFlowElement(SaveInfo);
+    }
+    Out.endFlowSequence();
+    Out.postflightKey(nullptr);
+  }
+  if (!Ctx.callsites().empty()) {
+    Out.preflightKey("Callsites", true, false, UseDefault, SaveInfo);
+    toYaml(Out, Ctx.callsites());
+    Out.postflightKey(nullptr);
+  }
+  Out.endMapping();
+}
+} // namespace
+
+void llvm::convertCtxProfToYaml(
+    raw_ostream &OS, const PGOCtxProfContext::CallTargetMapTy &Profiles) {
+  yaml::Output Out(OS);
+  toYaml(Out, Profiles);
+}
\ No newline at end of file
diff --git a/llvm/lib/ProfileData/PGOCtxProfWriter.cpp b/llvm/lib/ProfileData/PGOCtxProfWriter.cpp
index d22aadd6bd7eb..3d3da84817489 100644
--- a/llvm/lib/ProfileData/PGOCtxProfWriter.cpp
+++ b/llvm/lib/ProfileData/PGOCtxProfWriter.cpp
@@ -14,7 +14,6 @@
 #include "llvm/Bitstream/BitCodeEnums.h"
 #include "llvm/ProfileData/CtxInstrContextNode.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/JSON.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
@@ -89,6 +88,15 @@ void PGOCtxProfileWriter::write(const ContextNode &RootNode) {
 }
 
 namespace {
+
+/// Representation of the context node suitable for yaml serialization /
+/// deserialization.
+struct SerializableCtxRepresentation {
+  ctx_profile::GUID Guid = 0;
+  std::vector<uint64_t> Counters;
+  std::vector<std::vector<SerializableCtxRepresentation>> Callsites;
+};
+
 ctx_profile::ContextNode *
 createNode(std::vector<std::unique_ptr<char[]>> &Nodes,
            const std::vector<SerializableCtxRepresentation> &DCList);
diff --git a/llvm/lib/SandboxIR/Instruction.cpp b/llvm/lib/SandboxIR/Instruction.cpp
index 0a7cd95124bb5..cc961418600e3 100644
--- a/llvm/lib/SandboxIR/Instruction.cpp
+++ b/llvm/lib/SandboxIR/Instruction.cpp
@@ -926,21 +926,26 @@ void PHINode::removeIncomingValueIf(function_ref<bool(unsigned)> Predicate) {
   }
 }
 
-CmpInst *CmpInst::create(Predicate P, Value *S1, Value *S2, InsertPosition Pos,
-                         Context &Ctx, const Twine &Name) {
+Value *CmpInst::create(Predicate P, Value *S1, Value *S2, InsertPosition Pos,
+                       Context &Ctx, const Twine &Name) {
   auto &Builder = setInsertPos(Pos);
-  auto *LLVMI = Builder.CreateCmp(P, S1->Val, S2->Val, Name);
-  if (dyn_cast<llvm::ICmpInst>(LLVMI))
-    return Ctx.createICmpInst(cast<llvm::ICmpInst>(LLVMI));
-  return Ctx.createFCmpInst(cast<llvm::FCmpInst>(LLVMI));
-}
-CmpInst *CmpInst::createWithCopiedFlags(Predicate P, Value *S1, Value *S2,
-                                        const Instruction *F,
-                                        InsertPosition Pos, Context &Ctx,
-                                        const Twine &Name) {
-  CmpInst *Inst = create(P, S1, S2, Pos, Ctx, Name);
-  cast<llvm::CmpInst>(Inst->Val)->copyIRFlags(F->Val);
-  return Inst;
+  auto *LLVMV = Builder.CreateCmp(P, S1->Val, S2->Val, Name);
+  // It may have been folded into a constant.
+  if (auto *LLVMC = dyn_cast<llvm::Constant>(LLVMV))
+    return Ctx.getOrCreateConstant(LLVMC);
+  if (isa<llvm::ICmpInst>(LLVMV))
+    return Ctx.createICmpInst(cast<llvm::ICmpInst>(LLVMV));
+  return Ctx.createFCmpInst(cast<llvm::FCmpInst>(LLVMV));
+}
+
+Value *CmpInst::createWithCopiedFlags(Predicate P, Value *S1, Value *S2,
+                                      const Instruction *F, InsertPosition Pos,
+                                      Context &Ctx, const Twine &Name) {
+  Value *V = create(P, S1, S2, Pos, Ctx, Name);
+  if (auto *C = dyn_cast<Constant>(V))
+    return C;
+  cast<llvm::CmpInst>(V->Val)->copyIRFlags(F->Val);
+  return V;
 }
 
 Type *CmpInst::makeCmpResultType(Type *OpndType) {
diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index ea8295f95c751..4e45416b4598f 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -3108,3 +3108,21 @@ APInt APIntOps::mulhu(const APInt &C1, const APInt &C2) {
   APInt C2Ext = C2.zext(FullWidth);
   return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth());
 }
+
+APInt APIntOps::pow(const APInt &X, int64_t N) {
+  assert(N >= 0 && "negative exponents not supported.");
+  APInt Acc = APInt(X.getBitWidth(), 1);
+  if (N == 0)
+    return Acc;
+  APInt Base = X;
+  int64_t RemainingExponent = N;
+  while (RemainingExponent > 0) {
+    while (RemainingExponent % 2 == 0) {
+      Base *= Base;
+      RemainingExponent /= 2;
+    }
+    --RemainingExponent;
+    Acc *= Base;
+  }
+  return Acc;
+}
diff --git a/llvm/lib/Support/Errno.cpp b/llvm/lib/Support/Errno.cpp
index 60a7e536b6c5c..0ef8d1ef1c99a 100644
--- a/llvm/lib/Support/Errno.cpp
+++ b/llvm/lib/Support/Errno.cpp
@@ -13,10 +13,7 @@
 #include "llvm/Support/Errno.h"
 #include "llvm/Config/config.h"
 #include <cstring>
-
-#if HAVE_ERRNO_H
 #include <errno.h>
-#endif
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only TRULY operating system
@@ -26,11 +23,9 @@
 namespace llvm {
 namespace sys {
 
-#if HAVE_ERRNO_H
 std::string StrError() {
   return StrError(errno);
 }
-#endif  // HAVE_ERRNO_H
 
 std::string StrError(int errnum) {
   std::string str;
diff --git a/llvm/lib/Support/ErrorHandling.cpp b/llvm/lib/Support/ErrorHandling.cpp
index 8659f9492d5a3..afe3b37cc3431 100644
--- a/llvm/lib/Support/ErrorHandling.cpp
+++ b/llvm/lib/Support/ErrorHandling.cpp
@@ -33,7 +33,7 @@
 #if defined(HAVE_UNISTD_H)
 # include <unistd.h>
 #endif
-#if defined(_MSC_VER)
+#if defined(_WIN32)
 # include <io.h>
 # include <fcntl.h>
 #endif
diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc
index 44097bad7b46e..280f290e906c2 100644
--- a/llvm/lib/Support/Unix/Path.inc
+++ b/llvm/lib/Support/Unix/Path.inc
@@ -18,12 +18,8 @@
 #include "Unix.h"
 #include <limits.h>
 #include <stdio.h>
-#if HAVE_SYS_STAT_H
 #include <sys/stat.h>
-#endif
-#if HAVE_FCNTL_H
 #include <fcntl.h>
-#endif
 #ifdef HAVE_UNISTD_H
 #include <unistd.h>
 #endif
diff --git a/llvm/lib/Support/Unix/Process.inc b/llvm/lib/Support/Unix/Process.inc
index 3c07cba7122bf..550b0de2e0455 100644
--- a/llvm/lib/Support/Unix/Process.inc
+++ b/llvm/lib/Support/Unix/Process.inc
@@ -17,21 +17,11 @@
 #include "llvm/Config/llvm-config.h" // for LLVM_ENABLE_THREADS
 #include <mutex>
 #include <optional>
-#if HAVE_FCNTL_H
 #include <fcntl.h>
-#endif
-#ifdef HAVE_SYS_TIME_H
 #include <sys/time.h>
-#endif
-#ifdef HAVE_SYS_RESOURCE_H
 #include <sys/resource.h>
-#endif
-#ifdef HAVE_SYS_STAT_H
 #include <sys/stat.h>
-#endif
-#if HAVE_SIGNAL_H
 #include <signal.h>
-#endif
 #if defined(HAVE_MALLINFO) || defined(HAVE_MALLINFO2)
 #include <malloc.h>
 #endif
@@ -41,12 +31,6 @@
 #ifdef HAVE_MALLOC_MALLOC_H
 #include <malloc/malloc.h>
 #endif
-#ifdef HAVE_SYS_IOCTL_H
-#include <sys/ioctl.h>
-#endif
-#ifdef HAVE_TERMIOS_H
-#include <termios.h>
-#endif
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only generic UNIX code that
@@ -142,7 +126,6 @@ void Process::GetTimeUsage(TimePoint<> &elapsed,
 // their operation. To prevent the disk from filling up, this function
 // does what's necessary to prevent their generation.
 void Process::PreventCoreFiles() {
-#if HAVE_SETRLIMIT
   struct rlimit rlim;
   getrlimit(RLIMIT_CORE, &rlim);
 #ifdef __linux__
@@ -165,7 +148,6 @@ void Process::PreventCoreFiles() {
   rlim.rlim_cur = 0;
 #endif
   setrlimit(RLIMIT_CORE, &rlim);
-#endif
 
 #if defined(HAVE_MACH_MACH_H) && !defined(__GNU__)
   // Disable crash reporting on Mac OS X 10.0-10.4
diff --git a/llvm/lib/Support/Unix/Program.inc b/llvm/lib/Support/Unix/Program.inc
index ec0fad7076b45..0708df1eed0a3 100644
--- a/llvm/lib/Support/Unix/Program.inc
+++ b/llvm/lib/Support/Unix/Program.inc
@@ -28,18 +28,10 @@
 #include "llvm/Support/StringSaver.h"
 #include "llvm/Support/SystemZ/zOSSupport.h"
 #include "llvm/Support/raw_ostream.h"
-#if HAVE_SYS_STAT_H
 #include <sys/stat.h>
-#endif
-#if HAVE_SYS_RESOURCE_H
 #include <sys/resource.h>
-#endif
-#if HAVE_SIGNAL_H
 #include <signal.h>
-#endif
-#if HAVE_FCNTL_H
 #include <fcntl.h>
-#endif
 #if HAVE_UNISTD_H
 #include <unistd.h>
 #endif
@@ -146,7 +138,6 @@ static bool RedirectIO_PS(const std::string *Path, int FD, std::string *ErrMsg,
 static void TimeOutHandler(int Sig) {}
 
 static void SetMemoryLimits(unsigned size) {
-#if HAVE_SYS_RESOURCE_H && HAVE_GETRLIMIT && HAVE_SETRLIMIT
   struct rlimit r;
   __typeof__(r.rlim_cur) limit = (__typeof__(r.rlim_cur))(size)*1048576;
 
@@ -160,7 +151,6 @@ static void SetMemoryLimits(unsigned size) {
   r.rlim_cur = limit;
   setrlimit(RLIMIT_RSS, &r);
 #endif
-#endif
 }
 
 static std::vector<const char *>
diff --git a/llvm/lib/Support/Unix/Signals.inc b/llvm/lib/Support/Unix/Signals.inc
index 50d6248ba6af8..b2f68d25221a2 100644
--- a/llvm/lib/Support/Unix/Signals.inc
+++ b/llvm/lib/Support/Unix/Signals.inc
@@ -50,12 +50,8 @@
 #ifdef HAVE_BACKTRACE
 #include BACKTRACE_HEADER // For backtrace().
 #endif
-#if HAVE_SIGNAL_H
 #include <signal.h>
-#endif
-#if HAVE_SYS_STAT_H
 #include <sys/stat.h>
-#endif
 #if HAVE_DLFCN_H
 #include <dlfcn.h>
 #endif
diff --git a/llvm/lib/Support/Unix/Unix.h b/llvm/lib/Support/Unix/Unix.h
index 1599241a344af..f16c7fcda22c3 100644
--- a/llvm/lib/Support/Unix/Unix.h
+++ b/llvm/lib/Support/Unix/Unix.h
@@ -36,18 +36,14 @@
 #include <unistd.h>
 #endif
 
-#ifdef HAVE_SYS_TIME_H
-# include <sys/time.h>
-#endif
+#include <sys/time.h>
 #include <time.h>
 
 #ifdef HAVE_DLFCN_H
 # include <dlfcn.h>
 #endif
 
-#ifdef HAVE_FCNTL_H
 # include <fcntl.h>
-#endif
 
 /// This function builds an error message into \p ErrMsg using the \p prefix
 /// string and the Unix error number given by \p errnum. If errnum is -1, the
diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp
index 5d30c797ebf5b..e75ddc66b7d16 100644
--- a/llvm/lib/Support/raw_ostream.cpp
+++ b/llvm/lib/Support/raw_ostream.cpp
@@ -30,9 +30,7 @@
 #include <sys/stat.h>
 
 // <fcntl.h> may provide O_BINARY.
-#if defined(HAVE_FCNTL_H)
 # include <fcntl.h>
-#endif
 
 #if defined(HAVE_UNISTD_H)
 # include <unistd.h>
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 597ccb7ca144b..b76d7bcc95a56 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -671,7 +671,7 @@ const StringInit *StringInit::get(RecordKeeper &RK, StringRef V,
   detail::RecordKeeperImpl &RKImpl = RK.getImpl();
   auto &InitMap = Fmt == SF_String ? RKImpl.StringInitStringPool
                                    : RKImpl.StringInitCodePool;
-  auto &Entry = *InitMap.insert(std::make_pair(V, nullptr)).first;
+  auto &Entry = *InitMap.try_emplace(V, nullptr).first;
   if (!Entry.second)
     Entry.second = new (RKImpl.Allocator) StringInit(RK, Entry.getKey(), Fmt);
   return Entry.second;
@@ -1674,7 +1674,7 @@ static const Init *ForeachDagApply(const Init *LHS, const DagInit *MHSd,
     else
       NewArg = ItemApply(LHS, Arg, RHS, CurRec);
 
-    NewArgs.push_back(std::make_pair(NewArg, ArgName));
+    NewArgs.emplace_back(NewArg, ArgName);
     if (Arg != NewArg)
       Change = true;
   }
@@ -2260,7 +2260,7 @@ const VarInit *VarInit::get(StringRef VN, const RecTy *T) {
 
 const VarInit *VarInit::get(const Init *VN, const RecTy *T) {
   detail::RecordKeeperImpl &RK = T->getRecordKeeper().getImpl();
-  VarInit *&I = RK.TheVarInitPool[std::make_pair(T, VN)];
+  VarInit *&I = RK.TheVarInitPool[{T, VN}];
   if (!I)
     I = new (RK.Allocator) VarInit(VN, T);
   return I;
@@ -2285,7 +2285,7 @@ const Init *VarInit::resolveReferences(Resolver &R) const {
 
 const VarBitInit *VarBitInit::get(const TypedInit *T, unsigned B) {
   detail::RecordKeeperImpl &RK = T->getRecordKeeper().getImpl();
-  VarBitInit *&I = RK.TheVarBitInitPool[std::make_pair(T, B)];
+  VarBitInit *&I = RK.TheVarBitInitPool[{T, B}];
   if (!I)
     I = new (RK.Allocator) VarBitInit(T, B);
   return I;
@@ -2461,7 +2461,7 @@ std::string VarDefInit::getAsString() const {
 
 const FieldInit *FieldInit::get(const Init *R, const StringInit *FN) {
   detail::RecordKeeperImpl &RK = R->getRecordKeeper().getImpl();
-  FieldInit *&I = RK.TheFieldInitPool[std::make_pair(R, FN)];
+  FieldInit *&I = RK.TheFieldInitPool[{R, FN}];
   if (!I)
     I = new (RK.Allocator) FieldInit(R, FN);
   return I;
diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h
index bac583c4e33a1..6680915211205 100644
--- a/llvm/lib/TableGen/TGLexer.h
+++ b/llvm/lib/TableGen/TGLexer.h
@@ -234,7 +234,7 @@ class TGLexer {
   std::pair<int64_t, unsigned> getCurBinaryIntVal() const {
     assert(CurCode == tgtok::BinaryIntVal &&
            "This token isn't a binary integer");
-    return std::make_pair(CurIntVal, (CurPtr - TokStart)-2);
+    return {CurIntVal, (CurPtr - TokStart) - 2};
   }
 
   SMLoc getLoc() const;
diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp
index 60ae11b7f4261..d2115ab7627da 100644
--- a/llvm/lib/TableGen/TGParser.cpp
+++ b/llvm/lib/TableGen/TGParser.cpp
@@ -3162,7 +3162,7 @@ void TGParser::ParseDagArgList(
         Lex.Lex();  // eat the VarName.
       }
 
-      Result.push_back(std::make_pair(Val, VarName));
+      Result.emplace_back(Val, VarName);
     }
     if (!consume(tgtok::comma))
       break;
@@ -4152,9 +4152,8 @@ bool TGParser::ParseMultiClass() {
     return TokError("expected identifier after multiclass for name");
   std::string Name = Lex.getCurStrVal();
 
-  auto Result =
-    MultiClasses.insert(std::make_pair(Name,
-                    std::make_unique<MultiClass>(Name, Lex.getLoc(),Records)));
+  auto Result = MultiClasses.try_emplace(
+      Name, std::make_unique<MultiClass>(Name, Lex.getLoc(), Records));
 
   if (!Result.second)
     return TokError("multiclass '" + Name + "' already defined");
diff --git a/llvm/lib/TableGen/TGParser.h b/llvm/lib/TableGen/TGParser.h
index 4509893eefc2c..6094bba84fa55 100644
--- a/llvm/lib/TableGen/TGParser.h
+++ b/llvm/lib/TableGen/TGParser.h
@@ -131,7 +131,7 @@ class TGVarScope {
   }
 
   void addVar(StringRef Name, const Init *I) {
-    bool Ins = Vars.insert(std::make_pair(std::string(Name), I)).second;
+    bool Ins = Vars.try_emplace(std::string(Name), I).second;
     (void)Ins;
     assert(Ins && "Local variable already exists");
   }
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 8644264f5fb1c..3677f669c3481 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -58,34 +58,34 @@ include "AArch64SystemOperands.td"
 
 class AArch64Unsupported { list<Predicate> F; }
 
-let F = [HasSVE2p1, HasSVE2p1_or_HasSME2, HasSVE2p1_or_HasSME2p1] in
+let F = [HasSVE2p1, HasSVE2p1_or_SME2, HasSVE2p1_or_SME2p1] in
 def SVE2p1Unsupported : AArch64Unsupported;
 
 def SVE2Unsupported : AArch64Unsupported {
-  let F = !listconcat([HasSVE2, HasSVE2orSME, HasSVE2orSME2, HasSSVE_FP8FMA, HasSMEF8F16,
+  let F = !listconcat([HasSVE2, HasSVE2_or_SME, HasSVE2_or_SME2, HasSSVE_FP8FMA, HasSMEF8F16,
                        HasSMEF8F32, HasSVEAES, HasSVE2SHA3, HasSVE2SM4, HasSVEBitPerm,
                        HasSVEB16B16],
                        SVE2p1Unsupported.F);
 }
 
 def SVEUnsupported : AArch64Unsupported {
-  let F = !listconcat([HasSVE, HasSVEorSME],
+  let F = !listconcat([HasSVE, HasSVE_or_SME],
                       SVE2Unsupported.F);
 }
 
-let F = [HasSME2p2, HasSVE2p2orSME2p2, HasNonStreamingSVEorSME2p2,
-         HasNonStreamingSVE2p2orSME2p2, HasNonStreamingSVE2orSSVE_BitPerm,
+let F = [HasSME2p2, HasSVE2p2_or_SME2p2, HasNonStreamingSVE_or_SME2p2,
+         HasNonStreamingSVE2p2_or_SME2p2, HasNonStreamingSVE2_or_SSVE_BitPerm,
          HasSME_MOP4, HasSME_TMOP] in
 def SME2p2Unsupported : AArch64Unsupported;
 
 def SME2p1Unsupported : AArch64Unsupported {
-  let F = !listconcat([HasSME2p1, HasSVE2p1_or_HasSME2p1, HasNonStreamingSVE2p1orSSVE_AES],
+  let F = !listconcat([HasSME2p1, HasSVE2p1_or_SME2p1, HasNonStreamingSVE2p1_or_SSVE_AES],
                       SME2p2Unsupported.F);
 }
 
 def SME2Unsupported : AArch64Unsupported {
-  let F = !listconcat([HasSME2, HasSVE2orSME2, HasSVE2p1_or_HasSME2, HasSSVE_FP8FMA,
-                      HasSMEF8F16, HasSMEF8F32, HasSMEF16F16orSMEF8F16, HasSMEB16B16],
+  let F = !listconcat([HasSME2, HasSVE2_or_SME2, HasSVE2p1_or_SME2, HasSSVE_FP8FMA,
+                      HasSMEF8F16, HasSMEF8F32, HasSMEF16F16_or_SMEF8F16, HasSMEB16B16],
                       SME2p1Unsupported.F);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 206e410047db5..1582d1999ca1d 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2926,26 +2926,12 @@ struct RegPairInfo {
   int FrameIdx;
   int Offset;
   enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type;
+  const TargetRegisterClass *RC;
 
   RegPairInfo() = default;
 
   bool isPaired() const { return Reg2 != AArch64::NoRegister; }
 
-  unsigned getScale() const {
-    switch (Type) {
-    case PPR:
-      return 2;
-    case GPR:
-    case FPR64:
-    case VG:
-      return 8;
-    case ZPR:
-    case FPR128:
-      return 16;
-    }
-    llvm_unreachable("Unsupported type");
-  }
-
   bool isScalable() const { return Type == PPR || Type == ZPR; }
 };
 
@@ -3023,20 +3009,27 @@ static void computeCalleeSaveRegisterPairs(
     RegPairInfo RPI;
     RPI.Reg1 = CSI[i].getReg();
 
-    if (AArch64::GPR64RegClass.contains(RPI.Reg1))
+    if (AArch64::GPR64RegClass.contains(RPI.Reg1)) {
       RPI.Type = RegPairInfo::GPR;
-    else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
+      RPI.RC = &AArch64::GPR64RegClass;
+    } else if (AArch64::FPR64RegClass.contains(RPI.Reg1)) {
       RPI.Type = RegPairInfo::FPR64;
-    else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
+      RPI.RC = &AArch64::FPR64RegClass;
+    } else if (AArch64::FPR128RegClass.contains(RPI.Reg1)) {
       RPI.Type = RegPairInfo::FPR128;
-    else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
+      RPI.RC = &AArch64::FPR128RegClass;
+    } else if (AArch64::ZPRRegClass.contains(RPI.Reg1)) {
       RPI.Type = RegPairInfo::ZPR;
-    else if (AArch64::PPRRegClass.contains(RPI.Reg1))
+      RPI.RC = &AArch64::ZPRRegClass;
+    } else if (AArch64::PPRRegClass.contains(RPI.Reg1)) {
       RPI.Type = RegPairInfo::PPR;
-    else if (RPI.Reg1 == AArch64::VG)
+      RPI.RC = &AArch64::PPRRegClass;
+    } else if (RPI.Reg1 == AArch64::VG) {
       RPI.Type = RegPairInfo::VG;
-    else
+      RPI.RC = &AArch64::FIXED_REGSRegClass;
+    } else {
       llvm_unreachable("Unsupported register class.");
+    }
 
     // Add the stack hazard size as we transition from GPR->FPR CSRs.
     if (AFI->hasStackHazardSlotIndex() &&
@@ -3045,7 +3038,7 @@ static void computeCalleeSaveRegisterPairs(
       ByteOffset += StackFillDir * StackHazardSize;
     LastReg = RPI.Reg1;
 
-    int Scale = RPI.getScale();
+    int Scale = TRI->getSpillSize(*RPI.RC);
     // Add the next reg to the pair if it is in the same register class.
     if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) {
       Register NextReg = CSI[i + RegInc].getReg();
@@ -3254,38 +3247,26 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
     // Rationale: This sequence saves uop updates compared to a sequence of
     // pre-increment spills like stp xi,xj,[sp,#-16]!
     // Note: Similar rationale and sequence for restores in epilog.
-    unsigned Size;
-    Align Alignment;
+    unsigned Size = TRI->getSpillSize(*RPI.RC);
+    Align Alignment = TRI->getSpillAlign(*RPI.RC);
     switch (RPI.Type) {
     case RegPairInfo::GPR:
       StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
-      Size = 8;
-      Alignment = Align(8);
       break;
     case RegPairInfo::FPR64:
       StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
-      Size = 8;
-      Alignment = Align(8);
       break;
     case RegPairInfo::FPR128:
       StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
-      Size = 16;
-      Alignment = Align(16);
       break;
     case RegPairInfo::ZPR:
       StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
-      Size = 16;
-      Alignment = Align(16);
       break;
     case RegPairInfo::PPR:
       StrOpc = AArch64::STR_PXI;
-      Size = 2;
-      Alignment = Align(2);
       break;
     case RegPairInfo::VG:
       StrOpc = AArch64::STRXui;
-      Size = 8;
-      Alignment = Align(8);
       break;
     }
 
@@ -3495,33 +3476,23 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     //    ldp     x22, x21, [sp, #0]      // addImm(+0)
     // Note: see comment in spillCalleeSavedRegisters()
     unsigned LdrOpc;
-    unsigned Size;
-    Align Alignment;
+    unsigned Size = TRI->getSpillSize(*RPI.RC);
+    Align Alignment = TRI->getSpillAlign(*RPI.RC);
     switch (RPI.Type) {
     case RegPairInfo::GPR:
       LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
-      Size = 8;
-      Alignment = Align(8);
       break;
     case RegPairInfo::FPR64:
       LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
-      Size = 8;
-      Alignment = Align(8);
       break;
     case RegPairInfo::FPR128:
       LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
-      Size = 16;
-      Alignment = Align(16);
       break;
     case RegPairInfo::ZPR:
       LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
-      Size = 16;
-      Alignment = Align(16);
       break;
     case RegPairInfo::PPR:
       LdrOpc = AArch64::LDR_PXI;
-      Size = 2;
-      Alignment = Align(2);
       break;
     case RegPairInfo::VG:
       continue;
@@ -3795,14 +3766,15 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   unsigned CSStackSize = 0;
   unsigned SVECSStackSize = 0;
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
   for (unsigned Reg : SavedRegs.set_bits()) {
-    auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
+    auto *RC = TRI->getMinimalPhysRegClass(Reg);
+    assert(RC && "expected register class!");
+    auto SpillSize = TRI->getSpillSize(*RC);
     if (AArch64::PPRRegClass.contains(Reg) ||
         AArch64::ZPRRegClass.contains(Reg))
-      SVECSStackSize += RegSize;
+      SVECSStackSize += SpillSize;
     else
-      CSStackSize += RegSize;
+      CSStackSize += SpillSize;
   }
 
   // Increase the callee-saved stack size if the function has streaming mode
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d4a114c275fb7..0c096711bf3bd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1140,8 +1140,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
 
-  setTargetDAGCombine(ISD::SHL);
-
   // In case of strict alignment, avoid an excessive number of byte wide stores.
   MaxStoresPerMemsetOptSize = 8;
   MaxStoresPerMemset =
@@ -2373,8 +2371,9 @@ bool AArch64TargetLowering::targetShrinkDemandedConstant(
     return false;
 
   unsigned Size = VT.getSizeInBits();
-  assert((Size == 32 || Size == 64) &&
-         "i32 or i64 is expected after legalization.");
+
+  if (Size != 32 && Size != 64)
+    return false;
 
   // Exit early if we demand all bits.
   if (DemandedBits.popcount() == Size)
@@ -26473,43 +26472,6 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   return NVCAST;
 }
 
-/// If the operand is a bitwise AND with a constant RHS, and the shift has a
-/// constant RHS and is the only use, we can pull it out of the shift, i.e.
-///
-///   (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
-///
-/// We prefer this canonical form to match existing isel patterns.
-static SDValue performSHLCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 SelectionDAG &DAG) {
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  SDValue Op0 = N->getOperand(0);
-  if (Op0.getOpcode() != ISD::AND || !Op0.hasOneUse())
-    return SDValue();
-
-  SDValue C1 = Op0->getOperand(1);
-  SDValue C2 = N->getOperand(1);
-  if (!isa<ConstantSDNode>(C1) || !isa<ConstantSDNode>(C2))
-    return SDValue();
-
-  // Might be folded into shifted op, do not lower.
-  if (N->hasOneUse()) {
-    unsigned UseOpc = N->user_begin()->getOpcode();
-    if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
-        UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)
-      return SDValue();
-  }
-
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-  SDValue X = Op0->getOperand(0);
-  SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, C1, C2);
-  SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, X, C2);
-  return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
-}
-
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -26855,8 +26817,6 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performCTLZCombine(N, DAG, Subtarget);
   case ISD::SCALAR_TO_VECTOR:
     return performScalarToVectorCombine(N, DCI, DAG);
-  case ISD::SHL:
-    return performSHLCombine(N, DCI, DAG);
   }
   return SDValue();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index fd24e49f948a2..a2fd4963db108 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -9742,9 +9742,11 @@ AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
 
   // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
   // and zero immediate operands used as an alias for mov instruction.
-  if (MI.getOpcode() == AArch64::ORRWrs &&
-      MI.getOperand(1).getReg() == AArch64::WZR &&
-      MI.getOperand(3).getImm() == 0x0 &&
+  if (((MI.getOpcode() == AArch64::ORRWrs &&
+        MI.getOperand(1).getReg() == AArch64::WZR &&
+        MI.getOperand(3).getImm() == 0x0) ||
+       (MI.getOpcode() == AArch64::ORRWrr &&
+        MI.getOperand(1).getReg() == AArch64::WZR)) &&
       // Check that the w->w move is not a zero-extending w->x mov.
       (!MI.getOperand(0).getReg().isVirtual() ||
        MI.getOperand(0).getSubReg() == 0) &&
@@ -9764,9 +9766,11 @@ AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
 
 std::optional<DestSourcePair>
 AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const {
-  if (MI.getOpcode() == AArch64::ORRWrs &&
-      MI.getOperand(1).getReg() == AArch64::WZR &&
-      MI.getOperand(3).getImm() == 0x0)
+  if ((MI.getOpcode() == AArch64::ORRWrs &&
+       MI.getOperand(1).getReg() == AArch64::WZR &&
+       MI.getOperand(3).getImm() == 0x0) ||
+      (MI.getOpcode() == AArch64::ORRWrr &&
+       MI.getOperand(1).getReg() == AArch64::WZR))
     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
   return std::nullopt;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 9c7dc7784e939..8215f3a4fdae1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -244,53 +244,53 @@ def HasOCCMO         : Predicate<"Subtarget->hasOCCMO()">,
 
 // A subset of SVE(2) instructions are legal in Streaming SVE execution mode,
 // they should be enabled if either has been specified.
-def HasSVEorSME
+def HasSVE_or_SME
     : Predicate<"Subtarget->hasSVE() || (Subtarget->isStreaming() && Subtarget->hasSME())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSME),
                 "sve or sme">;
-def HasNonStreamingSVEorSME2p2
+def HasNonStreamingSVE_or_SME2p2
     : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE()) ||"
                 "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSME2p2),
                 "sve or sme2p2">;
-def HasSVE2orSME
+def HasSVE2_or_SME
     : Predicate<"Subtarget->hasSVE2() || (Subtarget->isStreaming() && Subtarget->hasSME())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME),
                 "sve2 or sme">;
-def HasSVE2orSME2
+def HasSVE2_or_SME2
     : Predicate<"Subtarget->hasSVE2() || (Subtarget->isStreaming() && Subtarget->hasSME2())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME2),
                 "sve2 or sme2">;
-def HasNonStreamingSVE2orSSVE_AES
+def HasNonStreamingSVE2_or_SSVE_AES
     : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2()) ||"
                 "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_AES())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSSVE_AES), "sve2 or ssve-aes">;
-def HasSVE2p1_or_HasSME
+def HasSVE2p1_or_SME
     : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME())">,
                  AssemblerPredicateWithAll<(any_of FeatureSME, FeatureSVE2p1), "sme or sve2p1">;
-def HasSVE2p1_or_HasSME2
+def HasSVE2p1_or_SME2
     : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME2())">,
                  AssemblerPredicateWithAll<(any_of FeatureSME2, FeatureSVE2p1), "sme2 or sve2p1">;
-def HasSVE2p1_or_HasSME2p1
+def HasSVE2p1_or_SME2p1
     : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME2p1())">,
                  AssemblerPredicateWithAll<(any_of FeatureSME2p1, FeatureSVE2p1), "sme2p1 or sve2p1">;
-def HasSVE2p2orSME2p2
+def HasSVE2p2_or_SME2p2
     : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && (Subtarget->hasSVE2p2() || Subtarget->hasSME2p2())">,
                  AssemblerPredicateWithAll<(any_of FeatureSME2p2, FeatureSVE2p2), "sme2p2 or sve2p2">;
-def HasNonStreamingSVE2p1orSSVE_AES
+def HasNonStreamingSVE2p1_or_SSVE_AES
     : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2p1()) ||"
                 "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_AES())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE2p1, FeatureSSVE_AES), "sve2p1 or ssve-aes">;
-def HasSMEF16F16orSMEF8F16
+def HasSMEF16F16_or_SMEF8F16
     : Predicate<"Subtarget->isStreaming() && (Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16())">,
                 AssemblerPredicateWithAll<(any_of FeatureSMEF16F16, FeatureSMEF8F16),
                 "sme-f16f16 or sme-f8f16">;
-def HasNonStreamingSVE2p2orSME2p2
+def HasNonStreamingSVE2p2_or_SME2p2
     : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2p2()) ||"
                 "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE2p2, FeatureSME2p2),
                 "sme2p2 or sve2p2">;
-def HasNonStreamingSVE2orSSVE_BitPerm
+def HasNonStreamingSVE2_or_SSVE_BitPerm
     : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2()) ||"
                 "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_BitPerm())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSSVE_BitPerm), "sve2 or ssve-bitperm">;
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 98b027862383d..d2aa86f388db2 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -882,7 +882,7 @@ defm LUTI4_S_2ZTZI : sme2p1_luti4_vector_vg2_index<"luti4">;
 defm LUTI4_S_4ZTZI : sme2p1_luti4_vector_vg4_index<"luti4">;
 }
 
-let Predicates = [HasSMEF16F16orSMEF8F16] in {
+let Predicates = [HasSMEF16F16_or_SMEF8F16] in {
 defm FADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fadd", 0b0100, MatrixOp16, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_add_za16_vg1x2>;
 defm FADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fadd", 0b0100, MatrixOp16, ZZZZ_h_mul_r, nxv8f16,  int_aarch64_sme_add_za16_vg1x4>;
 defm FSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fsub", 0b0101, MatrixOp16, ZZ_h_mul_r, nxv8f16,  int_aarch64_sme_sub_za16_vg1x2>;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 22715c61126d1..27c88a55919e6 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -563,7 +563,7 @@ let Predicates = [HasSVE] in {
   def WRFFR      : sve_int_wrffr<"wrffr", int_aarch64_sve_wrffr>;
 } // End HasSVE
 
-let Predicates = [HasSVEorSME] in {
+let Predicates = [HasSVE_or_SME] in {
   defm ADD_ZZZ   : sve_int_bin_cons_arit_0<0b000, "add", add>;
   defm SUB_ZZZ   : sve_int_bin_cons_arit_0<0b001, "sub", sub>;
   defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat>;
@@ -584,9 +584,9 @@ let Predicates = [HasSVEorSME] in {
   defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", "EOR_ZPZZ", AArch64eor_m1, DestructiveBinaryComm>;
   defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", "AND_ZPZZ", AArch64and_m1, DestructiveBinaryComm>;
   defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", "BIC_ZPZZ", int_aarch64_sve_bic, DestructiveBinary>;
-} // End HasSVEorSME
+} // End HasSVE_or_SME
 
-let Predicates = [HasSVEorSME, UseExperimentalZeroingPseudos] in {
+let Predicates = [HasSVE_or_SME, UseExperimentalZeroingPseudos] in {
   defm ADD_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_add>;
   defm SUB_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_sub>;
   defm SUBR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_subr>;
@@ -595,9 +595,9 @@ let Predicates = [HasSVEorSME, UseExperimentalZeroingPseudos] in {
   defm EOR_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_eor>;
   defm AND_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_and>;
   defm BIC_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_bic>;
-} // End HasSVEorSME, UseExperimentalZeroingPseudos
+} // End HasSVE_or_SME, UseExperimentalZeroingPseudos
 
-let Predicates = [HasSVEorSME] in {
+let Predicates = [HasSVE_or_SME] in {
   defm ADD_ZI   : sve_int_arith_imm0<0b000, "add", add>;
   defm SUB_ZI   : sve_int_arith_imm0<0b001, "sub", sub>;
   defm SUBR_ZI  : sve_int_arith_imm0<0b011, "subr", AArch64subr>;
@@ -764,9 +764,9 @@ let Predicates = [HasSVEorSME] in {
   defm FABD_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fabd_p>;
   defm FMULX_ZPZZ  : sve_fp_bin_pred_hfd<int_aarch64_sve_fmulx_u>;
   defm FDIV_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fdiv_p>;
-} // End HasSVEorSME
+} // End HasSVE_or_SME
 
-let Predicates = [HasSVEorSME, UseExperimentalZeroingPseudos] in {
+let Predicates = [HasSVE_or_SME, UseExperimentalZeroingPseudos] in {
   defm FADD_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fadd>;
   defm FSUB_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fsub>;
   defm FMUL_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmul>;
@@ -779,28 +779,28 @@ let Predicates = [HasSVEorSME, UseExperimentalZeroingPseudos] in {
   defm FMULX_ZPZZ  : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmulx>;
   defm FDIVR_ZPZZ  : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdivr>;
   defm FDIV_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdiv>;
-} // End HasSVEorSME, UseExperimentalZeroingPseudos
+} // End HasSVE_or_SME, UseExperimentalZeroingPseudos
 
-let Predicates = [HasSVEorSME] in {
+let Predicates = [HasSVE_or_SME] in {
   defm FADD_ZZZ    : sve_fp_3op_u_zd<0b000, "fadd", AArch64fadd>;
   defm FSUB_ZZZ    : sve_fp_3op_u_zd<0b001, "fsub", AArch64fsub>;
   defm FMUL_ZZZ    : sve_fp_3op_u_zd<0b010, "fmul", AArch64fmul>;
-} // End HasSVEorSME
+} // End HasSVE_or_SME
 
 let Predicates = [HasSVE] in {
   defm FTSMUL_ZZZ  : sve_fp_3op_u_zd_ftsmul<0b011, "ftsmul", int_aarch64_sve_ftsmul_x>;
 } // End HasSVE
 
-let Predicates = [HasSVEorSME] in {
+let Predicates = [HasSVE_or_SME] in {
   defm FRECPS_ZZZ  : sve_fp_3op_u_zd<0b110, "frecps",  AArch64frecps>;
   defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", AArch64frsqrts>;
-} // End HasSVEorSME
+} // End HasSVE_or_SME
 
 let Predicates = [HasSVE] in {
   defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel", int_aarch64_sve_ftssel_x>;
 } // End HasSVE
 
-let Predicates = [HasSVEorSME] in {
+let Predicates = [HasSVE_or_SME] in {
   defm FCADD_ZPmZ  : sve_fp_fcadd<"fcadd", int_aarch64_sve_fcadd>;
   defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla", int_aarch64_sve_fcmla>;
 
@@ -818,26 +818,26 @@ let Predicates = [HasSVEorSME] in {
   defm FMLS_ZPZZZ  : sve_fp_3op_pred_hfd<AArch64fmls_p>;
   defm FNMLA_ZPZZZ : sve_fp_3op_pred_hfd<AArch64fnmla_p>;
   defm FNMLS_ZPZZZ : sve_fp_3op_pred_hfd<AArch64fnmls_p>;
-} // End HasSVEorSME
+} // End HasSVE_or_SME
 
 let Predicates = [HasSVE] in {
   defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>;
 } // End HasSVE
 
-let Predicates = [HasSVEorSME] in {
+let Predicates = [HasSVE_or_SME] in {
   defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b00, "fmla", int_aarch64_sve_fmla_lane>;
   defm FMLS_ZZZI : sve_fp_fma_by_indexed_elem<0b01, "fmls", int_aarch64_sve_fmls_lane>;
 
   defm FCMLA_ZZZI : sve_fp_fcmla_by_indexed_elem<"fcmla", int_aarch64_sve_fcmla_lane>;
   defm FMUL_ZZZI   : sve_fp_fmul_by_indexed_elem<"fmul", int_aarch64_sve_fmul_lane>;
-} // End HasSVEorSME
+} // End HasSVE_or_SME
 
 let Predicates = [HasSVE] in {
   // SVE floating point reductions.
   defm FADDA_VPZ   : sve_fp_2op_p_vd<0b000, "fadda",   AArch64fadda_p>;
 } // End HasSVE
 
-let Predicates = [HasSVEorSME] in {
+let Predicates = [HasSVE_or_SME] in {
   defm FADDV_VPZ   : sve_fp_fast_red<0b000, "faddv",   AArch64faddv_p>;
   defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", AArch64fmaxnmv_p>;
   defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", AArch64fminnmv_p>;
@@ -937,14 +937,14 @@ let Predicates = [HasSVEorSME] in {
   defm SEL_ZPZZ   : sve_int_sel_vvv<"sel", vselect>;
 
   defm SPLICE_ZPZ : sve_int_perm_splice<"splice", AArch64splice>;
-} // End HasSVEorSME
+} // End HasSVE_or_SME
 
 // COMPACT - word and doubleword
-let Predicates = [HasNonStreamingSVEorSME2p2] in {
+let Predicates = [HasNonStreamingSVE_or_SME2p2] in {
   defm COMPACT_ZPZ : sve_int_perm_compact_sd<"compact", int_aarch64_sve_compact>;
 }
 
-let Predicates = [HasSVEorSME] in {
+let Predicates = [HasSVE_or_SME] in {
   defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>;
   defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>;
   defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>;
@@ -973,13 +973,13 @@ let Predicates = [HasSVEorSME] in {
   defm MOVPRFX_ZPzZ : sve_int_movprfx_pred_zero<0b000, "movprfx">;
   defm MOVPRFX_ZPmZ : sve_int_movprfx_pred_merge<0b001, "movprfx">;
   def MOVPRFX_ZZ : sve_int_bin_cons_misc_0_c<0b00000001, "movprfx", ZPRAny>;
-} // End HasSVEorSME
+} // End HasSVE_or_SME
 
-let Predicates = [HasNonStreamingSVEorSME2p2] in {
+let Predicates = [HasNonStreamingSVE_or_SME2p2] in {
   defm FEXPA_ZZ : sve_int_bin_cons_misc_0_c_fexpa<"fexpa", int_aarch64_sve_fexpa_x>;
 } // End HasSVE
 
-let Predicates = [HasSVEorSME] in {
+let Predicates = [HasSVE_or_SME] in {
   defm BRKPA_PPzPP  : sve_int_brkp<0b00, "brkpa",  int_aarch64_sve_brkpa_z>;
   defm BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas", null_frag>;
   defm BRKPB_PPzPP  : sve_int_brkp<0b01, "brkpb",  int_aarch64_sve_brkpb_z>;
@@ -1118,7 +1118,7 @@ let Predicates = [HasSVEorSME] in {
   let Predicates = [HasSVE2p1] in {
   defm LD1D_Q  : sve_mem_128b_cld_ss<0b11, "ld1d", GPR64NoXZRshifted64>;
   }
-} // End HasSVEorSME
+} // End HasSVE_or_SME
 
 let Predicates = [HasSVE] in {
   // non-faulting continuous load with reg+immediate
@@ -1158,7 +1158,7 @@ let Predicates = [HasSVE] in {
   defm LDFF1D    : sve_mem_cldff_ss<0b1111, "ldff1d",  Z_d, ZPR64, GPR64shifted64>;
 } // End HasSVE
 
-let Predicates = [HasSVEorSME] in {
+let Predicates = [HasSVE_or_SME] in {
   // LD(2|3|4) structured loads with reg+immediate
   defm LD2B_IMM : sve_mem_eld_si<0b00, 0b001, ZZ_b,   "ld2b", simm4s2>;
   defm LD3B_IMM : sve_mem_eld_si<0b00, 0b010, ZZZ_b,  "ld3b", simm4s3>;
@@ -1172,7 +1172,7 @@ let Predicates = [HasSVEorSME] in {
   defm LD2D_IMM : sve_mem_eld_si<0b11, 0b001, ZZ_d,   "ld2d", simm4s2>;
   defm LD3D_IMM : sve_mem_eld_si<0b11, 0b010, ZZZ_d,  "ld3d", simm4s3>;
   defm LD4D_IMM : sve_mem_eld_si<0b11, 0b011, ZZZZ_d, "ld4d", simm4s4>;
-  let Predicates = [HasSVE2p1_or_HasSME2p1] in {
+  let Predicates = [HasSVE2p1_or_SME2p1] in {
   defm LD2Q_IMM : sve_mem_eld_si<0b01, 0b100, ZZ_q,   "ld2q", simm4s2>;
   defm LD3Q_IMM : sve_mem_eld_si<0b10, 0b100, ZZZ_q,  "ld3q", simm4s3>;
   defm LD4Q_IMM : sve_mem_eld_si<0b11, 0b100, ZZZZ_q, "ld4q", simm4s4>;
@@ -1191,12 +1191,12 @@ let Predicates = [HasSVEorSME] in {
   def LD2D : sve_mem_eld_ss<0b11, 0b101, ZZ_d,   "ld2d", GPR64NoXZRshifted64>;
   def LD3D : sve_mem_eld_ss<0b11, 0b110, ZZZ_d,  "ld3d", GPR64NoXZRshifted64>;
   def LD4D : sve_mem_eld_ss<0b11, 0b111, ZZZZ_d, "ld4d", GPR64NoXZRshifted64>;
-  let Predicates = [HasSVE2p1_or_HasSME2p1] in {
+  let Predicates = [HasSVE2p1_or_SME2p1] in {
   def LD2Q : sve_mem_eld_ss<0b01, 0b001, ZZ_q,   "ld2q", GPR64NoXZRshifted128>;
   def LD3Q : sve_mem_eld_ss<0b10, 0b001, ZZZ_q,  "ld3q", GPR64NoXZRshifted128>;
   def LD4Q : sve_mem_eld_ss<0b11, 0b001, ZZZZ_q, "ld4q", GPR64NoXZRshifted128>;
   }
-} // End HasSVEorSME
+} // End HasSVE_or_SME
 
 let Predicates = [HasSVE] in {
   // Gathers using unscaled 32-bit offsets, e.g.
@@ -1401,7 +1401,7 @@ let Predicates = [HasSVE] in {
   defm : sve_masked_gather_x4<nxv4bf16, nonext_masked_gather_unsigned_unscaled,    GLD1H_S_UXTW>;
 } // End HasSVE
 
-let Predicates = [HasSVEorSME] in {
+let Predicates = [HasSVE_or_SME] in {
   // Non-temporal contiguous loads (register + immediate)
   defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>;
   defm LDNT1H_ZRI : sve_mem_cldnt_si<0b01, "ldnt1h", Z_h, ZPR16>;
@@ -1492,7 +1492,7 @@ let Predicates = [HasSVEorSME] in {
   defm : sve_st1q_pat<nxv2i64, nxv1i1, int_aarch64_sve_st1dq,  ST1D_Q, ST1D_Q_IMM, am_sve_regreg_lsl3>;
   defm : sve_st1q_pat<nxv2f64, nxv1i1, int_aarch64_sve_st1dq,  ST1D_Q, ST1D_Q_IMM, am_sve_regreg_lsl3>;
 
-} // End HasSVEorSME
+} // End HasSVE_or_SME
 
 let Predicates = [HasSVE] in {
   // Scatters using unpacked, unscaled 32-bit offsets, e.g.
@@ -1624,7 +1624,7 @@ let Predicates = [HasSVE] in {
   defm : sve_masked_scatter_x4<nxv4bf16, nontrunc_masked_scatter_unsigned_unscaled,  SST1H_S_UXTW>;
 } // End HasSVE
 
-let Predicates = [HasSVEorSME] in {
+let Predicates = [HasSVE_or_SME] in {
   // ST(2|3|4) structured stores (register + immediate)
   defm ST2B_IMM : sve_mem_est_si<0b00, 0b01, ZZ_b,   "st2b", simm4s2>;
   defm ST3B_IMM : sve_mem_est_si<0b00, 0b10, ZZZ_b,  "st3b", simm4s3>;
@@ -1638,7 +1638,7 @@ let Predicates = [HasSVEorSME] in {
   defm ST2D_IMM : sve_mem_est_si<0b11, 0b01, ZZ_d,   "st2d", simm4s2>;
   defm ST3D_IMM : sve_mem_est_si<0b11, 0b10, ZZZ_d,  "st3d", simm4s3>;
   defm ST4D_IMM : sve_mem_est_si<0b11, 0b11, ZZZZ_d, "st4d", simm4s4>;
-  let Predicates = [HasSVE2p1_or_HasSME2p1] in {
+  let Predicates = [HasSVE2p1_or_SME2p1] in {
   defm ST2Q_IMM : sve_mem_128b_est_si<0b01, ZZ_q,    "st2q", simm4s2>;
   defm ST3Q_IMM : sve_mem_128b_est_si<0b10, ZZZ_q,   "st3q", simm4s3>;
   defm ST4Q_IMM : sve_mem_128b_est_si<0b11, ZZZZ_q,  "st4q", simm4s4>;
@@ -1657,7 +1657,7 @@ let Predicates = [HasSVEorSME] in {
   def ST2D : sve_mem_est_ss<0b11, 0b01, ZZ_d,   "st2d", GPR64NoXZRshifted64>;
   def ST3D : sve_mem_est_ss<0b11, 0b10, ZZZ_d,  "st3d", GPR64NoXZRshifted64>;
   def ST4D : sve_mem_est_ss<0b11, 0b11, ZZZZ_d, "st4d", GPR64NoXZRshifted64>;
-  let Predicates = [HasSVE2p1_or_HasSME2p1] in {
+  let Predicates = [HasSVE2p1_or_SME2p1] in {
   def ST2Q : sve_mem_128b_est_ss<0b01, ZZ_q,    "st2q", GPR64NoXZRshifted128>;
   def ST3Q : sve_mem_128b_est_ss<0b10, ZZZ_q,   "st3q", GPR64NoXZRshifted128>;
   def ST4Q : sve_mem_128b_est_ss<0b11, ZZZZ_q,  "st4q", GPR64NoXZRshifted128>;
@@ -1714,7 +1714,7 @@ let Predicates = [HasSVEorSME] in {
   defm : sve_prefetch<int_aarch64_sve_prf, nxv8i1,  PRFH_PRI, PRFH_PRR, am_sve_regreg_lsl1>;
   defm : sve_prefetch<int_aarch64_sve_prf, nxv4i1,  PRFW_PRI, PRFW_PRR, am_sve_regreg_lsl2>;
   defm : sve_prefetch<int_aarch64_sve_prf, nxv2i1,  PRFD_PRI, PRFD_PRR, am_sve_regreg_lsl3>;
-} // End HasSVEorSME
+} // End HasSVE_or_SME
 
 let Predicates = [HasSVE] in {
   // Gather prefetch using scaled 32-bit offsets, e.g.
@@ -1820,7 +1820,7 @@ let Predicates = [HasSVE] in {
   defm : adrXtwShiftPat<nxv2i64, nxv2i1, 3>;
 } // End HasSVE
 
-let Predicates = [HasSVEorSME] in {
+let Predicates = [HasSVE_or_SME] in {
   defm TBL_ZZZ  : sve_int_perm_tbl<"tbl", AArch64tbl>;
 
   defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>;
@@ -2168,7 +2168,7 @@ let Predicates = [HasSVEorSME] in {
   defm INCD_XPiI : sve_int_pred_pattern_a<0b110, "incd", add, int_aarch64_sve_cntd>;
   defm DECD_XPiI : sve_int_pred_pattern_a<0b111, "decd", sub, int_aarch64_sve_cntd>;
 
-let Predicates = [HasSVEorSME] in {
+let Predicates = [HasSVE_or_SME] in {
   defm SQINCB_XPiWdI : sve_int_pred_pattern_b_s32<0b00000, "sqincb", int_aarch64_sve_sqincb_n32>;
   defm UQINCB_WPiI   : sve_int_pred_pattern_b_u32<0b00001, "uqincb", int_aarch64_sve_uqincb_n32>;
   defm SQDECB_XPiWdI : sve_int_pred_pattern_b_s32<0b00010, "sqdecb", int_aarch64_sve_sqdecb_n32>;
@@ -2297,9 +2297,9 @@ let Predicates = [HasSVEorSME] in {
   defm ASR_ZPZI : sve_int_shift_pred_bhsd<AArch64asr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>;
   defm LSR_ZPZI : sve_int_shift_pred_bhsd<AArch64lsr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>;
   defm LSL_ZPZI : sve_int_shift_pred_bhsd<AArch64lsl_p, SVEShiftImmL8, SVEShiftImmL16, SVEShiftImmL32, SVEShiftImmL64>;
-} // End HasSVEorSME
+} // End HasSVE_or_SME
 
-let Predicates = [HasSVEorSME, UseExperimentalZeroingPseudos] in {
+let Predicates = [HasSVE_or_SME, UseExperimentalZeroingPseudos] in {
   defm ASR_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_asr>;
   defm LSR_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsr>;
   defm LSL_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsl>;
@@ -2308,9 +2308,9 @@ let Predicates = [HasSVEorSME, UseExperimentalZeroingPseudos] in {
   defm ASR_ZPZI  : sve_int_bin_pred_imm_zeroing_bhsd<int_aarch64_sve_asr, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>;
   defm LSR_ZPZI  : sve_int_bin_pred_imm_zeroing_bhsd<int_aarch64_sve_lsr, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>;
   defm LSL_ZPZI  : sve_int_bin_pred_imm_zeroing_bhsd<int_aarch64_sve_lsl, SVEShiftImmL8, SVEShiftImmL16, SVEShiftImmL32, SVEShiftImmL64>;
-} // End HasSVEorSME, UseExperimentalZeroingPseudos
+} // End HasSVE_or_SME, UseExperimentalZeroingPseudos
 
-let Predicates = [HasSVEorSME] in {
+let Predicates = [HasSVE_or_SME] in {
   defm ASR_ZPmZ  : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", int_aarch64_sve_asr, "ASRR_ZPmZ">;
   defm LSR_ZPmZ  : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", int_aarch64_sve_lsr, "LSRR_ZPmZ">;
   defm LSL_ZPmZ  : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", int_aarch64_sve_lsl, "LSLR_ZPmZ">;
@@ -2431,18 +2431,18 @@ let Predicates = [HasSVEorSME] in {
   defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti", AArch64frinti_mt>;
   defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", AArch64frecpx_mt>;
   defm FSQRT_ZPmZ  : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt",  AArch64fsqrt_mt>;
-} // End HasSVEorSME
+} // End HasSVE_or_SME
 
-let Predicates = [HasBF16, HasSVEorSME] in {
+let Predicates = [HasBF16, HasSVE_or_SME] in {
   defm BFDOT_ZZZ    : sve_float_dot<0b1, 0b0, ZPR32, ZPR16, "bfdot", nxv8bf16, int_aarch64_sve_bfdot>;
   defm BFDOT_ZZI    : sve_float_dot_indexed<0b1, 0b00, ZPR16, ZPR3b16, "bfdot", nxv8bf16, int_aarch64_sve_bfdot_lane_v2>;
-} // End HasBF16, HasSVEorSME
+} // End HasBF16, HasSVE_or_SME
 
 let Predicates = [HasBF16, HasSVE] in {
   defm BFMMLA_ZZZ   : sve_fp_matrix_mla<0b01, "bfmmla", ZPR32, ZPR16, int_aarch64_sve_bfmmla, nxv4f32, nxv8bf16>;
 } // End HasBF16, HasSVE
 
-let Predicates = [HasBF16, HasSVEorSME] in {
+let Predicates = [HasBF16, HasSVE_or_SME] in {
   defm BFMLALB_ZZZ : sve2_fp_mla_long<0b100, "bfmlalb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalb>;
   defm BFMLALT_ZZZ : sve2_fp_mla_long<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt>;
   defm BFMLALB_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b100, "bfmlalb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalb_lane_v2>;
@@ -2450,9 +2450,9 @@ let Predicates = [HasBF16, HasSVEorSME] in {
 
   defm BFCVT_ZPmZ   : sve_bfloat_convert<"bfcvt", int_aarch64_sve_fcvt_bf16f32_v2, AArch64fcvtr_mt>;
   defm BFCVTNT_ZPmZ : sve_bfloat_convert_top<"bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32_v2>;
-} // End HasBF16, HasSVEorSME
+} // End HasBF16, HasSVE_or_SME
 
-let Predicates = [HasSVEorSME] in {
+let Predicates = [HasSVE_or_SME] in {
   // InstAliases
   def : InstAlias<"mov $Zd, $Zn",
                   (ORR_ZZZ ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zn), 1>;
@@ -2588,7 +2588,7 @@ let Predicates = [HasSVEorSME] in {
   // LDR1 of 64-bit data
   defm : LD1RPat<nxv2i64, load, LD1RD_IMM, PTRUE_D, i64, am_indexed64_6b, uimm6s8>;
 
-  let Predicates = [HasSVEorSME, UseSVEFPLD1R] in {
+  let Predicates = [HasSVE_or_SME, UseSVEFPLD1R] in {
     // LD1R of FP data
     defm : LD1RPat<nxv8f16, load, LD1RH_IMM,   PTRUE_H, f16, am_indexed16_6b, uimm6s2>;
     defm : LD1RPat<nxv4f16, load, LD1RH_S_IMM, PTRUE_S, f16, am_indexed16_6b, uimm6s2>;
@@ -2640,7 +2640,7 @@ let Predicates = [HasSVEorSME] in {
   }
 
   // Add NoUseScalarIncVL to avoid affecting for patterns with UseScalarIncVL
-  let Predicates = [HasSVEorSME, NoUseScalarIncVL] in {
+  let Predicates = [HasSVE_or_SME, NoUseScalarIncVL] in {
     def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm_neg i32:$imm))),
               (SUBXrs GPR64:$op, (CNTH_XPiI 31, $imm), 0)>;
     def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm_neg i32:$imm))),
@@ -2672,7 +2672,7 @@ let Predicates = [HasSVEorSME] in {
               (DECD_ZPiI ZPR:$op, 31, $imm)>;
   }
 
-  let Predicates = [HasSVEorSME, UseScalarIncVL], AddedComplexity = 5 in {
+  let Predicates = [HasSVE_or_SME, UseScalarIncVL], AddedComplexity = 5 in {
     def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
               (ADDVL_XXI GPR64:$op, $imm)>;
 
@@ -3059,7 +3059,7 @@ let Predicates = [HasSVEorSME] in {
 
   // 16-element contiguous loads
   defm : ld1<LD1B, LD1B_IMM, nxv16i8, AArch64ld1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
-} // End HasSVEorSME
+} // End HasSVE_or_SME
 
 let Predicates = [HasSVE] in {
   multiclass ldnf1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT> {
@@ -3144,7 +3144,7 @@ let Predicates = [HasSVE] in {
   defm : ldff1<LDFF1B, nxv16i8, AArch64ldff1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
 } // End HasSVE
 
-let Predicates = [HasSVEorSME] in {
+let Predicates = [HasSVE_or_SME] in {
   multiclass st1<Instruction RegRegInst, Instruction RegImmInst, ValueType Ty,
                  SDPatternOperator Store, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
     // reg + reg
@@ -3462,7 +3462,7 @@ let Predicates = [HasSVEorSME] in {
             (SUB_ZPmZ_S PPR:$pred, ZPR:$op, (DUP_ZI_S 255, 0))>;
   def : Pat<(nxv2i64 (sub ZPR:$op, (sext nxv2i1:$pred))),
             (SUB_ZPmZ_D PPR:$pred, ZPR:$op, (DUP_ZI_D 255, 0))>;
-} // End HasSVEorSME
+} // End HasSVE_or_SME
 
 let Predicates = [HasSVE, HasMatMulInt8] in {
   defm  SMMLA_ZZZ : sve_int_matmul<0b00, "smmla", int_aarch64_sve_smmla>;
@@ -3470,11 +3470,11 @@ let Predicates = [HasSVE, HasMatMulInt8] in {
   defm USMMLA_ZZZ : sve_int_matmul<0b10, "usmmla", int_aarch64_sve_usmmla>;
 } // End HasSVE, HasMatMulInt8
 
-let Predicates = [HasSVEorSME, HasMatMulInt8] in {
+let Predicates = [HasSVE_or_SME, HasMatMulInt8] in {
   defm USDOT_ZZZ  : sve_int_dot_mixed<"usdot", AArch64usdot>;
   defm USDOT_ZZZI : sve_int_dot_mixed_indexed<0, "usdot", int_aarch64_sve_usdot_lane>;
   defm SUDOT_ZZZI : sve_int_dot_mixed_indexed<1, "sudot", int_aarch64_sve_sudot_lane>;
-} // End HasSVEorSME, HasMatMulInt8
+} // End HasSVE_or_SME, HasMatMulInt8
 
 let Predicates = [HasSVE, HasMatMulFP32] in {
   defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0b10, "fmmla", ZPR32, ZPR32, int_aarch64_sve_fmmla, nxv4f32, nxv4f32>;
@@ -3496,16 +3496,16 @@ let Predicates = [HasSVE, HasMatMulFP64] in {
   defm LD1RO_D     : sve_mem_ldor_ss<0b11, "ld1rod", Z_d, ZPR64, GPR64NoXZRshifted64, nxv2i64, nxv2i1,  AArch64ld1ro_z, am_sve_regreg_lsl3>;
 } // End HasSVE, HasMatMulFP64
 
-let Predicates = [HasSVEorSME, HasMatMulFP64] in {
+let Predicates = [HasSVE_or_SME, HasMatMulFP64] in {
   defm ZIP1_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b00, 0, "zip1", int_aarch64_sve_zip1q>;
   defm ZIP2_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b00, 1, "zip2", int_aarch64_sve_zip2q>;
   defm UZP1_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b01, 0, "uzp1", int_aarch64_sve_uzp1q>;
   defm UZP2_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b01, 1, "uzp2", int_aarch64_sve_uzp2q>;
   defm TRN1_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b11, 0, "trn1", int_aarch64_sve_trn1q>;
   defm TRN2_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b11, 1, "trn2", int_aarch64_sve_trn2q>;
-} // End HasSVEorSME, HasMatMulFP64
+} // End HasSVE_or_SME, HasMatMulFP64
 
-let Predicates = [HasSVE2orSME] in {
+let Predicates = [HasSVE2_or_SME] in {
   // SVE2 integer multiply-add (indexed)
   defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla", int_aarch64_sve_mla_lane>;
   defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls", int_aarch64_sve_mls_lane>;
@@ -3653,17 +3653,17 @@ let Predicates = [HasSVE2orSME] in {
   defm UQSHL_ZPZZ   : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_uqshl>;
   defm SQRSHL_ZPZZ  : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_sqrshl>;
   defm UQRSHL_ZPZZ  : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_uqrshl>;
-} // End HasSVE2orSME
+} // End HasSVE2_or_SME
 
-let Predicates = [HasSVE2orSME, UseExperimentalZeroingPseudos] in {
+let Predicates = [HasSVE2_or_SME, UseExperimentalZeroingPseudos] in {
   defm SQSHL_ZPZI  : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
   defm UQSHL_ZPZI  : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
   defm SRSHR_ZPZI  : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_srshr>;
   defm URSHR_ZPZI  : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_urshr>;
   defm SQSHLU_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<int_aarch64_sve_sqshlu>;
-} // End HasSVE2orSME, UseExperimentalZeroingPseudos
+} // End HasSVE2_or_SME, UseExperimentalZeroingPseudos
 
-let Predicates = [HasSVE2orSME] in {
+let Predicates = [HasSVE2_or_SME] in {
   // SVE2 predicated shifts
   defm SQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left_dup<0b0110, "sqshl",  "SQSHL_ZPZI",  int_aarch64_sve_sqshl>;
   defm UQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left_dup<0b0111, "uqshl",  "UQSHL_ZPZI",  int_aarch64_sve_uqshl>;
@@ -3776,7 +3776,7 @@ let Predicates = [HasSVE2orSME] in {
   defm SQXTNT_ZZ  : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt",  int_aarch64_sve_sqxtnt>;
   defm UQXTNT_ZZ  : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt",  int_aarch64_sve_uqxtnt>;
   defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt", int_aarch64_sve_sqxtunt>;
-} // End HasSVE2orSME
+} // End HasSVE2_or_SME
 
 let Predicates = [HasSVE2] in {
   // SVE2 character match
@@ -3784,7 +3784,7 @@ let Predicates = [HasSVE2] in {
   defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch", int_aarch64_sve_nmatch>;
 } // End HasSVE2
 
-let Predicates = [HasSVE2orSME] in {
+let Predicates = [HasSVE2_or_SME] in {
   // SVE2 bitwise exclusive-or interleaved
   defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt", int_aarch64_sve_eorbt>;
   defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb", int_aarch64_sve_eortb>;
@@ -3799,7 +3799,7 @@ let Predicates = [HasSVE2orSME] in {
   defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt", int_aarch64_sve_saddlbt>;
   defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt", int_aarch64_sve_ssublbt>;
   defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb", int_aarch64_sve_ssubltb>;
-} // End HasSVE2orSME
+} // End HasSVE2_or_SME
 
 let Predicates = [HasSVE2] in {
   // SVE2 histogram generation (segment)
@@ -3809,16 +3809,16 @@ let Predicates = [HasSVE2] in {
   defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt", int_aarch64_sve_histcnt>;
 } // End HasSVE2
 
-let Predicates = [HasSVE2orSME] in {
+let Predicates = [HasSVE2_or_SME] in {
   // SVE2 floating-point base 2 logarithm as integer
   defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb", "FLOGB_ZPZZ", int_aarch64_sve_flogb>;
 }
 
-let Predicates = [HasSVE2orSME, UseExperimentalZeroingPseudos] in {
+let Predicates = [HasSVE2_or_SME, UseExperimentalZeroingPseudos] in {
   defm FLOGB_ZPZZ : sve2_fp_un_pred_zeroing_hsd<int_aarch64_sve_flogb>;
-} // End HasSVE2orSME, UseExperimentalZeroingPseudos
+} // End HasSVE2_or_SME, UseExperimentalZeroingPseudos
 
-let Predicates = [HasSVE2orSME] in {
+let Predicates = [HasSVE2_or_SME] in {
   // SVE2 floating-point convert precision
   defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding_top<"fcvtxnt", "int_aarch64_sve_fcvtxnt">;
   defm FCVTX_ZPmZ   : sve2_fp_convert_down_odd_rounding<"fcvtx",       "int_aarch64_sve_fcvtx", AArch64fcvtx_mt>;
@@ -3861,7 +3861,7 @@ let Predicates = [HasSVE2orSME] in {
     def : Pat<(nxv16i8 (AArch64ext nxv16i8:$zn1, nxv16i8:$zn2, (i32 imm0_255:$imm))),
               (EXT_ZZI_B (REG_SEQUENCE ZPR2, $zn1, zsub0, $zn2, zsub1), imm0_255:$imm)>;
   }
-} // End HasSVE2orSME
+} // End HasSVE2_or_SME
 
 let Predicates = [HasSVE2] in {
   // SVE2 non-temporal gather loads
@@ -3880,10 +3880,10 @@ let Predicates = [HasSVE2] in {
   defm LDNT1D_ZZR_D  : sve2_mem_gldnt_vs_64_ptrs<0b11110, "ldnt1d",  AArch64ldnt1_gather_z,  nxv2i64>;
 } // End HasSVE2
 
-let Predicates = [HasSVE2orSME] in {
+let Predicates = [HasSVE2_or_SME] in {
   // SVE2 vector splice (constructive)
   defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice", AArch64splice>;
-} // End HasSVE2orSME
+} // End HasSVE2_or_SME
 
 let Predicates = [HasSVE2] in {
   // SVE2 non-temporal scatter stores
@@ -3897,7 +3897,7 @@ let Predicates = [HasSVE2] in {
   defm STNT1D_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b110, "stnt1d", AArch64stnt1_scatter, nxv2i64>;
 } // End HasSVE2
 
-let Predicates = [HasSVE2orSME] in {
+let Predicates = [HasSVE2_or_SME] in {
   // SVE2 table lookup (three sources)
   defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl", int_aarch64_sve_tbl2>;
   defm TBX_ZZZ  : sve2_int_perm_tbx<"tbx", 0b01, int_aarch64_sve_tbx>;
@@ -3916,9 +3916,9 @@ let Predicates = [HasSVE2orSME] in {
   // SVE2 pointer conflict compare
   defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">;
   defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">;
-} // End HasSVE2orSME
+} // End HasSVE2_or_SME
 
-let Predicates = [HasSVEAES, HasNonStreamingSVE2orSSVE_AES] in {
+let Predicates = [HasSVEAES, HasNonStreamingSVE2_or_SSVE_AES] in {
   // SVE2 crypto destructive binary operations
   defm AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8, int_aarch64_sve_aese, nxv16i8>;
   defm AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8, int_aarch64_sve_aesd, nxv16i8>;
@@ -3946,14 +3946,14 @@ let Predicates = [HasSVE2SHA3] in {
   defm RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1", ZPR64, int_aarch64_sve_rax1, nxv2i64>;
 } // End HasSVE2SHA3
 
-let Predicates = [HasSVEBitPerm, HasNonStreamingSVE2orSSVE_BitPerm] in {
+let Predicates = [HasSVEBitPerm, HasNonStreamingSVE2_or_SSVE_BitPerm] in {
   // SVE2 bitwise permute
   defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext", int_aarch64_sve_bext_x>;
   defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep", int_aarch64_sve_bdep_x>;
   defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp", int_aarch64_sve_bgrp_x>;
 }
 
-let Predicates = [HasSVEAES2, HasNonStreamingSVE2p1orSSVE_AES] in {
+let Predicates = [HasSVEAES2, HasNonStreamingSVE2p1_or_SSVE_AES] in {
   // SVE_AES2 multi-vector instructions (x2)
   def AESE_2ZZI_B    : sve_crypto_binary_multi2<0b000, "aese">;
   def AESD_2ZZI_B    : sve_crypto_binary_multi2<0b010, "aesd">;
@@ -3974,20 +3974,20 @@ let Predicates = [HasSVEAES2, HasNonStreamingSVE2p1orSSVE_AES] in {
 // SME or SVE2.1 instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasSVE2p1_or_HasSME] in {
+let Predicates = [HasSVE2p1_or_SME] in {
 defm REVD_ZPmZ : sve2_int_perm_revd<"revd", AArch64revd_mt>;
 
 defm SCLAMP_ZZZ : sve2_clamp<"sclamp", 0b0, AArch64sclamp>;
 defm UCLAMP_ZZZ : sve2_clamp<"uclamp", 0b1, AArch64uclamp>;
 
 defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>;
-} // End HasSVE2p1_or_HasSME
+} // End HasSVE2p1_or_SME
 
 //===----------------------------------------------------------------------===//
 // SME2 or SVE2.1 instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasSVE2p1_or_HasSME2] in {
+let Predicates = [HasSVE2p1_or_SME2] in {
 defm FCLAMP_ZZZ : sve_fp_clamp<"fclamp", AArch64fclamp>;
 
 defm FDOT_ZZZ_S  : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>;
@@ -4154,9 +4154,9 @@ defm WHILEHS_CXX  : sve2p1_int_while_rr_pn<"whilehs", 0b100>;
 defm WHILEHI_CXX  : sve2p1_int_while_rr_pn<"whilehi", 0b101>;
 defm WHILELO_CXX  : sve2p1_int_while_rr_pn<"whilelo", 0b110>;
 defm WHILELS_CXX  : sve2p1_int_while_rr_pn<"whilels", 0b111>;
-} // End HasSVE2p1_or_HasSME2
+} // End HasSVE2p1_or_SME2
 
-let Predicates = [HasSVEorSME] in {
+let Predicates = [HasSVE_or_SME] in {
 
 // Aliases for existing SVE instructions for which predicate-as-counter are
 // accepted as an operand to the instruction
@@ -4222,7 +4222,7 @@ let Predicates = [HasSVEBFSCALE] in {
 //===----------------------------------------------------------------------===//
 // SME2.1 or SVE2.1 instructions
 //===----------------------------------------------------------------------===//
-let Predicates = [HasSVE2p1_or_HasSME2p1] in {
+let Predicates = [HasSVE2p1_or_SME2p1] in {
 defm FADDQV   : sve2p1_fp_reduction_q<0b000, "faddqv", int_aarch64_sve_faddqv>;
 defm FMAXNMQV : sve2p1_fp_reduction_q<0b100, "fmaxnmqv", int_aarch64_sve_fmaxnmqv>;
 defm FMINNMQV : sve2p1_fp_reduction_q<0b101, "fminnmqv", int_aarch64_sve_fminnmqv>;
@@ -4250,13 +4250,13 @@ defm UZPQ1_ZZZ : sve2p1_permute_vec_elems_q<0b010, "uzpq1", int_aarch64_sve_uzpq
 defm UZPQ2_ZZZ : sve2p1_permute_vec_elems_q<0b011, "uzpq2", int_aarch64_sve_uzpq2>;
 defm TBXQ_ZZZ : sve2_int_perm_tbx<"tbxq", 0b10, int_aarch64_sve_tbxq>;
 defm TBLQ_ZZZ  : sve2p1_tblq<"tblq", int_aarch64_sve_tblq>;
-} // End HasSVE2p1_or_HasSME2p1
+} // End HasSVE2p1_or_SME2p1
 
 
 //===----------------------------------------------------------------------===//
 // SME2.2 or SVE2.2 instructions
 //===----------------------------------------------------------------------===//
-let Predicates = [HasSVE2p2orSME2p2] in {
+let Predicates = [HasSVE2p2_or_SME2p2] in {
   // SVE Floating-point convert precision, zeroing predicate
   defm FCVT_ZPzZ : sve_fp_z2op_p_zd_b_0<"fcvt", "int_aarch64_sve_fcvt">;
 
@@ -4349,7 +4349,7 @@ let Predicates = [HasSVE2p2orSME2p2] in {
 //===----------------------------------------------------------------------===//
 // SME2.2 or SVE2.2 instructions - Legal in streaming mode iff target has SME2p2
 //===----------------------------------------------------------------------===//
-let Predicates = [HasNonStreamingSVE2p2orSME2p2] in {
+let Predicates = [HasNonStreamingSVE2p2_or_SME2p2] in {
   // SVE2 EXPAND
   defm EXPAND_ZPZ : sve2_int_perm_expand<"expand">;
   // SVE COMPACT - byte and halfword
@@ -4359,7 +4359,7 @@ let Predicates = [HasNonStreamingSVE2p2orSME2p2] in {
 //===----------------------------------------------------------------------===//
 // SVE2 FP8 instructions
 //===----------------------------------------------------------------------===//
-let Predicates = [HasSVE2orSME2, HasFP8] in {
+let Predicates = [HasSVE2_or_SME2, HasFP8] in {
 // FP8 upconvert
 defm F1CVT_ZZ     : sve2_fp8_cvt_single<0b0, 0b00, "f1cvt",    nxv8f16,  int_aarch64_sve_fp8_cvt1>;
 defm F2CVT_ZZ     : sve2_fp8_cvt_single<0b0, 0b01, "f2cvt",    nxv8f16,  int_aarch64_sve_fp8_cvt2>;
@@ -4376,15 +4376,15 @@ defm FCVTNB_Z2Z_StoB : sve2_fp8_down_cvt_single<0b01, "fcvtnb", ZZ_s_mul_r, nxv4
 defm BFCVTN_Z2Z_HtoB : sve2_fp8_down_cvt_single<0b10, "bfcvtn", ZZ_h_mul_r, nxv8bf16, int_aarch64_sve_fp8_cvtn>;
 
 defm FCVTNT_Z2Z_StoB : sve2_fp8_down_cvt_single_top<0b11, "fcvtnt", ZZ_s_mul_r, nxv4f32,  int_aarch64_sve_fp8_cvtnt>;
-} // End HasSVE2orSME2, HasFP8
+} // End HasSVE2_or_SME2, HasFP8
 
-let Predicates = [HasSVE2orSME2, HasFAMINMAX] in {
+let Predicates = [HasSVE2_or_SME2, HasFAMINMAX] in {
 defm FAMIN_ZPmZ : sve_fp_2op_p_zds<0b1111, "famin", "FAMIN_ZPZZ", int_aarch64_sve_famin, DestructiveBinaryComm>;
 defm FAMAX_ZPmZ : sve_fp_2op_p_zds<0b1110, "famax", "FAMAX_ZPZZ", int_aarch64_sve_famax, DestructiveBinaryComm>;
 
 defm FAMAX_ZPZZ : sve_fp_bin_pred_hfd<AArch64famax_p>;
 defm FAMIN_ZPZZ : sve_fp_bin_pred_hfd<AArch64famin_p>;
-} // End HasSVE2orSME2, HasFAMINMAX
+} // End HasSVE2_or_SME2, HasFAMINMAX
 
 let Predicates = [HasSSVE_FP8FMA] in {
 // FP8 Widening Multiply-Add Long - Indexed Group
@@ -4428,14 +4428,14 @@ defm FDOT_ZZZI_BtoS : sve2_fp8_dot_indexed_s<"fdot", int_aarch64_sve_fp8_fdot_la
 defm FDOT_ZZZ_BtoS : sve_fp8_dot<0b1, ZPR32, "fdot", nxv4f32, int_aarch64_sve_fp8_fdot>;
 }
 
-let Predicates = [HasSVE2orSME2, HasLUT] in {
+let Predicates = [HasSVE2_or_SME2, HasLUT] in {
 // LUTI2
   defm LUTI2_ZZZI : sve2_luti2_vector_index<"luti2">;
 // LUTI4
   defm LUTI4_ZZZI   : sve2_luti4_vector_index<"luti4">;
 // LUTI4 (two contiguous registers)
   defm LUTI4_Z2ZZI  : sve2_luti4_vector_vg2_index<"luti4">;
-} // End HasSVE2orSME2, HasLUT
+} // End HasSVE2_or_SME2, HasLUT
 
 //===----------------------------------------------------------------------===//
 // Checked Pointer Arithmetic (FEAT_CPA)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 932a6f9ce23fd..7f10bfed739b4 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/TargetParser/AArch64TargetParser.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include <algorithm>
@@ -248,6 +249,19 @@ static bool hasPossibleIncompatibleOps(const Function *F) {
   return false;
 }
 
+uint64_t AArch64TTIImpl::getFeatureMask(const Function &F) const {
+  StringRef AttributeStr =
+      isMultiversionedFunction(F) ? "fmv-features" : "target-features";
+  StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
+  SmallVector<StringRef, 8> Features;
+  FeatureStr.split(Features, ",");
+  return AArch64::getFMVPriority(Features);
+}
+
+bool AArch64TTIImpl::isMultiversionedFunction(const Function &F) const {
+  return F.hasFnAttribute("fmv-features");
+}
+
 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
                                          const Function *Callee) const {
   SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 8e7e590c173ff..1eb805ae00b1b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -89,6 +89,10 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
                                 unsigned DefaultCallPenalty) const;
 
+  uint64_t getFeatureMask(const Function &F) const;
+
+  bool isMultiversionedFunction(const Function &F) const;
+
   /// \name Scalar TTI Implementations
   /// @{
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 15f1c99e87246..e4719b26cab52 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -165,7 +165,7 @@ struct IncomingArgHandler : public CallLowering::IncomingValueHandler {
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
                         const CCValAssign &VA) override {
-    markPhysRegUsed(PhysReg);
+    markRegUsed(PhysReg);
     IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
   }
 
@@ -207,16 +207,16 @@ struct IncomingArgHandler : public CallLowering::IncomingValueHandler {
   /// How the physical register gets marked varies between formal
   /// parameters (it's a basic-block live-in), and a call instruction
   /// (it's an implicit-def of the BL).
-  virtual void markPhysRegUsed(MCRegister PhysReg) = 0;
+  virtual void markRegUsed(Register Reg) = 0;
 };
 
 struct FormalArgHandler : public IncomingArgHandler {
   FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
       : IncomingArgHandler(MIRBuilder, MRI) {}
 
-  void markPhysRegUsed(MCRegister PhysReg) override {
-    MIRBuilder.getMRI()->addLiveIn(PhysReg);
-    MIRBuilder.getMBB().addLiveIn(PhysReg);
+  void markRegUsed(Register Reg) override {
+    MIRBuilder.getMRI()->addLiveIn(Reg.asMCReg());
+    MIRBuilder.getMBB().addLiveIn(Reg.asMCReg());
   }
 };
 
@@ -225,8 +225,8 @@ struct CallReturnHandler : public IncomingArgHandler {
                     MachineInstrBuilder MIB)
       : IncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {}
 
-  void markPhysRegUsed(MCRegister PhysReg) override {
-    MIB.addDef(PhysReg, RegState::Implicit);
+  void markRegUsed(Register Reg) override {
+    MIB.addDef(Reg, RegState::Implicit);
   }
 
   MachineInstrBuilder MIB;
@@ -239,7 +239,7 @@ struct ReturnedArgCallReturnHandler : public CallReturnHandler {
                                MachineInstrBuilder MIB)
       : CallReturnHandler(MIRBuilder, MRI, MIB) {}
 
-  void markPhysRegUsed(MCRegister PhysReg) override {}
+  void markRegUsed(Register Reg) override {}
 };
 
 struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler {
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 0ef862fc1a27c..873fbf7dd346b 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -469,7 +469,7 @@ multiclass sve_int_ptrue<bits<3> opc, string asm, SDPatternOperator op> {
 def SDT_AArch64PTrue : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
 def AArch64ptrue : SDNode<"AArch64ISD::PTRUE", SDT_AArch64PTrue>;
 
-let Predicates = [HasSVEorSME] in {
+let Predicates = [HasSVE_or_SME] in {
   defm PTRUE  : sve_int_ptrue<0b000, "ptrue", AArch64ptrue>;
   defm PTRUES : sve_int_ptrue<0b001, "ptrues", null_frag>;
 
@@ -1263,7 +1263,7 @@ class sve_int_pred_pattern_a<bits<3> opc, string asm>
 multiclass sve_int_pred_pattern_a<bits<3> opc, string asm,
                                   SDPatternOperator op,
                                   SDPatternOperator opcnt> {
-  let Predicates = [HasSVEorSME] in {
+  let Predicates = [HasSVE_or_SME] in {
     def NAME : sve_int_pred_pattern_a<opc, asm>;
 
     def : InstAlias<asm # "\t$Rdn, $pattern",
@@ -1272,7 +1272,7 @@ multiclass sve_int_pred_pattern_a<bits<3> opc, string asm,
                     (!cast<Instruction>(NAME) GPR64:$Rdn, 0b11111, 1), 2>;
   }
 
-  let Predicates = [HasSVEorSME, UseScalarIncVL] in {
+  let Predicates = [HasSVE_or_SME, UseScalarIncVL] in {
     def : Pat<(i64 (op GPR64:$Rdn, (opcnt sve_pred_enum:$pattern))),
               (!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, 1)>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 400c5f219cc70..89356df39724a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -207,8 +207,8 @@ extern char &SILoadStoreOptimizerLegacyID;
 void initializeSIWholeQuadModePass(PassRegistry &);
 extern char &SIWholeQuadModeID;
 
-void initializeSILowerControlFlowPass(PassRegistry &);
-extern char &SILowerControlFlowID;
+void initializeSILowerControlFlowLegacyPass(PassRegistry &);
+extern char &SILowerControlFlowLegacyID;
 
 void initializeSIPreEmitPeepholePass(PassRegistry &);
 extern char &SIPreEmitPeepholeID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 6f322074ba74c..fbcf83e2fdd60 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -102,6 +102,7 @@ MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
 MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass());
 MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass())
 MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass())
+MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass())
 MACHINE_FUNCTION_PASS("si-lower-sgpr-spills", SILowerSGPRSpillsPass())
 MACHINE_FUNCTION_PASS("si-opt-vgpr-liverange", SIOptimizeVGPRLiveRangePass())
 MACHINE_FUNCTION_PASS("si-peephole-sdwa", SIPeepholeSDWAPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 6d4547dbc82c3..98268b848f5ce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -38,6 +38,7 @@
 #include "SIFixSGPRCopies.h"
 #include "SIFoldOperands.h"
 #include "SILoadStoreOptimizer.h"
+#include "SILowerControlFlow.h"
 #include "SILowerSGPRSpills.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIMachineScheduler.h"
@@ -523,7 +524,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSIInsertWaitcntsPass(*PR);
   initializeSIModeRegisterPass(*PR);
   initializeSIWholeQuadModePass(*PR);
-  initializeSILowerControlFlowPass(*PR);
+  initializeSILowerControlFlowLegacyPass(*PR);
   initializeSIPreEmitPeepholePass(*PR);
   initializeSILateBranchLoweringPass(*PR);
   initializeSIMemoryLegalizerPass(*PR);
@@ -1459,7 +1460,7 @@ void GCNPassConfig::addFastRegAlloc() {
   // This must be run immediately after phi elimination and before
   // TwoAddressInstructions, otherwise the processing of the tied operand of
   // SI_ELSE will introduce a copy of the tied operand source after the else.
-  insertPass(&PHIEliminationID, &SILowerControlFlowID);
+  insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
 
   insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
 
@@ -1480,7 +1481,7 @@ void GCNPassConfig::addOptimizedRegAlloc() {
   // This must be run immediately after phi elimination and before
   // TwoAddressInstructions, otherwise the processing of the tied operand of
   // SI_ELSE will introduce a copy of the tied operand source after the else.
-  insertPass(&PHIEliminationID, &SILowerControlFlowID);
+  insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
 
   if (EnableRewritePartialRegUses)
     insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 5207201e14c09..6baef137df5e1 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -3007,8 +3007,8 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
     switch (I.getOpcode()) {
     case AMDGPU::V_ADDC_U32_e32:
     case AMDGPU::V_ADDC_U32_dpp:
-    case AMDGPU::V_CNDMASK_B16_e32:
-    case AMDGPU::V_CNDMASK_B16_dpp:
+    case AMDGPU::V_CNDMASK_B16_fake16_e32:
+    case AMDGPU::V_CNDMASK_B16_fake16_dpp:
     case AMDGPU::V_CNDMASK_B32_e32:
     case AMDGPU::V_CNDMASK_B32_dpp:
     case AMDGPU::V_DIV_FMAS_F32_e64:
@@ -3023,8 +3023,8 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
              HazardReg == AMDGPU::VCC_HI;
     case AMDGPU::V_ADDC_U32_e64:
     case AMDGPU::V_ADDC_U32_e64_dpp:
-    case AMDGPU::V_CNDMASK_B16_e64:
-    case AMDGPU::V_CNDMASK_B16_e64_dpp:
+    case AMDGPU::V_CNDMASK_B16_fake16_e64:
+    case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
     case AMDGPU::V_CNDMASK_B32_e64:
     case AMDGPU::V_CNDMASK_B32_e64_dpp:
     case AMDGPU::V_SUBB_U32_e64:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 21f1f20e5e69a..e068b5f0b8769 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1952,6 +1952,13 @@ bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
   return Index == 0;
 }
 
+bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
+  // TODO: This should be more aggressive, particular for 16-bit element
+  // vectors. However there are some mixed improvements and regressions.
+  EVT EltTy = VT.getVectorElementType();
+  return EltTy.getSizeInBits() % 32 == 0;
+}
+
 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
   if (Subtarget->has16BitInsts() && VT == MVT::i16) {
     switch (Op) {
@@ -13877,6 +13884,37 @@ static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
   return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
 }
 
+// Fold
+//     y = lshr i64 x, 32
+//     res = add (mul i64 y, Const), x   where "Const" is a 64-bit constant
+//     with Const.hi == -1
+// To
+//     res = mad_u64_u32 y.lo ,Const.lo, x.lo
+static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
+                                 SDValue MulLHS, SDValue MulRHS,
+                                 SDValue AddRHS) {
+  if (MulRHS.getOpcode() == ISD::SRL)
+    std::swap(MulLHS, MulRHS);
+
+  if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
+    return SDValue();
+
+  ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
+  if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
+      MulLHS.getOperand(0) != AddRHS)
+    return SDValue();
+
+  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(MulRHS.getNode());
+  if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
+    return SDValue();
+
+  SDValue ConstMul =
+      DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
+  return getMad64_32(DAG, SL, MVT::i64,
+                     DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
+                     DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
+}
+
 // Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
 // multiplies, if any.
 //
@@ -13935,6 +13973,9 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
   SDValue MulRHS = LHS.getOperand(1);
   SDValue AddRHS = RHS;
 
+  if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
+    return FoldedMAD;
+
   // Always check whether operands are small unsigned values, since that
   // knowledge is useful in more cases. Check for small signed values only if
   // doing so can unlock a shorter code sequence.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 5c215f76552d9..bbb96d9115a0a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -365,6 +365,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                unsigned Index) const override;
+  bool isExtractVecEltCheap(EVT VT, unsigned Index) const override;
 
   bool isTypeDesirableForOp(unsigned Op, EVT VT) const override;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index cdc1132579d8d..40a20fa9cb15e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1245,11 +1245,22 @@ class VOPSelectPat <ValueType vt> : GCNPat <
   (vt (select i1:$src0, vt:$src1, vt:$src2)),
   (V_CNDMASK_B32_e64 0, VSrc_b32:$src2, 0, VSrc_b32:$src1, SSrc_i1:$src0)
 >;
+class VOPSelectPat_t16 <ValueType vt> : GCNPat <
+  (vt (select i1:$src0, vt:$src1, vt:$src2)),
+  (V_CNDMASK_B16_t16_e64 0, VSrcT_b16:$src2, 0, VSrcT_b16:$src1, SSrc_i1:$src0)
+>;
 
 def : VOPSelectModsPat <i32>;
 def : VOPSelectModsPat <f32>;
-def : VOPSelectPat <f16>;
-def : VOPSelectPat <i16>;
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
+  def : VOPSelectPat <f16>;
+  def : VOPSelectPat <i16>;
+} // End True16Predicate = p
+let True16Predicate = UseRealTrue16Insts in {
+  def : VOPSelectPat_t16 <f16>;
+  def : VOPSelectPat_t16 <i16>;
+} // End True16Predicate = UseRealTrue16Insts
 
 let AddedComplexity = 1 in {
 def : GCNPat <
@@ -3030,6 +3041,8 @@ def : GCNPat <
 
 // Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24)
 // The 12s emit 0s.
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
 def : GCNPat <
   (i16 (bswap i16:$a)),
   (V_PERM_B32_e64  (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
@@ -3039,6 +3052,19 @@ def : GCNPat <
   (i32 (zext (bswap i16:$a))),
   (V_PERM_B32_e64  (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
 >;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+  (i16 (bswap i16:$a)),
+  (EXTRACT_SUBREG (V_PERM_B32_e64  (i32 0), (COPY VGPR_16:$a), (S_MOV_B32 (i32 0x0c0c0001))), lo16)
+>;
+
+def : GCNPat <
+  (i32 (zext (bswap i16:$a))),
+  (V_PERM_B32_e64  (i32 0), (COPY VGPR_16:$a), (S_MOV_B32 (i32 0x0c0c0001)))
+>;
+}
 
 // Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24)
 def : GCNPat <
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 722a79be915dc..f8878f32f829d 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -48,6 +48,7 @@
 /// %exec = S_OR_B64 %exec, %sgpr0     // Re-enable saved exec mask bits
 //===----------------------------------------------------------------------===//
 
+#include "SILowerControlFlow.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -68,7 +69,7 @@ RemoveRedundantEndcf("amdgpu-remove-redundant-endcf",
 
 namespace {
 
-class SILowerControlFlow : public MachineFunctionPass {
+class SILowerControlFlow {
 private:
   const SIRegisterInfo *TRI = nullptr;
   const SIInstrInfo *TII = nullptr;
@@ -135,10 +136,18 @@ class SILowerControlFlow : public MachineFunctionPass {
   // Remove redundant SI_END_CF instructions.
   void optimizeEndCf();
 
+public:
+  SILowerControlFlow(LiveIntervals *LIS, LiveVariables *LV,
+                     MachineDominatorTree *MDT)
+      : LIS(LIS), LV(LV), MDT(MDT) {}
+  bool run(MachineFunction &MF);
+};
+
+class SILowerControlFlowLegacy : public MachineFunctionPass {
 public:
   static char ID;
 
-  SILowerControlFlow() : MachineFunctionPass(ID) {}
+  SILowerControlFlowLegacy() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -152,17 +161,17 @@ class SILowerControlFlow : public MachineFunctionPass {
     AU.addPreserved<MachineDominatorTreeWrapperPass>();
     AU.addPreserved<SlotIndexesWrapperPass>();
     AU.addPreserved<LiveIntervalsWrapperPass>();
-    AU.addPreservedID(LiveVariablesID);
+    AU.addPreserved<LiveVariablesWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
 
 } // end anonymous namespace
 
-char SILowerControlFlow::ID = 0;
+char SILowerControlFlowLegacy::ID = 0;
 
-INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
-               "SI lower control flow", false, false)
+INITIALIZE_PASS(SILowerControlFlowLegacy, DEBUG_TYPE, "SI lower control flow",
+                false, false)
 
 static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
   MachineOperand &ImpDefSCC = MI.getOperand(3);
@@ -171,7 +180,7 @@ static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
   ImpDefSCC.setIsDead(IsDead);
 }
 
-char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
+char &llvm::SILowerControlFlowLegacyID = SILowerControlFlowLegacy::ID;
 
 bool SILowerControlFlow::hasKill(const MachineBasicBlock *Begin,
                                  const MachineBasicBlock *End) {
@@ -753,21 +762,13 @@ bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
   return true;
 }
 
-bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
+bool SILowerControlFlow::run(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
   TRI = &TII->getRegisterInfo();
   EnableOptimizeEndCf = RemoveRedundantEndcf &&
                         MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
 
-  // This doesn't actually need LiveIntervals, but we can preserve them.
-  auto *LISWrapper = getAnalysisIfAvailable<LiveIntervalsWrapperPass>();
-  LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr;
-  // This doesn't actually need LiveVariables, but we can preserve them.
-  auto *LVWrapper = getAnalysisIfAvailable<LiveVariablesWrapperPass>();
-  LV = LVWrapper ? &LVWrapper->getLV() : nullptr;
-  auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
-  MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
   MRI = &MF.getRegInfo();
   BoolRC = TRI->getBoolRC();
 
@@ -864,3 +865,35 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
 
   return Changed;
 }
+
+bool SILowerControlFlowLegacy::runOnMachineFunction(MachineFunction &MF) {
+  // This doesn't actually need LiveIntervals, but we can preserve them.
+  auto *LISWrapper = getAnalysisIfAvailable<LiveIntervalsWrapperPass>();
+  LiveIntervals *LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr;
+  // This doesn't actually need LiveVariables, but we can preserve them.
+  auto *LVWrapper = getAnalysisIfAvailable<LiveVariablesWrapperPass>();
+  LiveVariables *LV = LVWrapper ? &LVWrapper->getLV() : nullptr;
+  auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
+  MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
+  return SILowerControlFlow(LIS, LV, MDT).run(MF);
+}
+
+PreservedAnalyses
+SILowerControlFlowPass::run(MachineFunction &MF,
+                            MachineFunctionAnalysisManager &MFAM) {
+  LiveIntervals *LIS = MFAM.getCachedResult<LiveIntervalsAnalysis>(MF);
+  LiveVariables *LV = MFAM.getCachedResult<LiveVariablesAnalysis>(MF);
+  MachineDominatorTree *MDT =
+      MFAM.getCachedResult<MachineDominatorTreeAnalysis>(MF);
+
+  bool Changed = SILowerControlFlow(LIS, LV, MDT).run(MF);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  auto PA = getMachineFunctionPassPreservedAnalyses();
+  PA.preserve<MachineDominatorTreeAnalysis>();
+  PA.preserve<SlotIndexesAnalysis>();
+  PA.preserve<LiveIntervalsAnalysis>();
+  PA.preserve<LiveVariablesAnalysis>();
+  return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.h b/llvm/lib/Target/AMDGPU/SILowerControlFlow.h
new file mode 100644
index 0000000000000..23803c679c246
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.h
@@ -0,0 +1,22 @@
+//===- SILowerControlFlow.h -------------------------------------*- C++- *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SILOWERCONTROLFLOW_H
+#define LLVM_LIB_TARGET_AMDGPU_SILOWERCONTROLFLOW_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+class SILowerControlFlowPass : public PassInfoMixin<SILowerControlFlowPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_SILOWERCONTROLFLOW_H
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 169f1369fb543..7de64bddf7884 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -715,7 +715,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
       ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
       PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
       MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
-      Mode(MFI.getMode()) {
+      Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()) {
   for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
     SpillPhysVGPRS.push_back(regToString(Reg, TRI));
 
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 6bbf19179b7f6..900c91731aa1b 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -714,6 +714,26 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> {
 def VOP2e_I32_I32_I32_I1 : VOP2e_SGPR<[i32, i32, i32, i1]>;
 def VOP2e_I16_I16_I16_I1 : VOP2e_SGPR<[i16, i16, i16, i1]>;
 // V_CNDMASK_B16 is VOP3 only
+def VOP2e_I16_I16_I16_I1_true16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
+  let IsTrue16 = 1;
+  let IsRealTrue16 = 1;
+  let HasOpSel = 1;
+  let DstRC64 = getVALUDstForVT<DstVT, 1, 1>.ret;
+  let Src0RC64 = getVOP3SrcForVT<Src0VT, 1/*IsTrue16*/>.ret;
+  let Src1RC64 = getVOP3SrcForVT<Src1VT, 1/*IsTrue16*/>.ret;
+  let Src2RC64 = getVOP3SrcForVT<Src2VT, 1/*IsTrue16*/>.ret;
+  let Src0Mod = getSrc0Mod<f16, DstVT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+  let Src1Mod = getSrcMod<f16, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+  let HasSrc2Mods = 0;
+  let InsVOP3OpSel = getInsVOP3Base<Src0RC64, Src1RC64,
+                    Src2RC64, NumSrcArgs,
+                    HasClamp, 1/*HasModifiers*/, 0/*HasSrc2Mods*/, HasOMod,
+                    Src0Mod, Src1Mod, Src2Mod, 1/*HasOpSel*/>.ret;
+  let Src0VOP3DPP = VGPRSrc_16;
+  let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
+  let Src0ModVOP3DPP = getSrc0ModVOP3DPP<f16, DstVT, 0/*IsFake16*/>.ret;
+  let Src1ModVOP3DPP = getSrcModVOP3DPP<f16, 0/*IsFake16*/>.ret;
+}
 def VOP2e_I16_I16_I16_I1_fake16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
   let IsTrue16 = 1;
   let DstRC64 = getVALUDstForVT<DstVT>.ret;
@@ -765,8 +785,10 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
 // VOP2 Instructions
 //===----------------------------------------------------------------------===//
 
-let SubtargetPredicate = isGFX11Plus in
-defm V_CNDMASK_B16 : VOP2eInst <"v_cndmask_b16", VOP2e_I16_I16_I16_I1_fake16>;
+let SubtargetPredicate = isGFX11Plus, True16Predicate = UseRealTrue16Insts in
+defm V_CNDMASK_B16_t16 : VOP2eInst <"v_cndmask_b16_t16", VOP2e_I16_I16_I16_I1_true16>;
+let SubtargetPredicate = isGFX11Plus, True16Predicate = UseFakeTrue16Insts in
+defm V_CNDMASK_B16_fake16 : VOP2eInst <"v_cndmask_b16_fake16", VOP2e_I16_I16_I16_I1_fake16>;
 defm V_CNDMASK_B32 : VOP2eInst_VOPD <"v_cndmask_b32", VOP2e_I32_I32_I32_I1, 0x9, "v_cndmask_b32">;
 let SubtargetPredicate = HasMadMacF32Insts, isReMaterializable = 1 in
 def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>;
@@ -1846,7 +1868,7 @@ defm V_FMAMK_F16           : VOP2Only_Real_MADK_t16_and_fake16_gfx11_gfx12<0x037
 defm V_FMAAK_F16           : VOP2Only_Real_MADK_t16_and_fake16_gfx11_gfx12<0x038, "v_fmaak_f16">;
 
 // VOP3 only.
-defm V_CNDMASK_B16         : VOP3Only_Realtriple_gfx11_gfx12<0x25d>;
+defm V_CNDMASK_B16         : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x25d, "v_cndmask_b16">;
 defm V_LDEXP_F32           : VOP3Only_Realtriple_gfx11_gfx12<0x31c>;
 defm V_BFM_B32             : VOP3Only_Realtriple_gfx11_gfx12<0x31d>;
 defm V_BCNT_U32_B32        : VOP3Only_Realtriple_gfx11_gfx12<0x31e>;
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 14e34c9e00ec6..bba8aa570d2b5 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -153,8 +153,7 @@ class VOPC_NoSdst_Profile<list<SchedReadWrite> sched, ValueType vt0,
   let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
                      Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
                      src0_sel:$src0_sel, src1_sel:$src1_sel);
-  let AsmVOP3Base = !if(Src0VT.isFP, "$src0_modifiers, $src1_modifiers$clamp",
-                                     "$src0, $src1");
+  let HasDst = 0;
   let AsmSDWA9 = "$src0_modifiers, $src1_modifiers $src0_sel $src1_sel";
   let EmitDst = 0;
 }
@@ -164,23 +163,53 @@ multiclass VOPC_NoSdst_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, Va
   def _t16 : VOPC_NoSdst_Profile<sched, vt0, vt1> {
     let IsTrue16 = 1;
     let IsRealTrue16 = 1;
-    let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
-    let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
-    let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
-    let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
-    let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
-    let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
-    let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
+    let HasOpSel = 1;
+    let HasModifiers = 1; // All instructions at least have OpSel
+    let Src0RC32 = getVOPSrc0ForVT<Src0VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+    let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+    let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+    let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+    let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+    let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0/*IsFake16*/>.ret;
+    let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0/*IsFake16*/>.ret;
+    let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0/*IsFake16*/>.ret;
+    let Src0VOP3DPP = VGPRSrc_16;
+    let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
+    let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0/*IsFake16*/>.ret;
+
+    let Src0RC64 = getVOP3SrcForVT<Src0VT, 1/*IsTrue16*/>.ret;
+    let Src1RC64 = getVOP3SrcForVT<Src1VT, 1/*IsTrue16*/>.ret;
+    let Src2RC64 = getVOP3SrcForVT<Src2VT, 1/*IsTrue16*/>.ret;
+    let Src0Mod = getSrc0Mod<Src0VT, DstVT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+    let Src1Mod = getSrcMod<Src1VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+    let Src2Mod = getSrcMod<Src2VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+    let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 0/*IsFake16*/>.ret;
+    let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 0/*IsFake16*/>.ret;
+    let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 0/*IsFake16*/>.ret;
   }
   def _fake16 : VOPC_NoSdst_Profile<sched, vt0, vt1> {
     let IsTrue16 = 1;
+    let Src0RC32 = getVOPSrc0ForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
-    let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
-    let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
-    let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
+    let Src0ModDPP = getSrcModDPP_t16<Src0VT, 1/*IsFake16*/>.ret;
+    let Src1ModDPP = getSrcModDPP_t16<Src1VT, 1/*IsFake16*/>.ret;
+    let Src2ModDPP = getSrcModDPP_t16<Src2VT, 1/*IsFake16*/>.ret;
+    let Src0VOP3DPP = VGPRSrc_32;
+    let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
+    let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret;
+
+    let Src0RC64 = getVOP3SrcForVT<Src0VT, 0/*IsTrue16*/>.ret;
+    let Src1RC64 = getVOP3SrcForVT<Src1VT, 0/*IsTrue16*/>.ret;
+    let Src2RC64 = getVOP3SrcForVT<Src2VT, 0/*IsTrue16*/>.ret;
+    let Src0Mod = getSrc0Mod<Src0VT, DstVT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+    let Src1Mod = getSrcMod<Src1VT, 0/*IsTrue16*/, 1/*IsFake16*/>.ret;
+    let Src2Mod = getSrcMod<Src2VT, 0/*IsTrue16*/, 1/*IsFake16*/>.ret;
+    let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 1/*IsFake16*/>.ret;
+    let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 1/*IsFake16*/>.ret;
+    let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 1/*IsFake16*/>.ret;
   }
 }
 
@@ -1408,7 +1437,7 @@ class VOPC64_DPP16_Dst<bits<10> op, VOP_DPP_Pseudo ps,
 class VOPC64_DPP16_NoDst<bits<10> op, VOP_DPP_Pseudo ps,
                          string opName = ps.OpName>
     : VOPC64_DPP<ps, opName>, VOP3_DPP_Enc<op, ps.Pfl, 1> {
-  let Inst{7-0} = ? ;
+  let Inst{7-0} = ?;
 }
 
 class VOPC64_DPP16_Dst_t16<bits<10> op, VOP_DPP_Pseudo ps,
@@ -1419,6 +1448,13 @@ class VOPC64_DPP16_Dst_t16<bits<10> op, VOP_DPP_Pseudo ps,
   let Inst{14} = 0;
 }
 
+class VOPC64_DPP16_NoDst_t16<bits<10> op, VOP_DPP_Pseudo ps,
+                             string opName = ps.OpName>
+    : VOPC64_DPP<ps, opName>, VOP3_DPP_Enc_t16<op, ps.Pfl, 1> {
+  let Inst{7-0} = ?;
+  let Inst{14} = 0;
+}
+
 class VOPC64_DPP8<VOP_Pseudo ps, string opName = ps.OpName>
     : VOP3_DPP8_Base<opName, ps.Pfl> {
   Instruction Opcode = !cast<Instruction>(NAME);
@@ -1440,7 +1476,7 @@ class VOPC64_DPP8_Dst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
 
 class VOPC64_DPP8_NoDst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
     : VOPC64_DPP8<ps, opName>, VOP3_DPP8_Enc<op, ps.Pfl> {
-  let Inst{7-0} = ? ;
+  let Inst{7-0} = ?;
   let Constraints = "";
 }
 
@@ -1452,6 +1488,13 @@ class VOPC64_DPP8_Dst_t16<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
   let Constraints = "";
 }
 
+class VOPC64_DPP8_NoDst_t16<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
+    : VOPC64_DPP8<ps, opName>, VOP3_DPP8_Enc_t16<op, ps.Pfl> {
+  let Inst{7-0} = ?;
+  let Inst{14} = 0;
+  let Constraints = "";
+}
+
 //===----------------------------------------------------------------------===//
 // Target-specific instruction encodings.
 //===----------------------------------------------------------------------===//
@@ -1619,7 +1662,7 @@ multiclass VOPCX_Real<GFXGen Gen, bits<9> op> {
                         # " " # ps32.AsmOperands;
     }
     def _e64#Gen.Suffix :
-      VOP3_Real<ps64, Gen.Subtarget>,
+      VOP3_Real_Gen<ps64, Gen>,
       VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
         let Inst{7-0} = ?; // sdst
         let AsmString = !subst("_nosdst", "", ps64.Mnemonic)
@@ -1677,11 +1720,22 @@ multiclass VOPCX_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
             VOPCe<op{7-0}> {
         let AsmString = asm_name # "{_e32} " # ps32.AsmOperands;
       }
-      def _e64#Gen.Suffix
-          : VOP3_Real_Gen<ps64, Gen, asm_name>,
-            VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
-        let Inst{7-0} = ? ; // sdst
-        let AsmString = asm_name # "{_e64} " # ps64.AsmOperands;
+
+      if ps64.Pfl.IsRealTrue16 then {
+        def _e64#Gen.Suffix
+            : VOP3_Real_Gen<ps64, Gen, asm_name>,
+              VOP3e_t16_gfx11_gfx12<{0, op}, ps64.Pfl> {
+          let Inst{7-0} = ?; // sdst
+          let Inst{14} = 0;
+          let AsmString = asm_name # "{_e64} " # ps64.AsmOperands;
+        }
+      } else {
+        def _e64#Gen.Suffix
+            : VOP3_Real_Gen<ps64, Gen, asm_name>,
+              VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
+          let Inst{7-0} = ?; // sdst
+          let AsmString = asm_name # "{_e64} " # ps64.AsmOperands;
+        }
       }
 
       defm : VOPCXInstAliases<OpName, !substr(Gen.Suffix, 1), NAME, asm_name>;
@@ -1695,14 +1749,25 @@ multiclass VOPCX_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
       if ps64.Pfl.HasExtVOP3DPP then {
         defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e64"#"_dpp");
         defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
-        def _e64_dpp#Gen.Suffix
-            : VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>,
-              SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
-          let AsmString = asm_name # "{_e64_dpp} " # AsmDPP;
-        }
         defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
-        def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> {
-          let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8;
+        if ps64.Pfl.IsRealTrue16 then {
+          def _e64_dpp#Gen.Suffix
+              : VOPC64_DPP16_NoDst_t16<{0, op}, psDPP, asm_name>,
+                SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
+            let AsmString = asm_name # "{_e64_dpp} " # AsmDPP;
+          }
+          def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst_t16<{0, op}, ps64, asm_name> {
+            let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8;
+          }
+        } else {
+          def _e64_dpp#Gen.Suffix
+              : VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>,
+                SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
+            let AsmString = asm_name # "{_e64_dpp} " # AsmDPP;
+          }
+          def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> {
+            let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8;
+          }
         }
       } // End if ps64.Pfl.HasExtVOP3DPP
     } // End DecoderNamespace
@@ -1756,11 +1821,23 @@ multiclass VOPCX_Real_t16_gfx11<bits<9> op, string asm_name,
     string OpName = NAME, string pseudo_mnemonic = ""> :
   VOPCX_Real_t16<GFX11Gen, op, asm_name, OpName, pseudo_mnemonic>;
 
+multiclass VOPCX_Real_t16_and_fake16_gfx11<bits<9> op, string asm_name,
+    string OpName = NAME, string pseudo_mnemonic = ""> {
+  defm _t16: VOPCX_Real_t16_gfx11<op, asm_name, OpName#"_t16", pseudo_mnemonic>;
+  defm _fake16: VOPCX_Real_t16_gfx11<op, asm_name, OpName#"_fake16", pseudo_mnemonic>;
+}
+
 multiclass VOPCX_Real_t16_gfx11_gfx12<bits<9> op, string asm_name,
     string OpName = NAME, string pseudo_mnemonic = ""> :
   VOPCX_Real_t16<GFX11Gen, op, asm_name, OpName, pseudo_mnemonic>,
   VOPCX_Real_t16<GFX12Gen, op, asm_name, OpName, pseudo_mnemonic>;
 
+multiclass VOPCX_Real_t16_and_fake16_gfx11_gfx12<bits<9> op, string asm_name,
+    string OpName = NAME, string pseudo_mnemonic = ""> {
+  defm _t16: VOPCX_Real_t16_gfx11_gfx12<op, asm_name, OpName#"_t16", pseudo_mnemonic>;
+  defm _fake16: VOPCX_Real_t16_gfx11_gfx12<op, asm_name, OpName#"_fake16", pseudo_mnemonic>;
+}
+
 defm V_CMP_F_F16_fake16      : VOPC_Real_t16_gfx11<0x000, "v_cmp_f_f16">;
 defm V_CMP_LT_F16            : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x001, "v_cmp_lt_f16">;
 defm V_CMP_EQ_F16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x002, "v_cmp_eq_f16">;
@@ -1848,7 +1925,7 @@ defm V_CMP_CLASS_F32     : VOPC_Real_gfx11_gfx12<0x07e>;
 defm V_CMP_CLASS_F64     : VOPC_Real_gfx11_gfx12<0x07f>;
 
 defm V_CMPX_F_F16_fake16     : VOPCX_Real_t16_gfx11<0x080, "v_cmpx_f_f16">;
-defm V_CMPX_LT_F16_fake16    : VOPCX_Real_t16_gfx11_gfx12<0x081, "v_cmpx_lt_f16">;
+defm V_CMPX_LT_F16           : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x081, "v_cmpx_lt_f16">;
 defm V_CMPX_EQ_F16_fake16    : VOPCX_Real_t16_gfx11_gfx12<0x082, "v_cmpx_eq_f16">;
 defm V_CMPX_LE_F16_fake16    : VOPCX_Real_t16_gfx11_gfx12<0x083, "v_cmpx_le_f16">;
 defm V_CMPX_GT_F16_fake16    : VOPCX_Real_t16_gfx11_gfx12<0x084, "v_cmpx_gt_f16">;
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
index ab03a4e56ea07..b3c27a3d1d6fa 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -60,7 +60,7 @@ bool BPFAsmPrinter::doInitialization(Module &M) {
   // Only emit BTF when debuginfo available.
   if (MAI->doesSupportDebugInformation() && !M.debug_compile_units().empty()) {
     BTF = new BTFDebug(this);
-    DebugHandlers.push_back(std::unique_ptr<BTFDebug>(BTF));
+    Handlers.push_back(std::unique_ptr<BTFDebug>(BTF));
   }
 
   return false;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 6fdd83c4dc877..4b20a64cb0722 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -303,6 +303,14 @@ defvar BarrierMode_GroupMemoryBarrierWithGroupSync  = 9;
 defvar BarrierMode_AllMemoryBarrier                 = 10;
 defvar BarrierMode_AllMemoryBarrierWithGroupSync    = 11;
 
+defvar WaveOpKind_Sum     = 0;
+defvar WaveOpKind_Product = 1;
+defvar WaveOpKind_Min     = 2;
+defvar WaveOpKind_Max     = 3;
+
+defvar SignedOpKind_Signed   = 0;
+defvar SignedOpKind_Unsigned = 1;
+
 // Intrinsic arg selection
 class IntrinArgSelectType;
 def IntrinArgSelect_Index : IntrinArgSelectType;
@@ -340,7 +348,7 @@ class IntrinArgI32  <int value> : IntrinArgSelect<IntrinArgSelect_I32,   value>;
 //     IntrinSelect<int_dx_wave_active_usum,
 //       [ IntrinArgIndex<0>, IntrinArgI8<0>, IntrinArgI8<1> ]
 //     >,
-//     IntrinSelect<int_dx_wave_active_sum,
+//     IntrinSelect<int_dx_wave_reduce_sum,
 //       [ IntrinArgIndex<0>, IntrinArgI8<0>, IntrinArgI8<0> ]
 //     >,
 //   ]
@@ -620,6 +628,18 @@ def CountBits :  DXILOp<31, unaryBits> {
   let attributes = [Attributes<DXIL1_0, [ReadNone]>];
 }
 
+def FirstbitLo : DXILOp<32, unaryBits> {
+  let Doc = "Returns the location of the first set bit starting from "
+            "the lowest order bit and working upward.";
+  let intrinsics = [ IntrinSelect<int_dx_firstbitlow> ];
+  let arguments = [OverloadTy];
+  let result = Int32Ty;
+  let overloads =
+      [Overloads<DXIL1_0, [Int16Ty, Int32Ty, Int64Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
 def FirstbitHi :  DXILOp<33, unaryBits> {
   let Doc = "Returns the location of the first set bit starting from "
             "the highest order bit and working downward.";
@@ -979,6 +999,24 @@ def WaveActiveAnyTrue : DXILOp<113, waveAnyTrue> {
   let stages = [Stages<DXIL1_0, [all_stages]>];
 }
 
+def WaveActiveOp : DXILOp<119, waveActiveOp> {
+  let Doc = "returns the result of the operation across waves";
+  let intrinsics = [
+    IntrinSelect<
+        int_dx_wave_reduce_sum,
+        [ IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Sum>, IntrinArgI8<SignedOpKind_Signed> ]>,
+    IntrinSelect<
+        int_dx_wave_reduce_usum,
+        [ IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Sum>, IntrinArgI8<SignedOpKind_Unsigned> ]>,
+  ];
+
+  let arguments = [OverloadTy, Int8Ty, Int8Ty];
+  let result = OverloadTy;
+  let overloads = [Overloads<DXIL1_0, [HalfTy, FloatTy, DoubleTy, Int16Ty, Int32Ty, Int64Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
 def WaveIsFirstLane :  DXILOp<110, waveIsFirstLane> {
   let Doc = "returns 1 for the first lane in the wave";
   let intrinsics = [ IntrinSelect<int_dx_wave_is_first_lane> ];
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index 4be1326085bc0..4e6e01bc5edbc 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -40,11 +40,14 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
   switch (ID) {
   case Intrinsic::dx_frac:
   case Intrinsic::dx_rsqrt:
+  case Intrinsic::dx_wave_reduce_sum:
+  case Intrinsic::dx_wave_reduce_usum:
   case Intrinsic::dx_wave_readlane:
   case Intrinsic::dx_asdouble:
   case Intrinsic::dx_splitdouble:
   case Intrinsic::dx_firstbituhigh:
   case Intrinsic::dx_firstbitshigh:
+  case Intrinsic::dx_firstbitlow:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 900a9054fc2c3..a19f9749cd9e3 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1861,13 +1861,6 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setLibcallName(RTLIB::UREM_I32, "__hexagon_umodsi3");
   setLibcallName(RTLIB::UREM_I64, "__hexagon_umoddi3");
 
-  setLibcallName(RTLIB::SINTTOFP_I128_F64, "__hexagon_floattidf");
-  setLibcallName(RTLIB::SINTTOFP_I128_F32, "__hexagon_floattisf");
-  setLibcallName(RTLIB::FPTOUINT_F32_I128, "__hexagon_fixunssfti");
-  setLibcallName(RTLIB::FPTOUINT_F64_I128, "__hexagon_fixunsdfti");
-  setLibcallName(RTLIB::FPTOSINT_F32_I128, "__hexagon_fixsfti");
-  setLibcallName(RTLIB::FPTOSINT_F64_I128, "__hexagon_fixdfti");
-
   // This is the only fast library function for sqrtd.
   if (FastMath)
     setLibcallName(RTLIB::SQRT_F64, "__hexagon_fast2_sqrtdf2");
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index efc8b77f8d8fa..420b98b8a9c1f 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -1009,7 +1009,8 @@ void LoongArchAsmParser::emitLoadAddressPcrel(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(
       LoongArchAsmParser::Inst(ADDI, LoongArchMCExpr::VK_LoongArch_PCALA_LO12));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, true);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out,
+                /*RelaxHint=*/true);
 }
 
 void LoongArchAsmParser::emitLoadAddressPcrelLarge(MCInst &Inst, SMLoc IDLoc,
@@ -1083,7 +1084,8 @@ void LoongArchAsmParser::emitLoadAddressGot(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(
       LoongArchAsmParser::Inst(LD, LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, true);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out,
+                /*RelaxHint=*/true);
 }
 
 void LoongArchAsmParser::emitLoadAddressGotLarge(MCInst &Inst, SMLoc IDLoc,
@@ -1176,7 +1178,8 @@ void LoongArchAsmParser::emitLoadAddressTLSIE(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(LoongArchAsmParser::Inst(
       LD, LoongArchMCExpr::VK_LoongArch_TLS_IE_PC_LO12));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out,
+                /*RelaxHint=*/true);
 }
 
 void LoongArchAsmParser::emitLoadAddressTLSIELarge(MCInst &Inst, SMLoc IDLoc,
@@ -1248,7 +1251,8 @@ void LoongArchAsmParser::emitLoadAddressTLSLD(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(LoongArchAsmParser::Inst(
       ADDI, LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out,
+                /*RelaxHint=*/true);
 }
 
 void LoongArchAsmParser::emitLoadAddressTLSLDLarge(MCInst &Inst, SMLoc IDLoc,
@@ -1320,7 +1324,8 @@ void LoongArchAsmParser::emitLoadAddressTLSGD(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(LoongArchAsmParser::Inst(
       ADDI, LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out,
+                /*RelaxHint=*/true);
 }
 
 void LoongArchAsmParser::emitLoadAddressTLSGDLarge(MCInst &Inst, SMLoc IDLoc,
@@ -1409,7 +1414,8 @@ void LoongArchAsmParser::emitLoadAddressTLSDesc(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(LoongArchAsmParser::Inst(
       LoongArch::JIRL, LoongArchMCExpr::VK_LoongArch_TLS_DESC_CALL));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out,
+                /*RelaxHint=*/true);
 }
 
 void LoongArchAsmParser::emitLoadAddressTLSDescLarge(MCInst &Inst, SMLoc IDLoc,
@@ -1500,8 +1506,9 @@ void LoongArchAsmParser::emitFuncCall36(MCInst &Inst, SMLoc IDLoc,
       IsTailCall ? Inst.getOperand(0).getReg() : MCRegister(LoongArch::R1);
   const MCExpr *Sym =
       IsTailCall ? Inst.getOperand(1).getExpr() : Inst.getOperand(0).getExpr();
-  const LoongArchMCExpr *LE = LoongArchMCExpr::create(
-      Sym, llvm::LoongArchMCExpr::VK_LoongArch_CALL36, getContext());
+  const LoongArchMCExpr *LE =
+      LoongArchMCExpr::create(Sym, llvm::LoongArchMCExpr::VK_LoongArch_CALL36,
+                              getContext(), /*RelaxHint=*/true);
 
   Out.emitInstruction(
       MCInstBuilder(LoongArch::PCADDU18I).addReg(ScratchReg).addExpr(LE),
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index 359bde1244429..04d57f0fe7457 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -282,9 +282,11 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
       break;
     case LoongArchMCExpr::VK_LoongArch_TLS_LE_HI20_R:
       FixupKind = LoongArch::fixup_loongarch_tls_le_hi20_r;
+      RelaxCandidate = true;
       break;
     case LoongArchMCExpr::VK_LoongArch_TLS_LE_LO12_R:
       FixupKind = LoongArch::fixup_loongarch_tls_le_lo12_r;
+      RelaxCandidate = true;
       break;
     case LoongArchMCExpr::VK_LoongArch_PCREL20_S2:
       FixupKind = LoongArch::fixup_loongarch_pcrel20_s2;
@@ -387,11 +389,17 @@ void LoongArchMCCodeEmitter::expandAddTPRel(const MCInst &MI,
          "Expected %le_add_r relocation on TP-relative symbol");
 
   // Emit the correct %le_add_r relocation for the symbol.
-  // TODO: Emit R_LARCH_RELAX for %le_add_r where the relax feature is enabled.
   Fixups.push_back(MCFixup::create(
       0, Expr, MCFixupKind(LoongArch::fixup_loongarch_tls_le_add_r),
       MI.getLoc()));
 
+  // Emit R_LARCH_RELAX for %le_add_r when the relax feature is enabled.
+  if (STI.hasFeature(LoongArch::FeatureRelax)) {
+    const MCConstantExpr *Dummy = MCConstantExpr::create(0, Ctx);
+    Fixups.push_back(MCFixup::create(
+        0, Dummy, MCFixupKind(LoongArch::fixup_loongarch_relax), MI.getLoc()));
+  }
+
   // Emit a normal ADD instruction with the given operands.
   unsigned ADD = MI.getOpcode() == LoongArch::PseudoAddTPRel_D
                      ? LoongArch::ADD_D
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 2e66b67dfdcc7..8f6adf2c22f92 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "NVPTXISelDAGToDAG.h"
+#include "NVPTX.h"
 #include "NVPTXUtilities.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
@@ -191,6 +192,12 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
     }
     break;
   }
+  case ISD::FADD:
+  case ISD::FMUL:
+  case ISD::FSUB:
+    if (tryBF16ArithToFMA(N))
+      return;
+    break;
   default:
     break;
   }
@@ -2450,6 +2457,62 @@ bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
   return true;
 }
 
+// Select bf16/bf16v2 FADD, FSUB, FMUL as fma on targets with only fma
+bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(SDNode *N) {
+  EVT VT = SDValue(N, 0).getValueType();
+  if (VT.getScalarType() != MVT::bf16)
+    return false;
+
+  const NVPTXSubtarget *STI = TM.getSubtargetImpl();
+  if (STI->hasNativeBF16Support(N->getOpcode()))
+    return false;
+
+  const bool IsVec = VT.isVector();
+  assert(!IsVec || VT.getVectorNumElements() == 2);
+  SDLoc DL(N);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SmallVector<SDValue, 3> Operands;
+  auto GetConstant = [&](float Value) -> SDValue {
+    // BF16 immediates must be legalized to integer register values
+    APFloat APF(Value);
+    bool LosesInfo;
+    APF.convert(APFloat::BFloat(), APFloat::rmNearestTiesToEven, &LosesInfo);
+    assert(!LosesInfo);
+    if (IsVec) {
+      auto API = APF.bitcastToAPInt();
+      API = API.concat(API);
+      auto Const = CurDAG->getTargetConstant(API, DL, MVT::i32);
+      return SDValue(CurDAG->getMachineNode(NVPTX::IMOV32ri, DL, VT, Const), 0);
+    }
+    auto Const = CurDAG->getTargetConstantFP(APF, DL, VT);
+    return SDValue(CurDAG->getMachineNode(NVPTX::BFMOV16ri, DL, VT, Const), 0);
+  };
+
+  switch (N->getOpcode()) {
+  case ISD::FADD:
+    // add(a, b) -> fma(a, 1.0, b)
+    Operands = {N0, GetConstant(1.0), N1};
+    break;
+  case ISD::FSUB:
+    // sub(a, b) -> fma(b, -1.0, a)
+    Operands = {N1, GetConstant(-1.0), N0};
+    break;
+  case ISD::FMUL:
+    // mul(a, b) -> fma(a, b, -0.0)
+    // NOTE: The identity is -0, not 0, because -0 + 0 == 0 for floats
+    Operands = {N0, N1, GetConstant(-0.0)};
+    break;
+  default:
+    llvm_unreachable("Unexpected opcode");
+  };
+
+  int Opcode = IsVec ? NVPTX::BFMA16x2rrr : NVPTX::BFMA16rrr;
+  MachineSDNode *FMA = CurDAG->getMachineNode(Opcode, DL, VT, Operands);
+  ReplaceNode(N, FMA);
+  return true;
+}
+
 static inline bool isAddLike(const SDValue V) {
   return V.getOpcode() == ISD::ADD ||
          (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 8cadde8a82264..7661f153238fc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -84,6 +84,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   bool tryFence(SDNode *N);
   void SelectAddrSpaceCast(SDNode *N);
   bool tryBFE(SDNode *N);
+  bool tryBF16ArithToFMA(SDNode *N);
   bool tryConstantFP(SDNode *N);
   bool SelectSETP_F16X2(SDNode *N);
   bool SelectSETP_BF16X2(SDNode *N);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 208d724f7ae28..c40c09c204fd7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -94,6 +94,13 @@ static cl::opt<bool> UsePrecSqrtF32(
     cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
     cl::init(true));
 
+/// Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it
+/// does NOT use lg2.approx for log2, so this is disabled by default.
+static cl::opt<bool> UseApproxLog2F32(
+    "nvptx-approx-log2f32",
+    cl::desc("NVPTX Specific: whether to use lg2.approx for log2"),
+    cl::init(false));
+
 static cl::opt<bool> ForceMinByValParamAlign(
     "nvptx-force-min-byval-param-align", cl::Hidden,
     cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
@@ -529,40 +536,16 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
     case ISD::FMINIMUM:
       IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
       break;
+    case ISD::FEXP2:
+      IsOpSupported &= STI.getSmVersion() >= 75 && STI.getPTXVersion() >= 70;
+      break;
     }
     setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action);
   };
 
   auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
                                     LegalizeAction NoBF16Action) {
-    bool IsOpSupported = STI.hasBF16Math();
-    switch (Op) {
-    // Several BF16 instructions are available on sm_90 only.
-    case ISD::FADD:
-    case ISD::FMUL:
-    case ISD::FSUB:
-    case ISD::SELECT:
-    case ISD::SELECT_CC:
-    case ISD::SETCC:
-    case ISD::FEXP2:
-    case ISD::FCEIL:
-    case ISD::FFLOOR:
-    case ISD::FNEARBYINT:
-    case ISD::FRINT:
-    case ISD::FROUNDEVEN:
-    case ISD::FTRUNC:
-      IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78;
-      break;
-    // Several BF16 instructions are available on sm_80 only.
-    case ISD::FMINNUM:
-    case ISD::FMAXNUM:
-    case ISD::FMAXNUM_IEEE:
-    case ISD::FMINNUM_IEEE:
-    case ISD::FMAXIMUM:
-    case ISD::FMINIMUM:
-      IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
-      break;
-    }
+    bool IsOpSupported = STI.hasNativeBF16Support(Op);
     setOperationAction(
         Op, VT, IsOpSupported ? Action : NoBF16Action);
   };
@@ -862,6 +845,15 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
       AddPromotedToType(Op, MVT::bf16, MVT::f32);
   }
 
+  // On SM80, we select add/mul/sub as fma to avoid promotion to float
+  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB}) {
+    for (const auto &VT : {MVT::bf16, MVT::v2bf16}) {
+      if (!STI.hasNativeBF16Support(Op) && STI.hasNativeBF16Support(ISD::FMA)) {
+        setOperationAction(Op, VT, Custom);
+      }
+    }
+  }
+
   // f16/f16x2 neg was introduced in PTX 60, SM_53.
   const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
                                         STI.getPTXVersion() >= 60 &&
@@ -977,7 +969,26 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::CopyToReg, MVT::i128, Custom);
   setOperationAction(ISD::CopyFromReg, MVT::i128, Custom);
 
-  // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
+  // FEXP2 support:
+  // - f32
+  // - f16/f16x2 (sm_70+, PTX 7.0+)
+  // - bf16/bf16x2 (sm_90+, PTX 7.8+)
+  // When f16/bf16 types aren't supported, they are promoted/expanded to f32.
+  setOperationAction(ISD::FEXP2, MVT::f32, Legal);
+  setFP16OperationAction(ISD::FEXP2, MVT::f16, Legal, Promote);
+  setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand);
+  setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote);
+  setBF16OperationAction(ISD::FEXP2, MVT::v2bf16, Legal, Expand);
+
+  // FLOG2 supports f32 only
+  // f16/bf16 types aren't supported, but they are promoted/expanded to f32.
+  if (UseApproxLog2F32) {
+    setOperationAction(ISD::FLOG2, MVT::f32, Legal);
+    setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);
+    setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);
+    setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16}, Expand);
+  }
+
   // No FPOW or FREM in PTX.
 
   // Now deduce the information based on the above mentioned
@@ -1046,7 +1057,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(NVPTXISD::StoreV4)
     MAKE_CASE(NVPTXISD::FSHL_CLAMP)
     MAKE_CASE(NVPTXISD::FSHR_CLAMP)
-    MAKE_CASE(NVPTXISD::IMAD)
     MAKE_CASE(NVPTXISD::BFE)
     MAKE_CASE(NVPTXISD::BFI)
     MAKE_CASE(NVPTXISD::PRMT)
@@ -2499,6 +2509,27 @@ SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
   return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
 }
 
+static SDValue PromoteBinOpToF32(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = MVT::f32;
+  if (VT.isVector()) {
+    NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount());
+  }
+  SDLoc DL(N);
+  SDValue Tmp0 = DAG.getFPExtendOrRound(N->getOperand(0), DL, NVT);
+  SDValue Tmp1 = DAG.getFPExtendOrRound(N->getOperand(1), DL, NVT);
+  SDValue Res = DAG.getNode(N->getOpcode(), DL, NVT, Tmp0, Tmp1, N->getFlags());
+  return DAG.getFPExtendOrRound(Res, DL, VT);
+}
+
+SDValue NVPTXTargetLowering::PromoteBinOpIfF32FTZ(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  if (useF32FTZ(DAG.getMachineFunction())) {
+    return PromoteBinOpToF32(Op.getNode(), DAG);
+  }
+  return Op;
+}
+
 SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
                                             SelectionDAG &DAG) const {
   assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
@@ -2690,6 +2721,12 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerSTACKSAVE(Op, DAG);
   case ISD::CopyToReg:
     return LowerCopyToReg_128(Op, DAG);
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+    // Used only for bf16 on SM80, where we select fma for non-ftz operation
+    return PromoteBinOpIfF32FTZ(Op, DAG);
+
   default:
     llvm_unreachable("Custom lowering not defined for operation");
   }
@@ -4451,14 +4488,8 @@ PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
   if (!N0.getNode()->hasOneUse())
     return SDValue();
 
-  // fold (add (mul a, b), c) -> (mad a, b, c)
-  //
-  if (N0.getOpcode() == ISD::MUL)
-    return DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, N0.getOperand(0),
-                           N0.getOperand(1), N1);
-
   // fold (add (select cond, 0, (mul a, b)), c)
-  //   -> (select cond, c, (mad a, b, c))
+  //   -> (select cond, c, (add (mul a, b), c))
   //
   if (N0.getOpcode() == ISD::SELECT) {
     unsigned ZeroOpNum;
@@ -4473,8 +4504,10 @@ PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
     if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
       return SDValue();
 
-    SDValue MAD = DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
-                                  M->getOperand(0), M->getOperand(1), N1);
+    SDLoc DL(N);
+    SDValue Mul =
+        DCI.DAG.getNode(ISD::MUL, DL, VT, M->getOperand(0), M->getOperand(1));
+    SDValue MAD = DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, N1);
     return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
                              ((ZeroOpNum == 1) ? N1 : MAD),
                              ((ZeroOpNum == 1) ? MAD : N1));
@@ -4911,8 +4944,10 @@ static SDValue matchMADConstOnePattern(SDValue Add) {
 static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL,
                                   TargetLowering::DAGCombinerInfo &DCI) {
 
-  if (SDValue Y = matchMADConstOnePattern(Add))
-    return DCI.DAG.getNode(NVPTXISD::IMAD, DL, VT, X, Y, X);
+  if (SDValue Y = matchMADConstOnePattern(Add)) {
+    SDValue Mul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
+    return DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, X);
+  }
 
   return SDValue();
 }
@@ -4959,7 +4994,7 @@ PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
 
   SDLoc DL(N);
 
-  // (mul x, (add y, 1)) -> (mad x, y, x)
+  // (mul x, (add y, 1)) -> (add (mul x, y), x)
   if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
     return Res;
   if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 4a98fe21b81dc..5adf69d621552 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -55,7 +55,6 @@ enum NodeType : unsigned {
   FSHR_CLAMP,
   MUL_WIDE_SIGNED,
   MUL_WIDE_UNSIGNED,
-  IMAD,
   SETP_F16X2,
   SETP_BF16X2,
   BFE,
@@ -279,6 +278,8 @@ class NVPTXTargetLowering : public TargetLowering {
   SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue PromoteBinOpIfF32FTZ(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index f8dc66d598025..a076fde8ee767 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -141,6 +141,7 @@ def hasLDG : Predicate<"Subtarget->hasLDG()">;
 def hasLDU : Predicate<"Subtarget->hasLDU()">;
 def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">;
 def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">;
+def hasOptEnabled : Predicate<"TM.getOptLevel() != CodeGenOptLevel::None">;
 
 def doF32FTZ : Predicate<"useF32FTZ()">;
 def doNoF32FTZ : Predicate<"!useF32FTZ()">;
@@ -568,6 +569,18 @@ multiclass F2_Support_Half<string OpcStr, SDNode OpNode> {
 
 }
 
+// Variant where only .ftz.bf16 is supported.
+multiclass F2_Support_Half_BF<string OpcStr, SDNode OpNode> {
+   def bf16_ftz :  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
+                           OpcStr # ".ftz.bf16 \t$dst, $a;",
+                           [(set bf16:$dst, (OpNode bf16:$a))]>,
+                           Requires<[hasSM<90>, hasPTX<78>]>;
+   def bf16x2_ftz: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
+                           OpcStr # ".ftz.bf16x2 \t$dst, $a;",
+                           [(set v2bf16:$dst, (OpNode v2bf16:$a))]>,
+                           Requires<[hasSM<90>, hasPTX<78>]>;
+}
+
 //===----------------------------------------------------------------------===//
 // NVPTX Instructions.
 //===----------------------------------------------------------------------===//
@@ -1092,73 +1105,39 @@ def : Pat<(mul (zext i16:$a), (i32 UInt16Const:$b)),
 //
 // Integer multiply-add
 //
-def SDTIMAD :
-  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
-                       SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
-def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
-
-def MAD16rrr :
-  NVPTXInst<(outs Int16Regs:$dst),
-            (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
-            "mad.lo.s16 \t$dst, $a, $b, $c;",
-            [(set i16:$dst, (imad i16:$a, i16:$b, i16:$c))]>;
-def MAD16rri :
-  NVPTXInst<(outs Int16Regs:$dst),
-            (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
-            "mad.lo.s16 \t$dst, $a, $b, $c;",
-            [(set i16:$dst, (imad i16:$a, i16:$b, imm:$c))]>;
-def MAD16rir :
-  NVPTXInst<(outs Int16Regs:$dst),
-            (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
-            "mad.lo.s16 \t$dst, $a, $b, $c;",
-            [(set i16:$dst, (imad i16:$a, imm:$b, i16:$c))]>;
-def MAD16rii :
-  NVPTXInst<(outs Int16Regs:$dst),
-            (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
-            "mad.lo.s16 \t$dst, $a, $b, $c;",
-            [(set i16:$dst, (imad i16:$a, imm:$b, imm:$c))]>;
-
-def MAD32rrr :
-  NVPTXInst<(outs Int32Regs:$dst),
-            (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
-            "mad.lo.s32 \t$dst, $a, $b, $c;",
-            [(set i32:$dst, (imad i32:$a, i32:$b, i32:$c))]>;
-def MAD32rri :
-  NVPTXInst<(outs Int32Regs:$dst),
-            (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
-            "mad.lo.s32 \t$dst, $a, $b, $c;",
-            [(set i32:$dst, (imad i32:$a, i32:$b, imm:$c))]>;
-def MAD32rir :
-  NVPTXInst<(outs Int32Regs:$dst),
-            (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
-            "mad.lo.s32 \t$dst, $a, $b, $c;",
-            [(set i32:$dst, (imad i32:$a, imm:$b, i32:$c))]>;
-def MAD32rii :
-  NVPTXInst<(outs Int32Regs:$dst),
-            (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
-            "mad.lo.s32 \t$dst, $a, $b, $c;",
-            [(set i32:$dst, (imad i32:$a, imm:$b, imm:$c))]>;
-
-def MAD64rrr :
-  NVPTXInst<(outs Int64Regs:$dst),
-            (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
-            "mad.lo.s64 \t$dst, $a, $b, $c;",
-            [(set i64:$dst, (imad i64:$a, i64:$b, i64:$c))]>;
-def MAD64rri :
-  NVPTXInst<(outs Int64Regs:$dst),
-            (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
-            "mad.lo.s64 \t$dst, $a, $b, $c;",
-            [(set i64:$dst, (imad i64:$a, i64:$b, imm:$c))]>;
-def MAD64rir :
-  NVPTXInst<(outs Int64Regs:$dst),
-            (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
-            "mad.lo.s64 \t$dst, $a, $b, $c;",
-            [(set i64:$dst, (imad i64:$a, imm:$b, i64:$c))]>;
-def MAD64rii :
-  NVPTXInst<(outs Int64Regs:$dst),
-            (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
-            "mad.lo.s64 \t$dst, $a, $b, $c;",
-            [(set i64:$dst, (imad i64:$a, imm:$b, imm:$c))]>;
+def mul_oneuse : PatFrag<(ops node:$a, node:$b), (mul node:$a, node:$b), [{
+  return N->hasOneUse();
+}]>;
+
+multiclass MAD<string Ptx, ValueType VT, NVPTXRegClass Reg, Operand Imm> {
+  def rrr:
+    NVPTXInst<(outs Reg:$dst),
+              (ins Reg:$a, Reg:$b, Reg:$c),
+              Ptx # " \t$dst, $a, $b, $c;",
+              [(set VT:$dst, (add (mul_oneuse VT:$a, VT:$b), VT:$c))]>;
+
+  def rir:
+    NVPTXInst<(outs Reg:$dst),
+              (ins Reg:$a, Imm:$b, Reg:$c),
+              Ptx # " \t$dst, $a, $b, $c;",
+              [(set VT:$dst, (add (mul_oneuse VT:$a, imm:$b), VT:$c))]>;
+  def rri:
+    NVPTXInst<(outs Reg:$dst),
+              (ins Reg:$a, Reg:$b, Imm:$c),
+              Ptx # " \t$dst, $a, $b, $c;",
+              [(set VT:$dst, (add (mul_oneuse VT:$a, VT:$b), imm:$c))]>;
+  def rii:
+    NVPTXInst<(outs Reg:$dst),
+              (ins Reg:$a, Imm:$b, Imm:$c),
+              Ptx # " \t$dst, $a, $b, $c;",
+              [(set VT:$dst, (add (mul_oneuse VT:$a, imm:$b), imm:$c))]>;
+}
+
+let Predicates = [hasOptEnabled] in {
+defm MAD16 : MAD<"mad.lo.s16", i16, Int16Regs, i16imm>;
+defm MAD32 : MAD<"mad.lo.s32", i32, Int32Regs, i32imm>;
+defm MAD64 : MAD<"mad.lo.s64", i64, Int64Regs, i64imm>;
+}
 
 def INEG16 :
   NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
@@ -1216,6 +1195,8 @@ defm FNEG_H: F2_Support_Half<"neg", fneg>;
 
 defm FSQRT : F2<"sqrt.rn", fsqrt>;
 
+defm FEXP2_H: F2_Support_Half_BF<"ex2.approx", fexp2>;
+
 //
 // F16 NEG
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 2d6ee2e28b4df..48d75728aef8e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1304,11 +1304,21 @@ def INT_NVVM_EX2_APPROX_F : F_MATH_1<"ex2.approx.f32 \t$dst, $src0;",
   Float32Regs, Float32Regs, int_nvvm_ex2_approx_f>;
 def INT_NVVM_EX2_APPROX_D : F_MATH_1<"ex2.approx.f64 \t$dst, $src0;",
   Float64Regs, Float64Regs, int_nvvm_ex2_approx_d>;
+
 def INT_NVVM_EX2_APPROX_F16 : F_MATH_1<"ex2.approx.f16 \t$dst, $src0;",
   Int16Regs, Int16Regs, int_nvvm_ex2_approx_f16, [hasPTX<70>, hasSM<75>]>;
 def INT_NVVM_EX2_APPROX_F16X2 : F_MATH_1<"ex2.approx.f16x2 \t$dst, $src0;",
   Int32Regs, Int32Regs, int_nvvm_ex2_approx_f16x2, [hasPTX<70>, hasSM<75>]>;
 
+def : Pat<(fexp2 f32:$a),
+          (INT_NVVM_EX2_APPROX_FTZ_F $a)>, Requires<[doF32FTZ]>;
+def : Pat<(fexp2 f32:$a),
+          (INT_NVVM_EX2_APPROX_F $a)>, Requires<[doNoF32FTZ]>;
+def : Pat<(fexp2 f16:$a),
+          (INT_NVVM_EX2_APPROX_F16 $a)>, Requires<[useFP16Math]>;
+def : Pat<(fexp2 v2f16:$a),
+          (INT_NVVM_EX2_APPROX_F16X2 $a)>, Requires<[useFP16Math]>;
+
 def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32 \t$dst, $src0;",
   Float32Regs, Float32Regs, int_nvvm_lg2_approx_ftz_f>;
 def INT_NVVM_LG2_APPROX_F : F_MATH_1<"lg2.approx.f32 \t$dst, $src0;",
@@ -1316,6 +1326,11 @@ def INT_NVVM_LG2_APPROX_F : F_MATH_1<"lg2.approx.f32 \t$dst, $src0;",
 def INT_NVVM_LG2_APPROX_D : F_MATH_1<"lg2.approx.f64 \t$dst, $src0;",
   Float64Regs, Float64Regs, int_nvvm_lg2_approx_d>;
 
+def : Pat<(flog2 f32:$a), (INT_NVVM_LG2_APPROX_FTZ_F $a)>,
+          Requires<[doF32FTZ]>;
+def : Pat<(flog2 f32:$a), (INT_NVVM_LG2_APPROX_F $a)>,
+          Requires<[doNoF32FTZ]>;
+
 //
 // Sin  Cos
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 74ce6a9fc4ac0..e5d680c19d921 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -70,6 +70,38 @@ bool NVPTXSubtarget::allowFP16Math() const {
   return hasFP16Math() && NoF16Math == false;
 }
 
+bool NVPTXSubtarget::hasNativeBF16Support(int Opcode) const {
+  if (!hasBF16Math())
+    return false;
+
+  switch (Opcode) {
+  // Several BF16 instructions are available on sm_90 only.
+  case ISD::FADD:
+  case ISD::FMUL:
+  case ISD::FSUB:
+  case ISD::SELECT:
+  case ISD::SELECT_CC:
+  case ISD::SETCC:
+  case ISD::FEXP2:
+  case ISD::FCEIL:
+  case ISD::FFLOOR:
+  case ISD::FNEARBYINT:
+  case ISD::FRINT:
+  case ISD::FROUNDEVEN:
+  case ISD::FTRUNC:
+    return getSmVersion() >= 90 && getPTXVersion() >= 78;
+  // Several BF16 instructions are available on sm_80 only.
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMAXNUM_IEEE:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXIMUM:
+  case ISD::FMINIMUM:
+    return getSmVersion() >= 80 && getPTXVersion() >= 70;
+  }
+  return true;
+}
+
 void NVPTXSubtarget::failIfClustersUnsupported(
     std::string const &FailureMessage) const {
   if (hasClusters())
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index bbc1cca7c12d8..3b5c28e357e0c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -118,6 +118,8 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   }
   bool hasTargetName() const { return !TargetName.empty(); }
 
+  bool hasNativeBF16Support(int Opcode) const;
+
   // Get maximum value of required alignments among the supported data types.
   // From the PTX ISA doc, section 8.2.3:
   //  The memory consistency model relates operations executed on memory
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 911cea27a48ac..333c8060f37f4 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -502,7 +502,7 @@ getPushOrLibCallsSavedInfo(const MachineFunction &MF,
 void RISCVFrameLowering::allocateAndProbeStackForRVV(
     MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator MBBI, const DebugLoc &DL, int64_t Amount,
-    MachineInstr::MIFlag Flag, bool EmitCFI) const {
+    MachineInstr::MIFlag Flag, bool EmitCFI, bool DynAllocation) const {
   assert(Amount != 0 && "Did not need to adjust stack pointer for RVV.");
 
   // Emit a variable-length allocation probing loop.
@@ -545,6 +545,15 @@ void RISCVFrameLowering::allocateAndProbeStackForRVV(
       .addReg(SPReg)
       .addReg(TargetReg)
       .setMIFlag(Flag);
+
+  // If we have a dynamic allocation later we need to probe any residuals.
+  if (DynAllocation) {
+    BuildMI(MBB, MBBI, DL, TII->get(STI.is64Bit() ? RISCV::SD : RISCV::SW))
+        .addReg(RISCV::X0)
+        .addReg(SPReg)
+        .addImm(0)
+        .setMIFlags(MachineInstr::FrameSetup);
+  }
 }
 
 static void appendScalableVectorExpression(const TargetRegisterInfo &TRI,
@@ -634,11 +643,12 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MBBI,
                                        MachineFunction &MF, uint64_t Offset,
                                        uint64_t RealStackSize, bool EmitCFI,
-                                       bool NeedProbe,
-                                       uint64_t ProbeSize) const {
+                                       bool NeedProbe, uint64_t ProbeSize,
+                                       bool DynAllocation) const {
   DebugLoc DL;
   const RISCVRegisterInfo *RI = STI.getRegisterInfo();
   const RISCVInstrInfo *TII = STI.getInstrInfo();
+  bool IsRV64 = STI.is64Bit();
 
   // Simply allocate the stack if it's not big enough to require a probe.
   if (!NeedProbe || Offset <= ProbeSize) {
@@ -654,13 +664,21 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
           .setMIFlag(MachineInstr::FrameSetup);
     }
 
+    if (NeedProbe && DynAllocation) {
+      // s[d|w] zero, 0(sp)
+      BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+          .addReg(RISCV::X0)
+          .addReg(SPReg)
+          .addImm(0)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+
     return;
   }
 
   // Unroll the probe loop depending on the number of iterations.
   if (Offset < ProbeSize * 5) {
     uint64_t CurrentOffset = 0;
-    bool IsRV64 = STI.is64Bit();
     while (CurrentOffset + ProbeSize <= Offset) {
       RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
                     StackOffset::getFixed(-ProbeSize), MachineInstr::FrameSetup,
@@ -696,6 +714,15 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
             .addCFIIndex(CFIIndex)
             .setMIFlag(MachineInstr::FrameSetup);
       }
+
+      if (DynAllocation) {
+        // s[d|w] zero, 0(sp)
+        BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+            .addReg(RISCV::X0)
+            .addReg(SPReg)
+            .addImm(0)
+            .setMIFlags(MachineInstr::FrameSetup);
+      }
     }
 
     return;
@@ -736,9 +763,18 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
         .setMIFlags(MachineInstr::FrameSetup);
   }
 
-  if (Residual)
+  if (Residual) {
     RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(-Residual),
                   MachineInstr::FrameSetup, getStackAlign());
+    if (DynAllocation) {
+      // s[d|w] zero, 0(sp)
+      BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+          .addReg(RISCV::X0)
+          .addReg(SPReg)
+          .addImm(0)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+  }
 
   if (EmitCFI) {
     // Emit ".cfi_def_cfa_offset Offset"
@@ -869,9 +905,11 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
   const RISCVTargetLowering *TLI = Subtarget.getTargetLowering();
   bool NeedProbe = TLI->hasInlineStackProbe(MF);
   uint64_t ProbeSize = TLI->getStackProbeSize(MF, getStackAlign());
+  bool DynAllocation =
+      MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation();
   if (StackSize != 0)
     allocateStack(MBB, MBBI, MF, StackSize, RealStackSize, /*EmitCFI=*/true,
-                  NeedProbe, ProbeSize);
+                  NeedProbe, ProbeSize, DynAllocation);
 
   // The frame pointer is callee-saved, and code has been generated for us to
   // save it to the stack. We need to skip over the storing of callee-saved
@@ -914,13 +952,14 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
 
     allocateStack(MBB, MBBI, MF, SecondSPAdjustAmount,
                   getStackSizeWithRVVPadding(MF), !hasFP(MF), NeedProbe,
-                  ProbeSize);
+                  ProbeSize, DynAllocation);
   }
 
   if (RVVStackSize) {
     if (NeedProbe) {
       allocateAndProbeStackForRVV(MF, MBB, MBBI, DL, RVVStackSize,
-                                  MachineInstr::FrameSetup, !hasFP(MF));
+                                  MachineInstr::FrameSetup, !hasFP(MF),
+                                  DynAllocation);
     } else {
       // We must keep the stack pointer aligned through any intermediate
       // updates.
@@ -2148,6 +2187,7 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
   }
 
   ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
+  ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
 
   LoopTestMBB->addSuccessor(ExitMBB);
   LoopTestMBB->addSuccessor(LoopTestMBB);
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index 26d2a26d681c3..d013755ce58a0 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -81,7 +81,7 @@ class RISCVFrameLowering : public TargetFrameLowering {
   void allocateStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                      MachineFunction &MF, uint64_t Offset,
                      uint64_t RealStackSize, bool EmitCFI, bool NeedProbe,
-                     uint64_t ProbeSize) const;
+                     uint64_t ProbeSize, bool DynAllocation) const;
 
 protected:
   const RISCVSubtarget &STI;
@@ -110,8 +110,8 @@ class RISCVFrameLowering : public TargetFrameLowering {
   void allocateAndProbeStackForRVV(MachineFunction &MF, MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
                                    const DebugLoc &DL, int64_t Amount,
-                                   MachineInstr::MIFlag Flag,
-                                   bool EmitCFI) const;
+                                   MachineInstr::MIFlag Flag, bool EmitCFI,
+                                   bool DynAllocation) const;
 };
 } // namespace llvm
 #endif
diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
index a71e6bbb93638..39c0af7985971 100644
--- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -211,10 +211,6 @@ bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L,
     assert(Phi->getIncomingValue(IncrementingBlock) == Inc &&
            "Expected one operand of phi to be Inc");
 
-    // Only proceed if the step is loop invariant.
-    if (!L->isLoopInvariant(Step))
-      return false;
-
     // Step should be a splat.
     Step = getSplatValue(Step);
     if (!Step)
@@ -298,6 +294,7 @@ bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L,
       BasePtr->getIncomingBlock(StartBlock)->getTerminator());
   Builder.SetCurrentDebugLocation(DebugLoc());
 
+  // TODO: Share this switch with matchStridedStart?
   switch (BO->getOpcode()) {
   default:
     llvm_unreachable("Unexpected opcode!");
@@ -310,18 +307,32 @@ bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L,
   }
   case Instruction::Mul: {
     Start = Builder.CreateMul(Start, SplatOp, "start");
-    Step = Builder.CreateMul(Step, SplatOp, "step");
     Stride = Builder.CreateMul(Stride, SplatOp, "stride");
     break;
   }
   case Instruction::Shl: {
     Start = Builder.CreateShl(Start, SplatOp, "start");
-    Step = Builder.CreateShl(Step, SplatOp, "step");
     Stride = Builder.CreateShl(Stride, SplatOp, "stride");
     break;
   }
   }
 
+  // If the Step was defined inside the loop, adjust it before its definition
+  // instead of in the preheader.
+  if (auto *StepI = dyn_cast<Instruction>(Step); StepI && L->contains(StepI))
+    Builder.SetInsertPoint(*StepI->getInsertionPointAfterDef());
+
+  switch (BO->getOpcode()) {
+  default:
+    break;
+  case Instruction::Mul:
+    Step = Builder.CreateMul(Step, SplatOp, "step");
+    break;
+  case Instruction::Shl:
+    Step = Builder.CreateShl(Step, SplatOp, "step");
+    break;
+  }
+
   Inc->setOperand(StepIndex, Step);
   BasePtr->setIncomingValue(StartBlock, Start);
   return true;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 9ccf95970e5b5..36292e3d572cb 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3791,15 +3791,6 @@ static bool isImplicitDef(SDValue V) {
   return V.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF;
 }
 
-static bool hasGPROut(unsigned Opc) {
-  switch (RISCV::getRVVMCOpcode(Opc)) {
-  case RISCV::VCPOP_M:
-  case RISCV::VFIRST_M:
-    return true;
-  }
-  return false;
-}
-
 // Optimize masked RVV pseudo instructions with a known all-ones mask to their
 // corresponding "unmasked" pseudo versions. The mask we're interested in will
 // take the form of a V0 physical register operand, with a glued
@@ -3818,19 +3809,22 @@ bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(MachineSDNode *N) {
   // everything else.  See the comment on RISCVMaskedPseudo for details.
   const unsigned Opc = I->UnmaskedPseudo;
   const MCInstrDesc &MCID = TII->get(Opc);
-  const bool UseTUPseudo = RISCVII::hasVecPolicyOp(MCID.TSFlags);
-#ifndef NDEBUG
+  const bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(MCID);
+
   const MCInstrDesc &MaskedMCID = TII->get(N->getMachineOpcode());
+  const bool MaskedHasPassthru = RISCVII::isFirstDefTiedToFirstUse(MaskedMCID);
+
   assert(RISCVII::hasVecPolicyOp(MaskedMCID.TSFlags) ==
          RISCVII::hasVecPolicyOp(MCID.TSFlags) &&
          "Masked and unmasked pseudos are inconsistent");
-  const bool HasTiedDest = RISCVII::isFirstDefTiedToFirstUse(MCID);
-  assert(UseTUPseudo == HasTiedDest && "Unexpected pseudo structure");
-#endif
+  assert(RISCVII::hasVecPolicyOp(MCID.TSFlags) == HasPassthru &&
+         "Unexpected pseudo structure");
+  assert(!(HasPassthru && !MaskedHasPassthru) &&
+         "Unmasked pseudo has passthru but masked pseudo doesn't?");
 
   SmallVector<SDValue, 8> Ops;
-  // Skip the passthru operand at index 0 if !UseTUPseudo and no GPR out.
-  bool ShouldSkip = !UseTUPseudo && !hasGPROut(Opc);
+  // Skip the passthru operand at index 0 if the unmasked don't have one.
+  bool ShouldSkip = !HasPassthru && MaskedHasPassthru;
   for (unsigned I = ShouldSkip, E = N->getNumOperands(); I != E; I++) {
     // Skip the mask, and the Glue.
     SDValue Op = N->getOperand(I);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index b25cb128bce9f..f4f511a7368f8 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -280,7 +280,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                    MVT::i1, Promote);
 
   // TODO: add all necessary setOperationAction calls.
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, XLenVT, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, XLenVT, Custom);
 
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::BR_CC, XLenVT, Expand);
@@ -1524,13 +1524,18 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setTargetDAGCombine({ISD::ZERO_EXTEND, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
                          ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT});
   if (Subtarget.hasVInstructions())
-    setTargetDAGCombine({ISD::FCOPYSIGN, ISD::MGATHER, ISD::MSCATTER,
-                         ISD::VP_GATHER, ISD::VP_SCATTER, ISD::SRA, ISD::SRL,
-                         ISD::SHL, ISD::STORE, ISD::SPLAT_VECTOR,
-                         ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS,
-                         ISD::EXPERIMENTAL_VP_REVERSE, ISD::MUL,
-                         ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM,
-                         ISD::INSERT_VECTOR_ELT, ISD::ABS, ISD::CTPOP});
+    setTargetDAGCombine({ISD::FCOPYSIGN,     ISD::MGATHER,
+                         ISD::MSCATTER,      ISD::VP_GATHER,
+                         ISD::VP_SCATTER,    ISD::SRA,
+                         ISD::SRL,           ISD::SHL,
+                         ISD::STORE,         ISD::SPLAT_VECTOR,
+                         ISD::BUILD_VECTOR,  ISD::CONCAT_VECTORS,
+                         ISD::VP_STORE,      ISD::EXPERIMENTAL_VP_REVERSE,
+                         ISD::MUL,           ISD::SDIV,
+                         ISD::UDIV,          ISD::SREM,
+                         ISD::UREM,          ISD::INSERT_VECTOR_ELT,
+                         ISD::ABS,           ISD::CTPOP,
+                         ISD::VECTOR_SHUFFLE});
   if (Subtarget.hasVendorXTHeadMemPair())
     setTargetDAGCombine({ISD::LOAD, ISD::STORE});
   if (Subtarget.useRVVForFixedLengthVectors())
@@ -5728,14 +5733,14 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
 }
 
 bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
-  // Support splats for any type. These should type legalize well.
-  if (ShuffleVectorSDNode::isSplatMask(M.data(), VT))
-    return true;
-
   // Only support legal VTs for other shuffles for now.
   if (!isTypeLegal(VT))
     return false;
 
+  // Support splats for any type. These should type legalize well.
+  if (ShuffleVectorSDNode::isSplatMask(M.data(), VT))
+    return true;
+
   MVT SVT = VT.getSimpleVT();
 
   // Not for i1 vectors.
@@ -7727,6 +7732,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return emitFlushICache(DAG, Op.getOperand(0), Op.getOperand(1),
                            Op.getOperand(2), Flags, DL);
   }
+  case ISD::DYNAMIC_STACKALLOC:
+    return lowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::INIT_TRAMPOLINE:
     return lowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::ADJUST_TRAMPOLINE:
@@ -16229,6 +16236,127 @@ static SDValue performBITREVERSECombine(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(RISCVISD::BREV8, DL, VT, Src.getOperand(0));
 }
 
+static SDValue performVP_REVERSECombine(SDNode *N, SelectionDAG &DAG,
+                                        const RISCVSubtarget &Subtarget) {
+  // Fold:
+  //    vp.reverse(vp.load(ADDR, MASK)) -> vp.strided.load(ADDR, -1, MASK)
+
+  // Check if its first operand is a vp.load.
+  auto *VPLoad = dyn_cast<VPLoadSDNode>(N->getOperand(0));
+  if (!VPLoad)
+    return SDValue();
+
+  EVT LoadVT = VPLoad->getValueType(0);
+  // We do not have a strided_load version for masks, and the evl of vp.reverse
+  // and vp.load should always be the same.
+  if (!LoadVT.getVectorElementType().isByteSized() ||
+      N->getOperand(2) != VPLoad->getVectorLength() ||
+      !N->getOperand(0).hasOneUse())
+    return SDValue();
+
+  // Check if the mask of outer vp.reverse are all 1's.
+  if (!isOneOrOneSplat(N->getOperand(1)))
+    return SDValue();
+
+  SDValue LoadMask = VPLoad->getMask();
+  // If Mask is all ones, then load is unmasked and can be reversed.
+  if (!isOneOrOneSplat(LoadMask)) {
+    // If the mask is not all ones, we can reverse the load if the mask was also
+    // reversed by an unmasked vp.reverse with the same EVL.
+    if (LoadMask.getOpcode() != ISD::EXPERIMENTAL_VP_REVERSE ||
+        !isOneOrOneSplat(LoadMask.getOperand(1)) ||
+        LoadMask.getOperand(2) != VPLoad->getVectorLength())
+      return SDValue();
+    LoadMask = LoadMask.getOperand(0);
+  }
+
+  // Base = LoadAddr + (NumElem - 1) * ElemWidthByte
+  SDLoc DL(N);
+  MVT XLenVT = Subtarget.getXLenVT();
+  SDValue NumElem = VPLoad->getVectorLength();
+  uint64_t ElemWidthByte = VPLoad->getValueType(0).getScalarSizeInBits() / 8;
+
+  SDValue Temp1 = DAG.getNode(ISD::SUB, DL, XLenVT, NumElem,
+                              DAG.getConstant(1, DL, XLenVT));
+  SDValue Temp2 = DAG.getNode(ISD::MUL, DL, XLenVT, Temp1,
+                              DAG.getConstant(ElemWidthByte, DL, XLenVT));
+  SDValue Base = DAG.getNode(ISD::ADD, DL, XLenVT, VPLoad->getBasePtr(), Temp2);
+  SDValue Stride = DAG.getConstant(-ElemWidthByte, DL, XLenVT);
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachinePointerInfo PtrInfo(VPLoad->getAddressSpace());
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, VPLoad->getMemOperand()->getFlags(),
+      LocationSize::beforeOrAfterPointer(), VPLoad->getAlign());
+
+  SDValue Ret = DAG.getStridedLoadVP(
+      LoadVT, DL, VPLoad->getChain(), Base, Stride, LoadMask,
+      VPLoad->getVectorLength(), MMO, VPLoad->isExpandingLoad());
+
+  DAG.ReplaceAllUsesOfValueWith(SDValue(VPLoad, 1), Ret.getValue(1));
+
+  return Ret;
+}
+
+static SDValue performVP_STORECombine(SDNode *N, SelectionDAG &DAG,
+                                      const RISCVSubtarget &Subtarget) {
+  // Fold:
+  //    vp.store(vp.reverse(VAL), ADDR, MASK) -> vp.strided.store(VAL, NEW_ADDR,
+  //    -1, MASK)
+  auto *VPStore = cast<VPStoreSDNode>(N);
+
+  if (VPStore->getValue().getOpcode() != ISD::EXPERIMENTAL_VP_REVERSE)
+    return SDValue();
+
+  SDValue VPReverse = VPStore->getValue();
+  EVT ReverseVT = VPReverse->getValueType(0);
+
+  // We do not have a strided_store version for masks, and the evl of vp.reverse
+  // and vp.store should always be the same.
+  if (!ReverseVT.getVectorElementType().isByteSized() ||
+      VPStore->getVectorLength() != VPReverse.getOperand(2) ||
+      !VPReverse.hasOneUse())
+    return SDValue();
+
+  SDValue StoreMask = VPStore->getMask();
+  // If Mask is all ones, then load is unmasked and can be reversed.
+  if (!isOneOrOneSplat(StoreMask)) {
+    // If the mask is not all ones, we can reverse the store if the mask was
+    // also reversed by an unmasked vp.reverse with the same EVL.
+    if (StoreMask.getOpcode() != ISD::EXPERIMENTAL_VP_REVERSE ||
+        !isOneOrOneSplat(StoreMask.getOperand(1)) ||
+        StoreMask.getOperand(2) != VPStore->getVectorLength())
+      return SDValue();
+    StoreMask = StoreMask.getOperand(0);
+  }
+
+  // Base = StoreAddr + (NumElem - 1) * ElemWidthByte
+  SDLoc DL(N);
+  MVT XLenVT = Subtarget.getXLenVT();
+  SDValue NumElem = VPStore->getVectorLength();
+  uint64_t ElemWidthByte = VPReverse.getValueType().getScalarSizeInBits() / 8;
+
+  SDValue Temp1 = DAG.getNode(ISD::SUB, DL, XLenVT, NumElem,
+                              DAG.getConstant(1, DL, XLenVT));
+  SDValue Temp2 = DAG.getNode(ISD::MUL, DL, XLenVT, Temp1,
+                              DAG.getConstant(ElemWidthByte, DL, XLenVT));
+  SDValue Base =
+      DAG.getNode(ISD::ADD, DL, XLenVT, VPStore->getBasePtr(), Temp2);
+  SDValue Stride = DAG.getConstant(-ElemWidthByte, DL, XLenVT);
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachinePointerInfo PtrInfo(VPStore->getAddressSpace());
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, VPStore->getMemOperand()->getFlags(),
+      LocationSize::beforeOrAfterPointer(), VPStore->getAlign());
+
+  return DAG.getStridedStoreVP(
+      VPStore->getChain(), DL, VPReverse.getOperand(0), Base,
+      VPStore->getOffset(), Stride, StoreMask, VPStore->getVectorLength(),
+      VPStore->getMemoryVT(), MMO, VPStore->getAddressingMode(),
+      VPStore->isTruncatingStore(), VPStore->isCompressingStore());
+}
+
 // Convert from one FMA opcode to another based on whether we are negating the
 // multiply result and/or the accumulator.
 // NOTE: Only supports RVV operations with VL.
@@ -17012,6 +17140,37 @@ static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG,
   return DAG.getBitcast(VT.getSimpleVT(), StridedLoad);
 }
 
+/// Custom legalize <N x i128> or <N x i256> to <M x ELEN>.  This runs
+/// during the combine phase before type legalization, and relies on
+/// DAGCombine not undoing the transform if isShuffleMaskLegal returns false
+/// for the source mask.
+static SDValue performVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG,
+                                            const RISCVSubtarget &Subtarget,
+                                            const RISCVTargetLowering &TLI) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  const unsigned ElementSize = VT.getScalarSizeInBits();
+  SDValue V1 = N->getOperand(0);
+  SDValue V2 = N->getOperand(1);
+  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
+
+  if (TLI.isTypeLegal(VT) || ElementSize <= Subtarget.getELen() ||
+      !isPowerOf2_64(ElementSize) || VT.getVectorNumElements() % 2 != 0 ||
+      VT.isFloatingPoint() || TLI.isShuffleMaskLegal(Mask, VT))
+    return SDValue();
+
+  SmallVector<int, 8> NewMask;
+  narrowShuffleMaskElts(2, Mask, NewMask);
+
+  LLVMContext &C = *DAG.getContext();
+  EVT NewEltVT = EVT::getIntegerVT(C, ElementSize / 2);
+  EVT NewVT = EVT::getVectorVT(C, NewEltVT, VT.getVectorNumElements() * 2);
+  SDValue Res = DAG.getVectorShuffle(NewVT, DL, DAG.getBitcast(NewVT, V1),
+                                     DAG.getBitcast(NewVT, V2), NewMask);
+  return DAG.getBitcast(VT, Res);
+}
+
+
 static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG,
                                const RISCVSubtarget &Subtarget) {
 
@@ -18241,6 +18400,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     if (SDValue V = performCONCAT_VECTORSCombine(N, DAG, Subtarget, *this))
       return V;
     break;
+  case ISD::VECTOR_SHUFFLE:
+    if (SDValue V = performVECTOR_SHUFFLECombine(N, DAG, Subtarget, *this))
+      return V;
+    break;
   case ISD::INSERT_VECTOR_ELT:
     if (SDValue V = performINSERT_VECTOR_ELTCombine(N, DAG, Subtarget, *this))
       return V;
@@ -18372,6 +18535,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     }
     }
   }
+  case ISD::EXPERIMENTAL_VP_REVERSE:
+    return performVP_REVERSECombine(N, DAG, Subtarget);
+  case ISD::VP_STORE:
+    return performVP_STORECombine(N, DAG, Subtarget);
   case ISD::BITCAST: {
     assert(Subtarget.useRVVForFixedLengthVectors());
     SDValue N0 = N->getOperand(0);
@@ -19641,6 +19808,8 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case RISCV::PseudoFROUND_D_INX:
   case RISCV::PseudoFROUND_D_IN32X:
     return emitFROUND(MI, BB, Subtarget);
+  case RISCV::PROBED_STACKALLOC_DYN:
+    return emitDynamicProbedAlloc(MI, BB);
   case TargetOpcode::STATEPOINT:
     // STATEPOINT is a pseudo instruction which has no implicit defs/uses
     // while jal call instruction (where statepoint will be lowered at the end)
@@ -20873,6 +21042,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(SF_VC_V_IVW_SE)
   NODE_NAME_CASE(SF_VC_V_VVW_SE)
   NODE_NAME_CASE(SF_VC_V_FVW_SE)
+  NODE_NAME_CASE(PROBED_ALLOCA)
   }
   // clang-format on
   return nullptr;
@@ -22602,3 +22772,95 @@ unsigned RISCVTargetLowering::getStackProbeSize(const MachineFunction &MF,
   StackProbeSize = alignDown(StackProbeSize, StackAlign.value());
   return StackProbeSize ? StackProbeSize : StackAlign.value();
 }
+
+SDValue RISCVTargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  if (!hasInlineStackProbe(MF))
+    return SDValue();
+
+  MVT XLenVT = Subtarget.getXLenVT();
+  // Get the inputs.
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+
+  MaybeAlign Align =
+      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  // Construct the new SP value in a GPR.
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, RISCV::X2, XLenVT);
+  Chain = SP.getValue(1);
+  SP = DAG.getNode(ISD::SUB, dl, XLenVT, SP, Size);
+  if (Align)
+    SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                     DAG.getSignedConstant(-(uint64_t)Align->value(), dl, VT));
+
+  // Set the real SP to the new value with a probing loop.
+  Chain = DAG.getNode(RISCVISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
+  return DAG.getMergeValues({SP, Chain}, dl);
+}
+
+MachineBasicBlock *
+RISCVTargetLowering::emitDynamicProbedAlloc(MachineInstr &MI,
+                                            MachineBasicBlock *MBB) const {
+  MachineFunction &MF = *MBB->getParent();
+  MachineBasicBlock::iterator MBBI = MI.getIterator();
+  DebugLoc DL = MBB->findDebugLoc(MBBI);
+  Register TargetReg = MI.getOperand(1).getReg();
+
+  const RISCVInstrInfo *TII = Subtarget.getInstrInfo();
+  bool IsRV64 = Subtarget.is64Bit();
+  Align StackAlign = Subtarget.getFrameLowering()->getStackAlign();
+  const RISCVTargetLowering *TLI = Subtarget.getTargetLowering();
+  uint64_t ProbeSize = TLI->getStackProbeSize(MF, StackAlign);
+
+  MachineFunction::iterator MBBInsertPoint = std::next(MBB->getIterator());
+  MachineBasicBlock *LoopTestMBB =
+      MF.CreateMachineBasicBlock(MBB->getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopTestMBB);
+  MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock());
+  MF.insert(MBBInsertPoint, ExitMBB);
+  Register SPReg = RISCV::X2;
+  Register ScratchReg =
+      MF.getRegInfo().createVirtualRegister(&RISCV::GPRRegClass);
+
+  // ScratchReg = ProbeSize
+  TII->movImm(*MBB, MBBI, DL, ScratchReg, ProbeSize, MachineInstr::NoFlags);
+
+  // LoopTest:
+  //   SUB SP, SP, ProbeSize
+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::SUB), SPReg)
+      .addReg(SPReg)
+      .addReg(ScratchReg);
+
+  //   s[d|w] zero, 0(sp)
+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL,
+          TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+      .addReg(RISCV::X0)
+      .addReg(SPReg)
+      .addImm(0);
+
+  //  BLT TargetReg, SP, LoopTest
+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BLT))
+      .addReg(TargetReg)
+      .addReg(SPReg)
+      .addMBB(LoopTestMBB);
+
+  // Adjust with: MV SP, TargetReg.
+  BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(RISCV::ADDI), SPReg)
+      .addReg(TargetReg)
+      .addImm(0);
+
+  ExitMBB->splice(ExitMBB->end(), MBB, std::next(MBBI), MBB->end());
+  ExitMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  LoopTestMBB->addSuccessor(ExitMBB);
+  LoopTestMBB->addSuccessor(LoopTestMBB);
+  MBB->addSuccessor(LoopTestMBB);
+
+  MI.eraseFromParent();
+  MF.getInfo<RISCVMachineFunctionInfo>()->setDynamicAllocation();
+  return ExitMBB->begin()->getParent();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index ea077c7d2d23a..892c1cd96ca61 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -461,6 +461,10 @@ enum NodeType : unsigned {
   SF_VC_V_VVW_SE,
   SF_VC_V_FVW_SE,
 
+  // To avoid stack clash, allocation is performed by block and each block is
+  // probed.
+  PROBED_ALLOCA,
+
   // RISC-V vector tuple type version of INSERT_SUBVECTOR/EXTRACT_SUBVECTOR.
   TUPLE_INSERT,
   TUPLE_EXTRACT,
@@ -922,6 +926,9 @@ class RISCVTargetLowering : public TargetLowering {
 
   unsigned getStackProbeSize(const MachineFunction &MF, Align StackAlign) const;
 
+  MachineBasicBlock *emitDynamicProbedAlloc(MachineInstr &MI,
+                                            MachineBasicBlock *MBB) const;
+
 private:
   void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
                         const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet,
@@ -1015,6 +1022,8 @@ class RISCVTargetLowering : public TargetLowering {
 
   SDValue lowerVectorStrictFSetcc(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue expandUnalignedRVVLoad(SDValue Op, SelectionDAG &DAG) const;
   SDValue expandUnalignedRVVStore(SDValue Op, SelectionDAG &DAG) const;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index ee86f53a5c8a8..bb5bb6352c32a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -100,6 +100,11 @@ def riscv_add_tprel : SDNode<"RISCVISD::ADD_TPREL",
                                                   SDTCisSameAs<0, 3>,
                                                   SDTCisInt<0>]>>;
 
+def riscv_probed_alloca : SDNode<"RISCVISD::PROBED_ALLOCA",
+                                 SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+                                                      SDTCisVT<0, i32>]>,
+                                 [SDNPHasChain, SDNPMayStore]>;
+
 //===----------------------------------------------------------------------===//
 // Operand and SDNode transformation definitions.
 //===----------------------------------------------------------------------===//
@@ -1428,6 +1433,11 @@ def PROBED_STACKALLOC_RVV : Pseudo<(outs GPR:$sp),
                                (ins GPR:$scratch),
                                []>,
                                Sched<[]>;
+let usesCustomInserter = 1 in
+def PROBED_STACKALLOC_DYN : Pseudo<(outs GPR:$rd),
+                               (ins GPR:$scratch),
+                               [(set GPR:$rd, (riscv_probed_alloca GPR:$scratch))]>,
+                               Sched<[]>;
 }
 
 /// HI and ADD_LO address nodes.
diff --git a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
index 8909f2f3bd317..27a13bb7cace1 100644
--- a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
@@ -78,6 +78,9 @@ class RISCVMachineFunctionInfo : public MachineFunctionInfo {
 
   int64_t StackProbeSize = 0;
 
+  /// Does it probe the stack for a dynamic allocation?
+  bool HasDynamicAllocation = false;
+
 public:
   RISCVMachineFunctionInfo(const Function &F, const RISCVSubtarget *STI);
 
@@ -159,6 +162,9 @@ class RISCVMachineFunctionInfo : public MachineFunctionInfo {
 
   bool isVectorCall() const { return IsVectorCall; }
   void setIsVectorCall() { IsVectorCall = true; }
+
+  bool hasDynamicAllocation() const { return HasDynamicAllocation; }
+  void setDynamicAllocation() { HasDynamicAllocation = true; }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 8156eaff8a04c..54ca8ccd8d9e9 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -206,6 +206,7 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
       MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
 
   const bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(MI.getDesc());
+  const bool IsTied = RISCVII::isTiedPseudo(MI.getDesc().TSFlags);
 
   // We bail out early for instructions that have passthru with non NoRegister,
   // which means they are using TU policy. We are not interested in these
@@ -568,7 +569,8 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   case RISCV::VFWADD_WV:
   case RISCV::VFWSUB_WF:
   case RISCV::VFWSUB_WV: {
-    bool IsOp1 = HasPassthru ? MO.getOperandNo() == 2 : MO.getOperandNo() == 1;
+    bool IsOp1 = (HasPassthru && !IsTied) ? MO.getOperandNo() == 2
+                                          : MO.getOperandNo() == 1;
     bool TwoTimes = IsMODef || IsOp1;
     return TwoTimes ? MILog2SEW + 1 : MILog2SEW;
   }
@@ -610,6 +612,7 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   case RISCV::VFNCVT_F_F_W:
   case RISCV::VFNCVT_ROD_F_F_W:
   case RISCV::VFNCVTBF16_F_F_W: {
+    assert(!IsTied);
     bool IsOp1 = HasPassthru ? MO.getOperandNo() == 2 : MO.getOperandNo() == 1;
     bool TwoTimes = IsOp1;
     return TwoTimes ? MILog2SEW + 1 : MILog2SEW;
diff --git a/llvm/lib/Target/SPIRV/CMakeLists.txt b/llvm/lib/Target/SPIRV/CMakeLists.txt
index a79e19fcd753d..efdd8c8d24fbd 100644
--- a/llvm/lib/Target/SPIRV/CMakeLists.txt
+++ b/llvm/lib/Target/SPIRV/CMakeLists.txt
@@ -10,6 +10,8 @@ tablegen(LLVM SPIRVGenRegisterBank.inc -gen-register-bank)
 tablegen(LLVM SPIRVGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM SPIRVGenSubtargetInfo.inc -gen-subtarget)
 tablegen(LLVM SPIRVGenTables.inc -gen-searchable-tables)
+tablegen(LLVM SPIRVGenPreLegalizeGICombiner.inc -gen-global-isel-combiner
+              -combiners="SPIRVPreLegalizerCombiner")
 
 add_public_tablegen_target(SPIRVCommonTableGen)
 
@@ -33,6 +35,7 @@ add_llvm_target(SPIRVCodeGen
   SPIRVModuleAnalysis.cpp
   SPIRVStructurizer.cpp
   SPIRVPreLegalizer.cpp
+  SPIRVPreLegalizerCombiner.cpp
   SPIRVPostLegalizer.cpp
   SPIRVPrepareFunctions.cpp
   SPIRVRegisterBankInfo.cpp
diff --git a/llvm/lib/Target/SPIRV/SPIRV.h b/llvm/lib/Target/SPIRV/SPIRV.h
index 81b5720264425..6d00a046ff7ca 100644
--- a/llvm/lib/Target/SPIRV/SPIRV.h
+++ b/llvm/lib/Target/SPIRV/SPIRV.h
@@ -24,6 +24,7 @@ FunctionPass *createSPIRVStructurizerPass();
 FunctionPass *createSPIRVMergeRegionExitTargetsPass();
 FunctionPass *createSPIRVStripConvergenceIntrinsicsPass();
 FunctionPass *createSPIRVRegularizerPass();
+FunctionPass *createSPIRVPreLegalizerCombiner();
 FunctionPass *createSPIRVPreLegalizerPass();
 FunctionPass *createSPIRVPostLegalizerPass();
 ModulePass *createSPIRVEmitIntrinsicsPass(SPIRVTargetMachine *TM);
@@ -36,6 +37,7 @@ createSPIRVInstructionSelector(const SPIRVTargetMachine &TM,
 void initializeSPIRVModuleAnalysisPass(PassRegistry &);
 void initializeSPIRVConvergenceRegionAnalysisWrapperPassPass(PassRegistry &);
 void initializeSPIRVPreLegalizerPass(PassRegistry &);
+void initializeSPIRVPreLegalizerCombinerPass(PassRegistry &);
 void initializeSPIRVPostLegalizerPass(PassRegistry &);
 void initializeSPIRVStructurizerPass(PassRegistry &);
 void initializeSPIRVEmitIntrinsicsPass(PassRegistry &);
diff --git a/llvm/lib/Target/SPIRV/SPIRV.td b/llvm/lib/Target/SPIRV/SPIRV.td
index 108c7e6d3861f..39a4131c7f1bd 100644
--- a/llvm/lib/Target/SPIRV/SPIRV.td
+++ b/llvm/lib/Target/SPIRV/SPIRV.td
@@ -11,6 +11,7 @@ include "llvm/Target/Target.td"
 include "SPIRVRegisterInfo.td"
 include "SPIRVRegisterBanks.td"
 include "SPIRVInstrInfo.td"
+include "SPIRVCombine.td"
 include "SPIRVBuiltins.td"
 
 def SPIRVInstrInfo : InstrInfo;
diff --git a/llvm/lib/Target/SPIRV/SPIRVCombine.td b/llvm/lib/Target/SPIRV/SPIRVCombine.td
new file mode 100644
index 0000000000000..6f726e024de52
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVCombine.td
@@ -0,0 +1,22 @@
+//=- SPIRVCombine.td - Define SPIRV Combine Rules -------------*-tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+include "llvm/Target/GlobalISel/Combine.td"
+
+
+def vector_length_sub_to_distance_lowering : GICombineRule <
+  (defs root:$root),
+  (match (wip_match_opcode G_INTRINSIC):$root,
+          [{ return matchLengthToDistance(*${root}, MRI); }]),
+  (apply [{ applySPIRVDistance(*${root}, MRI, B); }])
+>;
+
+def SPIRVPreLegalizerCombiner
+    : GICombiner<"SPIRVPreLegalizerCombinerImpl",
+                       [vector_length_sub_to_distance_lowering]> {
+    let CombineAllMethodName = "tryCombineAllImpl";
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index d2b14d6d058c9..1c1acd29ee0e6 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -264,7 +264,14 @@ bool expectIgnoredInIRTranslation(const Instruction *I) {
   const auto *II = dyn_cast<IntrinsicInst>(I);
   if (!II)
     return false;
-  return II->getIntrinsicID() == Intrinsic::invariant_start;
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::invariant_start:
+  case Intrinsic::spv_resource_handlefrombinding:
+  case Intrinsic::spv_resource_getpointer:
+    return true;
+  default:
+    return false;
+  }
 }
 
 bool allowEmitFakeUse(const Value *Arg) {
@@ -737,7 +744,13 @@ Type *SPIRVEmitIntrinsics::deduceElementTypeHelper(
         {"__spirv_GenericCastToPtrExplicit_ToLocal", 0},
         {"__spirv_GenericCastToPtrExplicit_ToPrivate", 0}};
     // TODO: maybe improve performance by caching demangled names
-    if (Function *CalledF = CI->getCalledFunction()) {
+
+    auto *II = dyn_cast<IntrinsicInst>(I);
+    if (II && II->getIntrinsicID() == Intrinsic::spv_resource_getpointer) {
+      auto *ImageType = cast<TargetExtType>(II->getOperand(0)->getType());
+      assert(ImageType->getTargetExtName() == "spirv.Image");
+      Ty = ImageType->getTypeParameter(0);
+    } else if (Function *CalledF = CI->getCalledFunction()) {
       std::string DemangledName =
           getOclOrSpirvBuiltinDemangledName(CalledF->getName());
       if (DemangledName.length() > 0)
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index a06c62e68d106..874894ae98726 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -1114,9 +1114,12 @@ SPIRVGlobalRegistry::getSPIRVTypeForVReg(Register VReg,
   return nullptr;
 }
 
-SPIRVType *SPIRVGlobalRegistry::getResultType(Register VReg) {
-  MachineInstr *Instr = getVRegDef(CurMF->getRegInfo(), VReg);
-  return getSPIRVTypeForVReg(Instr->getOperand(1).getReg());
+SPIRVType *SPIRVGlobalRegistry::getResultType(Register VReg,
+                                              MachineFunction *MF) {
+  if (!MF)
+    MF = CurMF;
+  MachineInstr *Instr = getVRegDef(MF->getRegInfo(), VReg);
+  return getSPIRVTypeForVReg(Instr->getOperand(1).getReg(), MF);
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVType(
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index 528baf5f8d9e2..0c94ec4df97f5 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -377,7 +377,7 @@ class SPIRVGlobalRegistry {
                                  const MachineFunction *MF = nullptr) const;
 
   // Return the result type of the instruction defining the register.
-  SPIRVType *getResultType(Register VReg);
+  SPIRVType *getResultType(Register VReg, MachineFunction *MF = nullptr);
 
   // Whether the given VReg has a SPIR-V type mapped to it yet.
   bool hasSPIRVTypeForVReg(Register VReg) const {
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 1d6be7619ecf4..f5409c27d6ea3 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -109,15 +109,25 @@ class SPIRVInstructionSelector : public InstructionSelector {
   bool selectFirstBitHigh(Register ResVReg, const SPIRVType *ResType,
                           MachineInstr &I, bool IsSigned) const;
 
-  bool selectFirstBitHigh16(Register ResVReg, const SPIRVType *ResType,
-                            MachineInstr &I, bool IsSigned) const;
+  bool selectFirstBitLow(Register ResVReg, const SPIRVType *ResType,
+                         MachineInstr &I) const;
+
+  bool selectFirstBitSet16(Register ResVReg, const SPIRVType *ResType,
+                           MachineInstr &I, unsigned ExtendOpcode,
+                           unsigned BitSetOpcode) const;
+
+  bool selectFirstBitSet32(Register ResVReg, const SPIRVType *ResType,
+                           MachineInstr &I, Register SrcReg,
+                           unsigned BitSetOpcode) const;
 
-  bool selectFirstBitHigh32(Register ResVReg, const SPIRVType *ResType,
-                            MachineInstr &I, Register SrcReg,
-                            bool IsSigned) const;
+  bool selectFirstBitSet64(Register ResVReg, const SPIRVType *ResType,
+                           MachineInstr &I, Register SrcReg,
+                           unsigned BitSetOpcode, bool SwapPrimarySide) const;
 
-  bool selectFirstBitHigh64(Register ResVReg, const SPIRVType *ResType,
-                            MachineInstr &I, bool IsSigned) const;
+  bool selectFirstBitSet64Overflow(Register ResVReg, const SPIRVType *ResType,
+                                   MachineInstr &I, Register SrcReg,
+                                   unsigned BitSetOpcode,
+                                   bool SwapPrimarySide) const;
 
   bool selectGlobalValue(Register ResVReg, MachineInstr &I,
                          const MachineInstr *Init = nullptr) const;
@@ -205,6 +215,9 @@ class SPIRVInstructionSelector : public InstructionSelector {
   bool selectDot4AddPackedExpansion(Register ResVReg, const SPIRVType *ResType,
                                     MachineInstr &I) const;
 
+  bool selectWaveReduceSum(Register ResVReg, const SPIRVType *ResType,
+                           MachineInstr &I) const;
+
   void renderImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
                    int OpIdx) const;
   void renderFImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
@@ -288,8 +301,9 @@ class SPIRVInstructionSelector : public InstructionSelector {
 
   bool selectReadImageIntrinsic(Register &ResVReg, const SPIRVType *ResType,
                                 MachineInstr &I) const;
-
   bool selectImageWriteIntrinsic(MachineInstr &I) const;
+  bool selectResourceGetPointer(Register &ResVReg, const SPIRVType *ResType,
+                                MachineInstr &I) const;
 
   // Utilities
   std::pair<Register, bool>
@@ -319,10 +333,15 @@ class SPIRVInstructionSelector : public InstructionSelector {
   SPIRVType *widenTypeToVec4(const SPIRVType *Type, MachineInstr &I) const;
   bool extractSubvector(Register &ResVReg, const SPIRVType *ResType,
                         Register &ReadReg, MachineInstr &InsertionPoint) const;
+  bool generateImageRead(Register &ResVReg, const SPIRVType *ResType,
+                         Register ImageReg, Register IdxReg, DebugLoc Loc,
+                         MachineInstr &Pos) const;
   bool BuildCOPY(Register DestReg, Register SrcReg, MachineInstr &I) const;
   bool loadVec3BuiltinInputID(SPIRV::BuiltIn::BuiltIn BuiltInValue,
                               Register ResVReg, const SPIRVType *ResType,
                               MachineInstr &I) const;
+  bool loadHandleBeforePosition(Register &HandleReg, const SPIRVType *ResType,
+                                GIntrinsic &HandleDef, MachineInstr &Pos) const;
 };
 
 } // end anonymous namespace
@@ -1030,6 +1049,25 @@ bool SPIRVInstructionSelector::selectLoad(Register ResVReg,
                                           MachineInstr &I) const {
   unsigned OpOffset = isa<GIntrinsic>(I) ? 1 : 0;
   Register Ptr = I.getOperand(1 + OpOffset).getReg();
+
+  auto *PtrDef = getVRegDef(*MRI, Ptr);
+  auto *IntPtrDef = dyn_cast<GIntrinsic>(PtrDef);
+  if (IntPtrDef &&
+      IntPtrDef->getIntrinsicID() == Intrinsic::spv_resource_getpointer) {
+    Register ImageReg = IntPtrDef->getOperand(2).getReg();
+    Register NewImageReg =
+        MRI->createVirtualRegister(MRI->getRegClass(ImageReg));
+    auto *ImageDef = cast<GIntrinsic>(getVRegDef(*MRI, ImageReg));
+    if (!loadHandleBeforePosition(NewImageReg, GR.getSPIRVTypeForVReg(ImageReg),
+                                  *ImageDef, I)) {
+      return false;
+    }
+
+    Register IdxReg = IntPtrDef->getOperand(3).getReg();
+    return generateImageRead(ResVReg, ResType, NewImageReg, IdxReg,
+                             I.getDebugLoc(), I);
+  }
+
   auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpLoad))
                  .addDef(ResVReg)
                  .addUse(GR.getSPIRVTypeID(ResType))
@@ -1049,6 +1087,29 @@ bool SPIRVInstructionSelector::selectStore(MachineInstr &I) const {
   unsigned OpOffset = isa<GIntrinsic>(I) ? 1 : 0;
   Register StoreVal = I.getOperand(0 + OpOffset).getReg();
   Register Ptr = I.getOperand(1 + OpOffset).getReg();
+
+  auto *PtrDef = getVRegDef(*MRI, Ptr);
+  auto *IntPtrDef = dyn_cast<GIntrinsic>(PtrDef);
+  if (IntPtrDef &&
+      IntPtrDef->getIntrinsicID() == Intrinsic::spv_resource_getpointer) {
+    Register ImageReg = IntPtrDef->getOperand(2).getReg();
+    Register NewImageReg =
+        MRI->createVirtualRegister(MRI->getRegClass(ImageReg));
+    auto *ImageDef = cast<GIntrinsic>(getVRegDef(*MRI, ImageReg));
+    if (!loadHandleBeforePosition(NewImageReg, GR.getSPIRVTypeForVReg(ImageReg),
+                                  *ImageDef, I)) {
+      return false;
+    }
+
+    Register IdxReg = IntPtrDef->getOperand(3).getReg();
+    return BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                   TII.get(SPIRV::OpImageWrite))
+        .addUse(NewImageReg)
+        .addUse(IdxReg)
+        .addUse(StoreVal)
+        .constrainAllUses(TII, TRI, RBI);
+  }
+
   MachineBasicBlock &BB = *I.getParent();
   auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpStore))
                  .addUse(Ptr)
@@ -2071,6 +2132,31 @@ bool SPIRVInstructionSelector::selectWaveActiveCountBits(
   return Result;
 }
 
+bool SPIRVInstructionSelector::selectWaveReduceSum(Register ResVReg,
+                                                   const SPIRVType *ResType,
+                                                   MachineInstr &I) const {
+  assert(I.getNumOperands() == 3);
+  assert(I.getOperand(2).isReg());
+  MachineBasicBlock &BB = *I.getParent();
+  Register InputRegister = I.getOperand(2).getReg();
+  SPIRVType *InputType = GR.getSPIRVTypeForVReg(InputRegister);
+
+  if (!InputType)
+    report_fatal_error("Input Type could not be determined.");
+
+  SPIRVType *IntTy = GR.getOrCreateSPIRVIntegerType(32, I, TII);
+  // Retreive the operation to use based on input type
+  bool IsFloatTy = GR.isScalarOrVectorOfType(InputRegister, SPIRV::OpTypeFloat);
+  auto Opcode =
+      IsFloatTy ? SPIRV::OpGroupNonUniformFAdd : SPIRV::OpGroupNonUniformIAdd;
+  return BuildMI(BB, I, I.getDebugLoc(), TII.get(Opcode))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addUse(GR.getOrCreateConstInt(SPIRV::Scope::Subgroup, I, IntTy, TII))
+      .addImm(SPIRV::GroupOperation::Reduce)
+      .addUse(I.getOperand(2).getReg());
+}
+
 bool SPIRVInstructionSelector::selectBitreverse(Register ResVReg,
                                                 const SPIRVType *ResType,
                                                 MachineInstr &I) const {
@@ -2952,6 +3038,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     return selectFirstBitHigh(ResVReg, ResType, I, /*IsSigned=*/false);
   case Intrinsic::spv_firstbitshigh: // There is no CL equivalent of FindSMsb
     return selectFirstBitHigh(ResVReg, ResType, I, /*IsSigned=*/true);
+  case Intrinsic::spv_firstbitlow: // There is no CL equivlent of FindILsb
+    return selectFirstBitLow(ResVReg, ResType, I);
   case Intrinsic::spv_group_memory_barrier_with_group_sync: {
     bool Result = true;
     auto MemSemConstant =
@@ -2998,6 +3086,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     return selectWaveOpInst(ResVReg, ResType, I, SPIRV::OpGroupNonUniformAny);
   case Intrinsic::spv_wave_is_first_lane:
     return selectWaveOpInst(ResVReg, ResType, I, SPIRV::OpGroupNonUniformElect);
+  case Intrinsic::spv_wave_reduce_sum:
+    return selectWaveReduceSum(ResVReg, ResType, I);
   case Intrinsic::spv_wave_readlane:
     return selectWaveOpInst(ResVReg, ResType, I,
                             SPIRV::OpGroupNonUniformShuffle);
@@ -3024,6 +3114,9 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
   case Intrinsic::spv_resource_load_typedbuffer: {
     return selectReadImageIntrinsic(ResVReg, ResType, I);
   }
+  case Intrinsic::spv_resource_getpointer: {
+    return selectResourceGetPointer(ResVReg, ResType, I);
+  }
   case Intrinsic::spv_discard: {
     return selectDiscard(ResVReg, ResType, I);
   }
@@ -3041,27 +3134,7 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
 bool SPIRVInstructionSelector::selectHandleFromBinding(Register &ResVReg,
                                                        const SPIRVType *ResType,
                                                        MachineInstr &I) const {
-
-  uint32_t Set = foldImm(I.getOperand(2), MRI);
-  uint32_t Binding = foldImm(I.getOperand(3), MRI);
-  uint32_t ArraySize = foldImm(I.getOperand(4), MRI);
-  Register IndexReg = I.getOperand(5).getReg();
-  bool IsNonUniform = ArraySize > 1 && foldImm(I.getOperand(6), MRI);
-
-  MachineIRBuilder MIRBuilder(I);
-  Register VarReg = buildPointerToResource(ResType, Set, Binding, ArraySize,
-                                           IndexReg, IsNonUniform, MIRBuilder);
-
-  if (IsNonUniform)
-    buildOpDecorate(ResVReg, I, TII, SPIRV::Decoration::NonUniformEXT, {});
-
-  // TODO: For now we assume the resource is an image, which needs to be
-  // loaded to get the handle. That will not be true for storage buffers.
-  return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpLoad))
-      .addDef(ResVReg)
-      .addUse(GR.getSPIRVTypeID(ResType))
-      .addUse(VarReg)
-      .constrainAllUses(TII, TRI, RBI);
+  return true;
 }
 
 bool SPIRVInstructionSelector::selectReadImageIntrinsic(
@@ -3074,34 +3147,49 @@ bool SPIRVInstructionSelector::selectReadImageIntrinsic(
   // We will do that when we can, but for now trying to move forward with other
   // issues.
   Register ImageReg = I.getOperand(2).getReg();
-  assert(MRI->getVRegDef(ImageReg)->getParent() == I.getParent() &&
-         "The image must be loaded in the same basic block as its use.");
+  auto *ImageDef = cast<GIntrinsic>(getVRegDef(*MRI, ImageReg));
+  Register NewImageReg = MRI->createVirtualRegister(MRI->getRegClass(ImageReg));
+  if (!loadHandleBeforePosition(NewImageReg, GR.getSPIRVTypeForVReg(ImageReg),
+                                *ImageDef, I)) {
+    return false;
+  }
+
+  Register IdxReg = I.getOperand(3).getReg();
+  DebugLoc Loc = I.getDebugLoc();
+  MachineInstr &Pos = I;
+
+  return generateImageRead(ResVReg, ResType, NewImageReg, IdxReg, Loc, Pos);
+}
 
+bool SPIRVInstructionSelector::generateImageRead(Register &ResVReg,
+                                                 const SPIRVType *ResType,
+                                                 Register ImageReg,
+                                                 Register IdxReg, DebugLoc Loc,
+                                                 MachineInstr &Pos) const {
   uint64_t ResultSize = GR.getScalarOrVectorComponentCount(ResType);
   if (ResultSize == 4) {
-    return BuildMI(*I.getParent(), I, I.getDebugLoc(),
-                   TII.get(SPIRV::OpImageRead))
+    return BuildMI(*Pos.getParent(), Pos, Loc, TII.get(SPIRV::OpImageRead))
         .addDef(ResVReg)
         .addUse(GR.getSPIRVTypeID(ResType))
         .addUse(ImageReg)
-        .addUse(I.getOperand(3).getReg())
+        .addUse(IdxReg)
         .constrainAllUses(TII, TRI, RBI);
   }
 
-  SPIRVType *ReadType = widenTypeToVec4(ResType, I);
+  SPIRVType *ReadType = widenTypeToVec4(ResType, Pos);
   Register ReadReg = MRI->createVirtualRegister(GR.getRegClass(ReadType));
   bool Succeed =
-      BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpImageRead))
+      BuildMI(*Pos.getParent(), Pos, Loc, TII.get(SPIRV::OpImageRead))
           .addDef(ReadReg)
           .addUse(GR.getSPIRVTypeID(ReadType))
           .addUse(ImageReg)
-          .addUse(I.getOperand(3).getReg())
+          .addUse(IdxReg)
           .constrainAllUses(TII, TRI, RBI);
   if (!Succeed)
     return false;
 
   if (ResultSize == 1) {
-    return BuildMI(*I.getParent(), I, I.getDebugLoc(),
+    return BuildMI(*Pos.getParent(), Pos, Loc,
                    TII.get(SPIRV::OpCompositeExtract))
         .addDef(ResVReg)
         .addUse(GR.getSPIRVTypeID(ResType))
@@ -3109,7 +3197,25 @@ bool SPIRVInstructionSelector::selectReadImageIntrinsic(
         .addImm(0)
         .constrainAllUses(TII, TRI, RBI);
   }
-  return extractSubvector(ResVReg, ResType, ReadReg, I);
+  return extractSubvector(ResVReg, ResType, ReadReg, Pos);
+}
+
+bool SPIRVInstructionSelector::selectResourceGetPointer(
+    Register &ResVReg, const SPIRVType *ResType, MachineInstr &I) const {
+#ifdef ASSERT
+  // For now, the operand is an image. This will change once we start handling
+  // more resource types.
+  Register ResourcePtr = I.getOperand(2).getReg();
+  SPIRVType *RegType = GR.getResultType(ResourcePtr);
+  assert(RegType->getOpcode() == SPIRV::OpTypeImage &&
+         "Can only handle texel buffers for now.");
+#endif
+
+  // For texel buffers, the index into the image is part of the OpImageRead or
+  // OpImageWrite instructions. So we will do nothing in this case. This
+  // intrinsic will be combined with the load or store when selecting the load
+  // or store.
+  return true;
 }
 
 bool SPIRVInstructionSelector::extractSubvector(
@@ -3161,15 +3267,20 @@ bool SPIRVInstructionSelector::selectImageWriteIntrinsic(
   // We will do that when we can, but for now trying to move forward with other
   // issues.
   Register ImageReg = I.getOperand(1).getReg();
-  assert(MRI->getVRegDef(ImageReg)->getParent() == I.getParent() &&
-         "The image must be loaded in the same basic block as its use.");
+  auto *ImageDef = cast<GIntrinsic>(getVRegDef(*MRI, ImageReg));
+  Register NewImageReg = MRI->createVirtualRegister(MRI->getRegClass(ImageReg));
+  if (!loadHandleBeforePosition(NewImageReg, GR.getSPIRVTypeForVReg(ImageReg),
+                                *ImageDef, I)) {
+    return false;
+  }
+
   Register CoordinateReg = I.getOperand(2).getReg();
   Register DataReg = I.getOperand(3).getReg();
   assert(GR.getResultType(DataReg)->getOpcode() == SPIRV::OpTypeVector);
   assert(GR.getScalarOrVectorComponentCount(GR.getResultType(DataReg)) == 4);
   return BuildMI(*I.getParent(), I, I.getDebugLoc(),
                  TII.get(SPIRV::OpImageWrite))
-      .addUse(ImageReg)
+      .addUse(NewImageReg)
       .addUse(CoordinateReg)
       .addUse(DataReg)
       .constrainAllUses(TII, TRI, RBI);
@@ -3208,136 +3319,249 @@ Register SPIRVInstructionSelector::buildPointerToResource(
   return AcReg;
 }
 
-bool SPIRVInstructionSelector::selectFirstBitHigh16(Register ResVReg,
-                                                    const SPIRVType *ResType,
-                                                    MachineInstr &I,
-                                                    bool IsSigned) const {
-  unsigned Opcode = IsSigned ? SPIRV::OpSConvert : SPIRV::OpUConvert;
-  // zero or sign extend
+bool SPIRVInstructionSelector::selectFirstBitSet16(
+    Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
+    unsigned ExtendOpcode, unsigned BitSetOpcode) const {
   Register ExtReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
-  bool Result =
-      selectOpWithSrcs(ExtReg, ResType, I, {I.getOperand(2).getReg()}, Opcode);
-  return Result && selectFirstBitHigh32(ResVReg, ResType, I, ExtReg, IsSigned);
+  bool Result = selectOpWithSrcs(ExtReg, ResType, I, {I.getOperand(2).getReg()},
+                                 ExtendOpcode);
+
+  return Result &&
+         selectFirstBitSet32(ResVReg, ResType, I, ExtReg, BitSetOpcode);
 }
 
-bool SPIRVInstructionSelector::selectFirstBitHigh32(Register ResVReg,
-                                                    const SPIRVType *ResType,
-                                                    MachineInstr &I,
-                                                    Register SrcReg,
-                                                    bool IsSigned) const {
-  unsigned Opcode = IsSigned ? GL::FindSMsb : GL::FindUMsb;
+bool SPIRVInstructionSelector::selectFirstBitSet32(
+    Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
+    Register SrcReg, unsigned BitSetOpcode) const {
   return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpExtInst))
       .addDef(ResVReg)
       .addUse(GR.getSPIRVTypeID(ResType))
       .addImm(static_cast<uint32_t>(SPIRV::InstructionSet::GLSL_std_450))
-      .addImm(Opcode)
+      .addImm(BitSetOpcode)
       .addUse(SrcReg)
       .constrainAllUses(TII, TRI, RBI);
 }
 
-bool SPIRVInstructionSelector::selectFirstBitHigh64(Register ResVReg,
-                                                    const SPIRVType *ResType,
-                                                    MachineInstr &I,
-                                                    bool IsSigned) const {
-  Register OpReg = I.getOperand(2).getReg();
-  // 1. split our int64 into 2 pieces using a bitcast
-  unsigned count = GR.getScalarOrVectorComponentCount(ResType);
-  SPIRVType *baseType = GR.retrieveScalarOrVectorIntType(ResType);
+bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
+    Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
+    Register SrcReg, unsigned BitSetOpcode, bool SwapPrimarySide) const {
+
+  // SPIR-V allow vectors of size 2,3,4 only. Calling with a larger vectors
+  // requires creating a param register and return register with an invalid
+  // vector size. If that is resolved, then this function can be used for
+  // vectors of any component size.
+  unsigned ComponentCount = GR.getScalarOrVectorComponentCount(ResType);
+  assert(ComponentCount < 5 && "Vec 5+ will generate invalid SPIR-V ops");
+
   MachineIRBuilder MIRBuilder(I);
-  SPIRVType *postCastT =
-      GR.getOrCreateSPIRVVectorType(baseType, 2 * count, MIRBuilder);
-  Register bitcastReg = MRI->createVirtualRegister(GR.getRegClass(postCastT));
-  bool Result =
-      selectOpWithSrcs(bitcastReg, postCastT, I, {OpReg}, SPIRV::OpBitcast);
+  SPIRVType *BaseType = GR.retrieveScalarOrVectorIntType(ResType);
+  SPIRVType *I64Type = GR.getOrCreateSPIRVIntegerType(64, MIRBuilder);
+  SPIRVType *I64x2Type = GR.getOrCreateSPIRVVectorType(I64Type, 2, MIRBuilder);
+  SPIRVType *Vec2ResType =
+      GR.getOrCreateSPIRVVectorType(BaseType, 2, MIRBuilder);
+
+  std::vector<Register> PartialRegs;
+
+  // Loops 0, 2, 4, ... but stops one loop early when ComponentCount is odd
+  unsigned CurrentComponent = 0;
+  for (; CurrentComponent + 1 < ComponentCount; CurrentComponent += 2) {
+    // This register holds the firstbitX result for each of the i64x2 vectors
+    // extracted from SrcReg
+    Register BitSetResult =
+        MRI->createVirtualRegister(GR.getRegClass(I64x2Type));
+
+    auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                       TII.get(SPIRV::OpVectorShuffle))
+                   .addDef(BitSetResult)
+                   .addUse(GR.getSPIRVTypeID(I64x2Type))
+                   .addUse(SrcReg)
+                   .addUse(SrcReg)
+                   .addImm(CurrentComponent)
+                   .addImm(CurrentComponent + 1);
+
+    if (!MIB.constrainAllUses(TII, TRI, RBI))
+      return false;
+
+    Register SubVecBitSetReg =
+        MRI->createVirtualRegister(GR.getRegClass(Vec2ResType));
+
+    if (!selectFirstBitSet64(SubVecBitSetReg, Vec2ResType, I, BitSetResult,
+                             BitSetOpcode, SwapPrimarySide))
+      return false;
+
+    PartialRegs.push_back(SubVecBitSetReg);
+  }
+
+  // On odd component counts we need to handle one more component
+  if (CurrentComponent != ComponentCount) {
+    bool ZeroAsNull = STI.isOpenCLEnv();
+    Register FinalElemReg = MRI->createVirtualRegister(GR.getRegClass(I64Type));
+    Register ConstIntLastIdx = GR.getOrCreateConstInt(
+        ComponentCount - 1, I, BaseType, TII, ZeroAsNull);
+
+    if (!selectOpWithSrcs(FinalElemReg, I64Type, I, {SrcReg, ConstIntLastIdx},
+                          SPIRV::OpVectorExtractDynamic))
+      return false;
+
+    Register FinalElemBitSetReg =
+        MRI->createVirtualRegister(GR.getRegClass(BaseType));
 
-  // 2. call firstbithigh
-  Register FBHReg = MRI->createVirtualRegister(GR.getRegClass(postCastT));
-  Result &= selectFirstBitHigh32(FBHReg, postCastT, I, bitcastReg, IsSigned);
+    if (!selectFirstBitSet64(FinalElemBitSetReg, BaseType, I, FinalElemReg,
+                             BitSetOpcode, SwapPrimarySide))
+      return false;
+
+    PartialRegs.push_back(FinalElemBitSetReg);
+  }
+
+  // Join all the resulting registers back into the return type in order
+  // (ie i32x2, i32x2, i32x1 -> i32x5)
+  return selectOpWithSrcs(ResVReg, ResType, I, PartialRegs,
+                          SPIRV::OpCompositeConstruct);
+}
 
-  // 3. split result vector into high bits and low bits
+bool SPIRVInstructionSelector::selectFirstBitSet64(
+    Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
+    Register SrcReg, unsigned BitSetOpcode, bool SwapPrimarySide) const {
+  unsigned ComponentCount = GR.getScalarOrVectorComponentCount(ResType);
+  SPIRVType *BaseType = GR.retrieveScalarOrVectorIntType(ResType);
+  bool ZeroAsNull = STI.isOpenCLEnv();
+  Register ConstIntZero =
+      GR.getOrCreateConstInt(0, I, BaseType, TII, ZeroAsNull);
+  Register ConstIntOne =
+      GR.getOrCreateConstInt(1, I, BaseType, TII, ZeroAsNull);
+
+  // SPIRV doesn't support vectors with more than 4 components. Since the
+  // algoritm below converts i64 -> i32x2 and i64x4 -> i32x8 it can only
+  // operate on vectors with 2 or less components. When largers vectors are
+  // seen. Split them, recurse, then recombine them.
+  if (ComponentCount > 2) {
+    return selectFirstBitSet64Overflow(ResVReg, ResType, I, SrcReg,
+                                       BitSetOpcode, SwapPrimarySide);
+  }
+
+  // 1. Split int64 into 2 pieces using a bitcast
+  MachineIRBuilder MIRBuilder(I);
+  SPIRVType *PostCastType =
+      GR.getOrCreateSPIRVVectorType(BaseType, 2 * ComponentCount, MIRBuilder);
+  Register BitcastReg =
+      MRI->createVirtualRegister(GR.getRegClass(PostCastType));
+
+  if (!selectOpWithSrcs(BitcastReg, PostCastType, I, {SrcReg},
+                        SPIRV::OpBitcast))
+    return false;
+
+  // 2. Find the first set bit from the primary side for all the pieces in #1
+  Register FBSReg = MRI->createVirtualRegister(GR.getRegClass(PostCastType));
+  if (!selectFirstBitSet32(FBSReg, PostCastType, I, BitcastReg, BitSetOpcode))
+    return false;
+
+  // 3. Split result vector into high bits and low bits
   Register HighReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
   Register LowReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
 
-  bool ZeroAsNull = STI.isOpenCLEnv();
-  bool isScalarRes = ResType->getOpcode() != SPIRV::OpTypeVector;
-  if (isScalarRes) {
+  bool IsScalarRes = ResType->getOpcode() != SPIRV::OpTypeVector;
+  if (IsScalarRes) {
     // if scalar do a vector extract
-    Result &= selectOpWithSrcs(
-        HighReg, ResType, I,
-        {FBHReg, GR.getOrCreateConstInt(0, I, ResType, TII, ZeroAsNull)},
-        SPIRV::OpVectorExtractDynamic);
-    Result &= selectOpWithSrcs(
-        LowReg, ResType, I,
-        {FBHReg, GR.getOrCreateConstInt(1, I, ResType, TII, ZeroAsNull)},
-        SPIRV::OpVectorExtractDynamic);
-  } else { // vector case do a shufflevector
+    if (!selectOpWithSrcs(HighReg, ResType, I, {FBSReg, ConstIntZero},
+                          SPIRV::OpVectorExtractDynamic))
+      return false;
+    if (!selectOpWithSrcs(LowReg, ResType, I, {FBSReg, ConstIntOne},
+                          SPIRV::OpVectorExtractDynamic))
+      return false;
+  } else {
+    // if vector do a shufflevector
     auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
                        TII.get(SPIRV::OpVectorShuffle))
                    .addDef(HighReg)
                    .addUse(GR.getSPIRVTypeID(ResType))
-                   .addUse(FBHReg)
-                   .addUse(FBHReg);
-    // ^^ this vector will not be selected from; could be empty
-    unsigned j;
-    for (j = 0; j < count * 2; j += 2) {
-      MIB.addImm(j);
+                   .addUse(FBSReg)
+                   // Per the spec, repeat the vector if only one vec is needed
+                   .addUse(FBSReg);
+
+    // high bits are stored in even indexes. Extract them from FBSReg
+    for (unsigned J = 0; J < ComponentCount * 2; J += 2) {
+      MIB.addImm(J);
     }
-    Result &= MIB.constrainAllUses(TII, TRI, RBI);
 
-    // get low bits
+    if (!MIB.constrainAllUses(TII, TRI, RBI))
+      return false;
+
     MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
                   TII.get(SPIRV::OpVectorShuffle))
               .addDef(LowReg)
               .addUse(GR.getSPIRVTypeID(ResType))
-              .addUse(FBHReg)
-              .addUse(FBHReg);
-    // ^^ this vector will not be selected from; could be empty
-    for (j = 1; j < count * 2; j += 2) {
-      MIB.addImm(j);
+              .addUse(FBSReg)
+              // Per the spec, repeat the vector if only one vec is needed
+              .addUse(FBSReg);
+
+    // low bits are stored in odd indexes. Extract them from FBSReg
+    for (unsigned J = 1; J < ComponentCount * 2; J += 2) {
+      MIB.addImm(J);
     }
-    Result &= MIB.constrainAllUses(TII, TRI, RBI);
+    if (!MIB.constrainAllUses(TII, TRI, RBI))
+      return false;
   }
 
-  // 4. check if result of each top 32 bits is == -1
+  // 4. Check the result. When primary bits == -1 use secondary, otherwise use
+  // primary
   SPIRVType *BoolType = GR.getOrCreateSPIRVBoolType(I, TII);
   Register NegOneReg;
   Register Reg0;
   Register Reg32;
-  unsigned selectOp;
-  unsigned addOp;
-  if (isScalarRes) {
+  unsigned SelectOp;
+  unsigned AddOp;
+
+  if (IsScalarRes) {
     NegOneReg =
         GR.getOrCreateConstInt((unsigned)-1, I, ResType, TII, ZeroAsNull);
     Reg0 = GR.getOrCreateConstInt(0, I, ResType, TII, ZeroAsNull);
     Reg32 = GR.getOrCreateConstInt(32, I, ResType, TII, ZeroAsNull);
-    selectOp = SPIRV::OpSelectSISCond;
-    addOp = SPIRV::OpIAddS;
+    SelectOp = SPIRV::OpSelectSISCond;
+    AddOp = SPIRV::OpIAddS;
   } else {
-    BoolType = GR.getOrCreateSPIRVVectorType(BoolType, count, MIRBuilder);
+    BoolType =
+        GR.getOrCreateSPIRVVectorType(BoolType, ComponentCount, MIRBuilder);
     NegOneReg =
         GR.getOrCreateConstVector((unsigned)-1, I, ResType, TII, ZeroAsNull);
     Reg0 = GR.getOrCreateConstVector(0, I, ResType, TII, ZeroAsNull);
     Reg32 = GR.getOrCreateConstVector(32, I, ResType, TII, ZeroAsNull);
-    selectOp = SPIRV::OpSelectVIVCond;
-    addOp = SPIRV::OpIAddV;
+    SelectOp = SPIRV::OpSelectVIVCond;
+    AddOp = SPIRV::OpIAddV;
+  }
+
+  Register PrimaryReg = HighReg;
+  Register SecondaryReg = LowReg;
+  Register PrimaryShiftReg = Reg32;
+  Register SecondaryShiftReg = Reg0;
+
+  // By default the emitted opcodes check for the set bit from the MSB side.
+  // Setting SwapPrimarySide checks the set bit from the LSB side
+  if (SwapPrimarySide) {
+    PrimaryReg = LowReg;
+    SecondaryReg = HighReg;
+    PrimaryShiftReg = Reg0;
+    SecondaryShiftReg = Reg32;
   }
 
-  // check if the high bits are == -1; true if -1
+  // Check if the primary bits are == -1
   Register BReg = MRI->createVirtualRegister(GR.getRegClass(BoolType));
-  Result &= selectOpWithSrcs(BReg, BoolType, I, {HighReg, NegOneReg},
-                             SPIRV::OpIEqual);
+  if (!selectOpWithSrcs(BReg, BoolType, I, {PrimaryReg, NegOneReg},
+                        SPIRV::OpIEqual))
+    return false;
 
-  // Select low bits if true in BReg, otherwise high bits
+  // Select secondary bits if true in BReg, otherwise primary bits
   Register TmpReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
-  Result &=
-      selectOpWithSrcs(TmpReg, ResType, I, {BReg, LowReg, HighReg}, selectOp);
+  if (!selectOpWithSrcs(TmpReg, ResType, I, {BReg, SecondaryReg, PrimaryReg},
+                        SelectOp))
+    return false;
 
-  // Add 32 for high bits, 0 for low bits
+  // 5. Add 32 when high bits are used, otherwise 0 for low bits
   Register ValReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
-  Result &= selectOpWithSrcs(ValReg, ResType, I, {BReg, Reg0, Reg32}, selectOp);
+  if (!selectOpWithSrcs(ValReg, ResType, I,
+                        {BReg, SecondaryShiftReg, PrimaryShiftReg}, SelectOp))
+    return false;
 
-  return Result &&
-         selectOpWithSrcs(ResVReg, ResType, I, {ValReg, TmpReg}, addOp);
+  return selectOpWithSrcs(ResVReg, ResType, I, {ValReg, TmpReg}, AddOp);
 }
 
 bool SPIRVInstructionSelector::selectFirstBitHigh(Register ResVReg,
@@ -3347,20 +3571,49 @@ bool SPIRVInstructionSelector::selectFirstBitHigh(Register ResVReg,
   // FindUMsb and FindSMsb intrinsics only support 32 bit integers
   Register OpReg = I.getOperand(2).getReg();
   SPIRVType *OpType = GR.getSPIRVTypeForVReg(OpReg);
+  // zero or sign extend
+  unsigned ExtendOpcode = IsSigned ? SPIRV::OpSConvert : SPIRV::OpUConvert;
+  unsigned BitSetOpcode = IsSigned ? GL::FindSMsb : GL::FindUMsb;
 
   switch (GR.getScalarOrVectorBitWidth(OpType)) {
   case 16:
-    return selectFirstBitHigh16(ResVReg, ResType, I, IsSigned);
+    return selectFirstBitSet16(ResVReg, ResType, I, ExtendOpcode, BitSetOpcode);
   case 32:
-    return selectFirstBitHigh32(ResVReg, ResType, I, OpReg, IsSigned);
+    return selectFirstBitSet32(ResVReg, ResType, I, OpReg, BitSetOpcode);
   case 64:
-    return selectFirstBitHigh64(ResVReg, ResType, I, IsSigned);
+    return selectFirstBitSet64(ResVReg, ResType, I, OpReg, BitSetOpcode,
+                               /*SwapPrimarySide=*/false);
   default:
     report_fatal_error(
         "spv_firstbituhigh and spv_firstbitshigh only support 16,32,64 bits.");
   }
 }
 
+bool SPIRVInstructionSelector::selectFirstBitLow(Register ResVReg,
+                                                 const SPIRVType *ResType,
+                                                 MachineInstr &I) const {
+  // FindILsb intrinsic only supports 32 bit integers
+  Register OpReg = I.getOperand(2).getReg();
+  SPIRVType *OpType = GR.getSPIRVTypeForVReg(OpReg);
+  // OpUConvert treats the operand bits as an unsigned i16 and zero extends it
+  // to an unsigned i32. As this leaves all the least significant bits unchanged
+  // so the first set bit from the LSB side doesn't change.
+  unsigned ExtendOpcode = SPIRV::OpUConvert;
+  unsigned BitSetOpcode = GL::FindILsb;
+
+  switch (GR.getScalarOrVectorBitWidth(OpType)) {
+  case 16:
+    return selectFirstBitSet16(ResVReg, ResType, I, ExtendOpcode, BitSetOpcode);
+  case 32:
+    return selectFirstBitSet32(ResVReg, ResType, I, OpReg, BitSetOpcode);
+  case 64:
+    return selectFirstBitSet64(ResVReg, ResType, I, OpReg, BitSetOpcode,
+                               /*SwapPrimarySide=*/true);
+  default:
+    report_fatal_error("spv_firstbitlow only supports 16,32,64 bits.");
+  }
+}
+
 bool SPIRVInstructionSelector::selectAllocaArray(Register ResVReg,
                                                  const SPIRVType *ResType,
                                                  MachineInstr &I) const {
@@ -3694,6 +3947,36 @@ SPIRVType *SPIRVInstructionSelector::widenTypeToVec4(const SPIRVType *Type,
   return GR.getOrCreateSPIRVVectorType(ScalarType, 4, MIRBuilder);
 }
 
+bool SPIRVInstructionSelector::loadHandleBeforePosition(
+    Register &HandleReg, const SPIRVType *ResType, GIntrinsic &HandleDef,
+    MachineInstr &Pos) const {
+
+  assert(HandleDef.getIntrinsicID() ==
+         Intrinsic::spv_resource_handlefrombinding);
+  uint32_t Set = foldImm(HandleDef.getOperand(2), MRI);
+  uint32_t Binding = foldImm(HandleDef.getOperand(3), MRI);
+  uint32_t ArraySize = foldImm(HandleDef.getOperand(4), MRI);
+  Register IndexReg = HandleDef.getOperand(5).getReg();
+  bool IsNonUniform = ArraySize > 1 && foldImm(HandleDef.getOperand(6), MRI);
+
+  MachineIRBuilder MIRBuilder(HandleDef);
+  Register VarReg = buildPointerToResource(ResType, Set, Binding, ArraySize,
+                                           IndexReg, IsNonUniform, MIRBuilder);
+
+  if (IsNonUniform)
+    buildOpDecorate(HandleReg, HandleDef, TII, SPIRV::Decoration::NonUniformEXT,
+                    {});
+
+  // TODO: For now we assume the resource is an image, which needs to be
+  // loaded to get the handle. That will not be true for storage buffers.
+  return BuildMI(*Pos.getParent(), Pos, HandleDef.getDebugLoc(),
+                 TII.get(SPIRV::OpLoad))
+      .addDef(HandleReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addUse(VarReg)
+      .constrainAllUses(TII, TRI, RBI);
+}
+
 namespace llvm {
 InstructionSelector *
 createSPIRVInstructionSelector(const SPIRVTargetMachine &TM,
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 020c11a3af4e1..bc00d5032544f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1352,9 +1352,7 @@ void addInstrRequirements(const MachineInstr &MI,
     case SPIRV::GroupOperation::Reduce:
     case SPIRV::GroupOperation::InclusiveScan:
     case SPIRV::GroupOperation::ExclusiveScan:
-      Reqs.addCapability(SPIRV::Capability::Kernel);
       Reqs.addCapability(SPIRV::Capability::GroupNonUniformArithmetic);
-      Reqs.addCapability(SPIRV::Capability::GroupNonUniformBallot);
       break;
     case SPIRV::GroupOperation::ClusteredReduce:
       Reqs.addCapability(SPIRV::Capability::GroupNonUniformClustered);
@@ -1696,14 +1694,16 @@ void addInstrRequirements(const MachineInstr &MI,
     break;
   case SPIRV::OpImageRead: {
     Register ImageReg = MI.getOperand(2).getReg();
-    SPIRVType *TypeDef = ST.getSPIRVGlobalRegistry()->getResultType(ImageReg);
+    SPIRVType *TypeDef = ST.getSPIRVGlobalRegistry()->getResultType(
+        ImageReg, const_cast<MachineFunction *>(MI.getMF()));
     if (isImageTypeWithUnknownFormat(TypeDef))
       Reqs.addCapability(SPIRV::Capability::StorageImageReadWithoutFormat);
     break;
   }
   case SPIRV::OpImageWrite: {
     Register ImageReg = MI.getOperand(0).getReg();
-    SPIRVType *TypeDef = ST.getSPIRVGlobalRegistry()->getResultType(ImageReg);
+    SPIRVType *TypeDef = ST.getSPIRVGlobalRegistry()->getResultType(
+        ImageReg, const_cast<MachineFunction *>(MI.getMF()));
     if (isImageTypeWithUnknownFormat(TypeDef))
       Reqs.addCapability(SPIRV::Capability::StorageImageWriteWithoutFormat);
     break;
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 5b4c84918ab48..b5ef8d2a9286f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -17,6 +17,8 @@
 #include "SPIRVUtils.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -35,9 +37,15 @@ class SPIRVPreLegalizer : public MachineFunctionPass {
     initializeSPIRVPreLegalizerPass(*PassRegistry::getPassRegistry());
   }
   bool runOnMachineFunction(MachineFunction &MF) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
 };
 } // namespace
 
+void SPIRVPreLegalizer::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<GISelKnownBitsAnalysis>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
 static void
 addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR,
                     const SPIRVSubtarget &STI,
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizerCombiner.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizerCombiner.cpp
new file mode 100644
index 0000000000000..269524b2410c2
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizerCombiner.cpp
@@ -0,0 +1,244 @@
+
+//===-- SPIRVPreLegalizerCombiner.cpp - combine legalization ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass does combining of machine instructions at the generic MI level,
+// before the legalizer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRV.h"
+#include "SPIRVTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
+#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IntrinsicsSPIRV.h"
+#include "llvm/Support/Debug.h"
+
+#define GET_GICOMBINER_DEPS
+#include "SPIRVGenPreLegalizeGICombiner.inc"
+#undef GET_GICOMBINER_DEPS
+
+#define DEBUG_TYPE "spirv-prelegalizer-combiner"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+
+namespace {
+
+#define GET_GICOMBINER_TYPES
+#include "SPIRVGenPreLegalizeGICombiner.inc"
+#undef GET_GICOMBINER_TYPES
+
+/// This match is part of a combine that
+/// rewrites length(X - Y) to distance(X, Y)
+///   (f32 (g_intrinsic length
+///           (g_fsub (vXf32 X) (vXf32 Y))))
+/// ->
+///   (f32 (g_intrinsic distance
+///           (vXf32 X) (vXf32 Y)))
+///
+bool matchLengthToDistance(MachineInstr &MI, MachineRegisterInfo &MRI) {
+  if (MI.getOpcode() != TargetOpcode::G_INTRINSIC ||
+      cast<GIntrinsic>(MI).getIntrinsicID() != Intrinsic::spv_length)
+    return false;
+
+  // First operand of MI is `G_INTRINSIC` so start at operand 2.
+  Register SubReg = MI.getOperand(2).getReg();
+  MachineInstr *SubInstr = MRI.getVRegDef(SubReg);
+  if (!SubInstr || SubInstr->getOpcode() != TargetOpcode::G_FSUB)
+    return false;
+
+  return true;
+}
+void applySPIRVDistance(MachineInstr &MI, MachineRegisterInfo &MRI,
+                        MachineIRBuilder &B) {
+
+  // Extract the operands for X and Y from the match criteria.
+  Register SubDestReg = MI.getOperand(2).getReg();
+  MachineInstr *SubInstr = MRI.getVRegDef(SubDestReg);
+  Register SubOperand1 = SubInstr->getOperand(1).getReg();
+  Register SubOperand2 = SubInstr->getOperand(2).getReg();
+
+  // Remove the original `spv_length` instruction.
+
+  Register ResultReg = MI.getOperand(0).getReg();
+  DebugLoc DL = MI.getDebugLoc();
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineBasicBlock::iterator InsertPt = MI.getIterator();
+
+  // Build the `spv_distance` intrinsic.
+  MachineInstrBuilder NewInstr =
+      BuildMI(MBB, InsertPt, DL, B.getTII().get(TargetOpcode::G_INTRINSIC));
+  NewInstr
+      .addDef(ResultReg)                       // Result register
+      .addIntrinsicID(Intrinsic::spv_distance) // Intrinsic ID
+      .addUse(SubOperand1)                     // Operand X
+      .addUse(SubOperand2);                    // Operand Y
+
+  auto RemoveAllUses = [&](Register Reg) {
+    SmallVector<MachineInstr *, 4> UsesToErase;
+    for (auto &UseMI : MRI.use_instructions(Reg))
+      UsesToErase.push_back(&UseMI);
+
+    // calling eraseFromParent to early invalidates the iterator.
+    for (auto *MIToErase : UsesToErase)
+      MIToErase->eraseFromParent();
+  };
+  RemoveAllUses(SubDestReg);   // remove all uses of FSUB Result
+  SubInstr->eraseFromParent(); // remove FSUB instruction
+}
+
+class SPIRVPreLegalizerCombinerImpl : public Combiner {
+protected:
+  const CombinerHelper Helper;
+  const SPIRVPreLegalizerCombinerImplRuleConfig &RuleConfig;
+  const SPIRVSubtarget &STI;
+
+public:
+  SPIRVPreLegalizerCombinerImpl(
+      MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
+      GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
+      const SPIRVPreLegalizerCombinerImplRuleConfig &RuleConfig,
+      const SPIRVSubtarget &STI, MachineDominatorTree *MDT,
+      const LegalizerInfo *LI);
+
+  static const char *getName() { return "SPIRVPreLegalizerCombiner"; }
+
+  bool tryCombineAll(MachineInstr &I) const override;
+
+  bool tryCombineAllImpl(MachineInstr &I) const;
+
+private:
+#define GET_GICOMBINER_CLASS_MEMBERS
+#include "SPIRVGenPreLegalizeGICombiner.inc"
+#undef GET_GICOMBINER_CLASS_MEMBERS
+};
+
+#define GET_GICOMBINER_IMPL
+#include "SPIRVGenPreLegalizeGICombiner.inc"
+#undef GET_GICOMBINER_IMPL
+
+SPIRVPreLegalizerCombinerImpl::SPIRVPreLegalizerCombinerImpl(
+    MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
+    GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
+    const SPIRVPreLegalizerCombinerImplRuleConfig &RuleConfig,
+    const SPIRVSubtarget &STI, MachineDominatorTree *MDT,
+    const LegalizerInfo *LI)
+    : Combiner(MF, CInfo, TPC, &KB, CSEInfo),
+      Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI),
+      RuleConfig(RuleConfig), STI(STI),
+#define GET_GICOMBINER_CONSTRUCTOR_INITS
+#include "SPIRVGenPreLegalizeGICombiner.inc"
+#undef GET_GICOMBINER_CONSTRUCTOR_INITS
+{
+}
+
+bool SPIRVPreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
+  return tryCombineAllImpl(MI);
+}
+
+// Pass boilerplate
+// ================
+
+class SPIRVPreLegalizerCombiner : public MachineFunctionPass {
+public:
+  static char ID;
+
+  SPIRVPreLegalizerCombiner();
+
+  StringRef getPassName() const override { return "SPIRVPreLegalizerCombiner"; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  SPIRVPreLegalizerCombinerImplRuleConfig RuleConfig;
+};
+
+} // end anonymous namespace
+
+void SPIRVPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetPassConfig>();
+  AU.setPreservesCFG();
+  getSelectionDAGFallbackAnalysisUsage(AU);
+  AU.addRequired<GISelKnownBitsAnalysis>();
+  AU.addPreserved<GISelKnownBitsAnalysis>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
+  AU.addPreserved<MachineDominatorTreeWrapperPass>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+SPIRVPreLegalizerCombiner::SPIRVPreLegalizerCombiner()
+    : MachineFunctionPass(ID) {
+  initializeSPIRVPreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
+
+  if (!RuleConfig.parseCommandLineOption())
+    report_fatal_error("Invalid rule identifier");
+}
+
+bool SPIRVPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+  auto &TPC = getAnalysis<TargetPassConfig>();
+
+  const SPIRVSubtarget &ST = MF.getSubtarget<SPIRVSubtarget>();
+  const auto *LI = ST.getLegalizerInfo();
+
+  const Function &F = MF.getFunction();
+  bool EnableOpt =
+      MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
+  GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
+  MachineDominatorTree *MDT =
+      &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
+                     /*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(),
+                     F.hasMinSize());
+  // Disable fixed-point iteration to reduce compile-time
+  CInfo.MaxIterations = 1;
+  CInfo.ObserverLvl = CombinerInfo::ObserverLevel::SinglePass;
+  // This is the first Combiner, so the input IR might contain dead
+  // instructions.
+  CInfo.EnableFullDCE = false;
+  SPIRVPreLegalizerCombinerImpl Impl(MF, CInfo, &TPC, *KB, /*CSEInfo*/ nullptr,
+                                     RuleConfig, ST, MDT, LI);
+  return Impl.combineMachineInstrs();
+}
+
+char SPIRVPreLegalizerCombiner::ID = 0;
+INITIALIZE_PASS_BEGIN(SPIRVPreLegalizerCombiner, DEBUG_TYPE,
+                      "Combine SPIRV machine instrs before legalization", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
+INITIALIZE_PASS_END(SPIRVPreLegalizerCombiner, DEBUG_TYPE,
+                    "Combine SPIRV machine instrs before legalization", false,
+                    false)
+
+namespace llvm {
+FunctionPass *createSPIRVPreLegalizerCombiner() {
+  return new SPIRVPreLegalizerCombiner();
+}
+} // end namespace llvm
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index dca67cb6c632b..098c7a6fba50e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -48,6 +48,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTarget() {
   initializeSPIRVModuleAnalysisPass(PR);
   initializeSPIRVConvergenceRegionAnalysisWrapperPassPass(PR);
   initializeSPIRVStructurizerPass(PR);
+  initializeSPIRVPreLegalizerCombinerPass(PR);
 }
 
 static std::string computeDataLayout(const Triple &TT) {
@@ -217,6 +218,7 @@ bool SPIRVPassConfig::addIRTranslator() {
 }
 
 void SPIRVPassConfig::addPreLegalizeMachineIR() {
+  addPass(createSPIRVPreLegalizerCombiner());
   addPass(createSPIRVPreLegalizerPass());
 }
 
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index fc8a0eaed140d..78db8413e62c9 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -563,10 +563,10 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case X86::PTILELOADDRST1V:
   case X86::PTCVTROWD2PSrreV:
   case X86::PTCVTROWD2PSrriV:
-  case X86::PTCVTROWPS2PBF16HrreV:
-  case X86::PTCVTROWPS2PBF16HrriV:
-  case X86::PTCVTROWPS2PBF16LrreV:
-  case X86::PTCVTROWPS2PBF16LrriV:
+  case X86::PTCVTROWPS2BF16HrreV:
+  case X86::PTCVTROWPS2BF16HrriV:
+  case X86::PTCVTROWPS2BF16LrreV:
+  case X86::PTCVTROWPS2BF16LrriV:
   case X86::PTCVTROWPS2PHHrreV:
   case X86::PTCVTROWPS2PHHrriV:
   case X86::PTCVTROWPS2PHLrreV:
@@ -578,10 +578,10 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     unsigned Opc;
     switch (Opcode) {
     case X86::PTILELOADDRSV:
-      Opc = X86::TILELOADDRS;
+      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
       break;
     case X86::PTILELOADDRST1V:
-      Opc = X86::TILELOADDRST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
       break;
     case X86::PTILELOADDV:
       Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
@@ -595,17 +595,17 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     case X86::PTCVTROWD2PSrriV:
       Opc = X86::TCVTROWD2PSrri;
       break;
-    case X86::PTCVTROWPS2PBF16HrreV:
-      Opc = X86::TCVTROWPS2PBF16Hrre;
+    case X86::PTCVTROWPS2BF16HrreV:
+      Opc = X86::TCVTROWPS2BF16Hrre;
       break;
-    case X86::PTCVTROWPS2PBF16HrriV:
-      Opc = X86::TCVTROWPS2PBF16Hrri;
+    case X86::PTCVTROWPS2BF16HrriV:
+      Opc = X86::TCVTROWPS2BF16Hrri;
       break;
-    case X86::PTCVTROWPS2PBF16LrreV:
-      Opc = X86::TCVTROWPS2PBF16Lrre;
+    case X86::PTCVTROWPS2BF16LrreV:
+      Opc = X86::TCVTROWPS2BF16Lrre;
       break;
-    case X86::PTCVTROWPS2PBF16LrriV:
-      Opc = X86::TCVTROWPS2PBF16Lrri;
+    case X86::PTCVTROWPS2BF16LrriV:
+      Opc = X86::TCVTROWPS2BF16Lrri;
       break;
     case X86::PTCVTROWPS2PHHrreV:
       Opc = X86::TCVTROWPS2PHHrre;
@@ -737,28 +737,28 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     unsigned Opc;
     switch (Opcode) {
     case X86::PT2RPNTLVWZ0V:
-      Opc = X86::T2RPNTLVWZ0;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
       break;
     case X86::PT2RPNTLVWZ0T1V:
-      Opc = X86::T2RPNTLVWZ0T1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
       break;
     case X86::PT2RPNTLVWZ1V:
-      Opc = X86::T2RPNTLVWZ1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
       break;
     case X86::PT2RPNTLVWZ1T1V:
-      Opc = X86::T2RPNTLVWZ1T1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
       break;
     case X86::PT2RPNTLVWZ0RSV:
-      Opc = X86::T2RPNTLVWZ0RS;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
       break;
     case X86::PT2RPNTLVWZ0RST1V:
-      Opc = X86::T2RPNTLVWZ0RST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
       break;
     case X86::PT2RPNTLVWZ1RSV:
-      Opc = X86::T2RPNTLVWZ1RS;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
       break;
     case X86::PT2RPNTLVWZ1RST1V:
-      Opc = X86::T2RPNTLVWZ1RST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
       break;
     default:
       llvm_unreachable("Impossible Opcode!");
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 9b340a778b36a..84bcdae520885 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1800,10 +1800,10 @@ void X86DAGToDAGISel::emitFunctionEntryCode() {
     emitSpecialCodeForMain();
 }
 
-static bool isDispSafeForFrameIndex(int64_t Val) {
-  // On 64-bit platforms, we can run into an issue where a frame index
+static bool isDispSafeForFrameIndexOrRegBase(int64_t Val) {
+  // We can run into an issue where a frame index or a register base
   // includes a displacement that, when added to the explicit displacement,
-  // will overflow the displacement field. Assuming that the frame index
+  // will overflow the displacement field. Assuming that the
   // displacement fits into a 31-bit integer  (which is only slightly more
   // aggressive than the current fundamental assumption that it fits into
   // a 32-bit integer), a 31-bit disp should always be safe.
@@ -1831,7 +1831,7 @@ bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
     // In addition to the checks required for a register base, check that
     // we do not try to use an unsafe Disp with a frame index.
     if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
-        !isDispSafeForFrameIndex(Val))
+        !isDispSafeForFrameIndexOrRegBase(Val))
       return true;
     // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
     // 64 bits. Instructions with 32-bit register addresses perform this zero
@@ -1849,10 +1849,14 @@ bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
     // to get an address size override to be emitted. However, this
     // pseudo-register is not part of any register class and therefore causes
     // MIR verification to fail.
-    if (Subtarget->isTarget64BitILP32() && !isUInt<31>(Val) &&
+    if (Subtarget->isTarget64BitILP32() &&
+        !isDispSafeForFrameIndexOrRegBase((uint32_t)Val) &&
         !AM.hasBaseOrIndexReg())
       return true;
-  }
+  } else if (AM.hasBaseOrIndexReg() && !isDispSafeForFrameIndexOrRegBase(Val))
+    // For 32-bit X86, make sure the displacement still isn't close to the
+    // expressible limit.
+    return true;
   AM.Disp = Val;
   return false;
 }
@@ -2553,7 +2557,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
   case ISD::FrameIndex:
     if (AM.BaseType == X86ISelAddressMode::RegBase &&
         AM.Base_Reg.getNode() == nullptr &&
-        (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
+        (!Subtarget->is64Bit() || isDispSafeForFrameIndexOrRegBase(AM.Disp))) {
       AM.BaseType = X86ISelAddressMode::FrameIndexBase;
       AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
       return false;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 90e3e15b1fb46..84736f18011a9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -26438,7 +26438,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       switch (CC) {
       case ISD::SETEQ: {
         SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
-        if (HasAVX10_2_COMX & HasAVX10_2_COMX_Ty) // ZF == 1
+        if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1
           break;
         // (ZF = 1 and PF = 0)
         SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
@@ -26447,7 +26447,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       }
       case ISD::SETNE: {
         SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
-        if (HasAVX10_2_COMX & HasAVX10_2_COMX_Ty) // ZF == 0
+        if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0
           break;
         // (ZF = 0 or PF = 1)
         SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
@@ -37800,14 +37800,14 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     case X86::PTILESTORED:
       Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
       break;
-#undef GET_EGPR_IF_ENABLED
     case X86::PTILELOADDRS:
-      Opc = X86::TILELOADDRS;
+      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
       break;
     case X86::PTILELOADDRST1:
-      Opc = X86::TILELOADDRST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
       break;
     }
+#undef GET_EGPR_IF_ENABLED
 
     MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
     unsigned CurOp = 0;
@@ -37838,34 +37838,36 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::PT2RPNTLVWZ1RST1: {
     const DebugLoc &DL = MI.getDebugLoc();
     unsigned Opc;
+#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
     switch (MI.getOpcode()) {
     default:
       llvm_unreachable("Unexpected instruction!");
     case X86::PT2RPNTLVWZ0:
-      Opc = X86::T2RPNTLVWZ0;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
       break;
     case X86::PT2RPNTLVWZ0T1:
-      Opc = X86::T2RPNTLVWZ0T1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
       break;
     case X86::PT2RPNTLVWZ1:
-      Opc = X86::T2RPNTLVWZ1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
       break;
     case X86::PT2RPNTLVWZ1T1:
-      Opc = X86::T2RPNTLVWZ1T1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
       break;
     case X86::PT2RPNTLVWZ0RS:
-      Opc = X86::T2RPNTLVWZ0RS;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
       break;
     case X86::PT2RPNTLVWZ0RST1:
-      Opc = X86::T2RPNTLVWZ0RST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
       break;
     case X86::PT2RPNTLVWZ1RS:
-      Opc = X86::T2RPNTLVWZ1RS;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
       break;
     case X86::PT2RPNTLVWZ1RST1:
-      Opc = X86::T2RPNTLVWZ1RST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
       break;
     }
+#undef GET_EGPR_IF_ENABLED
     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
     MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
 
@@ -37890,8 +37892,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MI.eraseFromParent(); // The pseudo is gone now.
     return BB;
   }
-  case X86::PTCVTROWPS2PBF16Hrri:
-  case X86::PTCVTROWPS2PBF16Lrri:
+  case X86::PTCVTROWPS2BF16Hrri:
+  case X86::PTCVTROWPS2BF16Lrri:
   case X86::PTCVTROWPS2PHHrri:
   case X86::PTCVTROWPS2PHLrri:
   case X86::PTCVTROWD2PSrri:
@@ -37904,14 +37906,14 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     case X86::PTCVTROWD2PSrri:
       Opc = X86::TCVTROWD2PSrri;
       break;
-    case X86::PTCVTROWPS2PBF16Hrri:
-      Opc = X86::TCVTROWPS2PBF16Hrri;
+    case X86::PTCVTROWPS2BF16Hrri:
+      Opc = X86::TCVTROWPS2BF16Hrri;
       break;
     case X86::PTCVTROWPS2PHHrri:
       Opc = X86::TCVTROWPS2PHHrri;
       break;
-    case X86::PTCVTROWPS2PBF16Lrri:
-      Opc = X86::TCVTROWPS2PBF16Lrri;
+    case X86::PTCVTROWPS2BF16Lrri:
+      Opc = X86::TCVTROWPS2BF16Lrri;
       break;
     case X86::PTCVTROWPS2PHLrri:
       Opc = X86::TCVTROWPS2PHLrri;
@@ -37928,8 +37930,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MI.eraseFromParent(); // The pseudo is gone now.
     return BB;
   }
-  case X86::PTCVTROWPS2PBF16Hrre:
-  case X86::PTCVTROWPS2PBF16Lrre:
+  case X86::PTCVTROWPS2BF16Hrre:
+  case X86::PTCVTROWPS2BF16Lrre:
   case X86::PTCVTROWPS2PHHrre:
   case X86::PTCVTROWPS2PHLrre:
   case X86::PTCVTROWD2PSrre:
@@ -37942,11 +37944,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     case X86::PTCVTROWD2PSrre:
       Opc = X86::TCVTROWD2PSrre;
       break;
-    case X86::PTCVTROWPS2PBF16Hrre:
-      Opc = X86::TCVTROWPS2PBF16Hrre;
+    case X86::PTCVTROWPS2BF16Hrre:
+      Opc = X86::TCVTROWPS2BF16Hrre;
       break;
-    case X86::PTCVTROWPS2PBF16Lrre:
-      Opc = X86::TCVTROWPS2PBF16Lrre;
+    case X86::PTCVTROWPS2BF16Lrre:
+      Opc = X86::TCVTROWPS2BF16Lrre;
       break;
     case X86::PTCVTROWPS2PHHrre:
       Opc = X86::TCVTROWPS2PHHrre;
@@ -58570,8 +58572,9 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG,
-                                     const X86Subtarget &Subtarget) {
+static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG,
+                                       const X86Subtarget &Subtarget) {
+  using namespace SDPatternMatch;
   EVT VT = N->getValueType(0);
   SDValue Src = N->getOperand(0);
   SDLoc DL(N);
@@ -58639,6 +58642,16 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp);
   }
 
+  if (VT == MVT::v4i32) {
+    SDValue HalfSrc;
+    // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))
+    // to remove XMM->GPR->XMM moves.
+    if (sd_match(Src, m_AnyExt(m_BitCast(
+                          m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))
+      return DAG.getBitcast(
+          VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));
+  }
+
   // See if we're broadcasting the scalar value, in which case just reuse that.
   // Ensure the same SDValue from the SDNode use is being used.
   if (VT.getScalarType() == Src.getValueType())
@@ -59264,7 +59277,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   // clang-format off
   default: break;
   case ISD::SCALAR_TO_VECTOR:
-    return combineScalarToVector(N, DAG, Subtarget);
+    return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
   case ISD::EXTRACT_VECTOR_ELT:
   case X86ISD::PEXTRW:
   case X86ISD::PEXTRB:
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index a055ba91d3e17..1beaaafb159e3 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -345,26 +345,33 @@ let Predicates = [HasAMXTILE, In64BitMode], isPseudo = true, SchedRW = [WriteSys
   def PTILEPAIRLOAD : PseudoI<(outs TILEPair:$dst), (ins opaquemem:$src), []>;
 }
 
-let Predicates = [HasAMXTRANSPOSE, In64BitMode] in {
-  let SchedRW = [WriteSystem] in {
-    def T2RPNTLVWZ0 : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                        (ins sibmem:$src), "t2rpntlvwz0\t{$src, $dst|$dst, $src}",
-                        []>, VEX, WIG, T8,PS;
+multiclass T2RPNTLVW_Base<bits<8> op1, bits<8> op2, string rs, string suffix> {
+  def Z0#rs#suffix    : I<op1, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
+                          "t2rpntlvwz0" #!tolower(rs)# "\t{$src, $dst|$dst, $src}", []>, PS;
+  def Z0#rs#T1#suffix : I<op2, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
+                          "t2rpntlvwz0" #!tolower(rs)# "t1\t{$src, $dst|$dst, $src}", []>, PS;
+  def Z1#rs#suffix    : I<op1, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
+                          "t2rpntlvwz1" #!tolower(rs)# "\t{$src, $dst|$dst, $src}", []>, PD;
+  def Z1#rs#T1#suffix : I<op2, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
+                          "t2rpntlvwz1" #!tolower(rs)# "t1\t{$src, $dst|$dst, $src}", []>, PD;
+}
 
-    def T2RPNTLVWZ0T1 : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                          (ins sibmem:$src), "t2rpntlvwz0t1\t{$src, $dst|$dst, $src}",
-                          []>, VEX, T8,PS;
+let Predicates = [HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in
+  defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "">, T8, VEX;
 
-    def T2RPNTLVWZ1 : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                        (ins sibmem:$src), "t2rpntlvwz1\t{$src, $dst|$dst, $src}",
-                        []>, VEX, T8,PD;
+let Predicates = [HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in
+  defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "_EVEX">, T8, EVEX, NoCD8;
 
-    def T2RPNTLVWZ1T1 : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                          (ins sibmem:$src), "t2rpntlvwz1t1\t{$src, $dst|$dst, $src}",
-                          []>, VEX, T8,PD;
+let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in
+  defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "">, T_MAP5, VEX;
 
+let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in
+  defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "_EVEX">, T_MAP5, EVEX, NoCD8;
+
+let Predicates = [HasAMXTRANSPOSE, In64BitMode] in {
+  let SchedRW = [WriteSystem] in {
     def TTRANSPOSED : I<0x5f, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src),
-                        "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8,XS;
+                        "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8, XS;
     let isPseudo = true in {
       def PT2RPNTLVWZ0V : PseudoI<(outs TILEPair:$dst),
                                   (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
@@ -491,22 +498,6 @@ let Predicates = [HasAMXCOMPLEX, HasAMXTRANSPOSE, In64BitMode], SchedRW = [Write
 }
 
 let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
-  def T2RPNTLVWZ0RS   : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                        (ins sibmem:$src1),
-                        "t2rpntlvwz0rs\t{$src1, $dst|$dst, $src1}",
-                        []>, VEX, T_MAP5;
-  def T2RPNTLVWZ0RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                        (ins sibmem:$src1),
-                        "t2rpntlvwz0rst1\t{$src1, $dst|$dst, $src1}",
-                        []>, VEX, T_MAP5;
-  def T2RPNTLVWZ1RS   : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                        (ins sibmem:$src1),
-                        "t2rpntlvwz1rs\t{$src1, $dst|$dst, $src1}",
-                        []>, VEX, T_MAP5, PD;
-  def T2RPNTLVWZ1RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                        (ins sibmem:$src1),
-                        "t2rpntlvwz1rst1\t{$src1, $dst|$dst, $src1}",
-                        []>, VEX, T_MAP5, PD;
   let isPseudo = true in {
     def PT2RPNTLVWZ0RSV   : PseudoI<(outs TILEPair:$dst),
                               (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
@@ -529,16 +520,20 @@ let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSy
   }
 } // HasAMXMOVRS, HasAMXTRANSPOSE
 
-let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in {
-  def TILELOADDRS : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst),
-                    (ins sibmem:$src1),
-                    "tileloaddrs\t{$src1, $dst|$dst, $src1}",
-                    []>, VEX, T8, XD;
-  def TILELOADDRST1 : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst),
-                    (ins sibmem:$src1),
-                    "tileloaddrst1\t{$src1, $dst|$dst, $src1}",
-                    []>, VEX, T8, PD;
+multiclass TILELOADDRS_Base<string suffix> {
+  def suffix    : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src1),
+                    "tileloaddrs\t{$src1, $dst|$dst, $src1}", []>, T8, XD;
+  def T1#suffix : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src1),
+                    "tileloaddrst1\t{$src1, $dst|$dst, $src1}", []>, T8, PD;
+}
+
+let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in
+  defm TILELOADDRS : TILELOADDRS_Base<"">, VEX;
 
+let Predicates = [HasAMXMOVRS, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in
+  defm TILELOADDRS : TILELOADDRS_Base<"_EVEX">, EVEX, NoCD8;
+
+let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in {
   let isPseudo = true, mayLoad = 1 in {
     def PTILELOADDRSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
                                                   GR16:$src2,
@@ -590,26 +585,26 @@ let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
                                    [(set VR512: $dst,
                                      (int_x86_tcvtrowd2ps_internal GR16:$src1, GR16:$src2,
                                       TILE:$src3, GR32:$src4))]>;
-    def PTCVTROWPS2PBF16HrriV : PseudoI<(outs VR512:$dst),
-                                        (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4),
-                                        [(set VR512: $dst,
-                                          (int_x86_tcvtrowps2pbf16h_internal GR16:$src1, GR16:$src2,
-                                           TILE:$src3, imm:$src4))]>;
-    def PTCVTROWPS2PBF16HrreV : PseudoI<(outs VR512:$dst),
-                                        (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4),
-                                        [(set VR512: $dst,
-                                          (int_x86_tcvtrowps2pbf16h_internal GR16:$src1, GR16:$src2,
-                                           TILE:$src3, GR32:$src4))]>;
-    def PTCVTROWPS2PBF16LrriV : PseudoI<(outs VR512:$dst),
-                                        (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4),
-                                        [(set VR512: $dst,
-                                          (int_x86_tcvtrowps2pbf16l_internal GR16:$src1, GR16:$src2,
-                                           TILE:$src3, imm:$src4))]>;
-    def PTCVTROWPS2PBF16LrreV : PseudoI<(outs VR512:$dst),
-                                        (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4),
-                                        [(set VR512: $dst,
-                                          (int_x86_tcvtrowps2pbf16l_internal GR16:$src1, GR16:$src2,
-                                           TILE:$src3, GR32:$src4))]>;
+    def PTCVTROWPS2BF16HrriV : PseudoI<(outs VR512:$dst),
+                                       (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4),
+                                       [(set VR512: $dst,
+                                         (int_x86_tcvtrowps2bf16h_internal GR16:$src1, GR16:$src2,
+                                          TILE:$src3, imm:$src4))]>;
+    def PTCVTROWPS2BF16HrreV : PseudoI<(outs VR512:$dst),
+                                       (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4),
+                                       [(set VR512: $dst,
+                                         (int_x86_tcvtrowps2bf16h_internal GR16:$src1, GR16:$src2,
+                                          TILE:$src3, GR32:$src4))]>;
+    def PTCVTROWPS2BF16LrriV : PseudoI<(outs VR512:$dst),
+                                       (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4),
+                                       [(set VR512: $dst,
+                                         (int_x86_tcvtrowps2bf16l_internal GR16:$src1, GR16:$src2,
+                                          TILE:$src3, imm:$src4))]>;
+    def PTCVTROWPS2BF16LrreV : PseudoI<(outs VR512:$dst),
+                                       (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4),
+                                       [(set VR512: $dst,
+                                         (int_x86_tcvtrowps2bf16l_internal GR16:$src1, GR16:$src2,
+                                          TILE:$src3, GR32:$src4))]>;
     def PTCVTROWPS2PHHrriV : PseudoI<(outs VR512:$dst),
                                      (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4),
                                      [(set VR512: $dst,
@@ -659,8 +654,8 @@ multiclass AMXAVX512_BASE<bits<8> Opcode1, bits<8> Opcode2, string Opstr,
 
 defm TCVTROWPS2PHH : AMXAVX512_BASE<0x6d, 0x07, "tcvtrowps2phh", PS, PS>;
 defm TCVTROWPS2PHL : AMXAVX512_BASE<0x6d, 0x77, "tcvtrowps2phl", PD, XD>;
-defm TCVTROWPS2PBF16H : AMXAVX512_BASE<0x6d, 0x07, "tcvtrowps2pbf16h", XD, XD>;
-defm TCVTROWPS2PBF16L : AMXAVX512_BASE<0x6d, 0x77, "tcvtrowps2pbf16l", XS, XS>;
+defm TCVTROWPS2BF16H : AMXAVX512_BASE<0x6d, 0x07, "tcvtrowps2bf16h", XD, XD>;
+defm TCVTROWPS2BF16L : AMXAVX512_BASE<0x6d, 0x77, "tcvtrowps2bf16l", XS, XS>;
 
 multiclass m_tilemovrow {
   let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index 127016184bc17..edbcb17297603 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -1767,9 +1767,9 @@ multiclass vmovrs_p<bits<8> opc, string OpStr, X86VectorVTInfo _> {
 }
 
 multiclass vmovrs_p_vl<bits<8> opc, string OpStr, AVX512VLVectorVTInfo _Vec> {
-  let Predicates = [HasMOVRS, HasAVX10_2_512] in
+  let Predicates = [HasMOVRS, HasAVX10_2_512, In64BitMode] in
     defm Z : vmovrs_p<opc, OpStr, _Vec.info512>, EVEX_V512;
-  let Predicates = [HasMOVRS, HasAVX10_2] in {
+  let Predicates = [HasMOVRS, HasAVX10_2, In64BitMode] in {
     defm Z128 : vmovrs_p<opc, OpStr, _Vec.info128>, EVEX_V128;
     defm Z256 : vmovrs_p<opc, OpStr, _Vec.info256>, EVEX_V256;
   }
diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td
index 9fabe2acf0019..43c02c4f85844 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -1733,7 +1733,7 @@ def CLDEMOTE : I<0x1C, MRM0m, (outs), (ins i8mem:$src), "cldemote\t$src",
 //
 
 let SchedRW = [WriteLoad] in {
-let Predicates = [HasMOVRS, NoEGPR] in {
+let Predicates = [HasMOVRS, NoEGPR, In64BitMode] in {
 def MOVRS8rm     : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
                    "movrs{b}\t{$src, $dst|$dst, $src}",
                    [(set GR8:$dst, (int_x86_movrsqi addr:$src))]>, T8;
@@ -1746,8 +1746,25 @@ def MOVRS32rm    : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
 def MOVRS64rm    : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                    "movrs{q}\t{$src, $dst|$dst, $src}",
                    [(set GR64:$dst, (int_x86_movrsdi addr:$src))]>, T8;
+}
+
+let Predicates = [HasMOVRS] in
 def PREFETCHRST2 : I<0x18, MRM4m, (outs), (ins i8mem:$src),
                    "prefetchrst2\t$src",
                    [(int_x86_prefetchrs addr:$src)]>, TB;
+
+let Predicates = [HasMOVRS, HasEGPR, In64BitMode] in {
+def MOVRS8rm_EVEX  : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
+                   "movrs{b}\t{$src, $dst|$dst, $src}",
+                   [(set GR8:$dst, (int_x86_movrsqi addr:$src))]>, EVEX, NoCD8, T_MAP4;
+def MOVRS16rm_EVEX : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                   "movrs{w}\t{$src, $dst|$dst, $src}",
+                   [(set GR16:$dst, (int_x86_movrshi addr:$src))]>, EVEX, NoCD8, PD, T_MAP4;
+def MOVRS32rm_EVEX : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                   "movrs{l}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (int_x86_movrssi addr:$src))]>, EVEX, NoCD8, T_MAP4;
+def MOVRS64rm_EVEX : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                   "movrs{q}\t{$src, $dst|$dst, $src}",
+                   [(set GR64:$dst, (int_x86_movrsdi addr:$src))]>, EVEX, NoCD8, T_MAP4, REX_W;
+}
 }
-}
\ No newline at end of file
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index fe963dddaac1f..cd5813a5338ea 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -273,8 +273,8 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II,
     break;
   }
   case Intrinsic::x86_tcvtrowd2ps_internal:
-  case Intrinsic::x86_tcvtrowps2pbf16h_internal:
-  case Intrinsic::x86_tcvtrowps2pbf16l_internal:
+  case Intrinsic::x86_tcvtrowps2bf16h_internal:
+  case Intrinsic::x86_tcvtrowps2bf16l_internal:
   case Intrinsic::x86_tcvtrowps2phh_internal:
   case Intrinsic::x86_tcvtrowps2phl_internal:
   case Intrinsic::x86_tilemovrow_internal: {
diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp
index d232a1d706549..96801636deb9e 100644
--- a/llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -122,10 +122,10 @@ class X86PreTileConfig : public MachineFunctionPass {
     case X86::PTILESTOREDV:
     case X86::PTCVTROWD2PSrreV:
     case X86::PTCVTROWD2PSrriV:
-    case X86::PTCVTROWPS2PBF16HrreV:
-    case X86::PTCVTROWPS2PBF16HrriV:
-    case X86::PTCVTROWPS2PBF16LrreV:
-    case X86::PTCVTROWPS2PBF16LrriV:
+    case X86::PTCVTROWPS2BF16HrreV:
+    case X86::PTCVTROWPS2BF16HrriV:
+    case X86::PTCVTROWPS2BF16LrreV:
+    case X86::PTCVTROWPS2BF16LrriV:
     case X86::PTCVTROWPS2PHHrreV:
     case X86::PTCVTROWPS2PHHrriV:
     case X86::PTCVTROWPS2PHLrreV:
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index 34ca03a47e0a4..e13c6e6d28c2b 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -48,12 +48,33 @@ std::optional<AArch64::ArchInfo> AArch64::ArchInfo::findBySubArch(StringRef SubA
   return {};
 }
 
+std::optional<AArch64::FMVInfo> lookupFMVByID(AArch64::ArchExtKind ExtID) {
+  for (const AArch64::FMVInfo &Info : AArch64::getFMVInfo())
+    if (Info.ID && *Info.ID == ExtID)
+      return Info;
+  return {};
+}
+
 uint64_t AArch64::getFMVPriority(ArrayRef<StringRef> Features) {
-  uint64_t Priority = 0;
-  for (StringRef Feature : Features)
-    if (std::optional<FMVInfo> Info = parseFMVExtension(Feature))
-      Priority |= (1ULL << Info->PriorityBit);
-  return Priority;
+  // Transitively enable the Arch Extensions which correspond to each feature.
+  ExtensionSet FeatureBits;
+  for (const StringRef Feature : Features) {
+    std::optional<FMVInfo> FMV = parseFMVExtension(Feature);
+    if (!FMV) {
+      if (std::optional<ExtensionInfo> Info = targetFeatureToExtension(Feature))
+        FMV = lookupFMVByID(Info->ID);
+    }
+    if (FMV && FMV->ID)
+      FeatureBits.enable(*FMV->ID);
+  }
+
+  // Construct a bitmask for all the transitively enabled Arch Extensions.
+  uint64_t PriorityMask = 0;
+  for (const FMVInfo &Info : getFMVInfo())
+    if (Info.ID && FeatureBits.Enabled.test(*Info.ID))
+      PriorityMask |= (1ULL << Info.PriorityBit);
+
+  return PriorityMask;
 }
 
 uint64_t AArch64::getCpuSupportsMask(ArrayRef<StringRef> Features) {
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 9d1b7b8b0e7cd..979b44b22338e 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1509,6 +1509,18 @@ StringRef sys::getHostCPUName() {
   return getCPUNameFromS390Model(Id, HaveVectorSupport);
 }
 #elif defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__))
+// Copied from <mach/machine.h> in the macOS SDK.
+//
+// Also available here, though usually not as up-to-date:
+// https://github.com/apple-oss-distributions/xnu/blob/xnu-11215.41.3/osfmk/mach/machine.h#L403-L452.
+#define CPUFAMILY_UNKNOWN 0
+#define CPUFAMILY_ARM_9 0xe73283ae
+#define CPUFAMILY_ARM_11 0x8ff620d8
+#define CPUFAMILY_ARM_XSCALE 0x53b005f5
+#define CPUFAMILY_ARM_12 0xbd1b0ae9
+#define CPUFAMILY_ARM_13 0x0cc90e64
+#define CPUFAMILY_ARM_14 0x96077ef1
+#define CPUFAMILY_ARM_15 0xa8511bca
 #define CPUFAMILY_ARM_SWIFT 0x1e2d6381
 #define CPUFAMILY_ARM_CYCLONE 0x37a09642
 #define CPUFAMILY_ARM_TYPHOON 0x2c91a47e
@@ -1520,13 +1532,46 @@ StringRef sys::getHostCPUName() {
 #define CPUFAMILY_ARM_FIRESTORM_ICESTORM 0x1b588bb3
 #define CPUFAMILY_ARM_BLIZZARD_AVALANCHE 0xda33d83d
 #define CPUFAMILY_ARM_EVEREST_SAWTOOTH 0x8765edea
+#define CPUFAMILY_ARM_IBIZA 0xfa33415e
+#define CPUFAMILY_ARM_PALMA 0x72015832
+#define CPUFAMILY_ARM_COLL 0x2876f5b5
+#define CPUFAMILY_ARM_LOBOS 0x5f4dea93
+#define CPUFAMILY_ARM_DONAN 0x6f5129ac
+#define CPUFAMILY_ARM_BRAVA 0x17d5b93a
+#define CPUFAMILY_ARM_TAHITI 0x75d4acb9
+#define CPUFAMILY_ARM_TUPAI 0x204526d0
 
 StringRef sys::getHostCPUName() {
   uint32_t Family;
   size_t Length = sizeof(Family);
   sysctlbyname("hw.cpufamily", &Family, &Length, NULL, 0);
 
+  // This is found by testing on actual hardware, and by looking at:
+  // https://github.com/apple-oss-distributions/xnu/blob/xnu-11215.41.3/osfmk/arm/cpuid.c#L109-L231.
+  //
+  // Another great resource is
+  // https://github.com/AsahiLinux/docs/wiki/Codenames.
+  //
+  // NOTE: We choose to return `apple-mX` instead of `apple-aX`, since the M1,
+  // M2, M3 etc. aliases are more widely known to users than A14, A15, A16 etc.
+  // (and this code is basically only used on host macOS anyways).
   switch (Family) {
+  case CPUFAMILY_UNKNOWN:
+    return "generic";
+  case CPUFAMILY_ARM_9:
+    return "arm920t"; // or arm926ej-s
+  case CPUFAMILY_ARM_11:
+    return "arm1136jf-s";
+  case CPUFAMILY_ARM_XSCALE:
+    return "xscale";
+  case CPUFAMILY_ARM_12: // Seems unused by the kernel
+    return "generic";
+  case CPUFAMILY_ARM_13:
+    return "cortex-a8";
+  case CPUFAMILY_ARM_14:
+    return "cortex-a9";
+  case CPUFAMILY_ARM_15:
+    return "cortex-a7";
   case CPUFAMILY_ARM_SWIFT:
     return "swift";
   case CPUFAMILY_ARM_CYCLONE:
@@ -1543,15 +1588,25 @@ StringRef sys::getHostCPUName() {
     return "apple-a12";
   case CPUFAMILY_ARM_LIGHTNING_THUNDER:
     return "apple-a13";
-  case CPUFAMILY_ARM_FIRESTORM_ICESTORM:
+  case CPUFAMILY_ARM_FIRESTORM_ICESTORM: // A14 / M1
     return "apple-m1";
-  case CPUFAMILY_ARM_BLIZZARD_AVALANCHE:
+  case CPUFAMILY_ARM_BLIZZARD_AVALANCHE: // A15 / M2
     return "apple-m2";
-  case CPUFAMILY_ARM_EVEREST_SAWTOOTH:
+  case CPUFAMILY_ARM_EVEREST_SAWTOOTH: // A16
+  case CPUFAMILY_ARM_IBIZA:            // M3
+  case CPUFAMILY_ARM_PALMA:            // M3 Max
+  case CPUFAMILY_ARM_LOBOS:            // M3 Pro
     return "apple-m3";
+  case CPUFAMILY_ARM_COLL: // A17 Pro
+    return "apple-a17";
+  case CPUFAMILY_ARM_DONAN:  // M4
+  case CPUFAMILY_ARM_BRAVA:  // M4 Max
+  case CPUFAMILY_ARM_TAHITI: // A18 Pro
+  case CPUFAMILY_ARM_TUPAI:  // A18
+    return "apple-m4";
   default:
     // Default to the newest CPU we know about.
-    return "apple-m3";
+    return "apple-m4";
   }
 }
 #elif defined(_AIX)
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index d540e6ca86154..c440638884322 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -259,8 +259,7 @@ doPromotion(Function *F, FunctionAnalysisManager &FAM,
             // all promoted loads.
             if (LI->hasMetadata(LLVMContext::MD_noundef))
               LI->copyMetadata(*Pair.second.MustExecInstr,
-                               {LLVMContext::MD_range, LLVMContext::MD_nonnull,
-                                LLVMContext::MD_align});
+                               Metadata::PoisonGeneratingIDs);
           }
           Args.push_back(LI);
           ArgAttrVec.push_back(AttributeSet());
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 03cb14c1270c2..56bfc8432cbb2 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -890,14 +890,11 @@ determinePointerAccessAttrs(Argument *A,
           // can participate in the speculation.
           break;
 
-      const bool IsByVal =
-          CB.isArgOperand(U) && CB.isByValArgument(CB.getArgOperandNo(U));
-
       // The accessors used on call site here do the right thing for calls and
       // invokes with operand bundles.
       if (CB.doesNotAccessMemory(UseIndex)) {
         /* nop */
-      } else if (!isModSet(ArgMR) || CB.onlyReadsMemory(UseIndex) || IsByVal) {
+      } else if (!isModSet(ArgMR) || CB.onlyReadsMemory(UseIndex)) {
         IsRead = true;
       } else if (!isRefSet(ArgMR) ||
                  CB.dataOperandHasImpliedAttr(UseIndex, Attribute::WriteOnly)) {
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 78cd249c9c16a..eb97d8b4a74f3 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2641,6 +2641,173 @@ DeleteDeadIFuncs(Module &M,
   return Changed;
 }
 
+// Follows the use-def chain of \p V backwards until it finds a Function,
+// in which case it collects in \p Versions. Return true on successful
+// use-def chain traversal, false otherwise.
+static bool collectVersions(TargetTransformInfo &TTI, Value *V,
+                            SmallVectorImpl<Function *> &Versions) {
+  if (auto *F = dyn_cast<Function>(V)) {
+    if (!TTI.isMultiversionedFunction(*F))
+      return false;
+    Versions.push_back(F);
+  } else if (auto *Sel = dyn_cast<SelectInst>(V)) {
+    if (!collectVersions(TTI, Sel->getTrueValue(), Versions))
+      return false;
+    if (!collectVersions(TTI, Sel->getFalseValue(), Versions))
+      return false;
+  } else if (auto *Phi = dyn_cast<PHINode>(V)) {
+    for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I)
+      if (!collectVersions(TTI, Phi->getIncomingValue(I), Versions))
+        return false;
+  } else {
+    // Unknown instruction type. Bail.
+    return false;
+  }
+  return true;
+}
+
+// Bypass the IFunc Resolver of MultiVersioned functions when possible. To
+// deduce whether the optimization is legal we need to compare the target
+// features between caller and callee versions. The criteria for bypassing
+// the resolver are the following:
+//
+// * If the callee's feature set is a subset of the caller's feature set,
+//   then the callee is a candidate for direct call.
+//
+// * Among such candidates the one of highest priority is the best match
+//   and it shall be picked, unless there is a version of the callee with
+//   higher priority than the best match which cannot be picked from a
+//   higher priority caller (directly or through the resolver).
+//
+// * For every higher priority callee version than the best match, there
+//   is a higher priority caller version whose feature set availability
+//   is implied by the callee's feature set.
+//
+static bool OptimizeNonTrivialIFuncs(
+    Module &M, function_ref<TargetTransformInfo &(Function &)> GetTTI) {
+  bool Changed = false;
+
+  // Cache containing the mask constructed from a function's target features.
+  DenseMap<Function *, uint64_t> FeatureMask;
+
+  for (GlobalIFunc &IF : M.ifuncs()) {
+    if (IF.isInterposable())
+      continue;
+
+    Function *Resolver = IF.getResolverFunction();
+    if (!Resolver)
+      continue;
+
+    if (Resolver->isInterposable())
+      continue;
+
+    TargetTransformInfo &TTI = GetTTI(*Resolver);
+
+    // Discover the callee versions.
+    SmallVector<Function *> Callees;
+    if (any_of(*Resolver, [&TTI, &Callees](BasicBlock &BB) {
+          if (auto *Ret = dyn_cast_or_null<ReturnInst>(BB.getTerminator()))
+            if (!collectVersions(TTI, Ret->getReturnValue(), Callees))
+              return true;
+          return false;
+        }))
+      continue;
+
+    assert(!Callees.empty() && "Expecting successful collection of versions");
+
+    // Cache the feature mask for each callee.
+    for (Function *Callee : Callees) {
+      auto [It, Inserted] = FeatureMask.try_emplace(Callee);
+      if (Inserted)
+        It->second = TTI.getFeatureMask(*Callee);
+    }
+
+    // Sort the callee versions in decreasing priority order.
+    sort(Callees, [&](auto *LHS, auto *RHS) {
+      return FeatureMask[LHS] > FeatureMask[RHS];
+    });
+
+    // Find the callsites and cache the feature mask for each caller.
+    SmallVector<Function *> Callers;
+    DenseMap<Function *, SmallVector<CallBase *>> CallSites;
+    for (User *U : IF.users()) {
+      if (auto *CB = dyn_cast<CallBase>(U)) {
+        if (CB->getCalledOperand() == &IF) {
+          Function *Caller = CB->getFunction();
+          auto [FeatIt, FeatInserted] = FeatureMask.try_emplace(Caller);
+          if (FeatInserted)
+            FeatIt->second = TTI.getFeatureMask(*Caller);
+          auto [CallIt, CallInserted] = CallSites.try_emplace(Caller);
+          if (CallInserted)
+            Callers.push_back(Caller);
+          CallIt->second.push_back(CB);
+        }
+      }
+    }
+
+    // Sort the caller versions in decreasing priority order.
+    sort(Callers, [&](auto *LHS, auto *RHS) {
+      return FeatureMask[LHS] > FeatureMask[RHS];
+    });
+
+    auto implies = [](uint64_t A, uint64_t B) { return (A & B) == B; };
+
+    // Index to the highest priority candidate.
+    unsigned I = 0;
+    // Now try to redirect calls starting from higher priority callers.
+    for (Function *Caller : Callers) {
+      assert(I < Callees.size() && "Found callers of equal priority");
+
+      Function *Callee = Callees[I];
+      uint64_t CallerBits = FeatureMask[Caller];
+      uint64_t CalleeBits = FeatureMask[Callee];
+
+      // In the case of FMV callers, we know that all higher priority callers
+      // than the current one did not get selected at runtime, which helps
+      // reason about the callees (if they have versions that mandate presence
+      // of the features which we already know are unavailable on this target).
+      if (TTI.isMultiversionedFunction(*Caller)) {
+        // If the feature set of the caller implies the feature set of the
+        // highest priority candidate then it shall be picked. In case of
+        // identical sets advance the candidate index one position.
+        if (CallerBits == CalleeBits)
+          ++I;
+        else if (!implies(CallerBits, CalleeBits)) {
+          // Keep advancing the candidate index as long as the caller's
+          // features are a subset of the current candidate's.
+          while (implies(CalleeBits, CallerBits)) {
+            if (++I == Callees.size())
+              break;
+            CalleeBits = FeatureMask[Callees[I]];
+          }
+          continue;
+        }
+      } else {
+        // We can't reason much about non-FMV callers. Just pick the highest
+        // priority callee if it matches, otherwise bail.
+        // if (I > 0 || !implies(CallerBits, CalleeBits))
+        //
+        // FIXME: This is causing a regression in the llvm test suite,
+        // specifically a 'predres' version is unexpectedly trapping on
+        // GravitonG4. My explanation is that when the caller in not a
+        // versioned function, the compiler exclusively relies on the
+        // command line option, or target attribute to deduce whether a
+        // feature is available. However, there is no guarantee that in
+        // reality the host supports those implied features.
+        continue;
+      }
+      auto &Calls = CallSites[Caller];
+      for (CallBase *CS : Calls)
+        CS->setCalledOperand(Callee);
+      Changed = true;
+    }
+    if (IF.use_empty() ||
+        all_of(IF.users(), [](User *U) { return isa<GlobalAlias>(U); }))
+      NumIFuncsResolved++;
+  }
+  return Changed;
+}
+
 static bool
 optimizeGlobalsInModule(Module &M, const DataLayout &DL,
                         function_ref<TargetLibraryInfo &(Function &)> GetTLI,
@@ -2707,6 +2874,9 @@ optimizeGlobalsInModule(Module &M, const DataLayout &DL,
     // Optimize IFuncs whose callee's are statically known.
     LocalChange |= OptimizeStaticIFuncs(M);
 
+    // Optimize IFuncs based on the target features of the caller.
+    LocalChange |= OptimizeNonTrivialIFuncs(M, GetTTI);
+
     // Remove any IFuncs that are now dead.
     LocalChange |= DeleteDeadIFuncs(M, NotDiscardableComdats);
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index f82a557e5760c..ca8a20b4b7312 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -179,10 +179,10 @@ static unsigned conjugateICmpMask(unsigned Mask) {
 }
 
 // Adapts the external decomposeBitTestICmp for local use.
-static bool decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate &Pred,
+static bool decomposeBitTestICmp(Value *Cond, CmpInst::Predicate &Pred,
                                  Value *&X, Value *&Y, Value *&Z) {
-  auto Res = llvm::decomposeBitTestICmp(
-      LHS, RHS, Pred, /*LookThroughTrunc=*/true, /*AllowNonZeroC=*/true);
+  auto Res = llvm::decomposeBitTest(Cond, /*LookThroughTrunc=*/true,
+                                    /*AllowNonZeroC=*/true);
   if (!Res)
     return false;
 
@@ -198,13 +198,10 @@ static bool decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate &Pre
 /// the right hand side as a pair.
 /// LHS and RHS are the left hand side and the right hand side ICmps and PredL
 /// and PredR are their predicates, respectively.
-static std::optional<std::pair<unsigned, unsigned>> getMaskedTypeForICmpPair(
-    Value *&A, Value *&B, Value *&C, Value *&D, Value *&E, ICmpInst *LHS,
-    ICmpInst *RHS, ICmpInst::Predicate &PredL, ICmpInst::Predicate &PredR) {
-  // Don't allow pointers. Splat vectors are fine.
-  if (!LHS->getOperand(0)->getType()->isIntOrIntVectorTy() ||
-      !RHS->getOperand(0)->getType()->isIntOrIntVectorTy())
-    return std::nullopt;
+static std::optional<std::pair<unsigned, unsigned>>
+getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C, Value *&D, Value *&E,
+                         Value *LHS, Value *RHS, ICmpInst::Predicate &PredL,
+                         ICmpInst::Predicate &PredR) {
 
   // Here comes the tricky part:
   // LHS might be of the form L11 & L12 == X, X == L21 & L22,
@@ -212,13 +209,23 @@ static std::optional<std::pair<unsigned, unsigned>> getMaskedTypeForICmpPair(
   // Now we must find those components L** and R**, that are equal, so
   // that we can extract the parameters A, B, C, D, and E for the canonical
   // above.
-  Value *L1 = LHS->getOperand(0);
-  Value *L2 = LHS->getOperand(1);
-  Value *L11, *L12, *L21, *L22;
+
   // Check whether the icmp can be decomposed into a bit test.
-  if (decomposeBitTestICmp(L1, L2, PredL, L11, L12, L2)) {
+  Value *L1, *L11, *L12, *L2, *L21, *L22;
+  if (decomposeBitTestICmp(LHS, PredL, L11, L12, L2)) {
     L21 = L22 = L1 = nullptr;
   } else {
+    auto *LHSCMP = dyn_cast<ICmpInst>(LHS);
+    if (!LHSCMP)
+      return std::nullopt;
+
+    // Don't allow pointers. Splat vectors are fine.
+    if (!LHSCMP->getOperand(0)->getType()->isIntOrIntVectorTy())
+      return std::nullopt;
+
+    PredL = LHSCMP->getPredicate();
+    L1 = LHSCMP->getOperand(0);
+    L2 = LHSCMP->getOperand(1);
     // Look for ANDs in the LHS icmp.
     if (!match(L1, m_And(m_Value(L11), m_Value(L12)))) {
       // Any icmp can be viewed as being trivially masked; if it allows us to
@@ -237,11 +244,8 @@ static std::optional<std::pair<unsigned, unsigned>> getMaskedTypeForICmpPair(
   if (!ICmpInst::isEquality(PredL))
     return std::nullopt;
 
-  Value *R1 = RHS->getOperand(0);
-  Value *R2 = RHS->getOperand(1);
-  Value *R11, *R12;
-  bool Ok = false;
-  if (decomposeBitTestICmp(R1, R2, PredR, R11, R12, R2)) {
+  Value *R11, *R12, *R2;
+  if (decomposeBitTestICmp(RHS, PredR, R11, R12, R2)) {
     if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
       A = R11;
       D = R12;
@@ -252,9 +256,19 @@ static std::optional<std::pair<unsigned, unsigned>> getMaskedTypeForICmpPair(
       return std::nullopt;
     }
     E = R2;
-    R1 = nullptr;
-    Ok = true;
   } else {
+    auto *RHSCMP = dyn_cast<ICmpInst>(RHS);
+    if (!RHSCMP)
+      return std::nullopt;
+    // Don't allow pointers. Splat vectors are fine.
+    if (!RHSCMP->getOperand(0)->getType()->isIntOrIntVectorTy())
+      return std::nullopt;
+
+    PredR = RHSCMP->getPredicate();
+
+    Value *R1 = RHSCMP->getOperand(0);
+    R2 = RHSCMP->getOperand(1);
+    bool Ok = false;
     if (!match(R1, m_And(m_Value(R11), m_Value(R12)))) {
       // As before, model no mask as a trivial mask if it'll let us do an
       // optimization.
@@ -277,36 +291,32 @@ static std::optional<std::pair<unsigned, unsigned>> getMaskedTypeForICmpPair(
     // Avoid matching against the -1 value we created for unmasked operand.
     if (Ok && match(A, m_AllOnes()))
       Ok = false;
+
+    // Look for ANDs on the right side of the RHS icmp.
+    if (!Ok) {
+      if (!match(R2, m_And(m_Value(R11), m_Value(R12)))) {
+        R11 = R2;
+        R12 = Constant::getAllOnesValue(R2->getType());
+      }
+
+      if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
+        A = R11;
+        D = R12;
+        E = R1;
+      } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
+        A = R12;
+        D = R11;
+        E = R1;
+      } else {
+        return std::nullopt;
+      }
+    }
   }
 
   // Bail if RHS was a icmp that can't be decomposed into an equality.
   if (!ICmpInst::isEquality(PredR))
     return std::nullopt;
 
-  // Look for ANDs on the right side of the RHS icmp.
-  if (!Ok) {
-    if (!match(R2, m_And(m_Value(R11), m_Value(R12)))) {
-      R11 = R2;
-      R12 = Constant::getAllOnesValue(R2->getType());
-    }
-
-    if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
-      A = R11;
-      D = R12;
-      E = R1;
-      Ok = true;
-    } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
-      A = R12;
-      D = R11;
-      E = R1;
-      Ok = true;
-    } else {
-      return std::nullopt;
-    }
-
-    assert(Ok && "Failed to find AND on the right side of the RHS icmp.");
-  }
-
   if (L11 == A) {
     B = L12;
     C = L2;
@@ -333,8 +343,8 @@ static std::optional<std::pair<unsigned, unsigned>> getMaskedTypeForICmpPair(
 /// (icmp (A & 12) != 0) & (icmp (A & 15) == 8) -> (icmp (A & 15) == 8).
 /// Also used for logical and/or, must be poison safe.
 static Value *foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
-    ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, Value *A, Value *B, Value *D,
-    Value *E, ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
+    Value *LHS, Value *RHS, bool IsAnd, Value *A, Value *B, Value *D, Value *E,
+    ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
     InstCombiner::BuilderTy &Builder) {
   // We are given the canonical form:
   //   (icmp ne (A & B), 0) & (icmp eq (A & D), E).
@@ -457,7 +467,8 @@ static Value *foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
   // (icmp ne (A & 15), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8).
   if (IsSuperSetOrEqual(BCst, DCst)) {
     // We can't guarantee that samesign hold after this fold.
-    RHS->setSameSign(false);
+    if (auto *ICmp = dyn_cast<ICmpInst>(RHS))
+      ICmp->setSameSign(false);
     return RHS;
   }
   // Otherwise, B is a subset of D. If B and E have a common bit set,
@@ -466,7 +477,8 @@ static Value *foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
   assert(IsSubSetOrEqual(BCst, DCst) && "Precondition due to above code");
   if ((*BCst & ECst) != 0) {
     // We can't guarantee that samesign hold after this fold.
-    RHS->setSameSign(false);
+    if (auto *ICmp = dyn_cast<ICmpInst>(RHS))
+      ICmp->setSameSign(false);
     return RHS;
   }
   // Otherwise, LHS and RHS contradict and the whole expression becomes false
@@ -481,8 +493,8 @@ static Value *foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
 /// aren't of the common mask pattern type.
 /// Also used for logical and/or, must be poison safe.
 static Value *foldLogOpOfMaskedICmpsAsymmetric(
-    ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, Value *A, Value *B, Value *C,
-    Value *D, Value *E, ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
+    Value *LHS, Value *RHS, bool IsAnd, Value *A, Value *B, Value *C, Value *D,
+    Value *E, ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
     unsigned LHSMask, unsigned RHSMask, InstCombiner::BuilderTy &Builder) {
   assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) &&
          "Expected equality predicates for masked type of icmps.");
@@ -511,12 +523,12 @@ static Value *foldLogOpOfMaskedICmpsAsymmetric(
 
 /// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
 /// into a single (icmp(A & X) ==/!= Y).
-static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
+static Value *foldLogOpOfMaskedICmps(Value *LHS, Value *RHS, bool IsAnd,
                                      bool IsLogical,
                                      InstCombiner::BuilderTy &Builder,
                                      const SimplifyQuery &Q) {
   Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
-  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+  ICmpInst::Predicate PredL, PredR;
   std::optional<std::pair<unsigned, unsigned>> MaskPair =
       getMaskedTypeForICmpPair(A, B, C, D, E, LHS, RHS, PredL, PredR);
   if (!MaskPair)
@@ -1066,8 +1078,7 @@ static Value *foldPowerOf2AndShiftedMask(ICmpInst *Cmp0, ICmpInst *Cmp1,
   if (!JoinedByAnd)
     return nullptr;
   Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
-  ICmpInst::Predicate CmpPred0 = Cmp0->getPredicate(),
-                      CmpPred1 = Cmp1->getPredicate();
+  ICmpInst::Predicate CmpPred0, CmpPred1;
   // Assuming P is a 2^n, getMaskedTypeForICmpPair will normalize (icmp X u<
   // 2^n) into (icmp (X & ~(2^n-1)) == 0) and (icmp X s> -1) into (icmp (X &
   // SignMask) == 0).
@@ -1672,12 +1683,9 @@ static Instruction *reassociateFCmps(BinaryOperator &BO,
 
   // and (fcmp ord X, 0), (and (fcmp ord Y, 0), Z) --> and (fcmp ord X, Y), Z
   // or  (fcmp uno X, 0), (or  (fcmp uno Y, 0), Z) --> or  (fcmp uno X, Y), Z
-  Value *NewFCmp = Builder.CreateFCmp(NanPred, X, Y);
-  if (auto *NewFCmpInst = dyn_cast<FCmpInst>(NewFCmp)) {
-    // Intersect FMF from the 2 source fcmps.
-    NewFCmpInst->copyIRFlags(Op0);
-    NewFCmpInst->andIRFlags(BO10);
-  }
+  // Intersect FMF from the 2 source fcmps.
+  Value *NewFCmp =
+      Builder.CreateFCmpFMF(NanPred, X, Y, FMFSource::intersect(Op0, BO10));
   return BinaryOperator::Create(Opcode, NewFCmp, BO11);
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index dd5a4ba5a4724..842881156dc67 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2522,13 +2522,12 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
         default:
           llvm_unreachable("unexpected intrinsic ID");
         }
-        Value *V = Builder.CreateBinaryIntrinsic(
-            IID, X, ConstantFP::get(Arg0->getType(), Res), II);
         // TODO: Conservatively intersecting FMF. If Res == C2, the transform
         //       was a simplification (so Arg0 and its original flags could
         //       propagate?)
-        if (auto *CI = dyn_cast<CallInst>(V))
-          CI->andIRFlags(M);
+        Value *V = Builder.CreateBinaryIntrinsic(
+            IID, X, ConstantFP::get(Arg0->getType(), Res),
+            FMFSource::intersect(II, M));
         return replaceInstUsesWith(*II, V);
       }
     }
@@ -2623,13 +2622,11 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
   }
   case Intrinsic::fmuladd: {
     // Try to simplify the underlying FMul.
-    if (Value *V = simplifyFMulInst(II->getArgOperand(0), II->getArgOperand(1),
-                                    II->getFastMathFlags(),
-                                    SQ.getWithInstruction(II))) {
-      auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2));
-      FAdd->copyFastMathFlags(II);
-      return FAdd;
-    }
+    if (Value *V =
+            simplifyFMulInst(II->getArgOperand(0), II->getArgOperand(1),
+                             II->getFastMathFlags(), SQ.getWithInstruction(II)))
+      return BinaryOperator::CreateFAddFMF(V, II->getArgOperand(2),
+                                           II->getFastMathFlags());
 
     [[fallthrough]];
   }
@@ -2656,11 +2653,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     // Try to simplify the underlying FMul. We can only apply simplifications
     // that do not require rounding.
     if (Value *V = simplifyFMAFMul(Src0, Src1, II->getFastMathFlags(),
-                                   SQ.getWithInstruction(II))) {
-      auto *FAdd = BinaryOperator::CreateFAdd(V, Src2);
-      FAdd->copyFastMathFlags(II);
-      return FAdd;
-    }
+                                   SQ.getWithInstruction(II)))
+      return BinaryOperator::CreateFAddFMF(V, Src2, II->getFastMathFlags());
 
     // fma x, y, 0 -> fmul x, y
     // This is always valid for -0.0, but requires nsz for +0.0 as
@@ -2754,8 +2748,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
               m_CopySign(m_Value(Magnitude), m_Value(Sign)))) {
       // fabs (copysign x, y) -> (fabs x)
       CallInst *AbsSign =
-          Builder.CreateCall(II->getCalledFunction(), {Magnitude});
-      AbsSign->copyFastMathFlags(II);
+          Builder.CreateUnaryIntrinsic(Intrinsic::fabs, Magnitude, II);
       return replaceInstUsesWith(*II, AbsSign);
     }
 
@@ -2862,16 +2855,15 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       Value *NewLdexp = nullptr;
       Value *Select = nullptr;
       if (match(SelectRHS, m_ZeroInt())) {
-        NewLdexp = Builder.CreateLdexp(Src, SelectLHS);
+        NewLdexp = Builder.CreateLdexp(Src, SelectLHS, II);
         Select = Builder.CreateSelect(SelectCond, NewLdexp, Src);
       } else if (match(SelectLHS, m_ZeroInt())) {
-        NewLdexp = Builder.CreateLdexp(Src, SelectRHS);
+        NewLdexp = Builder.CreateLdexp(Src, SelectRHS, II);
         Select = Builder.CreateSelect(SelectCond, Src, NewLdexp);
       }
 
       if (NewLdexp) {
         Select->takeName(II);
-        cast<Instruction>(NewLdexp)->copyFastMathFlags(II);
         return replaceInstUsesWith(*II, Select);
       }
     }
@@ -3253,12 +3245,13 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     // call void @llvm.assume(i1 %D)
     // into
     // call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64  Constant + 1)]
-    uint64_t AlignMask;
+    uint64_t AlignMask = 1;
     if (EnableKnowledgeRetention &&
-        match(IIOperand,
-              m_SpecificICmp(ICmpInst::ICMP_EQ,
-                             m_And(m_Value(A), m_ConstantInt(AlignMask)),
-                             m_Zero()))) {
+        (match(IIOperand, m_Not(m_Trunc(m_Value(A)))) ||
+         match(IIOperand,
+               m_SpecificICmp(ICmpInst::ICMP_EQ,
+                              m_And(m_Value(A), m_ConstantInt(AlignMask)),
+                              m_Zero())))) {
       if (isPowerOf2_64(AlignMask + 1)) {
         uint64_t Offset = 0;
         match(A, m_Add(m_Value(A), m_ConstantInt(Offset)));
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 93d183837d6f4..61f1c17592e96 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -112,11 +112,6 @@ isOnlyCopiedFromConstantMemory(AAResults *AA, AllocaInst *V,
         if ((Call->onlyReadsMemory() && (Call->use_empty() || NoCapture)) ||
             (Call->onlyReadsMemory(DataOpNo) && NoCapture))
           continue;
-
-        // If this is being passed as a byval argument, the caller is making a
-        // copy, so it is only a read of the alloca.
-        if (IsArgOperand && Call->isByValArgument(DataOpNo))
-          continue;
       }
 
       // Lifetime intrinsics can be handled by the caller.
@@ -1065,6 +1060,10 @@ Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {
         V1->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
         V2->setAlignment(Alignment);
         V2->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+        // It is safe to copy any metadata that does not trigger UB. Copy any
+        // poison-generating metadata.
+        V1->copyMetadata(LI, Metadata::PoisonGeneratingIDs);
+        V2->copyMetadata(LI, Metadata::PoisonGeneratingIDs);
         return SelectInst::Create(SI->getCondition(), V1, V2);
       }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index d0b2ded127ff7..df5f9833a2ff9 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -13,6 +13,7 @@
 
 #include "InstCombineInternal.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -657,6 +658,94 @@ Instruction *InstCombinerImpl::foldPowiReassoc(BinaryOperator &I) {
   return nullptr;
 }
 
+// If we have the following pattern,
+// X = 1.0/sqrt(a)
+// R1 = X * X
+// R2 = a/sqrt(a)
+// then this method collects all the instructions that match R1 and R2.
+static bool getFSqrtDivOptPattern(Instruction *Div,
+                                  SmallPtrSetImpl<Instruction *> &R1,
+                                  SmallPtrSetImpl<Instruction *> &R2) {
+  Value *A;
+  if (match(Div, m_FDiv(m_FPOne(), m_Sqrt(m_Value(A)))) ||
+      match(Div, m_FDiv(m_SpecificFP(-1.0), m_Sqrt(m_Value(A))))) {
+    for (User *U : Div->users()) {
+      Instruction *I = cast<Instruction>(U);
+      if (match(I, m_FMul(m_Specific(Div), m_Specific(Div))))
+        R1.insert(I);
+    }
+
+    CallInst *CI = cast<CallInst>(Div->getOperand(1));
+    for (User *U : CI->users()) {
+      Instruction *I = cast<Instruction>(U);
+      if (match(I, m_FDiv(m_Specific(A), m_Sqrt(m_Specific(A)))))
+        R2.insert(I);
+    }
+  }
+  return !R1.empty() && !R2.empty();
+}
+
+// Check legality for transforming
+// x = 1.0/sqrt(a)
+// r1 = x * x;
+// r2 = a/sqrt(a);
+//
+// TO
+//
+// r1 = 1/a
+// r2 = sqrt(a)
+// x = r1 * r2
+// This transform works only when 'a' is known positive.
+static bool isFSqrtDivToFMulLegal(Instruction *X,
+                                  SmallPtrSetImpl<Instruction *> &R1,
+                                  SmallPtrSetImpl<Instruction *> &R2) {
+  // Check if the required pattern for the transformation exists.
+  if (!getFSqrtDivOptPattern(X, R1, R2))
+    return false;
+
+  BasicBlock *BBx = X->getParent();
+  BasicBlock *BBr1 = (*R1.begin())->getParent();
+  BasicBlock *BBr2 = (*R2.begin())->getParent();
+
+  CallInst *FSqrt = cast<CallInst>(X->getOperand(1));
+  if (!FSqrt->hasAllowReassoc() || !FSqrt->hasNoNaNs() ||
+      !FSqrt->hasNoSignedZeros() || !FSqrt->hasNoInfs())
+    return false;
+
+  // We change x = 1/sqrt(a) to x = sqrt(a) * 1/a . This change isn't allowed
+  // by recip fp as it is strictly meant to transform ops of type a/b to
+  // a * 1/b. So, this can be considered as algebraic rewrite and reassoc flag
+  // has been used(rather abused)in the past for algebraic rewrites.
+  if (!X->hasAllowReassoc() || !X->hasAllowReciprocal() || !X->hasNoInfs())
+    return false;
+
+  // Check the constraints on X, R1 and R2 combined.
+  // fdiv instruction and one of the multiplications must reside in the same
+  // block. If not, the optimized code may execute more ops than before and
+  // this may hamper the performance.
+  if (BBx != BBr1 && BBx != BBr2)
+    return false;
+
+  // Check the constraints on instructions in R1.
+  if (any_of(R1, [BBr1](Instruction *I) {
+        // When you have multiple instructions residing in R1 and R2
+        // respectively, it's difficult to generate combinations of (R1,R2) and
+        // then check if we have the required pattern. So, for now, just be
+        // conservative.
+        return (I->getParent() != BBr1 || !I->hasAllowReassoc());
+      }))
+    return false;
+
+  // Check the constraints on instructions in R2.
+  return all_of(R2, [BBr2](Instruction *I) {
+    // When you have multiple instructions residing in R1 and R2
+    // respectively, it's difficult to generate combination of (R1,R2) and
+    // then check if we have the required pattern. So, for now, just be
+    // conservative.
+    return (I->getParent() == BBr2 && I->hasAllowReassoc());
+  });
+}
+
 Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0);
   Value *Op1 = I.getOperand(1);
@@ -1913,6 +2002,74 @@ static Instruction *foldFDivSqrtDivisor(BinaryOperator &I,
   return BinaryOperator::CreateFMulFMF(Op0, NewSqrt, &I);
 }
 
+// Change
+// X = 1/sqrt(a)
+// R1 = X * X
+// R2 = a * X
+//
+// TO
+//
+// FDiv = 1/a
+// FSqrt = sqrt(a)
+// FMul = FDiv * FSqrt
+// Replace Uses Of R1 With FDiv
+// Replace Uses Of R2 With FSqrt
+// Replace Uses Of X With FMul
+static Instruction *
+convertFSqrtDivIntoFMul(CallInst *CI, Instruction *X,
+                        const SmallPtrSetImpl<Instruction *> &R1,
+                        const SmallPtrSetImpl<Instruction *> &R2,
+                        InstCombiner::BuilderTy &B, InstCombinerImpl *IC) {
+
+  B.SetInsertPoint(X);
+
+  // Have an instruction that is representative of all of instructions in R1 and
+  // get the most common fpmath metadata and fast-math flags on it.
+  Value *SqrtOp = CI->getArgOperand(0);
+  auto *FDiv = cast<Instruction>(
+      B.CreateFDiv(ConstantFP::get(X->getType(), 1.0), SqrtOp));
+  auto *R1FPMathMDNode = (*R1.begin())->getMetadata(LLVMContext::MD_fpmath);
+  FastMathFlags R1FMF = (*R1.begin())->getFastMathFlags(); // Common FMF
+  for (Instruction *I : R1) {
+    R1FPMathMDNode = MDNode::getMostGenericFPMath(
+        R1FPMathMDNode, I->getMetadata(LLVMContext::MD_fpmath));
+    R1FMF &= I->getFastMathFlags();
+    IC->replaceInstUsesWith(*I, FDiv);
+    IC->eraseInstFromFunction(*I);
+  }
+  FDiv->setMetadata(LLVMContext::MD_fpmath, R1FPMathMDNode);
+  FDiv->copyFastMathFlags(R1FMF);
+
+  // Have a single sqrt call instruction that is representative of all of
+  // instructions in R2 and get the most common fpmath metadata and fast-math
+  // flags on it.
+  auto *FSqrt = cast<CallInst>(CI->clone());
+  FSqrt->insertBefore(CI);
+  auto *R2FPMathMDNode = (*R2.begin())->getMetadata(LLVMContext::MD_fpmath);
+  FastMathFlags R2FMF = (*R2.begin())->getFastMathFlags(); // Common FMF
+  for (Instruction *I : R2) {
+    R2FPMathMDNode = MDNode::getMostGenericFPMath(
+        R2FPMathMDNode, I->getMetadata(LLVMContext::MD_fpmath));
+    R2FMF &= I->getFastMathFlags();
+    IC->replaceInstUsesWith(*I, FSqrt);
+    IC->eraseInstFromFunction(*I);
+  }
+  FSqrt->setMetadata(LLVMContext::MD_fpmath, R2FPMathMDNode);
+  FSqrt->copyFastMathFlags(R2FMF);
+
+  Instruction *FMul;
+  // If X = -1/sqrt(a) initially,then FMul = -(FDiv * FSqrt)
+  if (match(X, m_FDiv(m_SpecificFP(-1.0), m_Specific(CI)))) {
+    Value *Mul = B.CreateFMul(FDiv, FSqrt);
+    FMul = cast<Instruction>(B.CreateFNeg(Mul));
+  } else
+    FMul = cast<Instruction>(B.CreateFMul(FDiv, FSqrt));
+  FMul->copyMetadata(*X);
+  FMul->copyFastMathFlags(FastMathFlags::intersectRewrite(R1FMF, R2FMF) |
+                          FastMathFlags::unionValue(R1FMF, R2FMF));
+  return IC->replaceInstUsesWith(*X, FMul);
+}
+
 Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
   Module *M = I.getModule();
 
@@ -1937,6 +2094,24 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
     return R;
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Convert
+  // x = 1.0/sqrt(a)
+  // r1 = x * x;
+  // r2 = a/sqrt(a);
+  //
+  // TO
+  //
+  // r1 = 1/a
+  // r2 = sqrt(a)
+  // x = r1 * r2
+  SmallPtrSet<Instruction *, 2> R1, R2;
+  if (isFSqrtDivToFMulLegal(&I, R1, R2)) {
+    CallInst *CI = cast<CallInst>(I.getOperand(1));
+    if (Instruction *D = convertFSqrtDivIntoFMul(CI, &I, R1, R2, Builder, this))
+      return D;
+  }
+
   if (isa<Constant>(Op0))
     if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
       if (Instruction *R = FoldOpIntoSelect(I, SI))
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 2fb60ef11499c..fb21576722461 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -281,28 +281,33 @@ bool InstCombinerImpl::shouldChangeType(Type *From, Type *To) const {
 // Return true, if No Signed Wrap should be maintained for I.
 // The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C",
 // where both B and C should be ConstantInts, results in a constant that does
-// not overflow. This function only handles the Add and Sub opcodes. For
+// not overflow. This function only handles the Add/Sub/Mul opcodes. For
 // all other opcodes, the function conservatively returns false.
 static bool maintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
   auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
   if (!OBO || !OBO->hasNoSignedWrap())
     return false;
 
-  // We reason about Add and Sub Only.
-  Instruction::BinaryOps Opcode = I.getOpcode();
-  if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
-    return false;
-
   const APInt *BVal, *CVal;
   if (!match(B, m_APInt(BVal)) || !match(C, m_APInt(CVal)))
     return false;
 
+  // We reason about Add/Sub/Mul Only.
   bool Overflow = false;
-  if (Opcode == Instruction::Add)
+  switch (I.getOpcode()) {
+  case Instruction::Add:
     (void)BVal->sadd_ov(*CVal, Overflow);
-  else
+    break;
+  case Instruction::Sub:
     (void)BVal->ssub_ov(*CVal, Overflow);
-
+    break;
+  case Instruction::Mul:
+    (void)BVal->smul_ov(*CVal, Overflow);
+    break;
+  default:
+    // Conservatively return false for other opcodes.
+    return false;
+  }
   return !Overflow;
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 0169320deae46..6daee7a3b6e81 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -318,6 +318,13 @@ static cl::opt<bool> ClDumpStrictInstructions(
     cl::desc("print out instructions with default strict semantics"),
     cl::Hidden, cl::init(false));
 
+static cl::opt<bool> ClDumpStrictIntrinsics(
+    "msan-dump-strict-intrinsics",
+    cl::desc("Prints 'unknown' intrinsics that were handled heuristically. "
+             "Use -msan-dump-strict-instructions to print intrinsics that "
+             "could not be handled exactly nor heuristically."),
+    cl::Hidden, cl::init(false));
+
 static cl::opt<int> ClInstrumentationWithCallThreshold(
     "msan-instrumentation-with-call-threshold",
     cl::desc(
@@ -3014,7 +3021,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   ///
   /// We special-case intrinsics where this approach fails. See llvm.bswap
   /// handling as an example of that.
-  bool handleUnknownIntrinsic(IntrinsicInst &I) {
+  bool handleUnknownIntrinsicUnlogged(IntrinsicInst &I) {
     unsigned NumArgOperands = I.arg_size();
     if (NumArgOperands == 0)
       return false;
@@ -3040,6 +3047,18 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return false;
   }
 
+  bool handleUnknownIntrinsic(IntrinsicInst &I) {
+    if (handleUnknownIntrinsicUnlogged(I)) {
+      if (ClDumpStrictIntrinsics)
+        dumpInst(I);
+
+      LLVM_DEBUG(dbgs() << "UNKNOWN INTRINSIC HANDLED HEURISTICALLY: " << I
+                        << "\n");
+      return true;
+    } else
+      return false;
+  }
+
   void handleInvariantGroup(IntrinsicInst &I) {
     setShadow(&I, getShadow(&I, 0));
     setOrigin(&I, getOrigin(&I, 0));
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index d769d5100afd2..e5fabd318b82c 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -4,6 +4,7 @@ add_llvm_component_library(LLVMVectorize
   LoopVectorizationLegality.cpp
   LoopVectorize.cpp
   SandboxVectorizer/DependencyGraph.cpp
+  SandboxVectorizer/InstrMaps.cpp
   SandboxVectorizer/Interval.cpp
   SandboxVectorizer/Legality.cpp
   SandboxVectorizer/Passes/BottomUpVec.cpp
@@ -12,6 +13,7 @@ add_llvm_component_library(LLVMVectorize
   SandboxVectorizer/SandboxVectorizerPassBuilder.cpp
   SandboxVectorizer/Scheduler.cpp
   SandboxVectorizer/SeedCollector.cpp
+  SandboxVectorizer/VecUtils.cpp
   SLPVectorizer.cpp
   Vectorize.cpp
   VectorCombine.cpp
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 99f6a8860f0f4..d79d9e8445b3d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -987,7 +987,7 @@ class LoopVectorizationCostModel {
                              InterleavedAccessInfo &IAI)
       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
-        Hints(Hints), InterleaveInfo(IAI) {}
+        Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {}
 
   /// \return An upper bound for the vectorization factors (both fixed and
   /// scalable). If the factors are 0, vectorization and interleaving should be
@@ -1447,9 +1447,11 @@ class LoopVectorizationCostModel {
     // Override forced styles if needed.
     // FIXME: use actual opcode/data type for analysis here.
     // FIXME: Investigate opportunity for fixed vector factor.
-    bool EVLIsLegal = UserIC <= 1 &&
-                      TTI.hasActiveVectorLength(0, nullptr, Align()) &&
-                      !EnableVPlanNativePath;
+    bool EVLIsLegal =
+        UserIC <= 1 && TTI.hasActiveVectorLength(0, nullptr, Align()) &&
+        !EnableVPlanNativePath &&
+        // FIXME: remove this once fixed-ordered recurrence is supported.
+        Legal->getFixedOrderRecurrences().empty();
     if (!EVLIsLegal) {
       // If for some reason EVL mode is unsupported, fallback to
       // DataWithoutLaneMask to try to vectorize the loop with folded tail
@@ -1553,9 +1555,9 @@ class LoopVectorizationCostModel {
 
   /// Return the cost of instructions in an inloop reduction pattern, if I is
   /// part of that pattern.
-  std::optional<InstructionCost>
-  getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
-                          TTI::TargetCostKind CostKind) const;
+  std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
+                                                         ElementCount VF,
+                                                         Type *VectorTy) const;
 
   /// Returns true if \p Op should be considered invariant and if it is
   /// trivially hoistable.
@@ -1614,8 +1616,8 @@ class LoopVectorizationCostModel {
 
   /// Estimate the overhead of scalarizing an instruction. This is a
   /// convenience wrapper for the type-based getScalarizationOverhead API.
-  InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
-                                           TTI::TargetCostKind CostKind) const;
+  InstructionCost getScalarizationOverhead(Instruction *I,
+                                           ElementCount VF) const;
 
   /// Returns true if an artificially high cost for emulated masked memrefs
   /// should be used.
@@ -1796,6 +1798,9 @@ class LoopVectorizationCostModel {
 
   /// All element types found in the loop.
   SmallPtrSet<Type *, 16> ElementTypesInLoop;
+
+  /// The kind of cost that we are calculating
+  TTI::TargetCostKind CostKind;
 };
 } // end namespace llvm
 
@@ -1836,13 +1841,17 @@ class GeneratedRTChecks {
 
   PredicatedScalarEvolution &PSE;
 
+  /// The kind of cost that we are calculating
+  TTI::TargetCostKind CostKind;
+
 public:
   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
                     LoopInfo *LI, TargetTransformInfo *TTI,
-                    const DataLayout &DL, bool AddBranchWeights)
+                    const DataLayout &DL, bool AddBranchWeights,
+                    TTI::TargetCostKind CostKind)
       : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
         MemCheckExp(*PSE.getSE(), DL, "scev.check"),
-        AddBranchWeights(AddBranchWeights), PSE(PSE) {}
+        AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
 
   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
   /// accurately estimate the cost of the runtime checks. The blocks are
@@ -1954,8 +1963,7 @@ class GeneratedRTChecks {
       for (Instruction &I : *SCEVCheckBlock) {
         if (SCEVCheckBlock->getTerminator() == &I)
           continue;
-        InstructionCost C =
-            TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
+        InstructionCost C = TTI->getInstructionCost(&I, CostKind);
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
         RTCheckCost += C;
       }
@@ -1964,8 +1972,7 @@ class GeneratedRTChecks {
       for (Instruction &I : *MemCheckBlock) {
         if (MemCheckBlock->getTerminator() == &I)
           continue;
-        InstructionCost C =
-            TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
+        InstructionCost C = TTI->getInstructionCost(&I, CostKind);
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
         MemCheckCost += C;
       }
@@ -2926,10 +2933,9 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
   if (!VF.isScalar())
     return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
 
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   Type *RetTy = CI->getType();
   if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
-    if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
+    if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
       return *RedCost;
 
   SmallVector<Type *, 4> Tys;
@@ -2972,8 +2978,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
 
   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
                                     dyn_cast<IntrinsicInst>(CI));
-  return TTI.getIntrinsicInstrCost(CostAttrs,
-                                   TargetTransformInfo::TCK_RecipThroughput);
+  return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
 }
 
 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
@@ -3430,8 +3435,6 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
          I->getOpcode() == Instruction::URem);
   assert(!isSafeToSpeculativelyExecute(I));
 
-  const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
   // Scalarization isn't legal for scalable vector types
   InstructionCost ScalarizationCost = InstructionCost::getInvalid();
   if (!VF.isScalable()) {
@@ -3453,7 +3456,7 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
 
     // The cost of insertelement and extractelement instructions needed for
     // scalarization.
-    ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
+    ScalarizationCost += getScalarizationOverhead(I, VF);
 
     // Scale the cost by the probability of executing the predicated blocks.
     // This assumes the predicated block for each vector lane is equally
@@ -3503,10 +3506,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
   if (hasIrregularType(ScalarTy, DL))
     return false;
 
-  // We currently only know how to emit interleave/deinterleave with
-  // Factor=2 for scalable vectors. This is purely an implementation
-  // limit.
-  if (VF.isScalable() && InterleaveFactor != 2)
+  // For scalable vectors, the only interleave factor currently supported
+  // must be power of 2 since we require the (de)interleave2 intrinsics
+  // instead of shufflevectors.
+  if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor))
     return false;
 
   // If the group involves a non-integral pointer, we may not be able to
@@ -4443,7 +4446,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
   for (const auto &Plan : VPlans) {
     for (ElementCount VF : Plan->vectorFactors()) {
       VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
-                            CM);
+                            CM, CM.CostKind);
       precomputeCosts(*Plan, VF, CostCtx);
       auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -5593,7 +5596,6 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
 
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
-    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
       ScalarCost += TTI.getScalarizationOverhead(
           cast<VectorType>(toVectorTy(I->getType(), VF)),
@@ -5740,7 +5742,6 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
 
   // Don't pass *I here, since it is scalar but will actually be part of a
   // vectorized loop where the user of it is a vectorized instruction.
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   const Align Alignment = getLoadStoreAlignment(I);
   Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
                                                       ValTy->getScalarType(),
@@ -5748,7 +5749,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
 
   // Get the overhead of the extractelement and insertelement instructions
   // we might create due to scalarization.
-  Cost += getScalarizationOverhead(I, VF, CostKind);
+  Cost += getScalarizationOverhead(I, VF);
 
   // If we have a predicated load/store, it will need extra i1 extracts and
   // conditional branches, but may not be executed for each vector lane. Scale
@@ -5781,7 +5782,6 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
   Value *Ptr = getLoadStorePointerOperand(I);
   unsigned AS = getLoadStoreAddressSpace(I);
   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
-  enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
          "Stride should be 1 or -1 for consecutive memory access");
@@ -5812,12 +5812,12 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
   const Align Alignment = getLoadStoreAlignment(I);
   unsigned AS = getLoadStoreAddressSpace(I);
-  enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   if (isa<LoadInst>(I)) {
     return TTI.getAddressComputationCost(ValTy) +
            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
                                CostKind) +
-           TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
+           TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy, {},
+                              CostKind);
   }
   StoreInst *SI = cast<StoreInst>(I);
 
@@ -5840,9 +5840,9 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
   const Value *Ptr = getLoadStorePointerOperand(I);
 
   return TTI.getAddressComputationCost(VectorTy) +
-         TTI.getGatherScatterOpCost(
-             I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
-             TargetTransformInfo::TCK_RecipThroughput, I);
+         TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
+                                    Legal->isMaskRequired(I), Alignment,
+                                    CostKind, I);
 }
 
 InstructionCost
@@ -5855,7 +5855,6 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   Type *ValTy = getLoadStoreType(InsertPos);
   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
   unsigned AS = getLoadStoreAddressSpace(InsertPos);
-  enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
   unsigned InterleaveFactor = Group->getFactor();
   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
@@ -5887,9 +5886,9 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
 }
 
 std::optional<InstructionCost>
-LoopVectorizationCostModel::getReductionPatternCost(
-    Instruction *I, ElementCount VF, Type *Ty,
-    TTI::TargetCostKind CostKind) const {
+LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
+                                                    ElementCount VF,
+                                                    Type *Ty) const {
   using namespace llvm::PatternMatch;
   // Early exit for no inloop reductions
   if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
@@ -6080,14 +6079,15 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
 
     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
     return TTI.getAddressComputationCost(ValTy) +
-           TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
-                               TTI::TCK_RecipThroughput, OpInfo, I);
+           TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
+                               OpInfo, I);
   }
   return getWideningCost(I, VF);
 }
 
-InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
-    Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
+InstructionCost
+LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
+                                                     ElementCount VF) const {
 
   // There is no mechanism yet to create a scalable scalarization loop,
   // so this is currently Invalid.
@@ -6330,7 +6330,6 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
       InstructionCost ScalarCost = InstructionCost::getInvalid();
       InstructionCost VectorCost = InstructionCost::getInvalid();
       InstructionCost IntrinsicCost = InstructionCost::getInvalid();
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
       Function *ScalarFunc = CI->getCalledFunction();
       Type *ScalarRetTy = CI->getType();
       SmallVector<Type *, 4> Tys, ScalarTys;
@@ -6346,8 +6345,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
 
       // Compute costs of unpacking argument values for the scalar calls and
       // packing the return values to a vector.
-      InstructionCost ScalarizationCost =
-          getScalarizationOverhead(CI, VF, CostKind);
+      InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
 
       ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
       // Honor ForcedScalars and UniformAfterVectorization decisions.
@@ -6371,7 +6369,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
       // An in-loop reduction using an fmuladd intrinsic is a special case;
       // we don't want the normal cost for that intrinsic.
       if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
-        if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
+        if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
           setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
                                   getVectorIntrinsicIDForCall(CI, TLI),
                                   std::nullopt, *RedCost);
@@ -6456,7 +6454,8 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
             TargetTransformInfo::SK_Broadcast,
             VectorType::get(IntegerType::getInt1Ty(
                                 VecFunc->getFunctionType()->getContext()),
-                            VF));
+                            VF),
+            {}, CostKind);
 
       if (TLI && VecFunc && !CI->isNoBuiltin())
         VectorCost =
@@ -6524,7 +6523,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
   if (canTruncateToMinimalBitwidth(I, VF))
     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
   auto *SE = PSE.getSE();
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
   auto HasSingleCopyAfterVectorization = [this](Instruction *I,
                                                 ElementCount VF) -> bool {
@@ -6700,7 +6698,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       InstructionCost MulCost = TTI::TCC_Free;
       ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
       if (!RHS || RHS->getZExtValue() != 1)
-        MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
+        MulCost =
+            TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
 
       // Find the cost of the histogram operation itself.
       Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
@@ -6711,9 +6710,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                                   {PtrTy, ScalarTy, MaskTy});
 
       // Add the costs together with the add/sub operation.
-      return TTI.getIntrinsicInstrCost(
-                 ICA, TargetTransformInfo::TCK_RecipThroughput) +
-             MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
+      return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
+             TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind);
     }
     [[fallthrough]];
   }
@@ -6738,7 +6736,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       return 0;
 
     // Detect reduction patterns
-    if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
+    if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
       return *RedCost;
 
     // Certain instructions can be cheaper to vectorize if they have a constant
@@ -6903,7 +6901,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     }
 
     // Detect reduction patterns
-    if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
+    if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
       return *RedCost;
 
     Type *SrcScalarTy = I->getOperand(0)->getType();
@@ -6928,7 +6926,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
   case Instruction::Call:
     return getVectorCallCost(cast<CallInst>(I), VF);
   case Instruction::ExtractValue:
-    return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
+    return TTI.getInstructionCost(I, CostKind);
   case Instruction::Alloca:
     // We cannot easily widen alloca to a scalable alloca, as
     // the result would need to be a vector of pointers.
@@ -7440,8 +7438,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
 
     // Pre-compute the cost for I, if it has a reduction pattern cost.
     for (Instruction *I : ChainOpsAndOperands) {
-      auto ReductionCost = CM.getReductionPatternCost(
-          I, VF, toVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
+      auto ReductionCost =
+          CM.getReductionPatternCost(I, VF, toVectorTy(I->getType(), VF));
       if (!ReductionCost)
         continue;
 
@@ -7499,7 +7497,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
 
 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
                                                ElementCount VF) const {
-  VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM);
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
+                        CM.CostKind);
   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
 
   // Now compute and add the VPlan-based cost.
@@ -7579,6 +7578,16 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
     return {*FirstPlan.vectorFactors().begin(), 0, 0};
 
+  LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
+                    << (CM.CostKind == TTI::TCK_RecipThroughput
+                            ? "Reciprocal Throughput\n"
+                        : CM.CostKind == TTI::TCK_Latency
+                            ? "Instruction Latency\n"
+                        : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
+                        : CM.CostKind == TTI::TCK_SizeAndLatency
+                            ? "Code Size and Latency\n"
+                            : "Unknown\n"));
+
   ElementCount ScalarVF = ElementCount::getFixed(1);
   assert(hasPlanWithVF(ScalarVF) &&
          "More than a single plan/VF w/o any plan having scalar VF");
@@ -7632,7 +7641,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   // simplifications not accounted for in the legacy cost model. If that's the
   // case, don't trigger the assertion, as the extra simplifications may cause a
   // different VF to be picked by the VPlan-based cost model.
-  VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM);
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
+                        CM.CostKind);
   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
   assert((BestFactor.Width == LegacyVF.Width ||
           planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
@@ -9433,9 +9443,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
                          LoopVectorizationCostModel::CM_Interleave);
       // For scalable vectors, the only interleave factor currently supported
-      // is 2 since we require the (de)interleave2 intrinsics instead of
-      // shufflevectors.
-      assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
+      // must be power of 2 since we require the (de)interleave2 intrinsics
+      // instead of shufflevectors.
+      assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) &&
              "Unsupported interleave factor for scalable vectors");
       return Result;
     };
@@ -10153,7 +10163,7 @@ static bool processLoopInVPlanNativePath(
     bool AddBranchWeights =
         hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
     GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
-                             AddBranchWeights);
+                             AddBranchWeights, CM.CostKind);
     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -10690,7 +10700,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   bool AddBranchWeights =
       hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
   GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
-                           AddBranchWeights);
+                           AddBranchWeights, CM.CostKind);
   if (LVP.hasPlanWithVF(VF.Width)) {
     // Select the interleave count.
     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b0b8f8249d657..34d9abb4dc7a3 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2414,15 +2414,17 @@ class BoUpSLP {
     }
 
     /// Go through the instructions in VL and append their operands.
-    void appendOperandsOfVL(ArrayRef<Value *> VL, Instruction *VL0) {
+    void appendOperandsOfVL(ArrayRef<Value *> VL, const InstructionsState &S) {
       assert(!VL.empty() && "Bad VL");
       assert((empty() || VL.size() == getNumLanes()) &&
              "Expected same number of lanes");
+      assert(S.valid() && "InstructionsState is invalid.");
       // IntrinsicInst::isCommutative returns true if swapping the first "two"
       // arguments to the intrinsic produces the same result.
       constexpr unsigned IntrinsicNumOperands = 2;
-      unsigned NumOperands = VL0->getNumOperands();
-      ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
+      Instruction *MainOp = S.getMainOp();
+      unsigned NumOperands = MainOp->getNumOperands();
+      ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
       OpsVec.resize(NumOperands);
       unsigned NumLanes = VL.size();
       for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
@@ -2441,19 +2443,19 @@ class BoUpSLP {
           // operations or alternating sequences (e.g., +, -), we can safely
           // tell the inverse operations by checking commutativity.
           if (isa<PoisonValue>(VL[Lane])) {
-            if (auto *EI = dyn_cast<ExtractElementInst>(VL0)) {
+            if (auto *EI = dyn_cast<ExtractElementInst>(MainOp)) {
               if (OpIdx == 0) {
                 OpsVec[OpIdx][Lane] = {EI->getVectorOperand(), true, false};
                 continue;
               }
-            } else if (auto *EV = dyn_cast<ExtractValueInst>(VL0)) {
+            } else if (auto *EV = dyn_cast<ExtractValueInst>(MainOp)) {
               if (OpIdx == 0) {
                 OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(), true, false};
                 continue;
               }
             }
             OpsVec[OpIdx][Lane] = {
-                PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true,
+                PoisonValue::get(MainOp->getOperand(OpIdx)->getType()), true,
                 false};
             continue;
           }
@@ -2566,11 +2568,12 @@ class BoUpSLP {
 
   public:
     /// Initialize with all the operands of the instruction vector \p RootVL.
-    VLOperands(ArrayRef<Value *> RootVL, Instruction *VL0, const BoUpSLP &R)
+    VLOperands(ArrayRef<Value *> RootVL, const InstructionsState &S,
+               const BoUpSLP &R)
         : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
-          L(R.LI->getLoopFor((VL0->getParent()))) {
+          L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
       // Append all the operands of RootVL.
-      appendOperandsOfVL(RootVL, VL0);
+      appendOperandsOfVL(RootVL, S);
     }
 
     /// \Returns a value vector with the operands across all lanes for the
@@ -3043,7 +3046,7 @@ class BoUpSLP {
   /// non-identity permutation that allows to reuse extract instructions.
   /// \param ResizeAllowed indicates whether it is allowed to handle subvector
   /// extract order.
-  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+  bool canReuseExtract(ArrayRef<Value *> VL,
                        SmallVectorImpl<unsigned> &CurrentOrder,
                        bool ResizeAllowed = false) const;
 
@@ -3270,7 +3273,7 @@ class BoUpSLP {
     };
 
     /// Checks if the current node is a gather node.
-    bool isGather() const {return State == NeedToGather; }
+    bool isGather() const { return State == NeedToGather; }
 
     /// A vector of scalars.
     ValueList Scalars;
@@ -3334,9 +3337,9 @@ class BoUpSLP {
     /// reordering of operands during buildTree_rec() and vectorizeTree().
     SmallVector<ValueList, 2> Operands;
 
-    /// The main/alternate instruction.
-    Instruction *MainOp = nullptr;
-    Instruction *AltOp = nullptr;
+    /// MainOp and AltOp are recorded inside. S should be obtained from
+    /// newTreeEntry.
+    InstructionsState S = InstructionsState::invalid();
 
     /// Interleaving factor for interleaved loads Vectorize nodes.
     unsigned InterleaveFactor = 0;
@@ -3360,10 +3363,10 @@ class BoUpSLP {
 
     /// Set this bundle's operand from Scalars.
     void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
-      VLOperands Ops(Scalars, MainOp, R);
+      VLOperands Ops(Scalars, S, R);
       if (RequireReorder)
         Ops.reorder();
-      for (unsigned I : seq<unsigned>(MainOp->getNumOperands()))
+      for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands()))
         setOperand(I, Ops.getVL(I));
     }
 
@@ -3396,13 +3399,9 @@ class BoUpSLP {
     }
 
     /// Some of the instructions in the list have alternate opcodes.
-    bool isAltShuffle() const { return MainOp != AltOp; }
+    bool isAltShuffle() const { return S.isAltShuffle(); }
 
-    bool isOpcodeOrAlt(Instruction *I) const {
-      unsigned CheckedOpcode = I->getOpcode();
-      return (getOpcode() == CheckedOpcode ||
-              getAltOpcode() == CheckedOpcode);
-    }
+    bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); }
 
     /// Chooses the correct key for scheduling data. If \p Op has the same (or
     /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
@@ -3411,31 +3410,24 @@ class BoUpSLP {
       auto *I = dyn_cast<Instruction>(Op);
       if (I && isOpcodeOrAlt(I))
         return Op;
-      return MainOp;
+      return S.getMainOp();
     }
 
     void setOperations(const InstructionsState &S) {
       assert(S && "InstructionsState is invalid.");
-      MainOp = S.getMainOp();
-      AltOp = S.getAltOp();
+      this->S = S;
     }
 
-    Instruction *getMainOp() const {
-      return MainOp;
-    }
+    Instruction *getMainOp() const { return S.getMainOp(); }
 
-    Instruction *getAltOp() const {
-      return AltOp;
-    }
+    Instruction *getAltOp() const { return S.getAltOp(); }
 
     /// The main/alternate opcodes for the list of instructions.
-    unsigned getOpcode() const {
-      return MainOp ? MainOp->getOpcode() : 0;
-    }
+    unsigned getOpcode() const { return S.getOpcode(); }
 
-    unsigned getAltOpcode() const {
-      return AltOp ? AltOp->getOpcode() : 0;
-    }
+    unsigned getAltOpcode() const { return S.getAltOpcode(); }
+
+    bool hasState() const { return S.valid(); }
 
     /// When ReuseReorderShuffleIndices is empty it just returns position of \p
     /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
@@ -3531,16 +3523,13 @@ class BoUpSLP {
         dbgs() << "CombinedVectorize\n";
         break;
       }
-      dbgs() << "MainOp: ";
-      if (MainOp)
-        dbgs() << *MainOp << "\n";
-      else
-        dbgs() << "NULL\n";
-      dbgs() << "AltOp: ";
-      if (AltOp)
-        dbgs() << *AltOp << "\n";
-      else
-        dbgs() << "NULL\n";
+      if (S) {
+        dbgs() << "MainOp: " << *S.getMainOp() << "\n";
+        dbgs() << "AltOp: " << *S.getAltOp() << "\n";
+      } else {
+        dbgs() << "MainOp: NULL\n";
+        dbgs() << "AltOp: NULL\n";
+      }
       dbgs() << "VectorizedValue: ";
       if (VectorizedValue)
         dbgs() << *VectorizedValue << "\n";
@@ -3715,9 +3704,13 @@ class BoUpSLP {
   }
 #endif
 
-  TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
+  TreeEntry *getTreeEntry(Value *V) {
+    assert(V && "V cannot be nullptr.");
+    return ScalarToTreeEntry.lookup(V);
+  }
 
   const TreeEntry *getTreeEntry(Value *V) const {
+    assert(V && "V cannot be nullptr.");
     return ScalarToTreeEntry.lookup(V);
   }
 
@@ -4995,6 +4988,23 @@ static Value *createInsertVector(
   return Vec;
 }
 
+/// Correctly creates extract_subvector, checking that the index is multiple of
+/// the subvectors length. Otherwise, generates shuffle using \p Generator or
+/// using default shuffle.
+static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec,
+                                  unsigned SubVecVF, unsigned Index) {
+  if (Index % SubVecVF == 0) {
+    VectorType *SubVecTy =
+        getWidenedType(Vec->getType()->getScalarType(), SubVecVF);
+    return Builder.CreateExtractVector(SubVecTy, Vec, Builder.getInt64(Index));
+  }
+  // Create shuffle, extract_subvector requires that index is multiple of
+  // the subvector length.
+  SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
+  std::iota(Mask.begin(), Mask.end(), Index);
+  return Builder.CreateShuffleVector(Vec, Mask);
+}
+
 BoUpSLP::LoadsState
 BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                            SmallVectorImpl<unsigned> &Order,
@@ -5598,7 +5608,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     // Try build correct order for extractelement instructions.
     SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
                                 TE.ReuseShuffleIndices.end());
-    if (TE.getOpcode() == Instruction::ExtractElement &&
+    if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
         all_of(TE.Scalars, [Sz](Value *V) {
           if (isa<PoisonValue>(V))
             return true;
@@ -5760,10 +5770,11 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
       return std::nullopt; // No need to reorder.
     return std::move(Phis);
   }
-  if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) {
+  if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&
+      allSameType(TE.Scalars)) {
     // TODO: add analysis of other gather nodes with extractelement
     // instructions and other values/instructions, not only undefs.
-    if ((TE.getOpcode() == Instruction::ExtractElement ||
+    if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
          (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
           any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
         all_of(TE.Scalars, [](Value *V) {
@@ -5773,8 +5784,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
       // Check that gather of extractelements can be represented as
       // just a shuffle of a single vector.
       OrdersType CurrentOrder;
-      bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
-                                   /*ResizeAllowed=*/true);
+      bool Reuse =
+          canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
       if (Reuse || !CurrentOrder.empty())
         return std::move(CurrentOrder);
     }
@@ -5823,7 +5834,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
         return Order;
     // Check if can include the order of vectorized loads. For masked gathers do
     // extra analysis later, so include such nodes into a special list.
-    if (TE.isGather() && TE.getOpcode() == Instruction::Load) {
+    if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
       SmallVector<Value *> PointerOps;
       OrdersType CurrentOrder;
       LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
@@ -5938,7 +5949,7 @@ void BoUpSLP::reorderTopToBottom() {
     // Patterns like [fadd,fsub] can be combined into a single instruction in
     // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
     // to take into account their order when looking for the most used order.
-    if (TE->isAltShuffle()) {
+    if (TE->hasState() && TE->isAltShuffle()) {
       VectorType *VecTy =
           getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
       unsigned Opcode0 = TE->getOpcode();
@@ -6017,7 +6028,7 @@ void BoUpSLP::reorderTopToBottom() {
           if (It != GathersToOrders.end())
             return It->second;
         }
-        if (OpTE->isAltShuffle()) {
+        if (OpTE->hasState() && OpTE->isAltShuffle()) {
           auto It = AltShufflesToOrders.find(OpTE);
           if (It != AltShufflesToOrders.end())
             return It->second;
@@ -7620,7 +7631,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
   }
   case Instruction::ExtractValue:
   case Instruction::ExtractElement: {
-    bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
+    bool Reuse = canReuseExtract(VL, CurrentOrder);
     // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
     if (!has_single_bit(VL.size()))
       return TreeEntry::NeedToGather;
@@ -8640,7 +8651,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                  TE->dump());
 
       ValueList Left, Right;
-      VLOperands Ops(VL, VL0, *this);
+      VLOperands Ops(VL, S, *this);
       if (cast<CmpInst>(VL0)->isCommutative()) {
         // Commutative predicate - collect + sort operands of the instructions
         // so that each side is more likely to have the same opcode.
@@ -8908,7 +8919,7 @@ unsigned BoUpSLP::canMapToVector(Type *T) const {
   return N;
 }
 
-bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
                               SmallVectorImpl<unsigned> &CurrentOrder,
                               bool ResizeAllowed) const {
   const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
@@ -9562,7 +9573,7 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
 
   // Do not reorder nodes if it small (just 2 elements), all-constant or all
   // instructions have same opcode already.
-  if (TE.Scalars.size() == 2 || (TE.getOpcode() && !TE.isAltShuffle()) ||
+  if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
       all_of(TE.Scalars, isConstant))
     return;
 
@@ -9781,7 +9792,7 @@ void BoUpSLP::transformNodes() {
       // Do not try partial vectorization for small nodes (<= 2), nodes with the
       // same opcode and same parent block or all constants.
       if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
-          !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
+          !(!E.hasState() || E.getOpcode() == Instruction::Load ||
             E.isAltShuffle() || !allSameBlock(VL)) ||
           allConstant(VL) || isSplat(VL))
         continue;
@@ -9904,6 +9915,7 @@ void BoUpSLP::transformNodes() {
           buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
           if (PrevSize + 1 == VectorizableTree.size() &&
               VectorizableTree[PrevSize]->isGather() &&
+              VectorizableTree[PrevSize]->hasState() &&
               VectorizableTree[PrevSize]->getOpcode() !=
                   Instruction::ExtractElement &&
               !isSplat(Slice)) {
@@ -9924,6 +9936,8 @@ void BoUpSLP::transformNodes() {
         E.ReorderIndices.clear();
       }
     }
+    if (!E.hasState())
+      continue;
     switch (E.getOpcode()) {
     case Instruction::Load: {
       // No need to reorder masked gather loads, just reorder the scalar
@@ -10027,7 +10041,7 @@ void BoUpSLP::transformNodes() {
 
   if (LoadEntriesToVectorize.empty()) {
     // Single load node - exit.
-    if (VectorizableTree.size() <= 1 &&
+    if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
         VectorizableTree.front()->getOpcode() == Instruction::Load)
       return;
     // Small graph with small VF - exit.
@@ -10043,7 +10057,7 @@ void BoUpSLP::transformNodes() {
         getCanonicalGraphSize() <= SmallTree &&
         count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
                  [](const std::unique_ptr<TreeEntry> &TE) {
-                   return TE->isGather() &&
+                   return TE->isGather() && TE->hasState() &&
                           TE->getOpcode() == Instruction::Load &&
                           !allSameBlock(TE->Scalars);
                  }) == 1)
@@ -10059,13 +10073,13 @@ void BoUpSLP::transformNodes() {
   for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
     TreeEntry &E = *TE;
     if (E.isGather() &&
-        (E.getOpcode() == Instruction::Load ||
-         (!E.getOpcode() && any_of(E.Scalars,
-                                   [&](Value *V) {
-                                     return isa<LoadInst>(V) &&
-                                            !isVectorized(V) &&
-                                            !isDeleted(cast<Instruction>(V));
-                                   }))) &&
+        ((E.hasState() && E.getOpcode() == Instruction::Load) ||
+         (!E.hasState() && any_of(E.Scalars,
+                                  [&](Value *V) {
+                                    return isa<LoadInst>(V) &&
+                                           !isVectorized(V) &&
+                                           !isDeleted(cast<Instruction>(V));
+                                  }))) &&
         !isSplat(E.Scalars)) {
       for (Value *V : E.Scalars) {
         auto *LI = dyn_cast<LoadInst>(V);
@@ -10659,7 +10673,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     bool PrevNodeFound = any_of(
         ArrayRef(R.VectorizableTree).take_front(E->Idx),
         [&](const std::unique_ptr<TreeEntry> &TE) {
-          return ((!TE->isAltShuffle() &&
+          return ((TE->hasState() && !TE->isAltShuffle() &&
                    TE->getOpcode() == Instruction::ExtractElement) ||
                   TE->isGather()) &&
                  all_of(enumerate(TE->Scalars), [&](auto &&Data) {
@@ -11784,7 +11798,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
         if (TE.get() == E)
           break;
-        if (TE->isAltShuffle() &&
+        if (TE->hasState() && TE->isAltShuffle() &&
             ((TE->getOpcode() == E->getOpcode() &&
               TE->getAltOpcode() == E->getAltOpcode()) ||
              (TE->getOpcode() == E->getAltOpcode() &&
@@ -11946,10 +11960,12 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
                    [this](Value *V) { return EphValues.contains(V); }) &&
            (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
             TE->Scalars.size() < Limit ||
-            ((TE->getOpcode() == Instruction::ExtractElement ||
+            (((TE->hasState() &&
+               TE->getOpcode() == Instruction::ExtractElement) ||
               all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
              isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
-            (TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) ||
+            (TE->hasState() && TE->getOpcode() == Instruction::Load &&
+             !TE->isAltShuffle()) ||
             any_of(TE->Scalars, IsaPred<LoadInst>));
   };
 
@@ -12078,9 +12094,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
       !VectorizableTree.empty() &&
       all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
         return (TE->isGather() &&
-                TE->getOpcode() != Instruction::ExtractElement &&
+                (!TE->hasState() ||
+                 TE->getOpcode() != Instruction::ExtractElement) &&
                 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
-               TE->getOpcode() == Instruction::PHI;
+               (TE->hasState() && TE->getOpcode() == Instruction::PHI);
       }))
     return true;
 
@@ -12098,7 +12115,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
   // somewhere.
   bool IsAllowedSingleBVNode =
       VectorizableTree.size() > 1 ||
-      (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
+      (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
        !VectorizableTree.front()->isAltShuffle() &&
        VectorizableTree.front()->getOpcode() != Instruction::PHI &&
        VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
@@ -12114,6 +12131,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
     return false;
 
   if (VectorizableTree.back()->isGather() &&
+      VectorizableTree.back()->hasState() &&
       VectorizableTree.back()->isAltShuffle() &&
       VectorizableTree.back()->getVectorFactor() > 2 &&
       allSameBlock(VectorizableTree.back()->Scalars) &&
@@ -12138,7 +12156,7 @@ bool BoUpSLP::isTreeNotExtendable() const {
         getCanonicalGraphSize() <= SmallTree &&
         count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
                  [](const std::unique_ptr<TreeEntry> &TE) {
-                   return TE->isGather() &&
+                   return TE->isGather() && TE->hasState() &&
                           TE->getOpcode() == Instruction::Load &&
                           !allSameBlock(TE->Scalars);
                  }) == 1)
@@ -12150,7 +12168,7 @@ bool BoUpSLP::isTreeNotExtendable() const {
     TreeEntry &E = *VectorizableTree[Idx];
     if (!E.isGather())
       continue;
-    if (E.getOpcode() && E.getOpcode() != Instruction::Load)
+    if (E.hasState() && E.getOpcode() != Instruction::Load)
       return false;
     if (isSplat(E.Scalars) || allConstant(E.Scalars))
       continue;
@@ -12460,7 +12478,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
           TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
       continue;
     }
-    if (TE.isGather()) {
+    if (TE.isGather() && TE.hasState()) {
       if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
           E && E->getVectorFactor() == TE.getVectorFactor() &&
           E->isSame(TE.Scalars)) {
@@ -13609,9 +13627,11 @@ BoUpSLP::isGatherShuffledEntry(
   if (!TE->UserTreeIndices.empty() &&
       TE->UserTreeIndices.front().UserTE->isGather() &&
       TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
-    assert((TE->Idx == 0 || TE->getOpcode() == Instruction::ExtractElement ||
-            isSplat(TE->Scalars)) &&
-           "Expected splat or extractelements only node.");
+    assert(
+        (TE->Idx == 0 ||
+         (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
+         isSplat(TE->Scalars)) &&
+        "Expected splat or extractelements only node.");
     return {};
   }
   unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
@@ -14904,14 +14924,15 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
       }
     }
     // Gather extracts after we check for full matched gathers only.
-    if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
-        ((E->getOpcode() == Instruction::Load ||
+    if (!ExtractShuffles.empty() || !E->hasState() ||
+        E->getOpcode() != Instruction::Load ||
+        (((E->hasState() && E->getOpcode() == Instruction::Load) ||
           any_of(E->Scalars, IsaPred<LoadInst>)) &&
          any_of(E->Scalars,
                 [this](Value *V) {
                   return isa<LoadInst>(V) && getTreeEntry(V);
                 })) ||
-        E->isAltShuffle() ||
+        (E->hasState() && E->isAltShuffle()) ||
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
         (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
@@ -15291,7 +15312,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
   auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
   if (E->isGather()) {
     // Set insert point for non-reduction initial nodes.
-    if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
+    if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
       setInsertPointAfterBundle(E);
     Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
     E->VectorizedValue = Vec;
@@ -16550,10 +16571,8 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
             // When REVEC is enabled, we need to extract a vector.
             // Note: The element size of Scalar may be different from the
             // element size of Vec.
-            Ex = Builder.CreateExtractVector(
-                FixedVectorType::get(Vec->getType()->getScalarType(),
-                                     VecTyNumElements),
-                Vec, Builder.getInt64(ExternalUse.Lane * VecTyNumElements));
+            Ex = createExtractVector(Builder, Vec, VecTyNumElements,
+                                     ExternalUse.Lane * VecTyNumElements);
           } else {
             Ex = Builder.CreateExtractElement(Vec, Lane);
           }
@@ -18138,8 +18157,9 @@ static RecurKind getRdxKind(Value *V);
 void BoUpSLP::computeMinimumValueSizes() {
   // We only attempt to truncate integer expressions.
   bool IsStoreOrInsertElt =
-      VectorizableTree.front()->getOpcode() == Instruction::Store ||
-      VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
+      VectorizableTree.front()->hasState() &&
+      (VectorizableTree.front()->getOpcode() == Instruction::Store ||
+       VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
   if ((IsStoreOrInsertElt || UserIgnoreList) &&
       ExtraBitWidthNodes.size() <= 1 &&
       (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
@@ -18180,10 +18200,9 @@ void BoUpSLP::computeMinimumValueSizes() {
     return;
 
   SmallVector<unsigned> ToDemote;
-  auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
-                                bool IsProfitableToDemoteRoot, unsigned Opcode,
-                                unsigned Limit, bool IsTruncRoot,
-                                bool IsSignedCmp) -> unsigned {
+  auto ComputeMaxBitWidth =
+      [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
+          unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
     ToDemote.clear();
     // Check if the root is trunc and the next node is gather/buildvector, then
     // keep trunc in scalars, which is free in most cases.
@@ -18224,11 +18243,14 @@ void BoUpSLP::computeMinimumValueSizes() {
       return MaxBitWidth;
     }
 
+    if (!E.hasState())
+      return 0u;
+
     unsigned VF = E.getVectorFactor();
     Type *ScalarTy = E.Scalars.front()->getType();
     unsigned ScalarTyNumElements = getNumElements(ScalarTy);
     auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
-    if (!TreeRootIT || !Opcode)
+    if (!TreeRootIT)
       return 0u;
 
     if (any_of(E.Scalars,
@@ -18300,6 +18322,7 @@ void BoUpSLP::computeMinimumValueSizes() {
                 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
       return 0u;
 
+    unsigned Opcode = E.getOpcode();
     bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
                                 Opcode == Instruction::SExt ||
                                 Opcode == Instruction::ZExt || NumParts > 1;
@@ -18380,15 +18403,14 @@ void BoUpSLP::computeMinimumValueSizes() {
   while (NodeIdx < VectorizableTree.size()) {
     ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
     unsigned Limit = 2;
-    unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
     if (IsTopRoot &&
         ReductionBitWidth ==
             DL->getTypeSizeInBits(
                 VectorizableTree.front()->Scalars.front()->getType()))
       Limit = 3;
     unsigned MaxBitWidth = ComputeMaxBitWidth(
-        *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
-        Limit, IsTruncRoot, IsSignedCmp);
+        *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
+        IsTruncRoot, IsSignedCmp);
     if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
       if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
         ReductionBitWidth = bit_ceil(MaxBitWidth);
@@ -18431,19 +18453,21 @@ void BoUpSLP::computeMinimumValueSizes() {
                  });
       IsSignedCmp =
           NodeIdx < VectorizableTree.size() &&
-          any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
-                 [&](const EdgeInfo &EI) {
-                   return EI.UserTE->getOpcode() == Instruction::ICmp &&
-                          any_of(EI.UserTE->Scalars, [&](Value *V) {
-                            auto *IC = dyn_cast<ICmpInst>(V);
-                            return IC &&
-                                   (IC->isSigned() ||
-                                    !isKnownNonNegative(IC->getOperand(0),
-                                                        SimplifyQuery(*DL)) ||
-                                    !isKnownNonNegative(IC->getOperand(1),
-                                                        SimplifyQuery(*DL)));
-                          });
-                 });
+          any_of(
+              VectorizableTree[NodeIdx]->UserTreeIndices,
+              [&](const EdgeInfo &EI) {
+                return (EI.UserTE->hasState() &&
+                        EI.UserTE->getOpcode() == Instruction::ICmp) &&
+                       any_of(EI.UserTE->Scalars, [&](Value *V) {
+                         auto *IC = dyn_cast<ICmpInst>(V);
+                         return IC &&
+                                (IC->isSigned() ||
+                                 !isKnownNonNegative(IC->getOperand(0),
+                                                     SimplifyQuery(*DL)) ||
+                                 !isKnownNonNegative(IC->getOperand(1),
+                                                     SimplifyQuery(*DL)));
+                       });
+              });
     }
 
     // If the maximum bit width we compute is less than the width of the roots'
@@ -20080,6 +20104,7 @@ class HorizontalReduction {
         NumRegs =
             TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
         while (NumParts > NumRegs) {
+          assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
           ReduxWidth = bit_floor(ReduxWidth - 1);
           VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
           NumParts = TTI.getNumberOfParts(Tp);
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/InstrMaps.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/InstrMaps.cpp
new file mode 100644
index 0000000000000..4df4829a04c41
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/InstrMaps.cpp
@@ -0,0 +1,21 @@
+//===- InstructionMaps.cpp - Maps scalars to vectors and reverse ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h"
+#include "llvm/Support/Debug.h"
+
+namespace llvm::sandboxir {
+
+#ifndef NDEBUG
+void InstrMaps::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+} // namespace llvm::sandboxir
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Interval.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Interval.cpp
index 79b3744419535..b2bfbf5c2e8b9 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Interval.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Interval.cpp
@@ -13,10 +13,36 @@
 
 namespace llvm::sandboxir {
 
-template class Interval<Instruction>;
-template class Interval<MemDGNode>;
+template <typename T> bool Interval<T>::disjoint(const Interval &Other) const {
+  if (Other.empty())
+    return true;
+  if (empty())
+    return true;
+  return Other.Bottom->comesBefore(Top) || Bottom->comesBefore(Other.Top);
+}
 
 #ifndef NDEBUG
+template <typename T> void Interval<T>::print(raw_ostream &OS) const {
+  auto *Top = top();
+  auto *Bot = bottom();
+  OS << "Top: ";
+  if (Top != nullptr)
+    OS << *Top;
+  else
+    OS << "nullptr";
+  OS << "\n";
+
+  OS << "Bot: ";
+  if (Bot != nullptr)
+    OS << *Bot;
+  else
+    OS << "nullptr";
+  OS << "\n";
+}
 template <typename T> void Interval<T>::dump() const { print(dbgs()); }
 #endif
+
+template class Interval<Instruction>;
+template class Interval<MemDGNode>;
+
 } // namespace llvm::sandboxir
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
index 8c6deeb7df249..ad3e38e2f1d92 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
@@ -12,6 +12,7 @@
 #include "llvm/SandboxIR/Utils.h"
 #include "llvm/SandboxIR/Value.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h"
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h"
 
 namespace llvm::sandboxir {
@@ -19,6 +20,11 @@ namespace llvm::sandboxir {
 #define DEBUG_TYPE "SBVec:Legality"
 
 #ifndef NDEBUG
+void ShuffleMask::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+
 void LegalityResult::dump() const {
   print(dbgs());
   dbgs() << "\n";
@@ -184,6 +190,22 @@ static void dumpBndl(ArrayRef<Value *> Bndl) {
 }
 #endif // NDEBUG
 
+CollectDescr
+LegalityAnalysis::getHowToCollectValues(ArrayRef<Value *> Bndl) const {
+  SmallVector<CollectDescr::ExtractElementDescr, 4> Vec;
+  Vec.reserve(Bndl.size());
+  for (auto [Lane, V] : enumerate(Bndl)) {
+    if (auto *VecOp = IMaps.getVectorForOrig(V)) {
+      // If there is a vector containing `V`, then get the lane it came from.
+      std::optional<int> ExtractIdxOpt = IMaps.getOrigLane(VecOp, V);
+      Vec.emplace_back(VecOp, ExtractIdxOpt ? *ExtractIdxOpt : -1);
+    } else {
+      Vec.emplace_back(V);
+    }
+  }
+  return CollectDescr(std::move(Vec));
+}
+
 const LegalityResult &LegalityAnalysis::canVectorize(ArrayRef<Value *> Bndl,
                                                      bool SkipScheduling) {
   // If Bndl contains values other than instructions, we need to Pack.
@@ -193,11 +215,20 @@ const LegalityResult &LegalityAnalysis::canVectorize(ArrayRef<Value *> Bndl,
     return createLegalityResult<Pack>(ResultReason::NotInstructions);
   }
 
+  auto CollectDescrs = getHowToCollectValues(Bndl);
+  if (CollectDescrs.hasVectorInputs()) {
+    if (auto ValueShuffleOpt = CollectDescrs.getSingleInput()) {
+      auto [Vec, Mask] = *ValueShuffleOpt;
+      if (Mask.isIdentity())
+        return createLegalityResult<DiamondReuse>(Vec);
+      return createLegalityResult<DiamondReuseWithShuffle>(Vec, Mask);
+    }
+    llvm_unreachable("TODO: Unimplemented");
+  }
+
   if (auto ReasonOpt = notVectorizableBasedOnOpcodesAndTypes(Bndl))
     return createLegalityResult<Pack>(*ReasonOpt);
 
-  // TODO: Check for existing vectors containing values in Bndl.
-
   if (!SkipScheduling) {
     // TODO: Try to remove the IBndl vector.
     SmallVector<Instruction *, 8> IBndl;
@@ -210,4 +241,9 @@ const LegalityResult &LegalityAnalysis::canVectorize(ArrayRef<Value *> Bndl,
 
   return createLegalityResult<Widen>();
 }
+
+void LegalityAnalysis::clear() {
+  Sched.clear();
+  IMaps.clear();
+}
 } // namespace llvm::sandboxir
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
index d44199609838d..d62023ea01884 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
@@ -56,103 +56,114 @@ getInsertPointAfterInstrs(ArrayRef<Value *> Instrs) {
 
 Value *BottomUpVec::createVectorInstr(ArrayRef<Value *> Bndl,
                                       ArrayRef<Value *> Operands) {
-  Change = true;
-  assert(all_of(Bndl, [](auto *V) { return isa<Instruction>(V); }) &&
-         "Expect Instructions!");
-  auto &Ctx = Bndl[0]->getContext();
+  auto CreateVectorInstr = [](ArrayRef<Value *> Bndl,
+                              ArrayRef<Value *> Operands) -> Value * {
+    assert(all_of(Bndl, [](auto *V) { return isa<Instruction>(V); }) &&
+           "Expect Instructions!");
+    auto &Ctx = Bndl[0]->getContext();
 
-  Type *ScalarTy = VecUtils::getElementType(Utils::getExpectedType(Bndl[0]));
-  auto *VecTy = VecUtils::getWideType(ScalarTy, VecUtils::getNumLanes(Bndl));
+    Type *ScalarTy = VecUtils::getElementType(Utils::getExpectedType(Bndl[0]));
+    auto *VecTy = VecUtils::getWideType(ScalarTy, VecUtils::getNumLanes(Bndl));
 
-  BasicBlock::iterator WhereIt = getInsertPointAfterInstrs(Bndl);
+    BasicBlock::iterator WhereIt = getInsertPointAfterInstrs(Bndl);
 
-  auto Opcode = cast<Instruction>(Bndl[0])->getOpcode();
-  switch (Opcode) {
-  case Instruction::Opcode::ZExt:
-  case Instruction::Opcode::SExt:
-  case Instruction::Opcode::FPToUI:
-  case Instruction::Opcode::FPToSI:
-  case Instruction::Opcode::FPExt:
-  case Instruction::Opcode::PtrToInt:
-  case Instruction::Opcode::IntToPtr:
-  case Instruction::Opcode::SIToFP:
-  case Instruction::Opcode::UIToFP:
-  case Instruction::Opcode::Trunc:
-  case Instruction::Opcode::FPTrunc:
-  case Instruction::Opcode::BitCast: {
-    assert(Operands.size() == 1u && "Casts are unary!");
-    return CastInst::create(VecTy, Opcode, Operands[0], WhereIt, Ctx, "VCast");
-  }
-  case Instruction::Opcode::FCmp:
-  case Instruction::Opcode::ICmp: {
-    auto Pred = cast<CmpInst>(Bndl[0])->getPredicate();
-    assert(all_of(drop_begin(Bndl),
-                  [Pred](auto *SBV) {
-                    return cast<CmpInst>(SBV)->getPredicate() == Pred;
-                  }) &&
-           "Expected same predicate across bundle.");
-    return CmpInst::create(Pred, Operands[0], Operands[1], WhereIt, Ctx,
-                           "VCmp");
-  }
-  case Instruction::Opcode::Select: {
-    return SelectInst::create(Operands[0], Operands[1], Operands[2], WhereIt,
-                              Ctx, "Vec");
-  }
-  case Instruction::Opcode::FNeg: {
-    auto *UOp0 = cast<UnaryOperator>(Bndl[0]);
-    auto OpC = UOp0->getOpcode();
-    return UnaryOperator::createWithCopiedFlags(OpC, Operands[0], UOp0, WhereIt,
-                                                Ctx, "Vec");
-  }
-  case Instruction::Opcode::Add:
-  case Instruction::Opcode::FAdd:
-  case Instruction::Opcode::Sub:
-  case Instruction::Opcode::FSub:
-  case Instruction::Opcode::Mul:
-  case Instruction::Opcode::FMul:
-  case Instruction::Opcode::UDiv:
-  case Instruction::Opcode::SDiv:
-  case Instruction::Opcode::FDiv:
-  case Instruction::Opcode::URem:
-  case Instruction::Opcode::SRem:
-  case Instruction::Opcode::FRem:
-  case Instruction::Opcode::Shl:
-  case Instruction::Opcode::LShr:
-  case Instruction::Opcode::AShr:
-  case Instruction::Opcode::And:
-  case Instruction::Opcode::Or:
-  case Instruction::Opcode::Xor: {
-    auto *BinOp0 = cast<BinaryOperator>(Bndl[0]);
-    auto *LHS = Operands[0];
-    auto *RHS = Operands[1];
-    return BinaryOperator::createWithCopiedFlags(BinOp0->getOpcode(), LHS, RHS,
-                                                 BinOp0, WhereIt, Ctx, "Vec");
-  }
-  case Instruction::Opcode::Load: {
-    auto *Ld0 = cast<LoadInst>(Bndl[0]);
-    Value *Ptr = Ld0->getPointerOperand();
-    return LoadInst::create(VecTy, Ptr, Ld0->getAlign(), WhereIt, Ctx, "VecL");
-  }
-  case Instruction::Opcode::Store: {
-    auto Align = cast<StoreInst>(Bndl[0])->getAlign();
-    Value *Val = Operands[0];
-    Value *Ptr = Operands[1];
-    return StoreInst::create(Val, Ptr, Align, WhereIt, Ctx);
-  }
-  case Instruction::Opcode::Br:
-  case Instruction::Opcode::Ret:
-  case Instruction::Opcode::PHI:
-  case Instruction::Opcode::AddrSpaceCast:
-  case Instruction::Opcode::Call:
-  case Instruction::Opcode::GetElementPtr:
-    llvm_unreachable("Unimplemented");
-    break;
-  default:
-    llvm_unreachable("Unimplemented");
-    break;
+    auto Opcode = cast<Instruction>(Bndl[0])->getOpcode();
+    switch (Opcode) {
+    case Instruction::Opcode::ZExt:
+    case Instruction::Opcode::SExt:
+    case Instruction::Opcode::FPToUI:
+    case Instruction::Opcode::FPToSI:
+    case Instruction::Opcode::FPExt:
+    case Instruction::Opcode::PtrToInt:
+    case Instruction::Opcode::IntToPtr:
+    case Instruction::Opcode::SIToFP:
+    case Instruction::Opcode::UIToFP:
+    case Instruction::Opcode::Trunc:
+    case Instruction::Opcode::FPTrunc:
+    case Instruction::Opcode::BitCast: {
+      assert(Operands.size() == 1u && "Casts are unary!");
+      return CastInst::create(VecTy, Opcode, Operands[0], WhereIt, Ctx,
+                              "VCast");
+    }
+    case Instruction::Opcode::FCmp:
+    case Instruction::Opcode::ICmp: {
+      auto Pred = cast<CmpInst>(Bndl[0])->getPredicate();
+      assert(all_of(drop_begin(Bndl),
+                    [Pred](auto *SBV) {
+                      return cast<CmpInst>(SBV)->getPredicate() == Pred;
+                    }) &&
+             "Expected same predicate across bundle.");
+      return CmpInst::create(Pred, Operands[0], Operands[1], WhereIt, Ctx,
+                             "VCmp");
+    }
+    case Instruction::Opcode::Select: {
+      return SelectInst::create(Operands[0], Operands[1], Operands[2], WhereIt,
+                                Ctx, "Vec");
+    }
+    case Instruction::Opcode::FNeg: {
+      auto *UOp0 = cast<UnaryOperator>(Bndl[0]);
+      auto OpC = UOp0->getOpcode();
+      return UnaryOperator::createWithCopiedFlags(OpC, Operands[0], UOp0,
+                                                  WhereIt, Ctx, "Vec");
+    }
+    case Instruction::Opcode::Add:
+    case Instruction::Opcode::FAdd:
+    case Instruction::Opcode::Sub:
+    case Instruction::Opcode::FSub:
+    case Instruction::Opcode::Mul:
+    case Instruction::Opcode::FMul:
+    case Instruction::Opcode::UDiv:
+    case Instruction::Opcode::SDiv:
+    case Instruction::Opcode::FDiv:
+    case Instruction::Opcode::URem:
+    case Instruction::Opcode::SRem:
+    case Instruction::Opcode::FRem:
+    case Instruction::Opcode::Shl:
+    case Instruction::Opcode::LShr:
+    case Instruction::Opcode::AShr:
+    case Instruction::Opcode::And:
+    case Instruction::Opcode::Or:
+    case Instruction::Opcode::Xor: {
+      auto *BinOp0 = cast<BinaryOperator>(Bndl[0]);
+      auto *LHS = Operands[0];
+      auto *RHS = Operands[1];
+      return BinaryOperator::createWithCopiedFlags(
+          BinOp0->getOpcode(), LHS, RHS, BinOp0, WhereIt, Ctx, "Vec");
+    }
+    case Instruction::Opcode::Load: {
+      auto *Ld0 = cast<LoadInst>(Bndl[0]);
+      Value *Ptr = Ld0->getPointerOperand();
+      return LoadInst::create(VecTy, Ptr, Ld0->getAlign(), WhereIt, Ctx,
+                              "VecL");
+    }
+    case Instruction::Opcode::Store: {
+      auto Align = cast<StoreInst>(Bndl[0])->getAlign();
+      Value *Val = Operands[0];
+      Value *Ptr = Operands[1];
+      return StoreInst::create(Val, Ptr, Align, WhereIt, Ctx);
+    }
+    case Instruction::Opcode::Br:
+    case Instruction::Opcode::Ret:
+    case Instruction::Opcode::PHI:
+    case Instruction::Opcode::AddrSpaceCast:
+    case Instruction::Opcode::Call:
+    case Instruction::Opcode::GetElementPtr:
+      llvm_unreachable("Unimplemented");
+      break;
+    default:
+      llvm_unreachable("Unimplemented");
+      break;
+    }
+    llvm_unreachable("Missing switch case!");
+    // TODO: Propagate debug info.
+  };
+
+  auto *VecI = CreateVectorInstr(Bndl, Operands);
+  if (VecI != nullptr) {
+    Change = true;
+    IMaps->registerVector(Bndl, VecI);
   }
-  llvm_unreachable("Missing switch case!");
-  // TODO: Propagate debug info.
+  return VecI;
 }
 
 void BottomUpVec::tryEraseDeadInstrs() {
@@ -168,6 +179,12 @@ void BottomUpVec::tryEraseDeadInstrs() {
   DeadInstrCandidates.clear();
 }
 
+Value *BottomUpVec::createShuffle(Value *VecOp, const ShuffleMask &Mask) {
+  BasicBlock::iterator WhereIt = getInsertPointAfterInstrs({VecOp});
+  return ShuffleVectorInst::create(VecOp, VecOp, Mask, WhereIt,
+                                   VecOp->getContext(), "VShuf");
+}
+
 Value *BottomUpVec::createPack(ArrayRef<Value *> ToPack) {
   BasicBlock::iterator WhereIt = getInsertPointAfterInstrs(ToPack);
 
@@ -280,6 +297,17 @@ Value *BottomUpVec::vectorizeRec(ArrayRef<Value *> Bndl, unsigned Depth) {
       collectPotentiallyDeadInstrs(Bndl);
     break;
   }
+  case LegalityResultID::DiamondReuse: {
+    NewVec = cast<DiamondReuse>(LegalityRes).getVector();
+    break;
+  }
+  case LegalityResultID::DiamondReuseWithShuffle: {
+    auto *VecOp = cast<DiamondReuseWithShuffle>(LegalityRes).getVector();
+    const ShuffleMask &Mask =
+        cast<DiamondReuseWithShuffle>(LegalityRes).getMask();
+    NewVec = createShuffle(VecOp, Mask);
+    break;
+  }
   case LegalityResultID::Pack: {
     // If we can't vectorize the seeds then just return.
     if (Depth == 0)
@@ -300,9 +328,10 @@ bool BottomUpVec::tryVectorize(ArrayRef<Value *> Bndl) {
 }
 
 bool BottomUpVec::runOnFunction(Function &F, const Analyses &A) {
+  IMaps = std::make_unique<InstrMaps>(F.getContext());
   Legality = std::make_unique<LegalityAnalysis>(
       A.getAA(), A.getScalarEvolution(), F.getParent()->getDataLayout(),
-      F.getContext());
+      F.getContext(), *IMaps);
   Change = false;
   const auto &DL = F.getParent()->getDataLayout();
   unsigned VecRegBits =
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/VecUtils.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/VecUtils.cpp
new file mode 100644
index 0000000000000..6f9ef07e467d2
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/VecUtils.cpp
@@ -0,0 +1,32 @@
+//===- VecUtils.cpp -------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h"
+
+namespace llvm::sandboxir {
+
+unsigned VecUtils::getFloorPowerOf2(unsigned Num) {
+  if (Num == 0)
+    return Num;
+  unsigned Mask = Num;
+  Mask >>= 1;
+  for (unsigned ShiftBy = 1; ShiftBy < sizeof(Num) * 8; ShiftBy <<= 1)
+    Mask |= Mask >> ShiftBy;
+  return Num & ~Mask;
+}
+
+#ifndef NDEBUG
+template <typename T> static void dumpImpl(ArrayRef<T *> Bndl) {
+  for (auto [Idx, V] : enumerate(Bndl))
+    dbgs() << Idx << "." << *V << "\n";
+}
+void VecUtils::dump(ArrayRef<Value *> Bndl) { dumpImpl(Bndl); }
+void VecUtils::dump(ArrayRef<Instruction *> Bndl) { dumpImpl(Bndl); }
+#endif // NDEBUG
+
+} // namespace llvm::sandboxir
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index aa41c41e90c4c..f1228368804be 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -770,7 +770,7 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
     InstructionCost BackedgeCost =
         ForceTargetInstructionCost.getNumOccurrences()
             ? InstructionCost(ForceTargetInstructionCost.getNumOccurrences())
-            : Ctx.TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
+            : Ctx.TTI.getCFInstrCost(Instruction::Br, Ctx.CostKind);
     LLVM_DEBUG(dbgs() << "Cost of " << BackedgeCost << " for VF " << VF
                       << ": vector loop backedge\n");
     Cost += BackedgeCost;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 87f87bf143719..784cee6ed4b06 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -686,11 +686,13 @@ struct VPCostContext {
   LLVMContext &LLVMCtx;
   LoopVectorizationCostModel &CM;
   SmallPtrSet<Instruction *, 8> SkipCostComputation;
+  TargetTransformInfo::TargetCostKind CostKind;
 
   VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
-                Type *CanIVTy, LoopVectorizationCostModel &CM)
+                Type *CanIVTy, LoopVectorizationCostModel &CM,
+                TargetTransformInfo::TargetCostKind CostKind)
       : TTI(TTI), TLI(TLI), Types(CanIVTy), LLVMCtx(CanIVTy->getContext()),
-        CM(CM) {}
+        CM(CM), CostKind(CostKind) {}
 
   /// Return the cost for \p UI with \p VF using the legacy cost model as
   /// fallback until computing the cost of all recipes migrates to VPlan.
@@ -1223,8 +1225,8 @@ class VPInstruction : public VPRecipeWithIRFlags,
     // operand). Only generates scalar values (either for the first lane only or
     // for all lanes, depending on its uses).
     PtrAdd,
-    // Returns a scalar boolean value, which is true if any lane of its single
-    // operand is true.
+    // Returns a scalar boolean value, which is true if any lane of its (only
+    // boolean) vector operand is true.
     AnyOf,
   };
 
@@ -3526,6 +3528,15 @@ class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags,
 /// holds a sequence of zero or more VPRecipe's each representing a sequence of
 /// output IR instructions. All PHI-like recipes must come before any non-PHI recipes.
 class VPBasicBlock : public VPBlockBase {
+  friend class VPlan;
+
+  /// Use VPlan::createVPBasicBlock to create VPBasicBlocks.
+  VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr)
+      : VPBlockBase(VPBasicBlockSC, Name.str()) {
+    if (Recipe)
+      appendRecipe(Recipe);
+  }
+
 public:
   using RecipeListTy = iplist<VPRecipeBase>;
 
@@ -3537,12 +3548,6 @@ class VPBasicBlock : public VPBlockBase {
       : VPBlockBase(BlockSC, Name.str()) {}
 
 public:
-  VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr)
-      : VPBlockBase(VPBasicBlockSC, Name.str()) {
-    if (Recipe)
-      appendRecipe(Recipe);
-  }
-
   ~VPBasicBlock() override {
     while (!Recipes.empty())
       Recipes.pop_back();
@@ -3665,14 +3670,17 @@ class VPBasicBlock : public VPBlockBase {
 /// Note: At the moment, VPIRBasicBlock can only be used to wrap VPlan's
 /// preheader block.
 class VPIRBasicBlock : public VPBasicBlock {
+  friend class VPlan;
+
   BasicBlock *IRBB;
 
-public:
+  /// Use VPlan::createVPIRBasicBlock to create VPIRBasicBlocks.
   VPIRBasicBlock(BasicBlock *IRBB)
       : VPBasicBlock(VPIRBasicBlockSC,
                      (Twine("ir-bb<") + IRBB->getName() + Twine(">")).str()),
         IRBB(IRBB) {}
 
+public:
   ~VPIRBasicBlock() override {}
 
   static inline bool classof(const VPBlockBase *V) {
@@ -3697,6 +3705,8 @@ class VPIRBasicBlock : public VPBasicBlock {
 /// candidate VF's. The actual replication takes place only once the desired VF
 /// and UF have been determined.
 class VPRegionBlock : public VPBlockBase {
+  friend class VPlan;
+
   /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock.
   VPBlockBase *Entry;
 
@@ -3708,7 +3718,7 @@ class VPRegionBlock : public VPBlockBase {
   /// instances of output IR corresponding to its VPBlockBases.
   bool IsReplicator;
 
-public:
+  /// Use VPlan::createVPRegionBlock to create VPRegionBlocks.
   VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
                 const std::string &Name = "", bool IsReplicator = false)
       : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting),
@@ -3722,6 +3732,7 @@ class VPRegionBlock : public VPBlockBase {
       : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr),
         IsReplicator(IsReplicator) {}
 
+public:
   ~VPRegionBlock() override {}
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 8fea2c6fd33b6..27357ff04b5f2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -60,7 +60,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   }
   case Instruction::ICmp:
   case VPInstruction::ActiveLaneMask:
-    return inferScalarType(R->getOperand(1));
+    assert(inferScalarType(R->getOperand(0)) ==
+               inferScalarType(R->getOperand(1)) &&
+           "different types inferred for different operands");
+    return IntegerType::get(Ctx, 1);
   case VPInstruction::ComputeReductionResult: {
     auto *PhiR = cast<VPReductionPHIRecipe>(R->getOperand(0));
     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
@@ -71,6 +74,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   case VPInstruction::FirstOrderRecurrenceSplice:
   case VPInstruction::Not:
   case VPInstruction::ResumePhi:
+  case VPInstruction::CalculateTripCountMinusVF:
+  case VPInstruction::CanonicalIVIncrementForPart:
+  case VPInstruction::AnyOf:
     return SetResultTyFromOp();
   case VPInstruction::ExtractFromEnd: {
     Type *BaseTy = inferScalarType(R->getOperand(0));
@@ -79,6 +85,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
     return BaseTy;
   }
   case VPInstruction::LogicalAnd:
+    assert(inferScalarType(R->getOperand(0))->isIntegerTy(1) &&
+           inferScalarType(R->getOperand(1))->isIntegerTy(1) &&
+           "LogicalAnd operands should be bool");
     return IntegerType::get(Ctx, 1);
   case VPInstruction::PtrAdd:
     // Return the type based on the pointer argument (i.e. first operand).
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 979a8e0768a99..aa5f92b235555 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -991,10 +991,9 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
 
 InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
                                   Variant->getFunctionType()->params(),
-                                  CostKind);
+                                  Ctx.CostKind);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1072,8 +1071,6 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
 
 InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
                                                     VPCostContext &Ctx) const {
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
   // Some backends analyze intrinsic arguments to determine cost. Use the
   // underlying value for the operand if it has one. Otherwise try to use the
   // operand of the underlying call instruction, if there is one. Otherwise
@@ -1113,7 +1110,7 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
   IntrinsicCostAttributes CostAttrs(
       VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
       dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()));
-  return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
+  return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
 }
 
 StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
@@ -1196,7 +1193,7 @@ InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
   // Assume that a non-constant update value (or a constant != 1) requires
   // a multiply, and add that into the cost.
   InstructionCost MulCost =
-      Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy);
+      Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
   if (IncAmt->isLiveIn()) {
     ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue());
 
@@ -1212,9 +1209,8 @@ InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
                               {PtrTy, IncTy, MaskTy});
 
   // Add the costs together with the add/sub operation.
-  return Ctx.TTI.getIntrinsicInstrCost(
-             ICA, TargetTransformInfo::TCK_RecipThroughput) +
-         MulCost + Ctx.TTI.getArithmeticInstrCost(Opcode, VTy);
+  return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
+         Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1278,7 +1274,6 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
   bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
   Type *ScalarTy = Ctx.Types.inferScalarType(this);
   Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
   VPValue *Op0, *Op1;
   using namespace llvm::VPlanPatternMatch;
@@ -1296,8 +1291,8 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
       Operands.append(SI->op_begin(), SI->op_end());
     bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
     return Ctx.TTI.getArithmeticInstrCost(
-        IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy, CostKind,
-        {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
+        IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy,
+        Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
   }
 
   Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
@@ -1307,9 +1302,9 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
   CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
   if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
     Pred = Cmp->getPredicate();
-  return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy, CondTy, Pred,
-                                    CostKind, {TTI::OK_AnyValue, TTI::OP_None},
-                                    {TTI::OK_AnyValue, TTI::OP_None}, SI);
+  return Ctx.TTI.getCmpSelInstrCost(
+      Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
+      {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
 }
 
 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
@@ -1454,12 +1449,11 @@ void VPWidenRecipe::execute(VPTransformState &State) {
 
 InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
                                            VPCostContext &Ctx) const {
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   switch (Opcode) {
   case Instruction::FNeg: {
     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
     return Ctx.TTI.getArithmeticInstrCost(
-        Opcode, VectorTy, CostKind,
+        Opcode, VectorTy, Ctx.CostKind,
         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
   }
@@ -1502,21 +1496,22 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
     if (CtxI)
       Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
     return Ctx.TTI.getArithmeticInstrCost(
-        Opcode, VectorTy, CostKind,
+        Opcode, VectorTy, Ctx.CostKind,
         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
         RHSInfo, Operands, CtxI, &Ctx.TLI);
   }
   case Instruction::Freeze: {
     // This opcode is unknown. Assume that it is the same as 'mul'.
     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
-    return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
+    return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
+                                          Ctx.CostKind);
   }
   case Instruction::ICmp:
   case Instruction::FCmp: {
     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
     return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
-                                      CostKind,
+                                      Ctx.CostKind,
                                       {TTI::OK_AnyValue, TTI::OP_None},
                                       {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
   }
@@ -1646,7 +1641,7 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
   auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF));
   // Arm TTI will use the underlying instruction to determine the cost.
   return Ctx.TTI.getCastInstrCost(
-      Opcode, DestTy, SrcTy, CCH, TTI::TCK_RecipThroughput,
+      Opcode, DestTy, SrcTy, CCH, Ctx.CostKind,
       dyn_cast_if_present<Instruction>(getUnderlyingValue()));
 }
 
@@ -1664,7 +1659,7 @@ void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
 
 InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
-  return Ctx.TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
+  return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
 }
 
 /// This function adds
@@ -2143,18 +2138,16 @@ void VPBlendRecipe::execute(VPTransformState &State) {
 
 InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
                                            VPCostContext &Ctx) const {
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
   // Handle cases where only the first lane is used the same way as the legacy
   // cost model.
   if (vputils::onlyFirstLaneUsed(this))
-    return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind);
+    return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
 
   Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
   Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
   return (getNumIncomingValues() - 1) *
          Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
-                                    CmpInst::BAD_ICMP_PREDICATE, CostKind);
+                                    CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2274,7 +2267,6 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
   RecurKind RdxKind = RdxDesc.getRecurrenceKind();
   Type *ElementTy = Ctx.Types.inferScalarType(this);
   auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   unsigned Opcode = RdxDesc.getOpcode();
 
   // TODO: Support any-of and in-loop reductions.
@@ -2292,15 +2284,15 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
 
   // Cost = Reduction cost + BinOp cost
   InstructionCost Cost =
-      Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, CostKind);
+      Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind);
   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
     Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
     return Cost + Ctx.TTI.getMinMaxReductionCost(
-                      Id, VectorTy, RdxDesc.getFastMathFlags(), CostKind);
+                      Id, VectorTy, RdxDesc.getFastMathFlags(), Ctx.CostKind);
   }
 
   return Cost + Ctx.TTI.getArithmeticReductionCost(
-                    Opcode, VectorTy, RdxDesc.getFastMathFlags(), CostKind);
+                    Opcode, VectorTy, RdxDesc.getFastMathFlags(), Ctx.CostKind);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2531,7 +2523,6 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
   unsigned AS =
       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
   if (!Consecutive) {
     // TODO: Using the original IR may not be accurate.
@@ -2542,25 +2533,26 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
            "Inconsecutive memory access should not have the order.");
     return Ctx.TTI.getAddressComputationCost(Ty) +
            Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
-                                          IsMasked, Alignment, CostKind,
+                                          IsMasked, Alignment, Ctx.CostKind,
                                           &Ingredient);
   }
 
   InstructionCost Cost = 0;
   if (IsMasked) {
     Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
-                                          AS, CostKind);
+                                          AS, Ctx.CostKind);
   } else {
     TTI::OperandValueInfo OpInfo =
         Ctx.TTI.getOperandInfo(Ingredient.getOperand(0));
     Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS,
-                                    CostKind, OpInfo, &Ingredient);
+                                    Ctx.CostKind, OpInfo, &Ingredient);
   }
   if (!Reverse)
     return Cost;
 
-  return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
-                                        cast<VectorType>(Ty), {}, CostKind, 0);
+  return Cost +=
+         Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
+                                cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
 }
 
 void VPWidenLoadRecipe::execute(VPTransformState &State) {
@@ -2678,14 +2670,14 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
   unsigned AS =
       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
-      Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
+      Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind);
   if (!Reverse)
     return Cost;
 
   return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
-                                       cast<VectorType>(Ty), {}, CostKind, 0);
+                                       cast<VectorType>(Ty), {}, Ctx.CostKind,
+                                       0);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2799,14 +2791,14 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
   unsigned AS =
       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
-      Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
+      Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind);
   if (!Reverse)
     return Cost;
 
   return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
-                                       cast<VectorType>(Ty), {}, CostKind, 0);
+                                       cast<VectorType>(Ty), {}, Ctx.CostKind,
+                                       0);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2863,10 +2855,21 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
   // must use intrinsics to interleave.
   if (VecTy->isScalableTy()) {
-    VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
-    return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
-                                   Vals,
-                                   /*FMFSource=*/nullptr, Name);
+    assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for "
+                                    "scalable vectors, must be power of 2");
+    SmallVector<Value *> InterleavingValues(Vals);
+    // When interleaving, the number of values will be shrunk until we have the
+    // single final interleaved value.
+    auto *InterleaveTy = cast<VectorType>(InterleavingValues[0]->getType());
+    for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) {
+      InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy);
+      for (unsigned I = 0; I < Midpoint; ++I)
+        InterleavingValues[I] = Builder.CreateIntrinsic(
+            InterleaveTy, Intrinsic::vector_interleave2,
+            {InterleavingValues[I], InterleavingValues[Midpoint + I]},
+            /*FMFSource=*/nullptr, Name);
+    }
+    return InterleavingValues[0];
   }
 
   // Fixed length. Start by concatenating all vectors into a wide vector.
@@ -2952,15 +2955,11 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
                           &InterleaveFactor](Value *MaskForGaps) -> Value * {
     if (State.VF.isScalable()) {
       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
-      assert(InterleaveFactor == 2 &&
+      assert(isPowerOf2_32(InterleaveFactor) &&
              "Unsupported deinterleave factor for scalable vectors");
       auto *ResBlockInMask = State.get(BlockInMask);
-      SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask};
-      auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
-                                     State.VF.getKnownMinValue() * 2, true);
-      return State.Builder.CreateIntrinsic(
-          MaskTy, Intrinsic::vector_interleave2, Ops,
-          /*FMFSource=*/nullptr, "interleaved.mask");
+      SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
+      return interleaveVectors(State.Builder, Ops, "interleaved.mask");
     }
 
     if (!BlockInMask)
@@ -3000,22 +2999,48 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
     ArrayRef<VPValue *> VPDefs = definedValues();
     const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
     if (VecTy->isScalableTy()) {
-      assert(InterleaveFactor == 2 &&
+      assert(isPowerOf2_32(InterleaveFactor) &&
              "Unsupported deinterleave factor for scalable vectors");
 
-        // Scalable vectors cannot use arbitrary shufflevectors (only splats),
-        // so must use intrinsics to deinterleave.
-      Value *DI = State.Builder.CreateIntrinsic(
-          Intrinsic::vector_deinterleave2, VecTy, NewLoad,
-          /*FMFSource=*/nullptr, "strided.vec");
-      unsigned J = 0;
-      for (unsigned I = 0; I < InterleaveFactor; ++I) {
-        Instruction *Member = Group->getMember(I);
+      // Scalable vectors cannot use arbitrary shufflevectors (only splats),
+      // so must use intrinsics to deinterleave.
+      SmallVector<Value *> DeinterleavedValues(InterleaveFactor);
+      DeinterleavedValues[0] = NewLoad;
+      // For the case of InterleaveFactor > 2, we will have to do recursive
+      // deinterleaving, because the current available deinterleave intrinsic
+      // supports only Factor of 2, otherwise it will bailout after first
+      // iteration.
+      // When deinterleaving, the number of values will double until we
+      // have "InterleaveFactor".
+      for (unsigned NumVectors = 1; NumVectors < InterleaveFactor;
+           NumVectors *= 2) {
+        // Deinterleave the elements within the vector
+        SmallVector<Value *> TempDeinterleavedValues(NumVectors);
+        for (unsigned I = 0; I < NumVectors; ++I) {
+          auto *DiTy = DeinterleavedValues[I]->getType();
+          TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic(
+              Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I],
+              /*FMFSource=*/nullptr, "strided.vec");
+        }
+        // Extract the deinterleaved values:
+        for (unsigned I = 0; I < 2; ++I)
+          for (unsigned J = 0; J < NumVectors; ++J)
+            DeinterleavedValues[NumVectors * I + J] =
+                State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I);
+      }
 
-        if (!Member)
+#ifndef NDEBUG
+      for (Value *Val : DeinterleavedValues)
+        assert(Val && "NULL Deinterleaved Value");
+#endif
+      for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
+        Instruction *Member = Group->getMember(I);
+        Value *StridedVec = DeinterleavedValues[I];
+        if (!Member) {
+          // This value is not needed as it's not used
+          cast<Instruction>(StridedVec)->eraseFromParent();
           continue;
-
-        Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
+        }
         // If this member has different type, cast the result type.
         if (Member->getType() != ScalarTy) {
           VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
@@ -3164,7 +3189,6 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
                                 : getStoredValues()[InsertPosIdx]);
   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
   unsigned AS = getLoadStoreAddressSpace(InsertPos);
-  enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
   unsigned InterleaveFactor = IG->getFactor();
   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
@@ -3178,14 +3202,15 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
   // Calculate the cost of the whole interleaved group.
   InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
       InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
-      IG->getAlign(), AS, CostKind, getMask(), NeedsMaskForGaps);
+      IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
 
   if (!IG->isReverse())
     return Cost;
 
   return Cost + IG->getNumMembers() *
                     Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
-                                           VectorTy, std::nullopt, CostKind, 0);
+                                           VectorTy, std::nullopt, Ctx.CostKind,
+                                           0);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -3395,9 +3420,8 @@ void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
 InstructionCost
 VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
                                              VPCostContext &Ctx) const {
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   if (VF.isScalar())
-  return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind);
+    return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
 
   if (VF.isScalable() && VF.getKnownMinValue() == 1)
     return InstructionCost::getInvalid();
@@ -3408,7 +3432,7 @@ VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
       toVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
 
   return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
-                                cast<VectorType>(VectorTy), Mask, CostKind,
+                                cast<VectorType>(VectorTy), Mask, Ctx.CostKind,
                                 VF.getKnownMinValue() - 1);
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 7aaf4002b8b3e..23e39ce89a3a4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -33,9 +33,11 @@ namespace llvm {
 class raw_ostream;
 class Value;
 class VPDef;
+struct VPDoubleValueDef;
 class VPSlotTracker;
 class VPUser;
 class VPRecipeBase;
+class VPInterleaveRecipe;
 
 // This is the base class of the VPlan Def/Use graph, used for modeling the data
 // flow into, within and out of the VPlan. VPValues can stand for live-ins
@@ -44,12 +46,15 @@ class VPRecipeBase;
 class VPValue {
   friend class VPBuilder;
   friend class VPDef;
+  friend struct VPDoubleValueDef;
   friend class VPInstruction;
+  friend class VPInterleaveRecipe;
   friend struct VPlanTransforms;
   friend class VPBasicBlock;
   friend class VPInterleavedAccessInfo;
   friend class VPSlotTracker;
   friend class VPRecipeBase;
+  friend class VPlan;
 
   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
 
@@ -65,6 +70,13 @@ class VPValue {
 
   VPValue(const unsigned char SC, Value *UV = nullptr, VPDef *Def = nullptr);
 
+  /// Create a live-in VPValue.
+  VPValue(Value *UV = nullptr) : VPValue(VPValueSC, UV, nullptr) {}
+  /// Create a VPValue for a \p Def which is a subclass of VPValue.
+  VPValue(VPDef *Def, Value *UV = nullptr) : VPValue(VPVRecipeSC, UV, Def) {}
+  /// Create a VPValue for a \p Def which defines multiple values.
+  VPValue(Value *UV, VPDef *Def) : VPValue(VPValueSC, UV, Def) {}
+
   // DESIGN PRINCIPLE: Access to the underlying IR must be strictly limited to
   // the front-end and back-end of VPlan so that the middle-end is as
   // independent as possible of the underlying IR. We grant access to the
@@ -84,12 +96,6 @@ class VPValue {
     VPVRecipeSC /// A VPValue sub-class that is a VPRecipeBase.
   };
 
-  /// Create a live-in VPValue.
-  VPValue(Value *UV = nullptr) : VPValue(VPValueSC, UV, nullptr) {}
-  /// Create a VPValue for a \p Def which is a subclass of VPValue.
-  VPValue(VPDef *Def, Value *UV = nullptr) : VPValue(VPVRecipeSC, UV, Def) {}
-  /// Create a VPValue for a \p Def which defines multiple values.
-  VPValue(Value *UV, VPDef *Def) : VPValue(VPValueSC, UV, Def) {}
   VPValue(const VPValue &) = delete;
   VPValue &operator=(const VPValue &) = delete;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index be420a873bef5..a30bc82cbde85 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -26,6 +26,7 @@ using namespace llvm;
 namespace {
 class VPlanVerifier {
   const VPDominatorTree &VPDT;
+  VPTypeAnalysis &TypeInfo;
 
   SmallPtrSet<BasicBlock *, 8> WrappedIRBBs;
 
@@ -58,7 +59,8 @@ class VPlanVerifier {
   bool verifyRegionRec(const VPRegionBlock *Region);
 
 public:
-  VPlanVerifier(VPDominatorTree &VPDT) : VPDT(VPDT) {}
+  VPlanVerifier(VPDominatorTree &VPDT, VPTypeAnalysis &TypeInfo)
+      : VPDT(VPDT), TypeInfo(TypeInfo) {}
 
   bool verify(const VPlan &Plan);
 };
@@ -195,6 +197,14 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
       return false;
     }
     for (const VPValue *V : R.definedValues()) {
+      // Verify that we can infer a scalar type for each defined value. With
+      // assertions enabled, inferScalarType will perform some consistency
+      // checks during type inference.
+      if (!TypeInfo.inferScalarType(V)) {
+        errs() << "Failed to infer scalar type!\n";
+        return false;
+      }
+
       for (const VPUser *U : V->users()) {
         auto *UI = dyn_cast<VPRecipeBase>(U);
         // TODO: check dominance of incoming values for phis properly.
@@ -406,6 +416,8 @@ bool VPlanVerifier::verify(const VPlan &Plan) {
 bool llvm::verifyVPlanIsValid(const VPlan &Plan) {
   VPDominatorTree VPDT;
   VPDT.recalculate(const_cast<VPlan &>(Plan));
-  VPlanVerifier Verifier(VPDT);
+  VPTypeAnalysis TypeInfo(
+      const_cast<VPlan &>(Plan).getCanonicalIV()->getScalarType());
+  VPlanVerifier Verifier(VPDT, TypeInfo);
   return Verifier.verify(Plan);
 }
diff --git a/llvm/test/Analysis/BasicAA/call-attrs.ll b/llvm/test/Analysis/BasicAA/call-attrs.ll
index c42c908310746..f6e92dd34ff7f 100644
--- a/llvm/test/Analysis/BasicAA/call-attrs.ll
+++ b/llvm/test/Analysis/BasicAA/call-attrs.ll
@@ -3,6 +3,7 @@
 declare void @readonly_attr(ptr readonly nocapture)
 declare void @writeonly_attr(ptr writeonly nocapture)
 declare void @readnone_attr(ptr readnone nocapture)
+declare void @byval_attr(ptr byval(i32))
 
 declare void @readonly_func(ptr nocapture) readonly
 declare void @writeonly_func(ptr nocapture) writeonly
@@ -24,6 +25,8 @@ entry:
   call void @readnone_attr(ptr %p)
   call void @readnone_func(ptr %p)
 
+  call void @byval_attr(ptr %p)
+
   call void @read_write(ptr %p, ptr %p, ptr %p)
 
   call void @func() ["deopt" (ptr %p)]
@@ -38,6 +41,7 @@ entry:
 ; CHECK:  Just Mod:  Ptr: i8* %p	<->  call void @writeonly_func(ptr %p)
 ; CHECK:  NoModRef:  Ptr: i8* %p	<->  call void @readnone_attr(ptr %p)
 ; CHECK:  NoModRef:  Ptr: i8* %p	<->  call void @readnone_func(ptr %p)
+; CHECK:  Just Ref:  Ptr: i8* %p	<->  call void @byval_attr(ptr %p)
 ; CHECK:  Both ModRef:  Ptr: i8* %p	<->  call void @read_write(ptr %p, ptr %p, ptr %p)
 ; CHECK:  Just Ref:  Ptr: i8* %p	<->  call void @func() [ "deopt"(ptr %p) ]
 ; CHECK:  Both ModRef:  Ptr: i8* %p	<->  call void @writeonly_attr(ptr %p) [ "deopt"(ptr %p) ]
diff --git a/llvm/test/Analysis/BasicAA/dereferenceable.ll b/llvm/test/Analysis/BasicAA/dereferenceable.ll
index 98bd5e3d5aa6a..8df2e4c6bda3a 100644
--- a/llvm/test/Analysis/BasicAA/dereferenceable.ll
+++ b/llvm/test/Analysis/BasicAA/dereferenceable.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -aa-pipeline=basic-aa -print-all-alias-modref-info -passes=aa-eval < %s 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -print-all-alias-modref-info -passes=aa-eval -use-dereferenceable-at-point-semantics=1 < %s 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -print-all-alias-modref-info -passes=aa-eval -use-dereferenceable-at-point-semantics < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Analysis/BasicAA/tail-byval.ll b/llvm/test/Analysis/BasicAA/tail-byval.ll
index 5c4c563a9a5a8..06c77f4454055 100644
--- a/llvm/test/Analysis/BasicAA/tail-byval.ll
+++ b/llvm/test/Analysis/BasicAA/tail-byval.ll
@@ -12,4 +12,4 @@ entry:
 }
 ; FIXME: This should be Just Ref.
 ; CHECK-LABEL: Function: tailbyval: 1 pointers, 1 call sites
-; CHECK-NEXT:   Both ModRef:  Ptr: i32* %p       <->  tail call void @takebyval(ptr byval(i32) %p)
+; CHECK-NEXT:   Just Ref:  Ptr: i32* %p       <->  tail call void @takebyval(ptr byval(i32) %p)
diff --git a/llvm/test/Analysis/CtxProfAnalysis/full-cycle.ll b/llvm/test/Analysis/CtxProfAnalysis/full-cycle.ll
index 5a8a2f4cad84b..49d34e71c5d08 100644
--- a/llvm/test/Analysis/CtxProfAnalysis/full-cycle.ll
+++ b/llvm/test/Analysis/CtxProfAnalysis/full-cycle.ll
@@ -88,54 +88,20 @@ Function Info:
 10507721908651011566 : entrypoint. MaxCounterID: 1. MaxCallsiteID: 2
 
 Current Profile:
-[
-  {
-    "Callsites": [
-      [
-        {
-          "Callsites": [
-            [
-              {
-                "Counters": [
-                  10,
-                  7
-                ],
-                "Guid": 3087265239403591524
-              }
-            ]
-          ],
-          "Counters": [
-            7
-          ],
-          "Guid": 2072045998141807037
-        }
-      ],
-      [
-        {
-          "Callsites": [
-            [
-              {
-                "Counters": [
-                  1,
-                  2
-                ],
-                "Guid": 3087265239403591524
-              }
-            ]
-          ],
-          "Counters": [
-            2
-          ],
-          "Guid": 4197650231481825559
-        }
-      ]
-    ],
-    "Counters": [
-      1
-    ],
-    "Guid": 10507721908651011566
-  }
-]
+
+- Guid:            10507721908651011566
+  Counters:        [ 1 ]
+  Callsites:
+    - - Guid:            2072045998141807037
+        Counters:        [ 7 ]
+        Callsites:
+          - - Guid:            3087265239403591524
+              Counters:        [ 10, 7 ]
+    - - Guid:            4197650231481825559
+        Counters:        [ 2 ]
+        Callsites:
+          - - Guid:            3087265239403591524
+              Counters:        [ 1, 2 ]
 
 Flat Profile:
 2072045998141807037 : 7 
diff --git a/llvm/test/Analysis/CtxProfAnalysis/inline.ll b/llvm/test/Analysis/CtxProfAnalysis/inline.ll
index 6c1e199c2ba1c..2b774ebfab5d0 100644
--- a/llvm/test/Analysis/CtxProfAnalysis/inline.ll
+++ b/llvm/test/Analysis/CtxProfAnalysis/inline.ll
@@ -1,11 +1,12 @@
+; REQUIRES: x86_64-linux
 ; RUN: rm -rf %t
 ; RUN: split-file %s %t
 ; RUN: llvm-ctxprof-util fromYAML --input=%t/profile.yaml --output=%t/profile.ctxprofdata
 
 ; RUN: opt -passes='module-inline,print<ctx-prof-analysis>' -ctx-profile-printer-level=everything %t/module.ll -S \
-; RUN:   -use-ctx-profile=%t/profile.ctxprofdata -ctx-profile-printer-level=json \
-; RUN:   -o - 2> %t/profile-final.txt | FileCheck %s
-; RUN: %python %S/json_equals.py %t/profile-final.txt %t/expected.json
+; RUN:   -use-ctx-profile=%t/profile.ctxprofdata -ctx-profile-printer-level=yaml \
+; RUN:   -o - 2> %t/profile-final.yaml | FileCheck %s
+; RUN: diff %t/profile-final.yaml %t/expected.yaml
 
 ; There are 2 calls to @a from @entrypoint. We only inline the one callsite
 ; marked as alwaysinline, the rest are blocked (marked noinline). After the inline,
@@ -109,17 +110,16 @@ define i32 @b() !guid !2 {
                   Callsites:  -
                                 - Guid: 1002
                                   Counters: [500]
-;--- expected.json
-[
-  { "Guid": 1000,
-    "Counters": [10, 2, 8, 100],
-    "Callsites": [
-      [],
-      [ { "Guid": 1001,
-          "Counters": [8, 500],
-          "Callsites": [[{"Guid": 1002, "Counters": [500]}]]}
-      ],
-      [{ "Guid": 1002, "Counters": [100]}]
-    ]
-  }
-]
+;--- expected.yaml
+
+- Guid:            1000
+  Counters:        [ 10, 2, 8, 100 ]
+  Callsites:
+    - [  ]
+    - - Guid:            1001
+        Counters:        [ 8, 500 ]
+        Callsites:
+          - - Guid:            1002
+              Counters:        [ 500 ]
+    - - Guid:            1002
+        Counters:        [ 100 ]
diff --git a/llvm/test/Analysis/CtxProfAnalysis/json_equals.py b/llvm/test/Analysis/CtxProfAnalysis/json_equals.py
deleted file mode 100644
index 8b94dda5528c5..0000000000000
--- a/llvm/test/Analysis/CtxProfAnalysis/json_equals.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import json
-import sys
-
-
-def to_json(fname: str):
-    with open(fname) as f:
-        return json.load(f)
-
-
-a = to_json(sys.argv[1])
-b = to_json(sys.argv[2])
-
-if a == b:
-    exit(0)
-exit(1)
diff --git a/llvm/test/Analysis/CtxProfAnalysis/load.ll b/llvm/test/Analysis/CtxProfAnalysis/load.ll
index 62c6344ed3fec..2618903bd62a8 100644
--- a/llvm/test/Analysis/CtxProfAnalysis/load.ll
+++ b/llvm/test/Analysis/CtxProfAnalysis/load.ll
@@ -40,31 +40,14 @@ Function Info:
 12074870348631550642 : another_entrypoint_no_callees. MaxCounterID: 1. MaxCallsiteID: 0
 
 Current Profile:
-[
-  {
-    "Callsites": [
-      [
-        {
-          "Counters": [
-            6,
-            7
-          ],
-          "Guid": 728453322856651412
-        }
-      ]
-    ],
-    "Counters": [
-      1
-    ],
-    "Guid": 11872291593386833696
-  },
-  {
-    "Counters": [
-      5
-    ],
-    "Guid": 12074870348631550642
-  }
-]
+
+- Guid:            11872291593386833696
+  Counters:        [ 1 ]
+  Callsites:
+    - - Guid:            728453322856651412
+        Counters:        [ 6, 7 ]
+- Guid:            12074870348631550642
+  Counters:        [ 5 ]
 
 Flat Profile:
 728453322856651412 : 6 7 
diff --git a/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll b/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll
new file mode 100644
index 0000000000000..7ab6221d0da53
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -disable-output -passes="print<scalar-evolution>" \
+; RUN:   -scalar-evolution-classify-expressions=0 2>&1 | FileCheck %s
+
+define void @implied1(i32 %n) {
+; Prove that (n s> 1) ===> (n - 1 s> 0).
+; CHECK-LABEL: 'implied1'
+; CHECK-NEXT:  Determining loop execution counts for: @implied1
+; CHECK-NEXT:  Loop %header: backedge-taken count is (-2 + %n)
+; CHECK-NEXT:  Loop %header: constant max backedge-taken count is i32 2147483645
+; CHECK-NEXT:  Loop %header: symbolic max backedge-taken count is (-2 + %n)
+; CHECK-NEXT:  Loop %header: Trip multiple is 1
+;
+entry:
+  %cmp1 = icmp sgt i32 %n, 1
+  %n.minus.1 = sub nsw i32 %n, 1
+  call void @llvm.assume(i1 %cmp1)
+  br label %header
+
+header:
+  %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp sgt i32 %n.minus.1, %indvar.next
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @implied1_neg(i32 %n) {
+; Prove that (n s> 0) =\=> (n - 1 s> 0).
+; CHECK-LABEL: 'implied1_neg'
+; CHECK-NEXT:  Determining loop execution counts for: @implied1_neg
+; CHECK-NEXT:  Loop %header: backedge-taken count is (-1 + (1 smax (-1 + %n)<nsw>))<nsw>
+; CHECK-NEXT:  Loop %header: constant max backedge-taken count is i32 2147483645
+; CHECK-NEXT:  Loop %header: symbolic max backedge-taken count is (-1 + (1 smax (-1 + %n)<nsw>))<nsw>
+; CHECK-NEXT:  Loop %header: Trip multiple is 1
+;
+entry:
+  %cmp1 = icmp sgt i32 %n, 0
+  %n.minus.1 = sub nsw i32 %n, 1
+  call void @llvm.assume(i1 %cmp1)
+  br label %header
+
+header:
+  %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp sgt i32 %n.minus.1, %indvar.next
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @implied2(i32 %n) {
+; Prove that (n u>= -1) ===> (n + 1 u>= 0).
+; CHECK-LABEL: 'implied2'
+; CHECK-NEXT:  Determining loop execution counts for: @implied2
+; CHECK-NEXT:  Loop %header: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %header: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %header: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %header: Predicated backedge-taken count is (1 + (zext i32 %n to i64))<nuw><nsw>
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %header: Predicated constant max backedge-taken count is i64 4294967296
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %header: Predicated symbolic max backedge-taken count is (1 + (zext i32 %n to i64))<nuw><nsw>
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
+;
+entry:
+  %cmp1 = icmp uge i32 %n, -1
+  %n.1 = add nuw i32 %n, 1
+  call void @llvm.assume(i1 %cmp1)
+  br label %header
+
+header:
+  %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp uge i32 %n.1, %indvar.next
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @implied2_neg(i32 %n) {
+; Prove that (n u>= -1) =\=> (n - 1 s>= 0).
+; CHECK-LABEL: 'implied2_neg'
+; CHECK-NEXT:  Determining loop execution counts for: @implied2_neg
+; CHECK-NEXT:  Loop %header: backedge-taken count is (-1 + (1 smax %n))<nsw>
+; CHECK-NEXT:  Loop %header: constant max backedge-taken count is i32 2147483646
+; CHECK-NEXT:  Loop %header: symbolic max backedge-taken count is (-1 + (1 smax %n))<nsw>
+; CHECK-NEXT:  Loop %header: Trip multiple is 1
+;
+entry:
+  %cmp1 = icmp uge i32 %n, -1
+  %n.minus.1 = sub nuw nsw i32 %n, 1
+  call void @llvm.assume(i1 %cmp1)
+  br label %header
+
+header:
+  %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp sge i32 %n.minus.1, %indvar.next
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/ValueTracking/deref-abstract-gc.ll b/llvm/test/Analysis/ValueTracking/deref-abstract-gc.ll
index 1efe4a90ea780..4b63c036f5491 100644
--- a/llvm/test/Analysis/ValueTracking/deref-abstract-gc.ll
+++ b/llvm/test/Analysis/ValueTracking/deref-abstract-gc.ll
@@ -1,4 +1,4 @@
-; RUN: opt -passes=print-memderefs -S < %s -disable-output  -use-dereferenceable-at-point-semantics=1 2>&1 | FileCheck %s --check-prefixes=CHECK
+; RUN: opt -passes=print-memderefs -S < %s -disable-output  -use-dereferenceable-at-point-semantics 2>&1 | FileCheck %s --check-prefixes=CHECK
 
 target datalayout = "e-i32:32:64"
 
diff --git a/llvm/test/Analysis/ValueTracking/memory-dereferenceable.ll b/llvm/test/Analysis/ValueTracking/memory-dereferenceable.ll
index 3da1aaa8a68a8..8c5216e0c45d9 100644
--- a/llvm/test/Analysis/ValueTracking/memory-dereferenceable.ll
+++ b/llvm/test/Analysis/ValueTracking/memory-dereferenceable.ll
@@ -1,5 +1,5 @@
-; RUN: opt -passes=print-memderefs -S < %s -disable-output  -use-dereferenceable-at-point-semantics=0 2>&1 | FileCheck %s --check-prefixes=CHECK,GLOBAL
-; RUN: opt -passes=print-memderefs -S < %s -disable-output  -use-dereferenceable-at-point-semantics=1 2>&1 | FileCheck %s --check-prefixes=CHECK,POINT
+; RUN: opt -passes=print-memderefs -S < %s -disable-output  -use-dereferenceable-at-point-semantics=false 2>&1 | FileCheck %s --check-prefixes=CHECK,GLOBAL
+; RUN: opt -passes=print-memderefs -S < %s -disable-output  -use-dereferenceable-at-point-semantics 2>&1 | FileCheck %s --check-prefixes=CHECK,POINT
 
 
 ; Uses the print-deref (+ analyze to print) pass to run
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir
index 253e6ebe793ce..76fdfd0c301f6 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir
@@ -6,15 +6,15 @@ tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $x0
-
     ; CHECK-LABEL: name: add_v16s8
     ; CHECK: liveins: $x0
-    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[LOAD:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<16 x s8>))
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s8) = G_VECREDUCE_ADD [[LOAD]](<16 x s8>)
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[VECREDUCE_ADD]](s8)
-    ; CHECK: $w0 = COPY [[ANYEXT]](s32)
-    ; CHECK: RET_ReallyLR implicit $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<16 x s8>))
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s8) = G_VECREDUCE_ADD [[LOAD]](<16 x s8>)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[VECREDUCE_ADD]](s8)
+    ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(p0) = COPY $x0
     %1:_(<16 x s8>) = G_LOAD %0(p0) :: (load (<16 x s8>))
     %2:_(s8) = G_VECREDUCE_ADD %1(<16 x s8>)
@@ -29,15 +29,15 @@ tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $x0
-
     ; CHECK-LABEL: name: add_v8s16
     ; CHECK: liveins: $x0
-    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p0) :: (load (<8 x s16>))
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s16) = G_VECREDUCE_ADD [[LOAD]](<8 x s16>)
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[VECREDUCE_ADD]](s16)
-    ; CHECK: $w0 = COPY [[ANYEXT]](s32)
-    ; CHECK: RET_ReallyLR implicit $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p0) :: (load (<8 x s16>))
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s16) = G_VECREDUCE_ADD [[LOAD]](<8 x s16>)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[VECREDUCE_ADD]](s16)
+    ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(p0) = COPY $x0
     %1:_(<8 x s16>) = G_LOAD %0(p0) :: (load (<8 x s16>))
     %2:_(s16) = G_VECREDUCE_ADD %1(<8 x s16>)
@@ -52,14 +52,14 @@ tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $x0
-
     ; CHECK-LABEL: name: add_v4s32
     ; CHECK: liveins: $x0
-    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[LOAD]](<4 x s32>)
-    ; CHECK: $w0 = COPY [[VECREDUCE_ADD]](s32)
-    ; CHECK: RET_ReallyLR implicit $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[LOAD]](<4 x s32>)
+    ; CHECK-NEXT: $w0 = COPY [[VECREDUCE_ADD]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(p0) = COPY $x0
     %1:_(<4 x s32>) = G_LOAD %0(p0) :: (load (<4 x s32>))
     %2:_(s32) = G_VECREDUCE_ADD %1(<4 x s32>)
@@ -73,14 +73,14 @@ tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $x0
-
     ; CHECK-LABEL: name: add_v2s64
     ; CHECK: liveins: $x0
-    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>))
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[LOAD]](<2 x s64>)
-    ; CHECK: $x0 = COPY [[VECREDUCE_ADD]](s64)
-    ; CHECK: RET_ReallyLR implicit $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>))
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[LOAD]](<2 x s64>)
+    ; CHECK-NEXT: $x0 = COPY [[VECREDUCE_ADD]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:_(p0) = COPY $x0
     %1:_(<2 x s64>) = G_LOAD %0(p0) :: (load (<2 x s64>))
     %2:_(s64) = G_VECREDUCE_ADD %1(<2 x s64>)
@@ -94,14 +94,14 @@ tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $x0
-
     ; CHECK-LABEL: name: add_v2s32
     ; CHECK: liveins: $x0
-    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[LOAD]](<2 x s32>)
-    ; CHECK: $w0 = COPY [[VECREDUCE_ADD]](s32)
-    ; CHECK: RET_ReallyLR implicit $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[LOAD]](<2 x s32>)
+    ; CHECK-NEXT: $w0 = COPY [[VECREDUCE_ADD]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(p0) = COPY $x0
     %1:_(<2 x s32>) = G_LOAD %0(p0) :: (load (<2 x s32>))
     %2:_(s32) = G_VECREDUCE_ADD %1(<2 x s32>)
@@ -111,24 +111,25 @@ body:             |
 ...
 ---
 name:            test_v8i64
+# This is a power-of-2 legalization, so use a tree reduction.
 alignment:       4
 tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $q0, $q1, $q2, $q3
-    ; This is a power-of-2 legalization, so use a tree reduction.
     ; CHECK-LABEL: name: test_v8i64
     ; CHECK: liveins: $q0, $q1, $q2, $q3
-    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
-    ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
-    ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
-    ; CHECK: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3
-    ; CHECK: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY]], [[COPY1]]
-    ; CHECK: [[ADD1:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY2]], [[COPY3]]
-    ; CHECK: [[ADD2:%[0-9]+]]:_(<2 x s64>) = G_ADD [[ADD]], [[ADD1]]
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[ADD2]](<2 x s64>)
-    ; CHECK: $x0 = COPY [[VECREDUCE_ADD]](s64)
-    ; CHECK: RET_ReallyLR implicit $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY2]], [[COPY3]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(<2 x s64>) = G_ADD [[ADD]], [[ADD1]]
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[ADD2]](<2 x s64>)
+    ; CHECK-NEXT: $x0 = COPY [[VECREDUCE_ADD]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:_(<2 x s64>) = COPY $q0
     %1:_(<2 x s64>) = COPY $q1
     %2:_(<2 x s64>) = COPY $q2
@@ -143,25 +144,26 @@ body:             |
 ...
 ---
 name:            test_v6i64
+# This is a non-power-of-2 legalization, generate multiple vector reductions
+# and combine them with scalar ops.
 alignment:       4
 tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $q0, $q1, $q2, $q3
-    ; This is a non-power-of-2 legalization, generate multiple vector reductions
-    ; and combine them with scalar ops.
     ; CHECK-LABEL: name: test_v6i64
     ; CHECK: liveins: $q0, $q1, $q2, $q3
-    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
-    ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
-    ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY]](<2 x s64>)
-    ; CHECK: [[VECREDUCE_ADD1:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY1]](<2 x s64>)
-    ; CHECK: [[VECREDUCE_ADD2:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY2]](<2 x s64>)
-    ; CHECK: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[VECREDUCE_ADD]], [[VECREDUCE_ADD1]]
-    ; CHECK: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[ADD]], [[VECREDUCE_ADD2]]
-    ; CHECK: $x0 = COPY [[ADD1]](s64)
-    ; CHECK: RET_ReallyLR implicit $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY]](<2 x s64>)
+    ; CHECK-NEXT: [[VECREDUCE_ADD1:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY1]](<2 x s64>)
+    ; CHECK-NEXT: [[VECREDUCE_ADD2:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY2]](<2 x s64>)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[VECREDUCE_ADD]], [[VECREDUCE_ADD1]]
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[ADD]], [[VECREDUCE_ADD2]]
+    ; CHECK-NEXT: $x0 = COPY [[ADD1]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:_(<2 x s64>) = COPY $q0
     %1:_(<2 x s64>) = COPY $q1
     %2:_(<2 x s64>) = COPY $q2
diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index def4192b0e005..aba284b4e0d29 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=generic | FileCheck %s -check-prefixes=CHECK,SDAG
-; RUN: llc < %s -global-isel=1 -global-isel-abort=2 -mtriple=aarch64 -aarch64-neon-syntax=generic 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=generic | FileCheck %s -check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=aarch64 -global-isel=1 -global-isel-abort=2 -aarch64-neon-syntax=generic 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; Function Attrs: nounwind readnone
 declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
 declare i8 @llvm.vector.reduce.add.v3i8(<3 x i8>)
 declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
@@ -23,14 +22,14 @@ declare i64 @llvm.vector.reduce.add.v3i64(<3 x i64>)
 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
 declare i128 @llvm.vector.reduce.add.v2i128(<2 x i128>)
 
-; GISEL:       warning: Instruction selection used fallback path for addv_v2i8
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v3i8
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v4i8
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v2i16
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v3i16
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v3i32
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v3i64
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v2i128
+; CHECK-GI:       warning: Instruction selection used fallback path for addv_v2i8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v3i8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v4i8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v2i16
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v3i16
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v3i32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v3i64
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v2i128
 
 
 define i8 @add_B(ptr %arr)  {
@@ -83,34 +82,34 @@ define i64 @add_D(ptr %arr)  {
 
 
 define i32 @oversized_ADDV_256(ptr noalias nocapture readonly %arg1, ptr noalias nocapture readonly %arg2) {
-; SDAG-LABEL: oversized_ADDV_256:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    ldr d0, [x0]
-; SDAG-NEXT:    ldr d1, [x1]
-; SDAG-NEXT:    uabdl v0.8h, v0.8b, v1.8b
-; SDAG-NEXT:    uaddlv s0, v0.8h
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: oversized_ADDV_256:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1]
+; CHECK-SD-NEXT:    uabdl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uaddlv s0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: oversized_ADDV_256:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    ldr d1, [x0]
-; GISEL-NEXT:    ldr d2, [x1]
-; GISEL-NEXT:    movi v0.2d, #0000000000000000
-; GISEL-NEXT:    usubl v1.8h, v1.8b, v2.8b
-; GISEL-NEXT:    sshll v2.4s, v1.4h, #0
-; GISEL-NEXT:    sshll2 v3.4s, v1.8h, #0
-; GISEL-NEXT:    ssubw2 v0.4s, v0.4s, v1.8h
-; GISEL-NEXT:    cmlt v4.4s, v2.4s, #0
-; GISEL-NEXT:    cmlt v5.4s, v3.4s, #0
-; GISEL-NEXT:    neg v6.4s, v2.4s
-; GISEL-NEXT:    mov v1.16b, v4.16b
-; GISEL-NEXT:    bif v0.16b, v3.16b, v5.16b
-; GISEL-NEXT:    bsl v1.16b, v6.16b, v2.16b
-; GISEL-NEXT:    add v0.4s, v1.4s, v0.4s
-; GISEL-NEXT:    addv s0, v0.4s
-; GISEL-NEXT:    fmov w0, s0
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: oversized_ADDV_256:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    ldr d2, [x1]
+; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-GI-NEXT:    usubl v1.8h, v1.8b, v2.8b
+; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ssubw2 v0.4s, v0.4s, v1.8h
+; CHECK-GI-NEXT:    cmlt v4.4s, v2.4s, #0
+; CHECK-GI-NEXT:    cmlt v5.4s, v3.4s, #0
+; CHECK-GI-NEXT:    neg v6.4s, v2.4s
+; CHECK-GI-NEXT:    mov v1.16b, v4.16b
+; CHECK-GI-NEXT:    bif v0.16b, v3.16b, v5.16b
+; CHECK-GI-NEXT:    bsl v1.16b, v6.16b, v2.16b
+; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = load <8 x i8>, ptr %arg1, align 1
   %1 = zext <8 x i8> %0 to <8 x i32>
@@ -127,48 +126,48 @@ entry:
 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
 
 define i32 @oversized_ADDV_512(ptr %arr)  {
-; SDAG-LABEL: oversized_ADDV_512:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldp q0, q1, [x0, #32]
-; SDAG-NEXT:    ldp q2, q3, [x0]
-; SDAG-NEXT:    add v1.4s, v3.4s, v1.4s
-; SDAG-NEXT:    add v0.4s, v2.4s, v0.4s
-; SDAG-NEXT:    add v0.4s, v0.4s, v1.4s
-; SDAG-NEXT:    addv s0, v0.4s
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: oversized_ADDV_512:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-SD-NEXT:    ldp q2, q3, [x0]
+; CHECK-SD-NEXT:    add v1.4s, v3.4s, v1.4s
+; CHECK-SD-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: oversized_ADDV_512:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldp q0, q1, [x0]
-; GISEL-NEXT:    ldp q2, q3, [x0, #32]
-; GISEL-NEXT:    add v0.4s, v0.4s, v1.4s
-; GISEL-NEXT:    add v1.4s, v2.4s, v3.4s
-; GISEL-NEXT:    add v0.4s, v0.4s, v1.4s
-; GISEL-NEXT:    addv s0, v0.4s
-; GISEL-NEXT:    fmov w0, s0
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: oversized_ADDV_512:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldp q0, q1, [x0]
+; CHECK-GI-NEXT:    ldp q2, q3, [x0, #32]
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %bin.rdx = load <16 x i32>, ptr %arr
   %r = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %bin.rdx)
   ret i32 %r
 }
 
 define i8 @addv_combine_i8(<8 x i8> %a1, <8 x i8> %a2) {
-; SDAG-LABEL: addv_combine_i8:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    add v0.8b, v0.8b, v1.8b
-; SDAG-NEXT:    addv b0, v0.8b
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: addv_combine_i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    addv b0, v0.8b
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: addv_combine_i8:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    addv b0, v0.8b
-; GISEL-NEXT:    addv b1, v1.8b
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    add w0, w9, w8, uxtb
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: addv_combine_i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv b0, v0.8b
+; CHECK-GI-NEXT:    addv b1, v1.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w9, w8, uxtb
+; CHECK-GI-NEXT:    ret
 entry:
   %rdx.1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a1)
   %rdx.2 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a2)
@@ -177,21 +176,21 @@ entry:
 }
 
 define i16 @addv_combine_i16(<4 x i16> %a1, <4 x i16> %a2) {
-; SDAG-LABEL: addv_combine_i16:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    add v0.4h, v0.4h, v1.4h
-; SDAG-NEXT:    addv h0, v0.4h
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: addv_combine_i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    addv h0, v0.4h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: addv_combine_i16:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    addv h0, v0.4h
-; GISEL-NEXT:    addv h1, v1.4h
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    add w0, w9, w8, uxth
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: addv_combine_i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv h0, v0.4h
+; CHECK-GI-NEXT:    addv h1, v1.4h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w9, w8, uxth
+; CHECK-GI-NEXT:    ret
 entry:
   %rdx.1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a1)
   %rdx.2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2)
@@ -200,21 +199,21 @@ entry:
 }
 
 define i32 @addv_combine_i32(<4 x i32> %a1, <4 x i32> %a2) {
-; SDAG-LABEL: addv_combine_i32:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    add v0.4s, v0.4s, v1.4s
-; SDAG-NEXT:    addv s0, v0.4s
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: addv_combine_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: addv_combine_i32:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    addv s0, v0.4s
-; GISEL-NEXT:    addv s1, v1.4s
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    add w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: addv_combine_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %rdx.1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a1)
   %rdx.2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2)
@@ -223,21 +222,21 @@ entry:
 }
 
 define i64 @addv_combine_i64(<2 x i64> %a1, <2 x i64> %a2) {
-; SDAG-LABEL: addv_combine_i64:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    add v0.2d, v0.2d, v1.2d
-; SDAG-NEXT:    addp d0, v0.2d
-; SDAG-NEXT:    fmov x0, d0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: addv_combine_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: addv_combine_i64:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    addp d0, v0.2d
-; GISEL-NEXT:    addp d1, v1.2d
-; GISEL-NEXT:    fmov x8, d0
-; GISEL-NEXT:    fmov x9, d1
-; GISEL-NEXT:    add x0, x8, x9
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: addv_combine_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %rdx.1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1)
   %rdx.2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a2)
@@ -471,3 +470,6 @@ entry:
   ret i128 %arg1
 }
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL: {{.*}}
+; SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll b/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
index b498611242d46..d69d1b6eb4a2a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefixes=CHECK,SDAG
-; RUN: llc < %s -global-isel -global-isel-abort=1 -pass-remarks-missed=gisel* -mtriple=arm64-linux-gnu 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL,FALLBACK
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=arm64-linux-gnu -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 %0 = type { i64, i64 }
 
@@ -39,22 +39,21 @@ declare i32 @llvm.aarch64.stxp(i64, i64, ptr) nounwind
 
 @var = dso_local global i64 0, align 8
 
-; FALLBACK-NOT: remark:{{.*}}test_load_i8
 define dso_local void @test_load_i8(ptr %addr) {
-; SDAG-LABEL: test_load_i8:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldxrb w8, [x0]
-; SDAG-NEXT:    adrp x9, var
-; SDAG-NEXT:    str x8, [x9, :lo12:var]
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: test_load_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldxrb w8, [x0]
+; CHECK-SD-NEXT:    adrp x9, var
+; CHECK-SD-NEXT:    str x8, [x9, :lo12:var]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_load_i8:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldxrb w9, [x0]
-; GISEL-NEXT:    adrp x8, var
-; GISEL-NEXT:    and x9, x9, #0xff
-; GISEL-NEXT:    str x9, [x8, :lo12:var]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_load_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldxrb w9, [x0]
+; CHECK-GI-NEXT:    adrp x8, var
+; CHECK-GI-NEXT:    and x9, x9, #0xff
+; CHECK-GI-NEXT:    str x9, [x8, :lo12:var]
+; CHECK-GI-NEXT:    ret
 
   %val = call i64 @llvm.aarch64.ldxr.p0(ptr elementtype(i8) %addr)
   %shortval = trunc i64 %val to i8
@@ -63,22 +62,21 @@ define dso_local void @test_load_i8(ptr %addr) {
   ret void
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_load_i16
 define dso_local void @test_load_i16(ptr %addr) {
-; SDAG-LABEL: test_load_i16:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldxrh w8, [x0]
-; SDAG-NEXT:    adrp x9, var
-; SDAG-NEXT:    str x8, [x9, :lo12:var]
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: test_load_i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldxrh w8, [x0]
+; CHECK-SD-NEXT:    adrp x9, var
+; CHECK-SD-NEXT:    str x8, [x9, :lo12:var]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_load_i16:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldxrh w9, [x0]
-; GISEL-NEXT:    adrp x8, var
-; GISEL-NEXT:    and x9, x9, #0xffff
-; GISEL-NEXT:    str x9, [x8, :lo12:var]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_load_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldxrh w9, [x0]
+; CHECK-GI-NEXT:    adrp x8, var
+; CHECK-GI-NEXT:    and x9, x9, #0xffff
+; CHECK-GI-NEXT:    str x9, [x8, :lo12:var]
+; CHECK-GI-NEXT:    ret
 
   %val = call i64 @llvm.aarch64.ldxr.p0(ptr elementtype(i16) %addr)
   %shortval = trunc i64 %val to i16
@@ -87,22 +85,21 @@ define dso_local void @test_load_i16(ptr %addr) {
   ret void
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_load_i32
 define dso_local void @test_load_i32(ptr %addr) {
-; SDAG-LABEL: test_load_i32:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldxr w8, [x0]
-; SDAG-NEXT:    adrp x9, var
-; SDAG-NEXT:    str x8, [x9, :lo12:var]
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: test_load_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldxr w8, [x0]
+; CHECK-SD-NEXT:    adrp x9, var
+; CHECK-SD-NEXT:    str x8, [x9, :lo12:var]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_load_i32:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldxr w9, [x0]
-; GISEL-NEXT:    adrp x8, var
-; GISEL-NEXT:    mov w9, w9
-; GISEL-NEXT:    str x9, [x8, :lo12:var]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_load_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldxr w9, [x0]
+; CHECK-GI-NEXT:    adrp x8, var
+; CHECK-GI-NEXT:    mov w9, w9
+; CHECK-GI-NEXT:    str x9, [x8, :lo12:var]
+; CHECK-GI-NEXT:    ret
 
   %val = call i64 @llvm.aarch64.ldxr.p0(ptr elementtype(i32) %addr)
   %shortval = trunc i64 %val to i32
@@ -111,7 +108,6 @@ define dso_local void @test_load_i32(ptr %addr) {
   ret void
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_load_i64
 define dso_local void @test_load_i64(ptr %addr) {
 ; CHECK-LABEL: test_load_i64:
 ; CHECK:       // %bb.0:
@@ -128,7 +124,6 @@ define dso_local void @test_load_i64(ptr %addr) {
 
 declare i64 @llvm.aarch64.ldxr.p0(ptr) nounwind
 
-; FALLBACK-NOT: remark:{{.*}}test_store_i8
 define dso_local i32 @test_store_i8(i32, i8 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_i8:
 ; CHECK:       // %bb.0:
@@ -140,7 +135,6 @@ define dso_local i32 @test_store_i8(i32, i8 %val, ptr %addr) {
   ret i32 %res
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_store_i16
 define dso_local i32 @test_store_i16(i32, i16 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_i16:
 ; CHECK:       // %bb.0:
@@ -152,7 +146,6 @@ define dso_local i32 @test_store_i16(i32, i16 %val, ptr %addr) {
   ret i32 %res
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_store_i32
 define dso_local i32 @test_store_i32(i32, i32 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_i32:
 ; CHECK:       // %bb.0:
@@ -163,7 +156,6 @@ define dso_local i32 @test_store_i32(i32, i32 %val, ptr %addr) {
   ret i32 %res
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_store_i64
 define dso_local i32 @test_store_i64(i32, i64 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_i64:
 ; CHECK:       // %bb.0:
@@ -219,22 +211,21 @@ entry:
 declare %0 @llvm.aarch64.ldaxp(ptr) nounwind
 declare i32 @llvm.aarch64.stlxp(i64, i64, ptr) nounwind
 
-; FALLBACK-NOT: remark:{{.*}}test_load_acquire_i8
 define dso_local void @test_load_acquire_i8(ptr %addr) {
-; SDAG-LABEL: test_load_acquire_i8:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldaxrb w8, [x0]
-; SDAG-NEXT:    adrp x9, var
-; SDAG-NEXT:    str x8, [x9, :lo12:var]
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: test_load_acquire_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldaxrb w8, [x0]
+; CHECK-SD-NEXT:    adrp x9, var
+; CHECK-SD-NEXT:    str x8, [x9, :lo12:var]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_load_acquire_i8:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldaxrb w9, [x0]
-; GISEL-NEXT:    adrp x8, var
-; GISEL-NEXT:    and x9, x9, #0xff
-; GISEL-NEXT:    str x9, [x8, :lo12:var]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_load_acquire_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldaxrb w9, [x0]
+; CHECK-GI-NEXT:    adrp x8, var
+; CHECK-GI-NEXT:    and x9, x9, #0xff
+; CHECK-GI-NEXT:    str x9, [x8, :lo12:var]
+; CHECK-GI-NEXT:    ret
 
   %val = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i8) %addr)
   %shortval = trunc i64 %val to i8
@@ -243,22 +234,21 @@ define dso_local void @test_load_acquire_i8(ptr %addr) {
   ret void
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_load_acquire_i16
 define dso_local void @test_load_acquire_i16(ptr %addr) {
-; SDAG-LABEL: test_load_acquire_i16:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldaxrh w8, [x0]
-; SDAG-NEXT:    adrp x9, var
-; SDAG-NEXT:    str x8, [x9, :lo12:var]
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: test_load_acquire_i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldaxrh w8, [x0]
+; CHECK-SD-NEXT:    adrp x9, var
+; CHECK-SD-NEXT:    str x8, [x9, :lo12:var]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_load_acquire_i16:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldaxrh w9, [x0]
-; GISEL-NEXT:    adrp x8, var
-; GISEL-NEXT:    and x9, x9, #0xffff
-; GISEL-NEXT:    str x9, [x8, :lo12:var]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_load_acquire_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldaxrh w9, [x0]
+; CHECK-GI-NEXT:    adrp x8, var
+; CHECK-GI-NEXT:    and x9, x9, #0xffff
+; CHECK-GI-NEXT:    str x9, [x8, :lo12:var]
+; CHECK-GI-NEXT:    ret
 
   %val = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i16) %addr)
   %shortval = trunc i64 %val to i16
@@ -267,22 +257,21 @@ define dso_local void @test_load_acquire_i16(ptr %addr) {
   ret void
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_load_acquire_i32
 define dso_local void @test_load_acquire_i32(ptr %addr) {
-; SDAG-LABEL: test_load_acquire_i32:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldaxr w8, [x0]
-; SDAG-NEXT:    adrp x9, var
-; SDAG-NEXT:    str x8, [x9, :lo12:var]
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: test_load_acquire_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldaxr w8, [x0]
+; CHECK-SD-NEXT:    adrp x9, var
+; CHECK-SD-NEXT:    str x8, [x9, :lo12:var]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_load_acquire_i32:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldaxr w9, [x0]
-; GISEL-NEXT:    adrp x8, var
-; GISEL-NEXT:    mov w9, w9
-; GISEL-NEXT:    str x9, [x8, :lo12:var]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_load_acquire_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldaxr w9, [x0]
+; CHECK-GI-NEXT:    adrp x8, var
+; CHECK-GI-NEXT:    mov w9, w9
+; CHECK-GI-NEXT:    str x9, [x8, :lo12:var]
+; CHECK-GI-NEXT:    ret
 
   %val = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i32) %addr)
   %shortval = trunc i64 %val to i32
@@ -291,7 +280,6 @@ define dso_local void @test_load_acquire_i32(ptr %addr) {
   ret void
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_load_acquire_i64
 define dso_local void @test_load_acquire_i64(ptr %addr) {
 ; CHECK-LABEL: test_load_acquire_i64:
 ; CHECK:       // %bb.0:
@@ -308,7 +296,6 @@ define dso_local void @test_load_acquire_i64(ptr %addr) {
 
 declare i64 @llvm.aarch64.ldaxr.p0(ptr) nounwind
 
-; FALLBACK-NOT: remark:{{.*}}test_store_release_i8
 define dso_local i32 @test_store_release_i8(i32, i8 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_release_i8:
 ; CHECK:       // %bb.0:
@@ -320,7 +307,6 @@ define dso_local i32 @test_store_release_i8(i32, i8 %val, ptr %addr) {
   ret i32 %res
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_store_release_i16
 define dso_local i32 @test_store_release_i16(i32, i16 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_release_i16:
 ; CHECK:       // %bb.0:
@@ -332,7 +318,6 @@ define dso_local i32 @test_store_release_i16(i32, i16 %val, ptr %addr) {
   ret i32 %res
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_store_release_i32
 define dso_local i32 @test_store_release_i32(i32, i32 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_release_i32:
 ; CHECK:       // %bb.0:
@@ -343,7 +328,6 @@ define dso_local i32 @test_store_release_i32(i32, i32 %val, ptr %addr) {
   ret i32 %res
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_store_release_i64
 define dso_local i32 @test_store_release_i64(i32, i64 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_release_i64:
 ; CHECK:       // %bb.0:
@@ -378,5 +362,3 @@ define dso_local i32 @test_stxp_undef_inline_asm(ptr %p, i64 %x) nounwind {
 }
 
 declare i32 @llvm.aarch64.stlxr.p0(i64, ptr) nounwind
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; FALLBACK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll b/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
index 1fffcdda4b416..66a6745cda8f7 100644
--- a/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
+++ b/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
@@ -190,7 +190,8 @@ define i8 @test_i8_224_mask_ashr_6(i8 %a0) {
 define i8 @test_i8_7_mask_shl_1(i8 %a0) {
 ; CHECK-LABEL: test_i8_7_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ubfiz w0, w0, #1, #3
+; CHECK-NEXT:    and w8, w0, #0x7
+; CHECK-NEXT:    lsl w0, w8, #1
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 7
   %t1 = shl i8 %t0, 1
@@ -199,7 +200,8 @@ define i8 @test_i8_7_mask_shl_1(i8 %a0) {
 define i8 @test_i8_7_mask_shl_4(i8 %a0) {
 ; CHECK-LABEL: test_i8_7_mask_shl_4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ubfiz w0, w0, #4, #3
+; CHECK-NEXT:    and w8, w0, #0x7
+; CHECK-NEXT:    lsl w0, w8, #4
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 7
   %t1 = shl i8 %t0, 4
@@ -227,8 +229,8 @@ define i8 @test_i8_7_mask_shl_6(i8 %a0) {
 define i8 @test_i8_28_mask_shl_1(i8 %a0) {
 ; CHECK-LABEL: test_i8_28_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, #1
-; CHECK-NEXT:    and w0, w8, #0x38
+; CHECK-NEXT:    and w8, w0, #0x1c
+; CHECK-NEXT:    lsl w0, w8, #1
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 28
   %t1 = shl i8 %t0, 1
@@ -237,8 +239,8 @@ define i8 @test_i8_28_mask_shl_1(i8 %a0) {
 define i8 @test_i8_28_mask_shl_2(i8 %a0) {
 ; CHECK-LABEL: test_i8_28_mask_shl_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, #2
-; CHECK-NEXT:    and w0, w8, #0x70
+; CHECK-NEXT:    and w8, w0, #0x1c
+; CHECK-NEXT:    lsl w0, w8, #2
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 28
   %t1 = shl i8 %t0, 2
@@ -247,8 +249,8 @@ define i8 @test_i8_28_mask_shl_2(i8 %a0) {
 define i8 @test_i8_28_mask_shl_3(i8 %a0) {
 ; CHECK-LABEL: test_i8_28_mask_shl_3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, #3
-; CHECK-NEXT:    and w0, w8, #0xe0
+; CHECK-NEXT:    and w8, w0, #0x1c
+; CHECK-NEXT:    lsl w0, w8, #3
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 28
   %t1 = shl i8 %t0, 3
@@ -257,8 +259,8 @@ define i8 @test_i8_28_mask_shl_3(i8 %a0) {
 define i8 @test_i8_28_mask_shl_4(i8 %a0) {
 ; CHECK-LABEL: test_i8_28_mask_shl_4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, #4
-; CHECK-NEXT:    and w0, w8, #0xc0
+; CHECK-NEXT:    and w8, w0, #0xc
+; CHECK-NEXT:    lsl w0, w8, #4
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 28
   %t1 = shl i8 %t0, 4
@@ -268,8 +270,8 @@ define i8 @test_i8_28_mask_shl_4(i8 %a0) {
 define i8 @test_i8_224_mask_shl_1(i8 %a0) {
 ; CHECK-LABEL: test_i8_224_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, #1
-; CHECK-NEXT:    and w0, w8, #0xc0
+; CHECK-NEXT:    and w8, w0, #0x60
+; CHECK-NEXT:    lsl w0, w8, #1
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 224
   %t1 = shl i8 %t0, 1
@@ -463,7 +465,8 @@ define i16 @test_i16_65024_mask_ashr_10(i16 %a0) {
 define i16 @test_i16_127_mask_shl_1(i16 %a0) {
 ; CHECK-LABEL: test_i16_127_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ubfiz w0, w0, #1, #7
+; CHECK-NEXT:    and w8, w0, #0x7f
+; CHECK-NEXT:    lsl w0, w8, #1
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 127
   %t1 = shl i16 %t0, 1
@@ -472,7 +475,8 @@ define i16 @test_i16_127_mask_shl_1(i16 %a0) {
 define i16 @test_i16_127_mask_shl_8(i16 %a0) {
 ; CHECK-LABEL: test_i16_127_mask_shl_8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ubfiz w0, w0, #8, #7
+; CHECK-NEXT:    and w8, w0, #0x7f
+; CHECK-NEXT:    lsl w0, w8, #8
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 127
   %t1 = shl i16 %t0, 8
@@ -500,8 +504,8 @@ define i16 @test_i16_127_mask_shl_10(i16 %a0) {
 define i16 @test_i16_2032_mask_shl_3(i16 %a0) {
 ; CHECK-LABEL: test_i16_2032_mask_shl_3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, #3
-; CHECK-NEXT:    and w0, w8, #0x3f80
+; CHECK-NEXT:    and w8, w0, #0x7f0
+; CHECK-NEXT:    lsl w0, w8, #3
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 2032
   %t1 = shl i16 %t0, 3
@@ -510,8 +514,8 @@ define i16 @test_i16_2032_mask_shl_3(i16 %a0) {
 define i16 @test_i16_2032_mask_shl_4(i16 %a0) {
 ; CHECK-LABEL: test_i16_2032_mask_shl_4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, #4
-; CHECK-NEXT:    and w0, w8, #0x7f00
+; CHECK-NEXT:    and w8, w0, #0x7f0
+; CHECK-NEXT:    lsl w0, w8, #4
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 2032
   %t1 = shl i16 %t0, 4
@@ -520,8 +524,8 @@ define i16 @test_i16_2032_mask_shl_4(i16 %a0) {
 define i16 @test_i16_2032_mask_shl_5(i16 %a0) {
 ; CHECK-LABEL: test_i16_2032_mask_shl_5:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, #5
-; CHECK-NEXT:    and w0, w8, #0xfe00
+; CHECK-NEXT:    and w8, w0, #0x7f0
+; CHECK-NEXT:    lsl w0, w8, #5
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 2032
   %t1 = shl i16 %t0, 5
@@ -530,8 +534,8 @@ define i16 @test_i16_2032_mask_shl_5(i16 %a0) {
 define i16 @test_i16_2032_mask_shl_6(i16 %a0) {
 ; CHECK-LABEL: test_i16_2032_mask_shl_6:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, #6
-; CHECK-NEXT:    and w0, w8, #0xfc00
+; CHECK-NEXT:    and w8, w0, #0x3f0
+; CHECK-NEXT:    lsl w0, w8, #6
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 2032
   %t1 = shl i16 %t0, 6
@@ -541,8 +545,8 @@ define i16 @test_i16_2032_mask_shl_6(i16 %a0) {
 define i16 @test_i16_65024_mask_shl_1(i16 %a0) {
 ; CHECK-LABEL: test_i16_65024_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, #1
-; CHECK-NEXT:    and w0, w8, #0xfc00
+; CHECK-NEXT:    and w8, w0, #0x7e00
+; CHECK-NEXT:    lsl w0, w8, #1
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 65024
   %t1 = shl i16 %t0, 1
@@ -736,7 +740,8 @@ define i32 @test_i32_4294836224_mask_ashr_18(i32 %a0) {
 define i32 @test_i32_32767_mask_shl_1(i32 %a0) {
 ; CHECK-LABEL: test_i32_32767_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ubfiz w0, w0, #1, #15
+; CHECK-NEXT:    and w8, w0, #0x7fff
+; CHECK-NEXT:    lsl w0, w8, #1
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 32767
   %t1 = shl i32 %t0, 1
@@ -745,7 +750,8 @@ define i32 @test_i32_32767_mask_shl_1(i32 %a0) {
 define i32 @test_i32_32767_mask_shl_16(i32 %a0) {
 ; CHECK-LABEL: test_i32_32767_mask_shl_16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ubfiz w0, w0, #16, #15
+; CHECK-NEXT:    and w8, w0, #0x7fff
+; CHECK-NEXT:    lsl w0, w8, #16
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 32767
   %t1 = shl i32 %t0, 16
@@ -773,8 +779,8 @@ define i32 @test_i32_32767_mask_shl_18(i32 %a0) {
 define i32 @test_i32_8388352_mask_shl_7(i32 %a0) {
 ; CHECK-LABEL: test_i32_8388352_mask_shl_7:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, #7
-; CHECK-NEXT:    and w0, w8, #0x3fff8000
+; CHECK-NEXT:    and w8, w0, #0x7fff00
+; CHECK-NEXT:    lsl w0, w8, #7
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 8388352
   %t1 = shl i32 %t0, 7
@@ -783,8 +789,8 @@ define i32 @test_i32_8388352_mask_shl_7(i32 %a0) {
 define i32 @test_i32_8388352_mask_shl_8(i32 %a0) {
 ; CHECK-LABEL: test_i32_8388352_mask_shl_8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, #8
-; CHECK-NEXT:    and w0, w8, #0x7fff0000
+; CHECK-NEXT:    and w8, w0, #0x7fff00
+; CHECK-NEXT:    lsl w0, w8, #8
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 8388352
   %t1 = shl i32 %t0, 8
@@ -793,8 +799,8 @@ define i32 @test_i32_8388352_mask_shl_8(i32 %a0) {
 define i32 @test_i32_8388352_mask_shl_9(i32 %a0) {
 ; CHECK-LABEL: test_i32_8388352_mask_shl_9:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, #9
-; CHECK-NEXT:    and w0, w8, #0xfffe0000
+; CHECK-NEXT:    and w8, w0, #0x7fff00
+; CHECK-NEXT:    lsl w0, w8, #9
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 8388352
   %t1 = shl i32 %t0, 9
@@ -803,8 +809,8 @@ define i32 @test_i32_8388352_mask_shl_9(i32 %a0) {
 define i32 @test_i32_8388352_mask_shl_10(i32 %a0) {
 ; CHECK-LABEL: test_i32_8388352_mask_shl_10:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, #10
-; CHECK-NEXT:    and w0, w8, #0xfffc0000
+; CHECK-NEXT:    and w8, w0, #0x3fff00
+; CHECK-NEXT:    lsl w0, w8, #10
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 8388352
   %t1 = shl i32 %t0, 10
@@ -814,8 +820,8 @@ define i32 @test_i32_8388352_mask_shl_10(i32 %a0) {
 define i32 @test_i32_4294836224_mask_shl_1(i32 %a0) {
 ; CHECK-LABEL: test_i32_4294836224_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w0, #1
-; CHECK-NEXT:    and w0, w8, #0xfffc0000
+; CHECK-NEXT:    and w8, w0, #0x7ffe0000
+; CHECK-NEXT:    lsl w0, w8, #1
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 4294836224
   %t1 = shl i32 %t0, 1
@@ -1009,7 +1015,8 @@ define i64 @test_i64_18446744065119617024_mask_ashr_34(i64 %a0) {
 define i64 @test_i64_2147483647_mask_shl_1(i64 %a0) {
 ; CHECK-LABEL: test_i64_2147483647_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w0, w0, #1
+; CHECK-NEXT:    and x8, x0, #0x7fffffff
+; CHECK-NEXT:    lsl x0, x8, #1
 ; CHECK-NEXT:    ret
   %t0 = and i64 %a0, 2147483647
   %t1 = shl i64 %t0, 1
@@ -1047,8 +1054,8 @@ define i64 @test_i64_2147483647_mask_shl_34(i64 %a0) {
 define i64 @test_i64_140737488289792_mask_shl_15(i64 %a0) {
 ; CHECK-LABEL: test_i64_140737488289792_mask_shl_15:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl x8, x0, #15
-; CHECK-NEXT:    and x0, x8, #0x3fffffff80000000
+; CHECK-NEXT:    and x8, x0, #0x7fffffff0000
+; CHECK-NEXT:    lsl x0, x8, #15
 ; CHECK-NEXT:    ret
   %t0 = and i64 %a0, 140737488289792
   %t1 = shl i64 %t0, 15
@@ -1057,8 +1064,8 @@ define i64 @test_i64_140737488289792_mask_shl_15(i64 %a0) {
 define i64 @test_i64_140737488289792_mask_shl_16(i64 %a0) {
 ; CHECK-LABEL: test_i64_140737488289792_mask_shl_16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl x8, x0, #16
-; CHECK-NEXT:    and x0, x8, #0x7fffffff00000000
+; CHECK-NEXT:    and x8, x0, #0x7fffffff0000
+; CHECK-NEXT:    lsl x0, x8, #16
 ; CHECK-NEXT:    ret
   %t0 = and i64 %a0, 140737488289792
   %t1 = shl i64 %t0, 16
@@ -1067,8 +1074,8 @@ define i64 @test_i64_140737488289792_mask_shl_16(i64 %a0) {
 define i64 @test_i64_140737488289792_mask_shl_17(i64 %a0) {
 ; CHECK-LABEL: test_i64_140737488289792_mask_shl_17:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl x8, x0, #17
-; CHECK-NEXT:    and x0, x8, #0xfffffffe00000000
+; CHECK-NEXT:    and x8, x0, #0x7fffffff0000
+; CHECK-NEXT:    lsl x0, x8, #17
 ; CHECK-NEXT:    ret
   %t0 = and i64 %a0, 140737488289792
   %t1 = shl i64 %t0, 17
@@ -1077,8 +1084,8 @@ define i64 @test_i64_140737488289792_mask_shl_17(i64 %a0) {
 define i64 @test_i64_140737488289792_mask_shl_18(i64 %a0) {
 ; CHECK-LABEL: test_i64_140737488289792_mask_shl_18:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl x8, x0, #18
-; CHECK-NEXT:    and x0, x8, #0xfffffffc00000000
+; CHECK-NEXT:    and x8, x0, #0x3fffffff0000
+; CHECK-NEXT:    lsl x0, x8, #18
 ; CHECK-NEXT:    ret
   %t0 = and i64 %a0, 140737488289792
   %t1 = shl i64 %t0, 18
@@ -1088,8 +1095,8 @@ define i64 @test_i64_140737488289792_mask_shl_18(i64 %a0) {
 define i64 @test_i64_18446744065119617024_mask_shl_1(i64 %a0) {
 ; CHECK-LABEL: test_i64_18446744065119617024_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl x8, x0, #1
-; CHECK-NEXT:    and x0, x8, #0xfffffffc00000000
+; CHECK-NEXT:    and x8, x0, #0x7ffffffe00000000
+; CHECK-NEXT:    lsl x0, x8, #1
 ; CHECK-NEXT:    ret
   %t0 = and i64 %a0, 18446744065119617024
   %t1 = shl i64 %t0, 1
diff --git a/llvm/test/CodeGen/AArch64/double_reduct.ll b/llvm/test/CodeGen/AArch64/double_reduct.ll
index 0c356b1d98287..f30895db2c098 100644
--- a/llvm/test/CodeGen/AArch64/double_reduct.ll
+++ b/llvm/test/CodeGen/AArch64/double_reduct.ll
@@ -1,111 +1,291 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc --mtriple=aarch64-eabi < %s | FileCheck %s
+; RUN: llc --mtriple=aarch64-eabi < %s -global-isel=false | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc --mtriple=aarch64-eabi < %s -global-isel=true | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define float @add_f32(<8 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: add_f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    fadd v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    faddp v0.4s, v0.4s, v0.4s
-; CHECK-NEXT:    faddp s0, v0.2s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    fadd v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    faddp v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    faddp s0, v0.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    faddp v1.4s, v2.4s, v2.4s
+; CHECK-GI-NEXT:    faddp v0.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT:    faddp s1, v1.2s
+; CHECK-GI-NEXT:    faddp s0, v0.2s
+; CHECK-GI-NEXT:    fadd s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a)
   %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
   %r = fadd fast float %r1, %r2
   ret float %r
 }
 
+define float @add_f32_same(<4 x float> %a, <4 x float> %b) {
+; CHECK-SD-LABEL: add_f32_same:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    faddp v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    faddp s0, v0.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_f32_same:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    faddp v0.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT:    faddp v1.4s, v1.4s, v1.4s
+; CHECK-GI-NEXT:    faddp s0, v0.2s
+; CHECK-GI-NEXT:    faddp s1, v1.2s
+; CHECK-GI-NEXT:    fadd s0, s0, s1
+; CHECK-GI-NEXT:    ret
+  %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
+  %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
+  %r = fadd fast float %r1, %r2
+  ret float %r
+}
+
 define float @fmul_f32(<8 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: fmul_f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    fmul v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fmul_f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    fmul v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fmul_f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov d3, v2.d[1]
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    fmul v1.2s, v2.2s, v3.2s
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
+; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a)
   %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
   %r = fmul fast float %r1, %r2
   ret float %r
 }
 
+define float @fmul_f32_same(<4 x float> %a, <4 x float> %b) {
+; CHECK-SD-LABEL: fmul_f32_same:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fmul_f32_same:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
+; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
+; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    ret
+  %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
+  %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
+  %r = fmul fast float %r1, %r2
+  ret float %r
+}
+
 define float @fmin_f32(<8 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: fmin_f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fminnm v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    fminnm v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    fminnmv s0, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fmin_f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fminnm v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    fminnm v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    fminnmv s0, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fmin_f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fminnm v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    fminnmv s1, v2.4s
+; CHECK-GI-NEXT:    fminnmv s0, v0.4s
+; CHECK-GI-NEXT:    fminnm s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r1 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a)
   %r2 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b)
   %r = call float @llvm.minnum.f32(float %r1, float %r2)
   ret float %r
 }
 
+define float @fmin_f32_same(<4 x float> %a, <4 x float> %b) {
+; CHECK-SD-LABEL: fmin_f32_same:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fminnm v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    fminnmv s0, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fmin_f32_same:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fminnmv s0, v0.4s
+; CHECK-GI-NEXT:    fminnmv s1, v1.4s
+; CHECK-GI-NEXT:    fminnm s0, s0, s1
+; CHECK-GI-NEXT:    ret
+  %r1 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
+  %r2 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b)
+  %r = call float @llvm.minnum.f32(float %r1, float %r2)
+  ret float %r
+}
+
 define float @fmax_f32(<8 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: fmax_f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    fmaxnm v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    fmaxnmv s0, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fmax_f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    fmaxnm v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    fmaxnmv s0, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fmax_f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    fmaxnmv s1, v2.4s
+; CHECK-GI-NEXT:    fmaxnmv s0, v0.4s
+; CHECK-GI-NEXT:    fmaxnm s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r1 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a)
   %r2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)
   %r = call float @llvm.maxnum.f32(float %r1, float %r2)
   ret float %r
 }
 
+define float @fmax_f32_same(<4 x float> %a, <4 x float> %b) {
+; CHECK-SD-LABEL: fmax_f32_same:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    fmaxnmv s0, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fmax_f32_same:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmaxnmv s0, v0.4s
+; CHECK-GI-NEXT:    fmaxnmv s1, v1.4s
+; CHECK-GI-NEXT:    fmaxnm s0, s0, s1
+; CHECK-GI-NEXT:    ret
+  %r1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
+  %r2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)
+  %r = call float @llvm.maxnum.f32(float %r1, float %r2)
+  ret float %r
+}
+
 define float @fminimum_f32(<8 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: fminimum_f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    fmin v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    fminv s0, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fminimum_f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmin v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    fmin v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    fminv s0, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fminimum_f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmin v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    fminv s1, v2.4s
+; CHECK-GI-NEXT:    fminv s0, v0.4s
+; CHECK-GI-NEXT:    fmin s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r1 = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> %a)
   %r2 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %b)
   %r = call float @llvm.minimum.f32(float %r1, float %r2)
   ret float %r
 }
 
+define float @fminimum_f32_same(<4 x float> %a, <4 x float> %b) {
+; CHECK-SD-LABEL: fminimum_f32_same:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmin v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    fminv s0, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fminimum_f32_same:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fminv s0, v0.4s
+; CHECK-GI-NEXT:    fminv s1, v1.4s
+; CHECK-GI-NEXT:    fmin s0, s0, s1
+; CHECK-GI-NEXT:    ret
+  %r1 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a)
+  %r2 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %b)
+  %r = call float @llvm.minimum.f32(float %r1, float %r2)
+  ret float %r
+}
+
 define float @fmaximum_f32(<8 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: fmaximum_f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    fmax v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    fmaxv s0, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fmaximum_f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmax v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    fmax v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    fmaxv s0, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fmaximum_f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmax v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    fmaxv s1, v2.4s
+; CHECK-GI-NEXT:    fmaxv s0, v0.4s
+; CHECK-GI-NEXT:    fmax s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r1 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> %a)
   %r2 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %b)
   %r = call float @llvm.maximum.f32(float %r1, float %r2)
   ret float %r
 }
 
+define float @fmaximum_f32_same(<4 x float> %a, <4 x float> %b) {
+; CHECK-SD-LABEL: fmaximum_f32_same:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmax v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    fmaxv s0, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fmaximum_f32_same:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmaxv s0, v0.4s
+; CHECK-GI-NEXT:    fmaxv s1, v1.4s
+; CHECK-GI-NEXT:    fmax s0, s0, s1
+; CHECK-GI-NEXT:    ret
+  %r1 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %a)
+  %r2 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %b)
+  %r = call float @llvm.maximum.f32(float %r1, float %r2)
+  ret float %r
+}
+
 ; These next two tests have incorrect minnum/minimum combinations
-define float @fminimumnum_f32(<8 x float> %a, <4 x float> %b) {
+define float @fminimumnum_f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: fminimumnum_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    fminv s1, v2.4s
 ; CHECK-NEXT:    fminv s0, v0.4s
+; CHECK-NEXT:    fminv s1, v1.4s
 ; CHECK-NEXT:    fminnm s0, s0, s1
 ; CHECK-NEXT:    ret
-  %r1 = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> %a)
+  %r1 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a)
   %r2 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %b)
   %r = call float @llvm.minnum.f32(float %r1, float %r2)
   ret float %r
 }
 
-define float @fmaxnumimum_f32(<8 x float> %a, <4 x float> %b) {
+define float @fmaxnumimum_f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: fmaxnumimum_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    fmaxnmv s1, v2.4s
 ; CHECK-NEXT:    fmaxnmv s0, v0.4s
+; CHECK-NEXT:    fmaxnmv s1, v1.4s
 ; CHECK-NEXT:    fmax s0, s0, s1
 ; CHECK-NEXT:    ret
-  %r1 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a)
+  %r1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
   %r2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)
   %r = call float @llvm.maximum.f32(float %r1, float %r2)
   ret float %r
@@ -113,13 +293,23 @@ define float @fmaxnumimum_f32(<8 x float> %a, <4 x float> %b) {
 
 
 define i32 @add_i32(<8 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: add_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    addv s1, v2.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %a)
   %r2 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %b)
   %r = add i32 %r1, %r2
@@ -127,13 +317,22 @@ define i32 @add_i32(<8 x i32> %a, <4 x i32> %b) {
 }
 
 define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: add_ext_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uaddlp v1.8h, v1.16b
-; CHECK-NEXT:    uadalp v1.8h, v0.16b
-; CHECK-NEXT:    addv h0, v1.8h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ext_i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uaddlp v1.8h, v1.16b
+; CHECK-SD-NEXT:    uadalp v1.8h, v0.16b
+; CHECK-SD-NEXT:    addv h0, v1.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ext_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-NEXT:    uaddlv h1, v1.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
   %ae = zext <16 x i8> %a to <16 x i16>
   %be = zext <16 x i8> %b to <16 x i16>
   %r1 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %ae)
@@ -143,15 +342,27 @@ define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) {
 }
 
 define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: add_ext_v32i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uaddl2 v3.8h, v0.16b, v1.16b
-; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    add v0.8h, v0.8h, v3.8h
-; CHECK-NEXT:    uadalp v0.8h, v2.16b
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_ext_v32i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uaddl2 v3.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    add v0.8h, v0.8h, v3.8h
+; CHECK-SD-NEXT:    uadalp v0.8h, v2.16b
+; CHECK-SD-NEXT:    addv h0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_ext_v32i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-NEXT:    uaddlv h1, v1.16b
+; CHECK-GI-NEXT:    uaddlv h2, v2.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w8, w8, w9
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
   %ae = zext <32 x i8> %a to <32 x i16>
   %be = zext <16 x i8> %b to <16 x i16>
   %r1 = call i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16> %ae)
@@ -161,141 +372,486 @@ define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) {
 }
 
 define i32 @mul_i32(<8 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: mul_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    mul v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    mul v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mul w0, w9, w8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: mul_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    mul v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    mul w0, w9, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mul_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    mov d4, v1.d[1]
+; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v3.2s
+; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v4.2s
+; CHECK-GI-NEXT:    mov d3, v2.d[1]
+; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    mul v1.2s, v2.2s, v3.2s
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    fmov w10, s0
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mul w8, w10, w8
+; CHECK-GI-NEXT:    fmov w10, s1
+; CHECK-GI-NEXT:    mul w9, w10, w9
+; CHECK-GI-NEXT:    mul w0, w8, w9
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32> %a)
   %r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b)
   %r = mul i32 %r1, %r2
   ret i32 %r
 }
 
+define i32 @mul_i32_same(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SD-LABEL: mul_i32_same:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    mul w0, w9, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mul_i32_same:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v2.2s
+; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v3.2s
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    fmov w10, s0
+; CHECK-GI-NEXT:    fmov w11, s1
+; CHECK-GI-NEXT:    mul w8, w10, w8
+; CHECK-GI-NEXT:    mul w9, w11, w9
+; CHECK-GI-NEXT:    mul w0, w8, w9
+; CHECK-GI-NEXT:    ret
+  %r1 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %a)
+  %r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b)
+  %r = mul i32 %r1, %r2
+  ret i32 %r
+}
+
 define i32 @and_i32(<8 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: and_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    lsr x9, x8, #32
-; CHECK-NEXT:    and w0, w8, w9
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: and_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    lsr x9, x8, #32
+; CHECK-SD-NEXT:    and w0, w8, w9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov d1, v2.d[1]
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    and v1.8b, v2.8b, v1.8b
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v3.8b
+; CHECK-GI-NEXT:    mov w8, v1.s[1]
+; CHECK-GI-NEXT:    fmov w10, s1
+; CHECK-GI-NEXT:    mov w9, v0.s[1]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    and w8, w10, w8
+; CHECK-GI-NEXT:    and w8, w11, w8
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32> %a)
   %r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b)
   %r = and i32 %r1, %r2
   ret i32 %r
 }
 
+define i32 @and_i32_same(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SD-LABEL: and_i32_same:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    lsr x9, x8, #32
+; CHECK-SD-NEXT:    and w0, w8, w9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_i32_same:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    and v1.8b, v1.8b, v3.8b
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    fmov w10, s0
+; CHECK-GI-NEXT:    fmov w11, s1
+; CHECK-GI-NEXT:    and w8, w10, w8
+; CHECK-GI-NEXT:    and w9, w11, w9
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
+  %r1 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %a)
+  %r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b)
+  %r = and i32 %r1, %r2
+  ret i32 %r
+}
+
 define i32 @or_i32(<8 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: or_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    lsr x9, x8, #32
-; CHECK-NEXT:    orr w0, w8, w9
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: or_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    lsr x9, x8, #32
+; CHECK-SD-NEXT:    orr w0, w8, w9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: or_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov d1, v2.d[1]
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    orr v1.8b, v2.8b, v1.8b
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v3.8b
+; CHECK-GI-NEXT:    mov w8, v1.s[1]
+; CHECK-GI-NEXT:    fmov w10, s1
+; CHECK-GI-NEXT:    mov w9, v0.s[1]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    orr w8, w10, w8
+; CHECK-GI-NEXT:    orr w8, w11, w8
+; CHECK-GI-NEXT:    orr w0, w8, w9
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32> %a)
   %r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b)
   %r = or i32 %r1, %r2
   ret i32 %r
 }
 
+define i32 @or_i32_same(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SD-LABEL: or_i32_same:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    lsr x9, x8, #32
+; CHECK-SD-NEXT:    orr w0, w8, w9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: or_i32_same:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    orr v1.8b, v1.8b, v3.8b
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    fmov w10, s0
+; CHECK-GI-NEXT:    fmov w11, s1
+; CHECK-GI-NEXT:    orr w8, w10, w8
+; CHECK-GI-NEXT:    orr w9, w11, w9
+; CHECK-GI-NEXT:    orr w0, w8, w9
+; CHECK-GI-NEXT:    ret
+  %r1 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %a)
+  %r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b)
+  %r = or i32 %r1, %r2
+  ret i32 %r
+}
+
 define i32 @xor_i32(<8 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: xor_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    eor v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    lsr x9, x8, #32
-; CHECK-NEXT:    eor w0, w8, w9
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: xor_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    eor v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    lsr x9, x8, #32
+; CHECK-SD-NEXT:    eor w0, w8, w9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: xor_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov d1, v2.d[1]
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    eor v1.8b, v2.8b, v1.8b
+; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v3.8b
+; CHECK-GI-NEXT:    mov w8, v1.s[1]
+; CHECK-GI-NEXT:    fmov w10, s1
+; CHECK-GI-NEXT:    mov w9, v0.s[1]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    eor w8, w10, w8
+; CHECK-GI-NEXT:    eor w8, w11, w8
+; CHECK-GI-NEXT:    eor w0, w8, w9
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32> %a)
   %r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b)
   %r = xor i32 %r1, %r2
   ret i32 %r
 }
 
+define i32 @xor_i32_same(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SD-LABEL: xor_i32_same:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    lsr x9, x8, #32
+; CHECK-SD-NEXT:    eor w0, w8, w9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: xor_i32_same:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    eor v1.8b, v1.8b, v3.8b
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    fmov w10, s0
+; CHECK-GI-NEXT:    fmov w11, s1
+; CHECK-GI-NEXT:    eor w8, w10, w8
+; CHECK-GI-NEXT:    eor w9, w11, w9
+; CHECK-GI-NEXT:    eor w0, w8, w9
+; CHECK-GI-NEXT:    ret
+  %r1 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %a)
+  %r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b)
+  %r = xor i32 %r1, %r2
+  ret i32 %r
+}
+
 define i32 @umin_i32(<8 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: umin_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umin v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    uminv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: umin_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    umin v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    umin v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    uminv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: umin_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    umin v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    uminv s1, v2.4s
+; CHECK-GI-NEXT:    uminv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    cmp w8, w9
+; CHECK-GI-NEXT:    fcsel s0, s0, s1, lo
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32> %a)
   %r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b)
   %r = call i32 @llvm.umin.i32(i32 %r1, i32 %r2)
   ret i32 %r
 }
 
+define i32 @umin_i32_same(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SD-LABEL: umin_i32_same:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    umin v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uminv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: umin_i32_same:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uminv s0, v0.4s
+; CHECK-GI-NEXT:    uminv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    cmp w8, w9
+; CHECK-GI-NEXT:    fcsel s0, s0, s1, lo
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+  %r1 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %a)
+  %r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b)
+  %r = call i32 @llvm.umin.i32(i32 %r1, i32 %r2)
+  ret i32 %r
+}
+
 define i32 @umax_i32(<8 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: umax_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umax v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    umaxv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: umax_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    umax v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    umax v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    umaxv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: umax_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    umax v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    umaxv s1, v2.4s
+; CHECK-GI-NEXT:    umaxv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    cmp w8, w9
+; CHECK-GI-NEXT:    fcsel s0, s0, s1, hi
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32> %a)
   %r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b)
   %r = call i32 @llvm.umax.i32(i32 %r1, i32 %r2)
   ret i32 %r
 }
 
+define i32 @umax_i32_same(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SD-LABEL: umax_i32_same:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    umax v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    umaxv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: umax_i32_same:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    umaxv s0, v0.4s
+; CHECK-GI-NEXT:    umaxv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    cmp w8, w9
+; CHECK-GI-NEXT:    fcsel s0, s0, s1, hi
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+  %r1 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %a)
+  %r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b)
+  %r = call i32 @llvm.umax.i32(i32 %r1, i32 %r2)
+  ret i32 %r
+}
+
 define i32 @smin_i32(<8 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: smin_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    smin v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    sminv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: smin_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    smin v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    sminv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: smin_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    sminv s1, v2.4s
+; CHECK-GI-NEXT:    sminv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    cmp w8, w9
+; CHECK-GI-NEXT:    fcsel s0, s0, s1, lt
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32> %a)
   %r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b)
   %r = call i32 @llvm.smin.i32(i32 %r1, i32 %r2)
   ret i32 %r
 }
 
+define i32 @smin_i32_same(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SD-LABEL: smin_i32_same:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    sminv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: smin_i32_same:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sminv s0, v0.4s
+; CHECK-GI-NEXT:    sminv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    cmp w8, w9
+; CHECK-GI-NEXT:    fcsel s0, s0, s1, lt
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+  %r1 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %a)
+  %r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b)
+  %r = call i32 @llvm.smin.i32(i32 %r1, i32 %r2)
+  ret i32 %r
+}
+
 define i32 @smax_i32(<8 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: smax_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    smax v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    smaxv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: smax_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    smax v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    smax v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    smaxv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: smax_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    smax v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    smaxv s1, v2.4s
+; CHECK-GI-NEXT:    smaxv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    cmp w8, w9
+; CHECK-GI-NEXT:    fcsel s0, s0, s1, gt
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32> %a)
   %r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b)
   %r = call i32 @llvm.smax.i32(i32 %r1, i32 %r2)
   ret i32 %r
 }
 
+define i32 @smax_i32_same(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SD-LABEL: smax_i32_same:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    smax v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    smaxv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: smax_i32_same:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    smaxv s0, v0.4s
+; CHECK-GI-NEXT:    smaxv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    cmp w8, w9
+; CHECK-GI-NEXT:    fcsel s0, s0, s1, gt
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+  %r1 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %a)
+  %r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b)
+  %r = call i32 @llvm.smax.i32(i32 %r1, i32 %r2)
+  ret i32 %r
+}
+
 
 define float @nested_fadd_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) {
-; CHECK-LABEL: nested_fadd_f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    faddp v1.4s, v1.4s, v1.4s
-; CHECK-NEXT:    faddp v0.4s, v0.4s, v0.4s
-; CHECK-NEXT:    faddp s1, v1.2s
-; CHECK-NEXT:    faddp s0, v0.2s
-; CHECK-NEXT:    fadd s1, s1, s3
-; CHECK-NEXT:    fadd s0, s0, s2
-; CHECK-NEXT:    fadd s0, s0, s1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: nested_fadd_f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    faddp v1.4s, v1.4s, v1.4s
+; CHECK-SD-NEXT:    faddp v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    faddp s1, v1.2s
+; CHECK-SD-NEXT:    faddp s0, v0.2s
+; CHECK-SD-NEXT:    fadd s1, s1, s3
+; CHECK-SD-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NEXT:    fadd s0, s0, s1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: nested_fadd_f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    faddp v0.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT:    faddp v1.4s, v1.4s, v1.4s
+; CHECK-GI-NEXT:    faddp s0, v0.2s
+; CHECK-GI-NEXT:    faddp s1, v1.2s
+; CHECK-GI-NEXT:    fadd s0, s0, s2
+; CHECK-GI-NEXT:    fadd s1, s1, s3
+; CHECK-GI-NEXT:    fadd s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
   %a1 = fadd fast float %r1, %c
   %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
@@ -305,22 +861,39 @@ define float @nested_fadd_f32(<4 x float> %a, <4 x float> %b, float %c, float %d
 }
 
 define float @nested_fadd_f32_slow(<4 x float> %a, <4 x float> %b, float %c, float %d) {
-; CHECK-LABEL: nested_fadd_f32_slow:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov s4, v1.s[2]
-; CHECK-NEXT:    mov s5, v0.s[2]
-; CHECK-NEXT:    faddp s6, v0.2s
-; CHECK-NEXT:    faddp s7, v1.2s
-; CHECK-NEXT:    mov s1, v1.s[3]
-; CHECK-NEXT:    mov s0, v0.s[3]
-; CHECK-NEXT:    fadd s5, s6, s5
-; CHECK-NEXT:    fadd s4, s7, s4
-; CHECK-NEXT:    fadd s0, s5, s0
-; CHECK-NEXT:    fadd s1, s4, s1
-; CHECK-NEXT:    fadd s0, s0, s2
-; CHECK-NEXT:    fadd s1, s1, s3
-; CHECK-NEXT:    fadd s0, s0, s1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: nested_fadd_f32_slow:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov s4, v1.s[2]
+; CHECK-SD-NEXT:    mov s5, v0.s[2]
+; CHECK-SD-NEXT:    faddp s6, v0.2s
+; CHECK-SD-NEXT:    faddp s7, v1.2s
+; CHECK-SD-NEXT:    mov s1, v1.s[3]
+; CHECK-SD-NEXT:    mov s0, v0.s[3]
+; CHECK-SD-NEXT:    fadd s5, s6, s5
+; CHECK-SD-NEXT:    fadd s4, s7, s4
+; CHECK-SD-NEXT:    fadd s0, s5, s0
+; CHECK-SD-NEXT:    fadd s1, s4, s1
+; CHECK-SD-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NEXT:    fadd s1, s1, s3
+; CHECK-SD-NEXT:    fadd s0, s0, s1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: nested_fadd_f32_slow:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov s4, v0.s[2]
+; CHECK-GI-NEXT:    faddp s5, v0.2s
+; CHECK-GI-NEXT:    mov s6, v1.s[2]
+; CHECK-GI-NEXT:    faddp s7, v1.2s
+; CHECK-GI-NEXT:    mov s0, v0.s[3]
+; CHECK-GI-NEXT:    mov s1, v1.s[3]
+; CHECK-GI-NEXT:    fadd s4, s5, s4
+; CHECK-GI-NEXT:    fadd s5, s7, s6
+; CHECK-GI-NEXT:    fadd s0, s4, s0
+; CHECK-GI-NEXT:    fadd s1, s5, s1
+; CHECK-GI-NEXT:    fadd s0, s0, s2
+; CHECK-GI-NEXT:    fadd s1, s1, s3
+; CHECK-GI-NEXT:    fadd s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
   %a1 = fadd float %r1, %c
   %r2 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
@@ -330,18 +903,33 @@ define float @nested_fadd_f32_slow(<4 x float> %a, <4 x float> %b, float %c, flo
 }
 
 define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) {
-; CHECK-LABEL: nested_mul_f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    ext v5.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    fmul v1.2s, v1.2s, v4.2s
-; CHECK-NEXT:    fmul v0.2s, v0.2s, v5.2s
-; CHECK-NEXT:    fmul s1, s1, v1.s[1]
-; CHECK-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-NEXT:    fmul s1, s1, s3
-; CHECK-NEXT:    fmul s0, s0, s2
-; CHECK-NEXT:    fmul s0, s0, s1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: nested_mul_f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-SD-NEXT:    ext v5.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    fmul v1.2s, v1.2s, v4.2s
+; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v5.2s
+; CHECK-SD-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT:    fmul s1, s1, s3
+; CHECK-SD-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NEXT:    fmul s0, s0, s1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: nested_mul_f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d4, v0.d[1]
+; CHECK-GI-NEXT:    mov d5, v1.d[1]
+; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v4.2s
+; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v5.2s
+; CHECK-GI-NEXT:    mov s4, v0.s[1]
+; CHECK-GI-NEXT:    mov s5, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s4
+; CHECK-GI-NEXT:    fmul s1, s1, s5
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
+; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
   %a1 = fmul fast float %r1, %c
   %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
@@ -351,16 +939,27 @@ define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d)
 }
 
 define i32 @nested_add_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
-; CHECK-LABEL: nested_add_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    addv s1, v1.4s
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    add w9, w9, w0
-; CHECK-NEXT:    add w8, w8, w1
-; CHECK-NEXT:    add w0, w9, w8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: nested_add_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    addv s1, v1.4s
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w8, s1
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    add w9, w9, w0
+; CHECK-SD-NEXT:    add w8, w8, w1
+; CHECK-SD-NEXT:    add w0, w9, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: nested_add_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w8, w8, w0
+; CHECK-GI-NEXT:    add w9, w9, w1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
   %a1 = add i32 %r1, %c
   %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b)
@@ -370,16 +969,27 @@ define i32 @nested_add_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
 }
 
 define i32 @nested_add_c1_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
-; CHECK-LABEL: nested_add_c1_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    addv s1, v1.4s
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    add w9, w0, w9
-; CHECK-NEXT:    add w8, w8, w1
-; CHECK-NEXT:    add w0, w9, w8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: nested_add_c1_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    addv s1, v1.4s
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w8, s1
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    add w9, w0, w9
+; CHECK-SD-NEXT:    add w8, w8, w1
+; CHECK-SD-NEXT:    add w0, w9, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: nested_add_c1_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w8, w0, w8
+; CHECK-GI-NEXT:    add w9, w9, w1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
   %a1 = add i32 %c, %r1
   %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b)
@@ -389,16 +999,27 @@ define i32 @nested_add_c1_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
 }
 
 define i32 @nested_add_c2_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
-; CHECK-LABEL: nested_add_c2_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    addv s1, v1.4s
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    add w9, w9, w0
-; CHECK-NEXT:    add w8, w1, w8
-; CHECK-NEXT:    add w0, w9, w8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: nested_add_c2_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    addv s1, v1.4s
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w8, s1
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    add w9, w9, w0
+; CHECK-SD-NEXT:    add w8, w1, w8
+; CHECK-SD-NEXT:    add w0, w9, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: nested_add_c2_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w8, w8, w0
+; CHECK-GI-NEXT:    add w9, w1, w9
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
   %a1 = add i32 %r1, %c
   %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b)
@@ -408,14 +1029,29 @@ define i32 @nested_add_c2_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
 }
 
 define i32 @nested_add_manyreduct_i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
-; CHECK-LABEL: nested_add_manyreduct_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: nested_add_manyreduct_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: nested_add_manyreduct_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s2, v2.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    addv s3, v3.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    fmov w10, s1
+; CHECK-GI-NEXT:    fmov w11, s3
+; CHECK-GI-NEXT:    add w8, w8, w9
+; CHECK-GI-NEXT:    add w9, w10, w11
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
   %r3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %c)
   %a1 = add i32 %r1, %r3
@@ -427,22 +1063,39 @@ define i32 @nested_add_manyreduct_i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c,
 }
 
 define i32 @nested_mul_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
-; CHECK-LABEL: nested_mul_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    mul v0.2s, v0.2s, v3.2s
-; CHECK-NEXT:    mul v1.2s, v1.2s, v2.2s
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w10, s0
-; CHECK-NEXT:    mov w9, v1.s[1]
-; CHECK-NEXT:    mul w8, w10, w8
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    mul w9, w10, w9
-; CHECK-NEXT:    mul w8, w8, w0
-; CHECK-NEXT:    mul w9, w9, w1
-; CHECK-NEXT:    mul w0, w8, w9
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: nested_mul_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v3.2s
+; CHECK-SD-NEXT:    mul v1.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w10, s0
+; CHECK-SD-NEXT:    mov w9, v1.s[1]
+; CHECK-SD-NEXT:    mul w8, w10, w8
+; CHECK-SD-NEXT:    fmov w10, s1
+; CHECK-SD-NEXT:    mul w9, w10, w9
+; CHECK-SD-NEXT:    mul w8, w8, w0
+; CHECK-SD-NEXT:    mul w9, w9, w1
+; CHECK-SD-NEXT:    mul w0, w8, w9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: nested_mul_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v2.2s
+; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v3.2s
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    fmov w10, s0
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mul w8, w10, w8
+; CHECK-GI-NEXT:    fmov w10, s1
+; CHECK-GI-NEXT:    mul w9, w10, w9
+; CHECK-GI-NEXT:    mul w8, w8, w0
+; CHECK-GI-NEXT:    mul w9, w9, w1
+; CHECK-GI-NEXT:    mul w0, w8, w9
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a)
   %a1 = mul i32 %r1, %c
   %r2 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %b)
@@ -452,22 +1105,39 @@ define i32 @nested_mul_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
 }
 
 define i32 @nested_and_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
-; CHECK-LABEL: nested_and_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    and v0.8b, v0.8b, v3.8b
-; CHECK-NEXT:    fmov x8, d1
-; CHECK-NEXT:    fmov x9, d0
-; CHECK-NEXT:    lsr x10, x9, #32
-; CHECK-NEXT:    lsr x11, x8, #32
-; CHECK-NEXT:    and w9, w9, w0
-; CHECK-NEXT:    and w8, w8, w1
-; CHECK-NEXT:    and w9, w9, w10
-; CHECK-NEXT:    and w8, w8, w11
-; CHECK-NEXT:    and w0, w9, w8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: nested_and_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-SD-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v3.8b
+; CHECK-SD-NEXT:    fmov x8, d1
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    lsr x10, x9, #32
+; CHECK-SD-NEXT:    lsr x11, x8, #32
+; CHECK-SD-NEXT:    and w9, w9, w0
+; CHECK-SD-NEXT:    and w8, w8, w1
+; CHECK-SD-NEXT:    and w9, w9, w10
+; CHECK-SD-NEXT:    and w8, w8, w11
+; CHECK-SD-NEXT:    and w0, w9, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: nested_and_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    and v1.8b, v1.8b, v3.8b
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    fmov w10, s0
+; CHECK-GI-NEXT:    fmov w11, s1
+; CHECK-GI-NEXT:    and w10, w10, w0
+; CHECK-GI-NEXT:    and w11, w11, w1
+; CHECK-GI-NEXT:    and w8, w10, w8
+; CHECK-GI-NEXT:    and w9, w11, w9
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
   %a1 = and i32 %r1, %c
   %r2 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %b)
@@ -477,22 +1147,39 @@ define i32 @nested_and_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
 }
 
 define i32 @nested_or_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
-; CHECK-LABEL: nested_or_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    orr v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    orr v0.8b, v0.8b, v3.8b
-; CHECK-NEXT:    fmov x8, d1
-; CHECK-NEXT:    fmov x9, d0
-; CHECK-NEXT:    lsr x10, x9, #32
-; CHECK-NEXT:    lsr x11, x8, #32
-; CHECK-NEXT:    orr w9, w9, w0
-; CHECK-NEXT:    orr w8, w8, w1
-; CHECK-NEXT:    orr w9, w9, w10
-; CHECK-NEXT:    orr w8, w8, w11
-; CHECK-NEXT:    orr w0, w9, w8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: nested_or_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-SD-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    orr v1.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v3.8b
+; CHECK-SD-NEXT:    fmov x8, d1
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    lsr x10, x9, #32
+; CHECK-SD-NEXT:    lsr x11, x8, #32
+; CHECK-SD-NEXT:    orr w9, w9, w0
+; CHECK-SD-NEXT:    orr w8, w8, w1
+; CHECK-SD-NEXT:    orr w9, w9, w10
+; CHECK-SD-NEXT:    orr w8, w8, w11
+; CHECK-SD-NEXT:    orr w0, w9, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: nested_or_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    orr v1.8b, v1.8b, v3.8b
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    fmov w10, s0
+; CHECK-GI-NEXT:    fmov w11, s1
+; CHECK-GI-NEXT:    orr w10, w10, w0
+; CHECK-GI-NEXT:    orr w11, w11, w1
+; CHECK-GI-NEXT:    orr w8, w10, w8
+; CHECK-GI-NEXT:    orr w9, w11, w9
+; CHECK-GI-NEXT:    orr w0, w8, w9
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
   %a1 = or i32 %r1, %c
   %r2 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %b)
@@ -502,22 +1189,39 @@ define i32 @nested_or_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
 }
 
 define i32 @nested_xor_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
-; CHECK-LABEL: nested_xor_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    eor v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    eor v0.8b, v0.8b, v3.8b
-; CHECK-NEXT:    fmov x8, d1
-; CHECK-NEXT:    fmov x9, d0
-; CHECK-NEXT:    lsr x10, x9, #32
-; CHECK-NEXT:    lsr x11, x8, #32
-; CHECK-NEXT:    eor w9, w9, w0
-; CHECK-NEXT:    eor w8, w8, w1
-; CHECK-NEXT:    eor w9, w9, w10
-; CHECK-NEXT:    eor w8, w8, w11
-; CHECK-NEXT:    eor w0, w9, w8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: nested_xor_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-SD-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    eor v1.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    eor v0.8b, v0.8b, v3.8b
+; CHECK-SD-NEXT:    fmov x8, d1
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    lsr x10, x9, #32
+; CHECK-SD-NEXT:    lsr x11, x8, #32
+; CHECK-SD-NEXT:    eor w9, w9, w0
+; CHECK-SD-NEXT:    eor w8, w8, w1
+; CHECK-SD-NEXT:    eor w9, w9, w10
+; CHECK-SD-NEXT:    eor w8, w8, w11
+; CHECK-SD-NEXT:    eor w0, w9, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: nested_xor_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    eor v1.8b, v1.8b, v3.8b
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    fmov w10, s0
+; CHECK-GI-NEXT:    fmov w11, s1
+; CHECK-GI-NEXT:    eor w10, w10, w0
+; CHECK-GI-NEXT:    eor w11, w11, w1
+; CHECK-GI-NEXT:    eor w8, w10, w8
+; CHECK-GI-NEXT:    eor w9, w11, w9
+; CHECK-GI-NEXT:    eor w0, w8, w9
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
   %a1 = xor i32 %r1, %c
   %r2 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %b)
@@ -527,19 +1231,33 @@ define i32 @nested_xor_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
 }
 
 define i32 @nested_smin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
-; CHECK-LABEL: nested_smin_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sminv s0, v0.4s
-; CHECK-NEXT:    sminv s1, v1.4s
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    cmp w9, w0
-; CHECK-NEXT:    csel w9, w9, w0, lt
-; CHECK-NEXT:    cmp w8, w1
-; CHECK-NEXT:    csel w8, w8, w1, lt
-; CHECK-NEXT:    cmp w9, w8
-; CHECK-NEXT:    csel w0, w9, w8, lt
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: nested_smin_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sminv s0, v0.4s
+; CHECK-SD-NEXT:    sminv s1, v1.4s
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    fmov w8, s1
+; CHECK-SD-NEXT:    cmp w9, w0
+; CHECK-SD-NEXT:    csel w9, w9, w0, lt
+; CHECK-SD-NEXT:    cmp w8, w1
+; CHECK-SD-NEXT:    csel w8, w8, w1, lt
+; CHECK-SD-NEXT:    cmp w9, w8
+; CHECK-SD-NEXT:    csel w0, w9, w8, lt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: nested_smin_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sminv s0, v0.4s
+; CHECK-GI-NEXT:    sminv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    cmp w8, w0
+; CHECK-GI-NEXT:    csel w8, w8, w0, lt
+; CHECK-GI-NEXT:    cmp w9, w1
+; CHECK-GI-NEXT:    csel w9, w9, w1, lt
+; CHECK-GI-NEXT:    cmp w8, w9
+; CHECK-GI-NEXT:    csel w0, w8, w9, lt
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
   %a1 = call i32 @llvm.smin.i32(i32 %r1, i32 %c)
   %r2 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %b)
@@ -549,19 +1267,33 @@ define i32 @nested_smin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
 }
 
 define i32 @nested_smax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
-; CHECK-LABEL: nested_smax_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    smaxv s0, v0.4s
-; CHECK-NEXT:    smaxv s1, v1.4s
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    cmp w9, w0
-; CHECK-NEXT:    csel w9, w9, w0, gt
-; CHECK-NEXT:    cmp w8, w1
-; CHECK-NEXT:    csel w8, w8, w1, gt
-; CHECK-NEXT:    cmp w9, w8
-; CHECK-NEXT:    csel w0, w9, w8, gt
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: nested_smax_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    smaxv s0, v0.4s
+; CHECK-SD-NEXT:    smaxv s1, v1.4s
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    fmov w8, s1
+; CHECK-SD-NEXT:    cmp w9, w0
+; CHECK-SD-NEXT:    csel w9, w9, w0, gt
+; CHECK-SD-NEXT:    cmp w8, w1
+; CHECK-SD-NEXT:    csel w8, w8, w1, gt
+; CHECK-SD-NEXT:    cmp w9, w8
+; CHECK-SD-NEXT:    csel w0, w9, w8, gt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: nested_smax_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    smaxv s0, v0.4s
+; CHECK-GI-NEXT:    smaxv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    cmp w8, w0
+; CHECK-GI-NEXT:    csel w8, w8, w0, gt
+; CHECK-GI-NEXT:    cmp w9, w1
+; CHECK-GI-NEXT:    csel w9, w9, w1, gt
+; CHECK-GI-NEXT:    cmp w8, w9
+; CHECK-GI-NEXT:    csel w0, w8, w9, gt
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
   %a1 = call i32 @llvm.smax.i32(i32 %r1, i32 %c)
   %r2 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %b)
@@ -571,19 +1303,33 @@ define i32 @nested_smax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
 }
 
 define i32 @nested_umin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
-; CHECK-LABEL: nested_umin_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uminv s0, v0.4s
-; CHECK-NEXT:    uminv s1, v1.4s
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    cmp w9, w0
-; CHECK-NEXT:    csel w9, w9, w0, lo
-; CHECK-NEXT:    cmp w8, w1
-; CHECK-NEXT:    csel w8, w8, w1, lo
-; CHECK-NEXT:    cmp w9, w8
-; CHECK-NEXT:    csel w0, w9, w8, lo
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: nested_umin_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uminv s0, v0.4s
+; CHECK-SD-NEXT:    uminv s1, v1.4s
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    fmov w8, s1
+; CHECK-SD-NEXT:    cmp w9, w0
+; CHECK-SD-NEXT:    csel w9, w9, w0, lo
+; CHECK-SD-NEXT:    cmp w8, w1
+; CHECK-SD-NEXT:    csel w8, w8, w1, lo
+; CHECK-SD-NEXT:    cmp w9, w8
+; CHECK-SD-NEXT:    csel w0, w9, w8, lo
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: nested_umin_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uminv s0, v0.4s
+; CHECK-GI-NEXT:    uminv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    cmp w8, w0
+; CHECK-GI-NEXT:    csel w8, w8, w0, lo
+; CHECK-GI-NEXT:    cmp w9, w1
+; CHECK-GI-NEXT:    csel w9, w9, w1, lo
+; CHECK-GI-NEXT:    cmp w8, w9
+; CHECK-GI-NEXT:    csel w0, w8, w9, lo
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
   %a1 = call i32 @llvm.umin.i32(i32 %r1, i32 %c)
   %r2 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %b)
@@ -593,19 +1339,33 @@ define i32 @nested_umin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
 }
 
 define i32 @nested_umax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
-; CHECK-LABEL: nested_umax_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    umaxv s0, v0.4s
-; CHECK-NEXT:    umaxv s1, v1.4s
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    cmp w9, w0
-; CHECK-NEXT:    csel w9, w9, w0, hi
-; CHECK-NEXT:    cmp w8, w1
-; CHECK-NEXT:    csel w8, w8, w1, hi
-; CHECK-NEXT:    cmp w9, w8
-; CHECK-NEXT:    csel w0, w9, w8, hi
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: nested_umax_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    umaxv s0, v0.4s
+; CHECK-SD-NEXT:    umaxv s1, v1.4s
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    fmov w8, s1
+; CHECK-SD-NEXT:    cmp w9, w0
+; CHECK-SD-NEXT:    csel w9, w9, w0, hi
+; CHECK-SD-NEXT:    cmp w8, w1
+; CHECK-SD-NEXT:    csel w8, w8, w1, hi
+; CHECK-SD-NEXT:    cmp w9, w8
+; CHECK-SD-NEXT:    csel w0, w9, w8, hi
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: nested_umax_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    umaxv s0, v0.4s
+; CHECK-GI-NEXT:    umaxv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    cmp w8, w0
+; CHECK-GI-NEXT:    csel w8, w8, w0, hi
+; CHECK-GI-NEXT:    cmp w9, w1
+; CHECK-GI-NEXT:    csel w9, w9, w1, hi
+; CHECK-GI-NEXT:    cmp w8, w9
+; CHECK-GI-NEXT:    csel w0, w8, w9, hi
+; CHECK-GI-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
   %a1 = call i32 @llvm.umax.i32(i32 %r1, i32 %c)
   %r2 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %b)
@@ -615,14 +1375,23 @@ define i32 @nested_umax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
 }
 
 define float @nested_fmin_float(<4 x float> %a, <4 x float> %b, float %c, float %d) {
-; CHECK-LABEL: nested_fmin_float:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fminnmv s1, v1.4s
-; CHECK-NEXT:    fminnmv s0, v0.4s
-; CHECK-NEXT:    fminnm s1, s1, s3
-; CHECK-NEXT:    fminnm s0, s0, s2
-; CHECK-NEXT:    fminnm s0, s0, s1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: nested_fmin_float:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fminnmv s1, v1.4s
+; CHECK-SD-NEXT:    fminnmv s0, v0.4s
+; CHECK-SD-NEXT:    fminnm s1, s1, s3
+; CHECK-SD-NEXT:    fminnm s0, s0, s2
+; CHECK-SD-NEXT:    fminnm s0, s0, s1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: nested_fmin_float:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fminnmv s0, v0.4s
+; CHECK-GI-NEXT:    fminnmv s1, v1.4s
+; CHECK-GI-NEXT:    fminnm s0, s0, s2
+; CHECK-GI-NEXT:    fminnm s1, s1, s3
+; CHECK-GI-NEXT:    fminnm s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r1 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
   %a1 = call float @llvm.minnum.f32(float %r1, float %c)
   %r2 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b)
@@ -632,14 +1401,23 @@ define float @nested_fmin_float(<4 x float> %a, <4 x float> %b, float %c, float
 }
 
 define float @nested_fmax_float(<4 x float> %a, <4 x float> %b, float %c, float %d) {
-; CHECK-LABEL: nested_fmax_float:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmaxnmv s1, v1.4s
-; CHECK-NEXT:    fmaxnmv s0, v0.4s
-; CHECK-NEXT:    fmaxnm s1, s1, s3
-; CHECK-NEXT:    fmaxnm s0, s0, s2
-; CHECK-NEXT:    fmaxnm s0, s0, s1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: nested_fmax_float:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmaxnmv s1, v1.4s
+; CHECK-SD-NEXT:    fmaxnmv s0, v0.4s
+; CHECK-SD-NEXT:    fmaxnm s1, s1, s3
+; CHECK-SD-NEXT:    fmaxnm s0, s0, s2
+; CHECK-SD-NEXT:    fmaxnm s0, s0, s1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: nested_fmax_float:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmaxnmv s0, v0.4s
+; CHECK-GI-NEXT:    fmaxnmv s1, v1.4s
+; CHECK-GI-NEXT:    fmaxnm s0, s0, s2
+; CHECK-GI-NEXT:    fmaxnm s1, s1, s3
+; CHECK-GI-NEXT:    fmaxnm s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
   %a1 = call float @llvm.maxnum.f32(float %r1, float %c)
   %r2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)
diff --git a/llvm/test/CodeGen/AArch64/extract-bits.ll b/llvm/test/CodeGen/AArch64/extract-bits.ll
index aaa6c7eb4a30f..b87157a183835 100644
--- a/llvm/test/CodeGen/AArch64/extract-bits.ll
+++ b/llvm/test/CodeGen/AArch64/extract-bits.ll
@@ -1013,8 +1013,8 @@ define i32 @c1_i32(i32 %arg) nounwind {
 define i32 @c2_i32(i32 %arg) nounwind {
 ; CHECK-LABEL: c2_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr w8, w0, #17
-; CHECK-NEXT:    and w0, w8, #0xffc
+; CHECK-NEXT:    ubfx w8, w0, #19, #10
+; CHECK-NEXT:    lsl w0, w8, #2
 ; CHECK-NEXT:    ret
   %tmp0 = lshr i32 %arg, 19
   %tmp1 = and i32 %tmp0, 1023
@@ -1063,8 +1063,8 @@ define i64 @c1_i64(i64 %arg) nounwind {
 define i64 @c2_i64(i64 %arg) nounwind {
 ; CHECK-LABEL: c2_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #49
-; CHECK-NEXT:    and x0, x8, #0xffc
+; CHECK-NEXT:    ubfx x8, x0, #51, #10
+; CHECK-NEXT:    lsl x0, x8, #2
 ; CHECK-NEXT:    ret
   %tmp0 = lshr i64 %arg, 51
   %tmp1 = and i64 %tmp0, 1023
@@ -1120,8 +1120,8 @@ define void @c6_i32(i32 %arg, ptr %ptr) nounwind {
 define void @c7_i32(i32 %arg, ptr %ptr) nounwind {
 ; CHECK-LABEL: c7_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr w8, w0, #17
-; CHECK-NEXT:    and w8, w8, #0xffc
+; CHECK-NEXT:    ubfx w8, w0, #19, #10
+; CHECK-NEXT:    lsl w8, w8, #2
 ; CHECK-NEXT:    str w8, [x1]
 ; CHECK-NEXT:    ret
   %tmp0 = lshr i32 %arg, 19
@@ -1163,8 +1163,8 @@ define void @c6_i64(i64 %arg, ptr %ptr) nounwind {
 define void @c7_i64(i64 %arg, ptr %ptr) nounwind {
 ; CHECK-LABEL: c7_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #49
-; CHECK-NEXT:    and x8, x8, #0xffc
+; CHECK-NEXT:    ubfx x8, x0, #51, #10
+; CHECK-NEXT:    lsl x8, x8, #2
 ; CHECK-NEXT:    str x8, [x1]
 ; CHECK-NEXT:    ret
   %tmp0 = lshr i64 %arg, 51
diff --git a/llvm/test/CodeGen/AArch64/fpenv.ll b/llvm/test/CodeGen/AArch64/fpenv.ll
index 3351565d8dd89..3a307f7731037 100644
--- a/llvm/test/CodeGen/AArch64/fpenv.ll
+++ b/llvm/test/CodeGen/AArch64/fpenv.ll
@@ -4,11 +4,11 @@
 define void @func_set_rounding_dyn(i32 %rm) {
 ; CHECK-LABEL: func_set_rounding_dyn:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w9, w0, #22
+; CHECK-NEXT:    sub w9, w0, #1
 ; CHECK-NEXT:    mrs x8, FPCR
+; CHECK-NEXT:    and w9, w9, #0x3
 ; CHECK-NEXT:    and x8, x8, #0xffffffffff3fffff
-; CHECK-NEXT:    sub w9, w9, #1024, lsl #12 // =4194304
-; CHECK-NEXT:    and w9, w9, #0xc00000
+; CHECK-NEXT:    lsl w9, w9, #22
 ; CHECK-NEXT:    orr x8, x8, x9
 ; CHECK-NEXT:    msr FPCR, x8
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/half-precision-signof-no-assert.ll b/llvm/test/CodeGen/AArch64/half-precision-signof-no-assert.ll
new file mode 100644
index 0000000000000..92e15e78d8c41
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/half-precision-signof-no-assert.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+; Check that the following does not crash
+; See https://github.com/llvm/llvm-project/issues/123029 for details
+
+define ptr @fn(ptr %in, ptr %out) {
+; CHECK-LABEL: fn:
+; CHECK:       // %bb.0: // %fn
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    movi v0.4h, #60, lsl #8
+; CHECK-NEXT:    adrp x8, .LCPI0_0
+; CHECK-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-NEXT:    fcmgt v2.4s, v1.4s, #0.0
+; CHECK-NEXT:    fcmlt v1.4s, v1.4s, #0.0
+; CHECK-NEXT:    orr v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ldr h2, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT:    xtn v1.4h, v1.4s
+; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    str d0, [x1]
+; CHECK-NEXT:    ldr h0, [x0, #8]
+; CHECK-NEXT:    mov x0, xzr
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    fcmp s0, #0.0
+; CHECK-NEXT:    fcsel s1, s2, s1, mi
+; CHECK-NEXT:    fcsel s1, s2, s1, gt
+; CHECK-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-NEXT:    fcvt s1, h1
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    str h0, [x1, #8]
+; CHECK-NEXT:    ret
+fn:
+  %1 = load <4 x half>, ptr %in
+  %2 = fcmp one <4 x half> %1, zeroinitializer
+  %3 = uitofp <4 x i1> %2 to <4 x half>
+  store <4 x half> %3, ptr %out
+
+  %4 = getelementptr inbounds nuw i8, ptr %in, i64 8
+  %5 = load half, ptr %4
+  %6 = fcmp one half %5, 0xH0000
+  %7 = uitofp i1 %6 to half
+  %8 = call half @llvm.copysign.f16(half %7, half %5)
+  %9 = getelementptr inbounds nuw i8, ptr %out, i64 8
+  store half %8, ptr %9
+  ret ptr null
+}
diff --git a/llvm/test/CodeGen/AArch64/instr-ref-ldv.ll b/llvm/test/CodeGen/AArch64/instr-ref-ldv.ll
new file mode 100644
index 0000000000000..fa00c75e2928b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/instr-ref-ldv.ll
@@ -0,0 +1,70 @@
+; RUN: llc -O2 -experimental-debug-variable-locations %s -stop-after=livedebugvalues -mtriple=arm64-apple-macosx15.0.0 -o - | FileCheck %s
+
+; CHECK: $w{{[0-9]+}} = ORRWrs $wzr, killed $w{{[0-9]+}}, 0
+; CHECK-NEXT: DBG_INSTR_REF !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref({{[0-9]+}}, 0), debug-location !{{[0-9]+}}
+
+; This test makes sure that instruction referenced livedebugvalues pass doesn't crash when an ORRWrr is present before 
+; aarch64-isel and is converted to an ORRWrs with a shift amount immediate value of 0 before livedebugvalues, in this 
+; test case the MIR before both passes is shown below:
+
+; Before aarch64-isel
+; %11:gpr32 = ORRWrr $wzr, killed %10:gpr32, debug-location !5; :0
+; %0:gpr64all = SUBREG_TO_REG 0, killed %11:gpr32, %subreg.sub_32, debug-location !5; :0
+; DBG_INSTR_REF !7, !DIExpression(DW_OP_LLVM_arg, 0), %0:gpr64all, debug-location !11; :0 @[ :0 ] line no:0
+
+; Before livedebugvalues
+; $w0 = ORRWrs $wzr, killed $w3, 0
+; DBG_INSTR_REF !7, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(3, 0), debug-location !11; :0 @[ :0 ] line no:0
+
+; The livedebugvalues pass will consider the ORRWrs variant as a copy, therefore the aarch64-isel call to 
+; salvageCopySSA should do the same.
+
+%"class.llvm::iterator_range.53" = type { %"class.llvm::opt::arg_iterator.54", %"class.llvm::opt::arg_iterator.54" }
+%"class.llvm::opt::arg_iterator.54" = type { %"class.std::__1::reverse_iterator", %"class.std::__1::reverse_iterator", [2 x %"class.llvm::opt::OptSpecifier"] }
+%"class.std::__1::reverse_iterator" = type { ptr, ptr }
+%"class.llvm::opt::OptSpecifier" = type { i32 }
+declare noundef zeroext i1 @_ZNK4llvm3opt6Option7matchesENS0_12OptSpecifierE(ptr noundef nonnull align 8 dereferenceable(16), i64) local_unnamed_addr #1
+define noundef zeroext i1 @_ZNK4llvm3opt7ArgList14hasFlagNoClaimENS0_12OptSpecifierES2_b(ptr noundef nonnull align 8 dereferenceable(184) %this, i64 %Pos.coerce, i64 %Neg.coerce, i1 noundef zeroext %Default) local_unnamed_addr #2 !dbg !9383 {
+entry:
+  %ref.tmp.i = alloca %"class.llvm::iterator_range.53", align 8
+  %coerce.val.ii6 = and i64 %Pos.coerce, 4294967295, !dbg !9393
+    #dbg_value(i64 %coerce.val.ii6, !9452, !DIExpression(), !9480)
+  %__begin0.sroa.4.0.ref.tmp.sroa_idx.i = getelementptr inbounds i8, ptr %ref.tmp.i, i64 8, !dbg !9480
+  %__begin0.sroa.4.0.copyload.i = load ptr, ptr %__begin0.sroa.4.0.ref.tmp.sroa_idx.i, align 8, !dbg !9480
+  %__end0.sroa.4.0.end_iterator.i.sroa_idx.i = getelementptr inbounds i8, ptr %ref.tmp.i, i64 48, !dbg !9480
+  %__end0.sroa.4.0.copyload.i = load ptr, ptr %__end0.sroa.4.0.end_iterator.i.sroa_idx.i, align 8, !dbg !9480
+  %cmp.i.i.i.not.i = icmp eq ptr %__begin0.sroa.4.0.copyload.i, %__end0.sroa.4.0.copyload.i, !dbg !9480
+  br i1 %cmp.i.i.i.not.i, label %_ZNK4llvm3opt7ArgList17getLastArgNoClaimIJNS0_12OptSpecifierES3_EEEPNS0_3ArgEDpT_.exit.thread, label %_ZNK4llvm3opt7ArgList17getLastArgNoClaimIJNS0_12OptSpecifierES3_EEEPNS0_3ArgEDpT_.exit, !dbg !9480
+_ZNK4llvm3opt7ArgList17getLastArgNoClaimIJNS0_12OptSpecifierES3_EEEPNS0_3ArgEDpT_.exit.thread: ; preds = %entry
+  br label %1, !dbg !9480
+_ZNK4llvm3opt7ArgList17getLastArgNoClaimIJNS0_12OptSpecifierES3_EEEPNS0_3ArgEDpT_.exit: ; preds = %entry
+  %incdec.ptr.i.i.i = getelementptr inbounds i8, ptr %__begin0.sroa.4.0.copyload.i, i64 -8, !dbg !9480
+  %0 = load ptr, ptr %incdec.ptr.i.i.i, align 8, !dbg !9527, !tbaa !9528
+  %tobool.not.not = icmp eq ptr %0, null, !dbg !9480
+  br i1 %tobool.not.not, label %1, label %cleanup, !dbg !9480
+cleanup:                                          ; preds = %_ZNK4llvm3opt7ArgList17getLastArgNoClaimIJNS0_12OptSpecifierES3_EEEPNS0_3ArgEDpT_.exit
+  %call13 = call noundef zeroext i1 @_ZNK4llvm3opt6Option7matchesENS0_12OptSpecifierE(ptr noundef nonnull align 8 dereferenceable(16) %0, i64 %coerce.val.ii6) #3, !dbg !9480
+  br label %1
+  %2 = phi i1 [ %call13, %cleanup ], [ %Default, %_ZNK4llvm3opt7ArgList17getLastArgNoClaimIJNS0_12OptSpecifierES3_EEEPNS0_3ArgEDpT_.exit ], [ %Default, %_ZNK4llvm3opt7ArgList17getLastArgNoClaimIJNS0_12OptSpecifierES3_EEEPNS0_3ArgEDpT_.exit.thread ]
+  ret i1 %2, !dbg !9480
+}
+!llvm.module.flags = !{!2, !6}
+!llvm.dbg.cu = !{!7}
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{i32 7, !"frame-pointer", i32 1}
+!7 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !8, emissionKind: FullDebug, sdk: "MacOSX15.3.sdk")
+!8 = !DIFile(filename: "/Users/shubhamrastogi/Development/llvm-project-instr-ref/llvm-project/llvm/lib/Option/ArgList.cpp", directory: "/Users/shubhamrastogi/Development/llvm-project-instr-ref/llvm-project/build-instr-ref-stage2", checksumkind: CSK_MD5, checksum: "a3198e8ace679c7b1581a26b5583c658")
+!3116 = distinct !DICompositeType(tag: DW_TAG_class_type, size: 32)
+!9383 = distinct !DISubprogram(unit: !7, flags: DIFlagArtificial | DIFlagObjectPointer)
+!9391 = distinct !DILexicalBlock(scope: !9383, line: 80, column: 12)
+!9393 = !DILocation(scope: !9391)
+!9440 = distinct !DILexicalBlock(scope: !9441, line: 269, column: 5)
+!9441 = distinct !DILexicalBlock(scope: !9442, line: 269, column: 5)
+!9442 = distinct !DISubprogram(unit: !7, retainedNodes: !9450)
+!9450 = !{}
+!9452 = !DILocalVariable(scope: !9442, type: !3116)
+!9478 = distinct !DILocation(scope: !9391)
+!9480 = !DILocation(scope: !9441, inlinedAt: !9478)
+!9527 = !DILocation(scope: !9440, inlinedAt: !9478)
+!9528 = !{!"any pointer", !9530, i64 0}
+!9530 = !{}
diff --git a/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir b/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir
index 5b379c2bd5629..c166b6b48f981 100644
--- a/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir
+++ b/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir
@@ -1,6 +1,17 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
 # RUN: llc -o - %s --run-pass=machine-cp -mcp-use-is-copy-instr -mtriple=arm64-apple-macos --verify-machineinstrs | FileCheck %s
 
+--- |
+  declare void @foo()
+
+  define void @test() {
+    unreachable
+  }
+  define void @test2() {
+    unreachable
+  }
+...
+
 ---
 name: test
 tracksRegLiveness: true
@@ -30,3 +41,22 @@ body:             |
 
     RET undef $lr, implicit $x0
 ...
+---
+name:            test2
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $q14, $d29, $x0, $x1
+    ; CHECK-LABEL: name: test2
+    ; CHECK: liveins: $q14, $d29, $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $d8 = COPY killed renamable $d29
+    ; CHECK-NEXT: BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+    ; CHECK-NEXT: renamable $b0 = SMAXVv8i8v killed renamable $d8, implicit-def $q0
+    ; CHECK-NEXT: RET_ReallyLR implicit $b0
+    renamable $q8 = COPY renamable $q14
+    renamable $d8 = COPY killed renamable $d29
+    BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+    renamable $b0 = SMAXVv8i8v killed renamable $d8, implicit-def $q0
+    RET_ReallyLR implicit $b0
+...
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 7f3c1fdc93380..c9fe258f11556 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -1,86 +1,87 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,SDAG
-; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -global-isel -global-isel-abort=2 -verify-machineinstrs < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -global-isel -global-isel-abort=2 -verify-machineinstrs < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; Basic tests from input vector to bitmask
 ; IR generated from clang for:
 ; __builtin_convertvector + reinterpret_cast<uint16&>
 
-; GISEL: warning: Instruction selection used fallback path for clang_builtins_undef_concat_convert_to_bitmask4
-; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_2xi32
-; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_8xi2
-; GISEL-NEXT: warning: Instruction selection used fallback path for no_direct_convert_for_bad_concat
+; CHECK-GI:       warning: Instruction selection used fallback path for convert_to_bitmask2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for clang_builtins_undef_concat_convert_to_bitmask4
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for convert_to_bitmask_2xi32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for convert_to_bitmask_8xi2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for no_direct_convert_for_bad_concat
 
 define i16 @convert_to_bitmask16(<16 x i8> %vec) {
 ; Bits used in mask
-; SDAG-LABEL: convert_to_bitmask16:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, lCPI0_0@PAGE
-; SDAG-NEXT:    cmeq.16b v0, v0, #0
-; SDAG-NEXT:    ldr q1, [x8, lCPI0_0@PAGEOFF]
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    ext.16b v1, v0, v0, #8
-; SDAG-NEXT:    zip1.16b v0, v0, v1
-; SDAG-NEXT:    addv.8h h0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask16:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    adrp x8, lCPI0_0@PAGE
+; CHECK-SD-NEXT:    cmeq.16b v0, v0, #0
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI0_0@PAGEOFF]
+; CHECK-SD-NEXT:    bic.16b v0, v1, v0
+; CHECK-SD-NEXT:    ext.16b v1, v0, v0, #8
+; CHECK-SD-NEXT:    zip1.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.8h h0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.16b v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    umov.b w8, v0[1]
-; GISEL-NEXT:    umov.b w9, v0[0]
-; GISEL-NEXT:    umov.b w10, v0[2]
-; GISEL-NEXT:    umov.b w11, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    orr w8, w9, w8, lsl #2
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[5]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[6]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #4
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #5
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[8]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #6
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[9]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #7
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[10]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #8
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[11]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #9
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[12]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #10
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[13]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #11
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[14]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #12
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[15]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #13
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #14
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #15
-; GISEL-NEXT:    strh w8, [sp, #14]
-; GISEL-NEXT:    and w0, w8, #0xffff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask16:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.16b v0, v0, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    umov.b w8, v0[1]
+; CHECK-GI-NEXT:    umov.b w9, v0[0]
+; CHECK-GI-NEXT:    umov.b w10, v0[2]
+; CHECK-GI-NEXT:    umov.b w11, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #2
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[5]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[6]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #4
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #5
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[8]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #6
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[9]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #7
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[10]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #8
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[11]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #9
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[12]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #10
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[13]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #11
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[14]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #12
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[15]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #13
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #14
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #15
+; CHECK-GI-NEXT:    strh w8, [sp, #14]
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 ; Actual conversion
 
@@ -90,50 +91,50 @@ define i16 @convert_to_bitmask16(<16 x i8> %vec) {
 }
 
 define i16 @convert_to_bitmask8(<8 x i16> %vec) {
-; SDAG-LABEL: convert_to_bitmask8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, lCPI1_0@PAGE
-; SDAG-NEXT:    cmeq.8h v0, v0, #0
-; SDAG-NEXT:    ldr q1, [x8, lCPI1_0@PAGEOFF]
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    addv.8h h0, v0
-; SDAG-NEXT:    fmov w8, s0
-; SDAG-NEXT:    and w0, w8, #0xff
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask8:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    adrp x8, lCPI1_0@PAGE
+; CHECK-SD-NEXT:    cmeq.8h v0, v0, #0
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI1_0@PAGEOFF]
+; CHECK-SD-NEXT:    bic.16b v0, v1, v0
+; CHECK-SD-NEXT:    addv.8h h0, v0
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    and w0, w8, #0xff
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.8h v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    xtn.8b v0, v0
-; GISEL-NEXT:    umov.b w8, v0[1]
-; GISEL-NEXT:    umov.b w9, v0[0]
-; GISEL-NEXT:    umov.b w10, v0[2]
-; GISEL-NEXT:    umov.b w11, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    orr w8, w9, w8, lsl #2
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[5]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[6]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #4
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #5
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #6
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #7
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask8:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.8h v0, v0, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    xtn.8b v0, v0
+; CHECK-GI-NEXT:    umov.b w8, v0[1]
+; CHECK-GI-NEXT:    umov.b w9, v0[0]
+; CHECK-GI-NEXT:    umov.b w10, v0[2]
+; CHECK-GI-NEXT:    umov.b w11, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #2
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[5]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[6]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #4
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #5
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #6
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #7
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
@@ -143,36 +144,36 @@ define i16 @convert_to_bitmask8(<8 x i16> %vec) {
 }
 
 define i4 @convert_to_bitmask4(<4 x i32> %vec) {
-; SDAG-LABEL: convert_to_bitmask4:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, lCPI2_0@PAGE
-; SDAG-NEXT:    cmeq.4s v0, v0, #0
-; SDAG-NEXT:    ldr q1, [x8, lCPI2_0@PAGEOFF]
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    addv.4s s0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask4:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    adrp x8, lCPI2_0@PAGE
+; CHECK-SD-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI2_0@PAGEOFF]
+; CHECK-SD-NEXT:    bic.16b v0, v1, v0
+; CHECK-SD-NEXT:    addv.4s s0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask4:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.4s v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask4:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
@@ -220,37 +221,37 @@ define i8 @clang_builtins_undef_concat_convert_to_bitmask4(<4 x i32> %vec) {
 
 
 define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) {
-; SDAG-LABEL: convert_to_bitmask_no_compare:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    adrp x8, lCPI5_0@PAGE
-; SDAG-NEXT:    ldr q1, [x8, lCPI5_0@PAGEOFF]
-; SDAG-NEXT:    shl.4s v0, v0, #31
-; SDAG-NEXT:    cmlt.4s v0, v0, #0
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    addv.4s s0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_no_compare:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    adrp x8, lCPI5_0@PAGE
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI5_0@PAGEOFF]
+; CHECK-SD-NEXT:    shl.4s v0, v0, #31
+; CHECK-SD-NEXT:    cmlt.4s v0, v0, #0
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4s s0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_no_compare:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    and.16b v0, v0, v1
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_no_compare:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    and.16b v0, v0, v1
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp = and <4 x i32> %vec1, %vec2
@@ -260,39 +261,39 @@ define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) {
 }
 
 define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; SDAG-LABEL: convert_to_bitmask_with_compare_chain:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    cmeq.4s v2, v0, #0
-; SDAG-NEXT:    cmeq.4s v0, v0, v1
-; SDAG-NEXT:    adrp x8, lCPI6_0@PAGE
-; SDAG-NEXT:    ldr q1, [x8, lCPI6_0@PAGEOFF]
-; SDAG-NEXT:    bic.16b v0, v0, v2
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    addv.4s s0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_with_compare_chain:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmeq.4s v2, v0, #0
+; CHECK-SD-NEXT:    cmeq.4s v0, v0, v1
+; CHECK-SD-NEXT:    adrp x8, lCPI6_0@PAGE
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI6_0@PAGEOFF]
+; CHECK-SD-NEXT:    bic.16b v0, v0, v2
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4s s0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_with_compare_chain:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.4s v2, v0, #0
-; GISEL-NEXT:    cmeq.4s v0, v0, v1
-; GISEL-NEXT:    bic.16b v0, v0, v2
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_with_compare_chain:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.4s v2, v0, #0
+; CHECK-GI-NEXT:    cmeq.4s v0, v0, v1
+; CHECK-GI-NEXT:    bic.16b v0, v0, v2
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -303,39 +304,39 @@ define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec
 }
 
 define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; SDAG-LABEL: convert_to_bitmask_with_trunc_in_chain:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    cmeq.4s v0, v0, #0
-; SDAG-NEXT:    adrp x8, lCPI7_0@PAGE
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    ldr q1, [x8, lCPI7_0@PAGEOFF]
-; SDAG-NEXT:    shl.4s v0, v0, #31
-; SDAG-NEXT:    cmlt.4s v0, v0, #0
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    addv.4s s0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_with_trunc_in_chain:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-SD-NEXT:    adrp x8, lCPI7_0@PAGE
+; CHECK-SD-NEXT:    bic.16b v0, v1, v0
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI7_0@PAGEOFF]
+; CHECK-SD-NEXT:    shl.4s v0, v0, #31
+; CHECK-SD-NEXT:    cmlt.4s v0, v0, #0
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4s s0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_with_trunc_in_chain:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.4s v0, v0, #0
-; GISEL-NEXT:    bic.16b v0, v1, v0
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_with_trunc_in_chain:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-GI-NEXT:    bic.16b v0, v1, v0
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -346,82 +347,82 @@ define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %ve
 }
 
 define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; SDAG-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    cmeq.4s v0, v0, #0
-; SDAG-NEXT:    cmeq.4s v1, v1, #0
-; SDAG-NEXT:    adrp x8, lCPI8_0@PAGE
-; SDAG-NEXT:    movi d2, #0x000000ffffffff
-; SDAG-NEXT:    movi d3, #0x00ffffffffffff
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    movi d1, #0xffff0000ffff0000
-; SDAG-NEXT:    xtn.4h v0, v0
-; SDAG-NEXT:    orr.8b v0, v0, v2
-; SDAG-NEXT:    movi d2, #0x00ffffffff0000
-; SDAG-NEXT:    eor.8b v1, v0, v1
-; SDAG-NEXT:    eor.8b v0, v0, v2
-; SDAG-NEXT:    mov.h v1[2], wzr
-; SDAG-NEXT:    orr.8b v0, v0, v3
-; SDAG-NEXT:    orr.8b v0, v1, v0
-; SDAG-NEXT:    ldr d1, [x8, lCPI8_0@PAGEOFF]
-; SDAG-NEXT:    shl.4h v0, v0, #15
-; SDAG-NEXT:    cmlt.4h v0, v0, #0
-; SDAG-NEXT:    and.8b v0, v0, v1
-; SDAG-NEXT:    addv.4h h0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-SD-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-SD-NEXT:    adrp x8, lCPI8_0@PAGE
+; CHECK-SD-NEXT:    movi d2, #0x000000ffffffff
+; CHECK-SD-NEXT:    movi d3, #0x00ffffffffffff
+; CHECK-SD-NEXT:    bic.16b v0, v1, v0
+; CHECK-SD-NEXT:    movi d1, #0xffff0000ffff0000
+; CHECK-SD-NEXT:    xtn.4h v0, v0
+; CHECK-SD-NEXT:    orr.8b v0, v0, v2
+; CHECK-SD-NEXT:    movi d2, #0x00ffffffff0000
+; CHECK-SD-NEXT:    eor.8b v1, v0, v1
+; CHECK-SD-NEXT:    eor.8b v0, v0, v2
+; CHECK-SD-NEXT:    mov.h v1[2], wzr
+; CHECK-SD-NEXT:    orr.8b v0, v0, v3
+; CHECK-SD-NEXT:    orr.8b v0, v1, v0
+; CHECK-SD-NEXT:    ldr d1, [x8, lCPI8_0@PAGEOFF]
+; CHECK-SD-NEXT:    shl.4h v0, v0, #15
+; CHECK-SD-NEXT:    cmlt.4h v0, v0, #0
+; CHECK-SD-NEXT:    and.8b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4h h0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    mov w8, #1 ; =0x1
-; GISEL-NEXT:    mov w9, #0 ; =0x0
-; GISEL-NEXT:    cmeq.4s v5, v0, #0
-; GISEL-NEXT:    fmov s2, w8
-; GISEL-NEXT:    fmov s4, w9
-; GISEL-NEXT:    cmeq.4s v1, v1, #0
-; GISEL-NEXT:    mov.16b v3, v2
-; GISEL-NEXT:    mov.16b v0, v4
-; GISEL-NEXT:    mov.h v4[1], w8
-; GISEL-NEXT:    bic.16b v1, v1, v5
-; GISEL-NEXT:    mov.16b v5, v2
-; GISEL-NEXT:    mov.h v2[1], w8
-; GISEL-NEXT:    mov.h v3[1], w8
-; GISEL-NEXT:    mov.h v0[1], w8
-; GISEL-NEXT:    mov.h v5[1], w8
-; GISEL-NEXT:    mov.h v4[2], w8
-; GISEL-NEXT:    xtn.4h v1, v1
-; GISEL-NEXT:    mov.h v2[2], w8
-; GISEL-NEXT:    mov.h v3[2], w9
-; GISEL-NEXT:    mov.h v0[2], w9
-; GISEL-NEXT:    mov.h v5[2], w9
-; GISEL-NEXT:    mov.h v4[3], w9
-; GISEL-NEXT:    mov.h v2[3], w9
-; GISEL-NEXT:    mov.h v3[3], w9
-; GISEL-NEXT:    mov.h v0[3], w8
-; GISEL-NEXT:    mov.h v5[3], w8
-; GISEL-NEXT:    orr.8b v1, v1, v3
-; GISEL-NEXT:    eor.8b v0, v1, v0
-; GISEL-NEXT:    eor.8b v1, v4, v1
-; GISEL-NEXT:    and.8b v0, v0, v5
-; GISEL-NEXT:    orr.8b v1, v2, v1
-; GISEL-NEXT:    orr.8b v0, v0, v1
-; GISEL-NEXT:    ushll.4s v0, v0, #0
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w8, #1 ; =0x1
+; CHECK-GI-NEXT:    mov w9, #0 ; =0x0
+; CHECK-GI-NEXT:    cmeq.4s v5, v0, #0
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-GI-NEXT:    mov.16b v3, v2
+; CHECK-GI-NEXT:    mov.16b v0, v4
+; CHECK-GI-NEXT:    mov.h v4[1], w8
+; CHECK-GI-NEXT:    bic.16b v1, v1, v5
+; CHECK-GI-NEXT:    mov.16b v5, v2
+; CHECK-GI-NEXT:    mov.h v2[1], w8
+; CHECK-GI-NEXT:    mov.h v3[1], w8
+; CHECK-GI-NEXT:    mov.h v0[1], w8
+; CHECK-GI-NEXT:    mov.h v5[1], w8
+; CHECK-GI-NEXT:    mov.h v4[2], w8
+; CHECK-GI-NEXT:    xtn.4h v1, v1
+; CHECK-GI-NEXT:    mov.h v2[2], w8
+; CHECK-GI-NEXT:    mov.h v3[2], w9
+; CHECK-GI-NEXT:    mov.h v0[2], w9
+; CHECK-GI-NEXT:    mov.h v5[2], w9
+; CHECK-GI-NEXT:    mov.h v4[3], w9
+; CHECK-GI-NEXT:    mov.h v2[3], w9
+; CHECK-GI-NEXT:    mov.h v3[3], w9
+; CHECK-GI-NEXT:    mov.h v0[3], w8
+; CHECK-GI-NEXT:    mov.h v5[3], w8
+; CHECK-GI-NEXT:    orr.8b v1, v1, v3
+; CHECK-GI-NEXT:    eor.8b v0, v1, v0
+; CHECK-GI-NEXT:    eor.8b v1, v4, v1
+; CHECK-GI-NEXT:    and.8b v0, v0, v5
+; CHECK-GI-NEXT:    orr.8b v1, v2, v1
+; CHECK-GI-NEXT:    orr.8b v0, v0, v1
+; CHECK-GI-NEXT:    ushll.4s v0, v0, #0
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -440,42 +441,42 @@ define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <
 }
 
 define i4 @convert_to_bitmask_with_different_types_in_chain(<4 x i16> %vec1, <4 x i32> %vec2) {
-; SDAG-LABEL: convert_to_bitmask_with_different_types_in_chain:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    cmeq.4s v1, v1, #0
-; SDAG-NEXT:    cmeq.4h v0, v0, #0
-; SDAG-NEXT:    adrp x8, lCPI9_0@PAGE
-; SDAG-NEXT:    xtn.4h v1, v1
-; SDAG-NEXT:    orn.8b v0, v1, v0
-; SDAG-NEXT:    ldr d1, [x8, lCPI9_0@PAGEOFF]
-; SDAG-NEXT:    and.8b v0, v0, v1
-; SDAG-NEXT:    addv.4h h0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_with_different_types_in_chain:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-SD-NEXT:    cmeq.4h v0, v0, #0
+; CHECK-SD-NEXT:    adrp x8, lCPI9_0@PAGE
+; CHECK-SD-NEXT:    xtn.4h v1, v1
+; CHECK-SD-NEXT:    orn.8b v0, v1, v0
+; CHECK-SD-NEXT:    ldr d1, [x8, lCPI9_0@PAGEOFF]
+; CHECK-SD-NEXT:    and.8b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4h h0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_with_different_types_in_chain:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.4s v1, v1, #0
-; GISEL-NEXT:    cmeq.4h v0, v0, #0
-; GISEL-NEXT:    xtn.4h v1, v1
-; GISEL-NEXT:    orn.8b v0, v1, v0
-; GISEL-NEXT:    ushll.4s v0, v0, #0
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_with_different_types_in_chain:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-GI-NEXT:    cmeq.4h v0, v0, #0
+; CHECK-GI-NEXT:    xtn.4h v1, v1
+; CHECK-GI-NEXT:    orn.8b v0, v1, v0
+; CHECK-GI-NEXT:    ushll.4s v0, v0, #0
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp1 = icmp ne <4 x i16> %vec1, zeroinitializer
@@ -486,73 +487,73 @@ define i4 @convert_to_bitmask_with_different_types_in_chain(<4 x i16> %vec1, <4
 }
 
 define i16 @convert_to_bitmask_without_knowing_type(<16 x i1> %vec) {
-; SDAG-LABEL: convert_to_bitmask_without_knowing_type:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    shl.16b v0, v0, #7
-; SDAG-NEXT:    adrp x8, lCPI10_0@PAGE
-; SDAG-NEXT:    ldr q1, [x8, lCPI10_0@PAGEOFF]
-; SDAG-NEXT:    cmlt.16b v0, v0, #0
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    ext.16b v1, v0, v0, #8
-; SDAG-NEXT:    zip1.16b v0, v0, v1
-; SDAG-NEXT:    addv.8h h0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_without_knowing_type:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    shl.16b v0, v0, #7
+; CHECK-SD-NEXT:    adrp x8, lCPI10_0@PAGE
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI10_0@PAGEOFF]
+; CHECK-SD-NEXT:    cmlt.16b v0, v0, #0
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    ext.16b v1, v0, v0, #8
+; CHECK-SD-NEXT:    zip1.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.8h h0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_without_knowing_type:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    umov.b w8, v0[1]
-; GISEL-NEXT:    umov.b w9, v0[0]
-; GISEL-NEXT:    umov.b w10, v0[2]
-; GISEL-NEXT:    umov.b w11, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    orr w8, w9, w8, lsl #2
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[5]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[6]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #4
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #5
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[8]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #6
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[9]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #7
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[10]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #8
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[11]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #9
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[12]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #10
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[13]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #11
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[14]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #12
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[15]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #13
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #14
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #15
-; GISEL-NEXT:    strh w8, [sp, #14]
-; GISEL-NEXT:    and w0, w8, #0xffff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_without_knowing_type:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    umov.b w8, v0[1]
+; CHECK-GI-NEXT:    umov.b w9, v0[0]
+; CHECK-GI-NEXT:    umov.b w10, v0[2]
+; CHECK-GI-NEXT:    umov.b w11, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #2
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[5]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[6]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #4
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #5
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[8]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #6
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[9]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #7
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[10]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #8
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[11]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #9
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[12]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #10
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[13]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #11
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[14]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #12
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[15]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #13
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #14
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #15
+; CHECK-GI-NEXT:    strh w8, [sp, #14]
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
   %bitmask = bitcast <16 x i1> %vec to i16
   ret i16 %bitmask
@@ -575,51 +576,51 @@ define i2 @convert_to_bitmask_2xi32(<2 x i32> %vec) {
 }
 
 define i4 @convert_to_bitmask_4xi8(<4 x i8> %vec) {
-; SDAG-LABEL: convert_to_bitmask_4xi8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    bic.4h v0, #255, lsl #8
-; SDAG-NEXT:    adrp x8, lCPI12_0@PAGE
-; SDAG-NEXT:    ldr d1, [x8, lCPI12_0@PAGEOFF]
-; SDAG-NEXT:    cmeq.4h v0, v0, #0
-; SDAG-NEXT:    bic.8b v0, v1, v0
-; SDAG-NEXT:    addv.4h h0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_4xi8:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    bic.4h v0, #255, lsl #8
+; CHECK-SD-NEXT:    adrp x8, lCPI12_0@PAGE
+; CHECK-SD-NEXT:    ldr d1, [x8, lCPI12_0@PAGEOFF]
+; CHECK-SD-NEXT:    cmeq.4h v0, v0, #0
+; CHECK-SD-NEXT:    bic.8b v0, v1, v0
+; CHECK-SD-NEXT:    addv.4h h0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_4xi8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    mov w8, #0 ; =0x0
-; GISEL-NEXT:    uzp1.8b v0, v0, v0
-; GISEL-NEXT:    fmov s1, w8
-; GISEL-NEXT:    mov.b v1[1], w8
-; GISEL-NEXT:    mov.b v1[2], w8
-; GISEL-NEXT:    mov.b v1[3], w8
-; GISEL-NEXT:    cmeq.8b v0, v0, v1
-; GISEL-NEXT:    mvn.8b v0, v0
-; GISEL-NEXT:    umov.b w8, v0[0]
-; GISEL-NEXT:    umov.b w9, v0[1]
-; GISEL-NEXT:    mov.s v1[0], w8
-; GISEL-NEXT:    umov.b w8, v0[2]
-; GISEL-NEXT:    mov.s v1[1], w9
-; GISEL-NEXT:    umov.b w9, v0[3]
-; GISEL-NEXT:    mov.s v1[2], w8
-; GISEL-NEXT:    mov.s v1[3], w9
-; GISEL-NEXT:    mov.s w8, v1[1]
-; GISEL-NEXT:    mov.s w9, v1[2]
-; GISEL-NEXT:    fmov w11, s1
-; GISEL-NEXT:    mov.s w10, v1[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_4xi8:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w8, #0 ; =0x0
+; CHECK-GI-NEXT:    uzp1.8b v0, v0, v0
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov.b v1[1], w8
+; CHECK-GI-NEXT:    mov.b v1[2], w8
+; CHECK-GI-NEXT:    mov.b v1[3], w8
+; CHECK-GI-NEXT:    cmeq.8b v0, v0, v1
+; CHECK-GI-NEXT:    mvn.8b v0, v0
+; CHECK-GI-NEXT:    umov.b w8, v0[0]
+; CHECK-GI-NEXT:    umov.b w9, v0[1]
+; CHECK-GI-NEXT:    mov.s v1[0], w8
+; CHECK-GI-NEXT:    umov.b w8, v0[2]
+; CHECK-GI-NEXT:    mov.s v1[1], w9
+; CHECK-GI-NEXT:    umov.b w9, v0[3]
+; CHECK-GI-NEXT:    mov.s v1[2], w8
+; CHECK-GI-NEXT:    mov.s v1[3], w9
+; CHECK-GI-NEXT:    mov.s w8, v1[1]
+; CHECK-GI-NEXT:    mov.s w9, v1[2]
+; CHECK-GI-NEXT:    fmov w11, s1
+; CHECK-GI-NEXT:    mov.s w10, v1[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
   %cmp_result = icmp ne <4 x i8> %vec, zeroinitializer
   %bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -645,39 +646,39 @@ define i8 @convert_to_bitmask_8xi2(<8 x i2> %vec) {
 }
 
 define i4 @convert_to_bitmask_float(<4 x float> %vec) {
-; SDAG-LABEL: convert_to_bitmask_float:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    fcmgt.4s v1, v0, #0.0
-; SDAG-NEXT:    fcmlt.4s v0, v0, #0.0
-; SDAG-NEXT:    adrp x8, lCPI14_0@PAGE
-; SDAG-NEXT:    orr.16b v0, v0, v1
-; SDAG-NEXT:    ldr q1, [x8, lCPI14_0@PAGEOFF]
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    addv.4s s0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_float:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    fcmgt.4s v1, v0, #0.0
+; CHECK-SD-NEXT:    fcmlt.4s v0, v0, #0.0
+; CHECK-SD-NEXT:    adrp x8, lCPI14_0@PAGE
+; CHECK-SD-NEXT:    orr.16b v0, v0, v1
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI14_0@PAGEOFF]
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4s s0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_float:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    fcmgt.4s v1, v0, #0.0
-; GISEL-NEXT:    fcmlt.4s v0, v0, #0.0
-; GISEL-NEXT:    orr.16b v0, v0, v1
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_float:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    fcmgt.4s v1, v0, #0.0
+; CHECK-GI-NEXT:    fcmlt.4s v0, v0, #0.0
+; CHECK-GI-NEXT:    orr.16b v0, v0, v1
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp_result = fcmp one <4 x float> %vec, zeroinitializer
@@ -688,58 +689,58 @@ define i4 @convert_to_bitmask_float(<4 x float> %vec) {
 ; Larger vector types don't map directly, but the can be split/truncated and then converted.
 ; After the comparison against 0, this is truncated to <8 x i16>, which is valid again.
 define i8 @convert_large_vector(<8 x i32> %vec) {
-; SDAG-LABEL: convert_large_vector:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    sub sp, sp, #16
-; SDAG-NEXT:    .cfi_def_cfa_offset 16
-; SDAG-NEXT:    cmeq.4s v1, v1, #0
-; SDAG-NEXT:    cmeq.4s v0, v0, #0
-; SDAG-NEXT:    adrp x8, lCPI15_0@PAGE
-; SDAG-NEXT:    uzp1.8h v0, v0, v1
-; SDAG-NEXT:    ldr q1, [x8, lCPI15_0@PAGEOFF]
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    addv.8h h0, v0
-; SDAG-NEXT:    fmov w8, s0
-; SDAG-NEXT:    and w0, w8, #0xff
-; SDAG-NEXT:    add sp, sp, #16
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_large_vector:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-SD-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-SD-NEXT:    adrp x8, lCPI15_0@PAGE
+; CHECK-SD-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI15_0@PAGEOFF]
+; CHECK-SD-NEXT:    bic.16b v0, v1, v0
+; CHECK-SD-NEXT:    addv.8h h0, v0
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    and w0, w8, #0xff
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_large_vector:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.4s v0, v0, #0
-; GISEL-NEXT:    cmeq.4s v1, v1, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    mvn.16b v1, v1
-; GISEL-NEXT:    uzp1.8h v0, v0, v1
-; GISEL-NEXT:    xtn.8b v0, v0
-; GISEL-NEXT:    umov.b w8, v0[1]
-; GISEL-NEXT:    umov.b w9, v0[0]
-; GISEL-NEXT:    umov.b w10, v0[2]
-; GISEL-NEXT:    umov.b w11, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    orr w8, w9, w8, lsl #2
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[5]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[6]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #4
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #5
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #6
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #7
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_large_vector:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-GI-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    mvn.16b v1, v1
+; CHECK-GI-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-GI-NEXT:    xtn.8b v0, v0
+; CHECK-GI-NEXT:    umov.b w8, v0[1]
+; CHECK-GI-NEXT:    umov.b w9, v0[0]
+; CHECK-GI-NEXT:    umov.b w10, v0[2]
+; CHECK-GI-NEXT:    umov.b w11, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #2
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[5]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[6]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #4
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #5
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #6
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #7
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
    %cmp_result = icmp ne <8 x i32> %vec, zeroinitializer
@@ -748,40 +749,40 @@ define i8 @convert_large_vector(<8 x i32> %vec) {
 }
 
 define i4 @convert_legalized_illegal_element_size(<4 x i22> %vec) {
-; SDAG-LABEL: convert_legalized_illegal_element_size:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    movi.4s v1, #63, msl #16
-; SDAG-NEXT:    adrp x8, lCPI16_0@PAGE
-; SDAG-NEXT:    cmtst.4s v0, v0, v1
-; SDAG-NEXT:    ldr d1, [x8, lCPI16_0@PAGEOFF]
-; SDAG-NEXT:    xtn.4h v0, v0
-; SDAG-NEXT:    and.8b v0, v0, v1
-; SDAG-NEXT:    addv.4h h0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_legalized_illegal_element_size:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    movi.4s v1, #63, msl #16
+; CHECK-SD-NEXT:    adrp x8, lCPI16_0@PAGE
+; CHECK-SD-NEXT:    cmtst.4s v0, v0, v1
+; CHECK-SD-NEXT:    ldr d1, [x8, lCPI16_0@PAGEOFF]
+; CHECK-SD-NEXT:    xtn.4h v0, v0
+; CHECK-SD-NEXT:    and.8b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4h h0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_legalized_illegal_element_size:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    movi.4s v1, #63, msl #16
-; GISEL-NEXT:    and.16b v0, v0, v1
-; GISEL-NEXT:    cmeq.4s v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_legalized_illegal_element_size:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    movi.4s v1, #63, msl #16
+; CHECK-GI-NEXT:    and.16b v0, v0, v1
+; CHECK-GI-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
   %cmp_result = icmp ne <4 x i22> %vec, zeroinitializer
   %bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -818,101 +819,101 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
 }
 
 define <8 x i1> @no_convert_without_direct_bitcast(<8 x i16> %vec) {
-; SDAG-LABEL: no_convert_without_direct_bitcast:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    cmtst.8h v0, v0, v0
-; SDAG-NEXT:    xtn.8b v0, v0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: no_convert_without_direct_bitcast:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmtst.8h v0, v0, v0
+; CHECK-SD-NEXT:    xtn.8b v0, v0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: no_convert_without_direct_bitcast:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmeq.8h v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    xtn.8b v0, v0
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: no_convert_without_direct_bitcast:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    cmeq.8h v0, v0, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    xtn.8b v0, v0
+; CHECK-GI-NEXT:    ret
 
    %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
    ret <8 x i1> %cmp_result
 }
 
 define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
-; SDAG-LABEL: no_combine_illegal_num_elements:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    sub sp, sp, #16
-; SDAG-NEXT:    .cfi_def_cfa_offset 16
-; SDAG-NEXT:    fmov s0, w0
-; SDAG-NEXT:    fmov s1, w4
-; SDAG-NEXT:    mov.s v0[1], w1
-; SDAG-NEXT:    mov.s v1[1], w5
-; SDAG-NEXT:    mov.s v0[2], w2
-; SDAG-NEXT:    cmeq.4s v1, v1, #0
-; SDAG-NEXT:    mov.s v0[3], w3
-; SDAG-NEXT:    cmeq.4s v0, v0, #0
-; SDAG-NEXT:    uzp1.8h v0, v0, v1
-; SDAG-NEXT:    mvn.16b v0, v0
-; SDAG-NEXT:    xtn.8b v0, v0
-; SDAG-NEXT:    umov.b w8, v0[0]
-; SDAG-NEXT:    umov.b w9, v0[1]
-; SDAG-NEXT:    umov.b w10, v0[2]
-; SDAG-NEXT:    and w8, w8, #0x1
-; SDAG-NEXT:    bfi w8, w9, #1, #1
-; SDAG-NEXT:    umov.b w9, v0[3]
-; SDAG-NEXT:    bfi w8, w10, #2, #1
-; SDAG-NEXT:    umov.b w10, v0[4]
-; SDAG-NEXT:    bfi w8, w9, #3, #1
-; SDAG-NEXT:    umov.b w9, v0[5]
-; SDAG-NEXT:    bfi w8, w10, #4, #1
-; SDAG-NEXT:    orr w8, w8, w9, lsl #5
-; SDAG-NEXT:    and w0, w8, #0x3f
-; SDAG-NEXT:    add sp, sp, #16
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: no_combine_illegal_num_elements:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    fmov s1, w4
+; CHECK-SD-NEXT:    mov.s v0[1], w1
+; CHECK-SD-NEXT:    mov.s v1[1], w5
+; CHECK-SD-NEXT:    mov.s v0[2], w2
+; CHECK-SD-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-SD-NEXT:    mov.s v0[3], w3
+; CHECK-SD-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-SD-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-SD-NEXT:    mvn.16b v0, v0
+; CHECK-SD-NEXT:    xtn.8b v0, v0
+; CHECK-SD-NEXT:    umov.b w8, v0[0]
+; CHECK-SD-NEXT:    umov.b w9, v0[1]
+; CHECK-SD-NEXT:    umov.b w10, v0[2]
+; CHECK-SD-NEXT:    and w8, w8, #0x1
+; CHECK-SD-NEXT:    bfi w8, w9, #1, #1
+; CHECK-SD-NEXT:    umov.b w9, v0[3]
+; CHECK-SD-NEXT:    bfi w8, w10, #2, #1
+; CHECK-SD-NEXT:    umov.b w10, v0[4]
+; CHECK-SD-NEXT:    bfi w8, w9, #3, #1
+; CHECK-SD-NEXT:    umov.b w9, v0[5]
+; CHECK-SD-NEXT:    bfi w8, w10, #4, #1
+; CHECK-SD-NEXT:    orr w8, w8, w9, lsl #5
+; CHECK-SD-NEXT:    and w0, w8, #0x3f
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: no_combine_illegal_num_elements:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    mov.s v0[0], w0
-; GISEL-NEXT:    mov.s v1[0], w4
-; GISEL-NEXT:    mov.s v2[0], wzr
-; GISEL-NEXT:    mov.s v0[1], w1
-; GISEL-NEXT:    mov.s v1[1], w5
-; GISEL-NEXT:    mov.s v2[1], wzr
-; GISEL-NEXT:    mov.s v0[2], w2
-; GISEL-NEXT:    cmeq.4s v1, v1, v2
-; GISEL-NEXT:    mvn.16b v1, v1
-; GISEL-NEXT:    mov.s v0[3], w3
-; GISEL-NEXT:    cmeq.4s v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    mov.h v0[1], w8
-; GISEL-NEXT:    mov.s w8, v1[1]
-; GISEL-NEXT:    mov.h v0[2], w9
-; GISEL-NEXT:    mov.h v0[3], w10
-; GISEL-NEXT:    mov.h v0[4], v1[0]
-; GISEL-NEXT:    mov.h v0[5], w8
-; GISEL-NEXT:    umov.h w8, v0[1]
-; GISEL-NEXT:    umov.h w9, v0[0]
-; GISEL-NEXT:    umov.h w10, v0[2]
-; GISEL-NEXT:    umov.h w11, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    umov.h w10, v0[4]
-; GISEL-NEXT:    orr w8, w9, w8, lsl #2
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.h w11, v0[5]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #4
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #5
-; GISEL-NEXT:    and w8, w8, #0x3f
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: no_combine_illegal_num_elements:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov.s v0[0], w0
+; CHECK-GI-NEXT:    mov.s v1[0], w4
+; CHECK-GI-NEXT:    mov.s v2[0], wzr
+; CHECK-GI-NEXT:    mov.s v0[1], w1
+; CHECK-GI-NEXT:    mov.s v1[1], w5
+; CHECK-GI-NEXT:    mov.s v2[1], wzr
+; CHECK-GI-NEXT:    mov.s v0[2], w2
+; CHECK-GI-NEXT:    cmeq.4s v1, v1, v2
+; CHECK-GI-NEXT:    mvn.16b v1, v1
+; CHECK-GI-NEXT:    mov.s v0[3], w3
+; CHECK-GI-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    mov.h v0[1], w8
+; CHECK-GI-NEXT:    mov.s w8, v1[1]
+; CHECK-GI-NEXT:    mov.h v0[2], w9
+; CHECK-GI-NEXT:    mov.h v0[3], w10
+; CHECK-GI-NEXT:    mov.h v0[4], v1[0]
+; CHECK-GI-NEXT:    mov.h v0[5], w8
+; CHECK-GI-NEXT:    umov.h w8, v0[1]
+; CHECK-GI-NEXT:    umov.h w9, v0[0]
+; CHECK-GI-NEXT:    umov.h w10, v0[2]
+; CHECK-GI-NEXT:    umov.h w11, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    umov.h w10, v0[4]
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #2
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.h w11, v0[5]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #4
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #5
+; CHECK-GI-NEXT:    and w8, w8, #0x3f
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
   %cmp_result = icmp ne <6 x i32> %vec, zeroinitializer
   %bitmask = bitcast <6 x i1> %cmp_result to i6
@@ -921,220 +922,220 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
 
 ; Only apply the combine when casting a vector to a scalar.
 define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind {
-; SDAG-LABEL: vector_to_vector_cast:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    sub sp, sp, #16
-; SDAG-NEXT:    shl.16b v0, v0, #7
-; SDAG-NEXT:    adrp x8, lCPI20_0@PAGE
-; SDAG-NEXT:    ldr q1, [x8, lCPI20_0@PAGEOFF]
-; SDAG-NEXT:    add x8, sp, #14
-; SDAG-NEXT:    cmlt.16b v0, v0, #0
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    ext.16b v1, v0, v0, #8
-; SDAG-NEXT:    zip1.16b v0, v0, v1
-; SDAG-NEXT:    addv.8h h0, v0
-; SDAG-NEXT:    str h0, [sp, #14]
-; SDAG-NEXT:    ld1.b { v0 }[0], [x8]
-; SDAG-NEXT:    orr x8, x8, #0x1
-; SDAG-NEXT:    ld1.b { v0 }[4], [x8]
-; SDAG-NEXT:    ; kill: def $d0 killed $d0 killed $q0
-; SDAG-NEXT:    add sp, sp, #16
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: vector_to_vector_cast:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    shl.16b v0, v0, #7
+; CHECK-SD-NEXT:    adrp x8, lCPI20_0@PAGE
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI20_0@PAGEOFF]
+; CHECK-SD-NEXT:    add x8, sp, #14
+; CHECK-SD-NEXT:    cmlt.16b v0, v0, #0
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    ext.16b v1, v0, v0, #8
+; CHECK-SD-NEXT:    zip1.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.8h h0, v0
+; CHECK-SD-NEXT:    str h0, [sp, #14]
+; CHECK-SD-NEXT:    ld1.b { v0 }[0], [x8]
+; CHECK-SD-NEXT:    orr x8, x8, #0x1
+; CHECK-SD-NEXT:    ld1.b { v0 }[4], [x8]
+; CHECK-SD-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: vector_to_vector_cast:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    umov.b w8, v0[1]
-; GISEL-NEXT:    mov d1, v0[1]
-; GISEL-NEXT:    umov.b w10, v0[1]
-; GISEL-NEXT:    umov.b w9, v0[0]
-; GISEL-NEXT:    umov.b w13, v0[0]
-; GISEL-NEXT:    umov.b w14, v0[2]
-; GISEL-NEXT:    umov.b w15, v0[3]
-; GISEL-NEXT:    umov.b w11, v0[2]
-; GISEL-NEXT:    umov.b w16, v0[4]
-; GISEL-NEXT:    umov.b w17, v0[5]
-; GISEL-NEXT:    umov.b w12, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    umov.b w0, v1[1]
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    bfi w13, w10, #1, #31
-; GISEL-NEXT:    and w14, w14, #0x1
-; GISEL-NEXT:    umov.b w8, v1[0]
-; GISEL-NEXT:    umov.b w10, v1[2]
-; GISEL-NEXT:    and w15, w15, #0x1
-; GISEL-NEXT:    orr w13, w13, w14, lsl #2
-; GISEL-NEXT:    umov.b w14, v1[3]
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    and w0, w0, #0x1
-; GISEL-NEXT:    and w16, w16, #0x1
-; GISEL-NEXT:    orr w9, w9, w11, lsl #2
-; GISEL-NEXT:    orr w13, w13, w15, lsl #3
-; GISEL-NEXT:    umov.b w15, v1[4]
-; GISEL-NEXT:    umov.b w11, v0[6]
-; GISEL-NEXT:    bfi w8, w0, #1, #31
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    and w17, w17, #0x1
-; GISEL-NEXT:    orr w13, w13, w16, lsl #4
-; GISEL-NEXT:    and w14, w14, #0x1
-; GISEL-NEXT:    umov.b w0, v0[7]
-; GISEL-NEXT:    orr w8, w8, w10, lsl #2
-; GISEL-NEXT:    umov.b w10, v1[5]
-; GISEL-NEXT:    umov.b w16, v1[6]
-; GISEL-NEXT:    orr w13, w13, w17, lsl #5
-; GISEL-NEXT:    umov.b w17, v0[4]
-; GISEL-NEXT:    and w15, w15, #0x1
-; GISEL-NEXT:    orr w8, w8, w14, lsl #3
-; GISEL-NEXT:    and w12, w12, #0x1
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    umov.b w14, v1[7]
-; GISEL-NEXT:    orr w9, w9, w12, lsl #3
-; GISEL-NEXT:    orr w11, w13, w11, lsl #6
-; GISEL-NEXT:    orr w8, w8, w15, lsl #4
-; GISEL-NEXT:    umov.b w15, v0[5]
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    and w0, w0, #0x1
-; GISEL-NEXT:    and w12, w17, #0x1
-; GISEL-NEXT:    umov.b w13, v0[1]
-; GISEL-NEXT:    orr w8, w8, w10, lsl #5
-; GISEL-NEXT:    and w16, w16, #0x1
-; GISEL-NEXT:    orr w9, w9, w12, lsl #4
-; GISEL-NEXT:    umov.b w10, v0[0]
-; GISEL-NEXT:    orr w11, w11, w0, lsl #7
-; GISEL-NEXT:    and w14, w14, #0x1
-; GISEL-NEXT:    and w12, w15, #0x1
-; GISEL-NEXT:    umov.b w15, v0[2]
-; GISEL-NEXT:    orr w8, w8, w16, lsl #6
-; GISEL-NEXT:    orr w9, w9, w12, lsl #5
-; GISEL-NEXT:    umov.b w12, v0[6]
-; GISEL-NEXT:    strb w11, [sp, #8]
-; GISEL-NEXT:    and w11, w13, #0x1
-; GISEL-NEXT:    umov.b w13, v0[3]
-; GISEL-NEXT:    orr w8, w8, w14, lsl #7
-; GISEL-NEXT:    umov.b w14, v0[7]
-; GISEL-NEXT:    ldr b0, [sp, #8]
-; GISEL-NEXT:    bfi w10, w11, #1, #31
-; GISEL-NEXT:    and w11, w15, #0x1
-; GISEL-NEXT:    strb w8, [sp, #9]
-; GISEL-NEXT:    umov.b w15, v0[4]
-; GISEL-NEXT:    and w8, w12, #0x1
-; GISEL-NEXT:    orr w10, w10, w11, lsl #2
-; GISEL-NEXT:    orr w8, w9, w8, lsl #6
-; GISEL-NEXT:    and w9, w13, #0x1
-; GISEL-NEXT:    umov.b w11, v0[1]
-; GISEL-NEXT:    orr w9, w10, w9, lsl #3
-; GISEL-NEXT:    umov.b w10, v0[5]
-; GISEL-NEXT:    umov.b w12, v0[0]
-; GISEL-NEXT:    and w13, w14, #0x1
-; GISEL-NEXT:    umov.b w16, v0[2]
-; GISEL-NEXT:    umov.b w17, v0[3]
-; GISEL-NEXT:    and w14, w15, #0x1
-; GISEL-NEXT:    umov.b w15, v0[2]
-; GISEL-NEXT:    orr w8, w8, w13, lsl #7
-; GISEL-NEXT:    orr w9, w9, w14, lsl #4
-; GISEL-NEXT:    umov.b w13, v0[6]
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    umov.b w14, v0[3]
-; GISEL-NEXT:    strb w8, [sp, #10]
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    bfi w12, w11, #1, #31
-; GISEL-NEXT:    orr w8, w9, w8, lsl #5
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    and w9, w15, #0x1
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    umov.b w15, v0[1]
-; GISEL-NEXT:    orr w9, w12, w9, lsl #2
-; GISEL-NEXT:    umov.b w12, v0[5]
-; GISEL-NEXT:    and w13, w13, #0x1
-; GISEL-NEXT:    and w14, w14, #0x1
-; GISEL-NEXT:    orr w8, w8, w13, lsl #6
-; GISEL-NEXT:    umov.b w13, v0[0]
-; GISEL-NEXT:    orr w9, w9, w14, lsl #3
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    umov.b w14, v0[6]
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    and w15, w15, #0x1
-; GISEL-NEXT:    umov.b w0, v0[3]
-; GISEL-NEXT:    orr w9, w9, w10, lsl #4
-; GISEL-NEXT:    and w10, w12, #0x1
-; GISEL-NEXT:    umov.b w12, v0[7]
-; GISEL-NEXT:    orr w8, w8, w11, lsl #7
-; GISEL-NEXT:    bfi w13, w15, #1, #31
-; GISEL-NEXT:    and w11, w16, #0x1
-; GISEL-NEXT:    orr w9, w9, w10, lsl #5
-; GISEL-NEXT:    and w10, w14, #0x1
-; GISEL-NEXT:    umov.b w14, v0[4]
-; GISEL-NEXT:    strb w8, [sp, #11]
-; GISEL-NEXT:    umov.b w15, v0[1]
-; GISEL-NEXT:    umov.b w16, v0[3]
-; GISEL-NEXT:    orr w8, w9, w10, lsl #6
-; GISEL-NEXT:    orr w9, w13, w11, lsl #2
-; GISEL-NEXT:    and w10, w12, #0x1
-; GISEL-NEXT:    and w11, w17, #0x1
-; GISEL-NEXT:    umov.b w12, v0[5]
-; GISEL-NEXT:    umov.b w17, v0[0]
-; GISEL-NEXT:    orr w8, w8, w10, lsl #7
-; GISEL-NEXT:    orr w9, w9, w11, lsl #3
-; GISEL-NEXT:    umov.b w10, v0[1]
-; GISEL-NEXT:    and w11, w14, #0x1
-; GISEL-NEXT:    umov.b w14, v0[0]
-; GISEL-NEXT:    and w15, w15, #0x1
-; GISEL-NEXT:    orr w9, w9, w11, lsl #4
-; GISEL-NEXT:    umov.b w11, v0[2]
-; GISEL-NEXT:    umov.b w13, v0[6]
-; GISEL-NEXT:    and w12, w12, #0x1
-; GISEL-NEXT:    bfi w17, w15, #1, #31
-; GISEL-NEXT:    umov.b w15, v0[5]
-; GISEL-NEXT:    orr w9, w9, w12, lsl #5
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    umov.b w12, v0[2]
-; GISEL-NEXT:    bfi w14, w10, #1, #31
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    ldr b1, [sp, #9]
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    and w13, w13, #0x1
-; GISEL-NEXT:    strb w8, [sp, #12]
-; GISEL-NEXT:    orr w11, w14, w11, lsl #2
-; GISEL-NEXT:    and w14, w16, #0x1
-; GISEL-NEXT:    umov.b w16, v0[4]
-; GISEL-NEXT:    and w12, w12, #0x1
-; GISEL-NEXT:    and w15, w15, #0x1
-; GISEL-NEXT:    orr w9, w9, w13, lsl #6
-; GISEL-NEXT:    orr w11, w11, w14, lsl #3
-; GISEL-NEXT:    orr w12, w17, w12, lsl #2
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    and w17, w0, #0x1
-; GISEL-NEXT:    umov.b w0, v0[5]
-; GISEL-NEXT:    umov.b w14, v0[6]
-; GISEL-NEXT:    orr w10, w11, w10, lsl #4
-; GISEL-NEXT:    orr w12, w12, w17, lsl #3
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    and w16, w16, #0x1
-; GISEL-NEXT:    umov.b w17, v0[6]
-; GISEL-NEXT:    orr w10, w10, w15, lsl #5
-; GISEL-NEXT:    umov.b w15, v0[7]
-; GISEL-NEXT:    orr w12, w12, w16, lsl #4
-; GISEL-NEXT:    and w16, w0, #0x1
-; GISEL-NEXT:    umov.b w0, v0[7]
-; GISEL-NEXT:    and w14, w14, #0x1
-; GISEL-NEXT:    orr w12, w12, w16, lsl #5
-; GISEL-NEXT:    orr w10, w10, w14, lsl #6
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    and w13, w17, #0x1
-; GISEL-NEXT:    orr w9, w9, w11, lsl #7
-; GISEL-NEXT:    mov.s v0[1], v1[0]
-; GISEL-NEXT:    orr w11, w12, w13, lsl #6
-; GISEL-NEXT:    and w12, w15, #0x1
-; GISEL-NEXT:    ; kill: def $d0 killed $d0 killed $q0
-; GISEL-NEXT:    orr w8, w10, w12, lsl #7
-; GISEL-NEXT:    and w10, w0, #0x1
-; GISEL-NEXT:    strb w9, [sp, #13]
-; GISEL-NEXT:    orr w9, w11, w10, lsl #7
-; GISEL-NEXT:    strb w8, [sp, #14]
-; GISEL-NEXT:    strb w9, [sp, #15]
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: vector_to_vector_cast:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    umov.b w8, v0[1]
+; CHECK-GI-NEXT:    mov d1, v0[1]
+; CHECK-GI-NEXT:    umov.b w10, v0[1]
+; CHECK-GI-NEXT:    umov.b w9, v0[0]
+; CHECK-GI-NEXT:    umov.b w13, v0[0]
+; CHECK-GI-NEXT:    umov.b w14, v0[2]
+; CHECK-GI-NEXT:    umov.b w15, v0[3]
+; CHECK-GI-NEXT:    umov.b w11, v0[2]
+; CHECK-GI-NEXT:    umov.b w16, v0[4]
+; CHECK-GI-NEXT:    umov.b w17, v0[5]
+; CHECK-GI-NEXT:    umov.b w12, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v1[1]
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    bfi w13, w10, #1, #31
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w8, v1[0]
+; CHECK-GI-NEXT:    umov.b w10, v1[2]
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    orr w13, w13, w14, lsl #2
+; CHECK-GI-NEXT:    umov.b w14, v1[3]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w0, w0, #0x1
+; CHECK-GI-NEXT:    and w16, w16, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #2
+; CHECK-GI-NEXT:    orr w13, w13, w15, lsl #3
+; CHECK-GI-NEXT:    umov.b w15, v1[4]
+; CHECK-GI-NEXT:    umov.b w11, v0[6]
+; CHECK-GI-NEXT:    bfi w8, w0, #1, #31
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    and w17, w17, #0x1
+; CHECK-GI-NEXT:    orr w13, w13, w16, lsl #4
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #2
+; CHECK-GI-NEXT:    umov.b w10, v1[5]
+; CHECK-GI-NEXT:    umov.b w16, v1[6]
+; CHECK-GI-NEXT:    orr w13, w13, w17, lsl #5
+; CHECK-GI-NEXT:    umov.b w17, v0[4]
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w14, lsl #3
+; CHECK-GI-NEXT:    and w12, w12, #0x1
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v1[7]
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #3
+; CHECK-GI-NEXT:    orr w11, w13, w11, lsl #6
+; CHECK-GI-NEXT:    orr w8, w8, w15, lsl #4
+; CHECK-GI-NEXT:    umov.b w15, v0[5]
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    and w0, w0, #0x1
+; CHECK-GI-NEXT:    and w12, w17, #0x1
+; CHECK-GI-NEXT:    umov.b w13, v0[1]
+; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #5
+; CHECK-GI-NEXT:    and w16, w16, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #4
+; CHECK-GI-NEXT:    umov.b w10, v0[0]
+; CHECK-GI-NEXT:    orr w11, w11, w0, lsl #7
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    and w12, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w15, v0[2]
+; CHECK-GI-NEXT:    orr w8, w8, w16, lsl #6
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #5
+; CHECK-GI-NEXT:    umov.b w12, v0[6]
+; CHECK-GI-NEXT:    strb w11, [sp, #8]
+; CHECK-GI-NEXT:    and w11, w13, #0x1
+; CHECK-GI-NEXT:    umov.b w13, v0[3]
+; CHECK-GI-NEXT:    orr w8, w8, w14, lsl #7
+; CHECK-GI-NEXT:    umov.b w14, v0[7]
+; CHECK-GI-NEXT:    ldr b0, [sp, #8]
+; CHECK-GI-NEXT:    bfi w10, w11, #1, #31
+; CHECK-GI-NEXT:    and w11, w15, #0x1
+; CHECK-GI-NEXT:    strb w8, [sp, #9]
+; CHECK-GI-NEXT:    umov.b w15, v0[4]
+; CHECK-GI-NEXT:    and w8, w12, #0x1
+; CHECK-GI-NEXT:    orr w10, w10, w11, lsl #2
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #6
+; CHECK-GI-NEXT:    and w9, w13, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[1]
+; CHECK-GI-NEXT:    orr w9, w10, w9, lsl #3
+; CHECK-GI-NEXT:    umov.b w10, v0[5]
+; CHECK-GI-NEXT:    umov.b w12, v0[0]
+; CHECK-GI-NEXT:    and w13, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w16, v0[2]
+; CHECK-GI-NEXT:    umov.b w17, v0[3]
+; CHECK-GI-NEXT:    and w14, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w15, v0[2]
+; CHECK-GI-NEXT:    orr w8, w8, w13, lsl #7
+; CHECK-GI-NEXT:    orr w9, w9, w14, lsl #4
+; CHECK-GI-NEXT:    umov.b w13, v0[6]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[3]
+; CHECK-GI-NEXT:    strb w8, [sp, #10]
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    bfi w12, w11, #1, #31
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #5
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    and w9, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    umov.b w15, v0[1]
+; CHECK-GI-NEXT:    orr w9, w12, w9, lsl #2
+; CHECK-GI-NEXT:    umov.b w12, v0[5]
+; CHECK-GI-NEXT:    and w13, w13, #0x1
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w13, lsl #6
+; CHECK-GI-NEXT:    umov.b w13, v0[0]
+; CHECK-GI-NEXT:    orr w9, w9, w14, lsl #3
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[6]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[3]
+; CHECK-GI-NEXT:    orr w9, w9, w10, lsl #4
+; CHECK-GI-NEXT:    and w10, w12, #0x1
+; CHECK-GI-NEXT:    umov.b w12, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w11, lsl #7
+; CHECK-GI-NEXT:    bfi w13, w15, #1, #31
+; CHECK-GI-NEXT:    and w11, w16, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w10, lsl #5
+; CHECK-GI-NEXT:    and w10, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[4]
+; CHECK-GI-NEXT:    strb w8, [sp, #11]
+; CHECK-GI-NEXT:    umov.b w15, v0[1]
+; CHECK-GI-NEXT:    umov.b w16, v0[3]
+; CHECK-GI-NEXT:    orr w8, w9, w10, lsl #6
+; CHECK-GI-NEXT:    orr w9, w13, w11, lsl #2
+; CHECK-GI-NEXT:    and w10, w12, #0x1
+; CHECK-GI-NEXT:    and w11, w17, #0x1
+; CHECK-GI-NEXT:    umov.b w12, v0[5]
+; CHECK-GI-NEXT:    umov.b w17, v0[0]
+; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #7
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #3
+; CHECK-GI-NEXT:    umov.b w10, v0[1]
+; CHECK-GI-NEXT:    and w11, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[0]
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #4
+; CHECK-GI-NEXT:    umov.b w11, v0[2]
+; CHECK-GI-NEXT:    umov.b w13, v0[6]
+; CHECK-GI-NEXT:    and w12, w12, #0x1
+; CHECK-GI-NEXT:    bfi w17, w15, #1, #31
+; CHECK-GI-NEXT:    umov.b w15, v0[5]
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #5
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w12, v0[2]
+; CHECK-GI-NEXT:    bfi w14, w10, #1, #31
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    ldr b1, [sp, #9]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w13, w13, #0x1
+; CHECK-GI-NEXT:    strb w8, [sp, #12]
+; CHECK-GI-NEXT:    orr w11, w14, w11, lsl #2
+; CHECK-GI-NEXT:    and w14, w16, #0x1
+; CHECK-GI-NEXT:    umov.b w16, v0[4]
+; CHECK-GI-NEXT:    and w12, w12, #0x1
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w13, lsl #6
+; CHECK-GI-NEXT:    orr w11, w11, w14, lsl #3
+; CHECK-GI-NEXT:    orr w12, w17, w12, lsl #2
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    and w17, w0, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[5]
+; CHECK-GI-NEXT:    umov.b w14, v0[6]
+; CHECK-GI-NEXT:    orr w10, w11, w10, lsl #4
+; CHECK-GI-NEXT:    orr w12, w12, w17, lsl #3
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    and w16, w16, #0x1
+; CHECK-GI-NEXT:    umov.b w17, v0[6]
+; CHECK-GI-NEXT:    orr w10, w10, w15, lsl #5
+; CHECK-GI-NEXT:    umov.b w15, v0[7]
+; CHECK-GI-NEXT:    orr w12, w12, w16, lsl #4
+; CHECK-GI-NEXT:    and w16, w0, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[7]
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    orr w12, w12, w16, lsl #5
+; CHECK-GI-NEXT:    orr w10, w10, w14, lsl #6
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w13, w17, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #7
+; CHECK-GI-NEXT:    mov.s v0[1], v1[0]
+; CHECK-GI-NEXT:    orr w11, w12, w13, lsl #6
+; CHECK-GI-NEXT:    and w12, w15, #0x1
+; CHECK-GI-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    orr w8, w10, w12, lsl #7
+; CHECK-GI-NEXT:    and w10, w0, #0x1
+; CHECK-GI-NEXT:    strb w9, [sp, #13]
+; CHECK-GI-NEXT:    orr w9, w11, w10, lsl #7
+; CHECK-GI-NEXT:    strb w8, [sp, #14]
+; CHECK-GI-NEXT:    strb w9, [sp, #15]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
   %bc = bitcast <16 x i1> %arg to <2 x i8>
   ret <2 x i8> %bc
 }
diff --git a/llvm/test/CodeGen/AArch64/xbfiz.ll b/llvm/test/CodeGen/AArch64/xbfiz.ll
index 05567e3425840..b777ddcb7efcc 100644
--- a/llvm/test/CodeGen/AArch64/xbfiz.ll
+++ b/llvm/test/CodeGen/AArch64/xbfiz.ll
@@ -69,19 +69,3 @@ define i64 @lsl32_not_ubfiz64(i64 %v) {
   %and = and i64 %shl, 4294967295
   ret i64 %and
 }
-
-define i64 @lsl_zext_i8_i64(i8 %b) {
-; CHECK-LABEL: lsl_zext_i8_i64:
-; CHECK:    ubfiz x0, x0, #1, #8
-  %1 = zext i8 %b to i64
-  %2 = shl i64 %1, 1
-  ret i64 %2
-}
-
-define i64 @lsl_zext_i16_i64(i16 %b) {
-; CHECK-LABEL: lsl_zext_i16_i64:
-; CHECK:    ubfiz x0, x0, #1, #16
-  %1 = zext i16 %b to i64
-  %2 = shl i64 %1, 1
-  ret i64 %2
-}
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index bc359d6ff3aaa..8e3c905b0eae5 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -34508,14 +34508,25 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_select_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_select_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_select_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, bfloat %a, bfloat %b
   ret bfloat %op
 }
@@ -34573,11 +34584,14 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX11TRUE16-LABEL: v_select_fneg_lhs_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_xor_b16 v1.l, 0x8000, v1.l
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_select_fneg_lhs_bf16:
@@ -34647,11 +34661,14 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX11TRUE16-LABEL: v_select_fneg_rhs_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_xor_b16 v2.l, 0x8000, v2.l
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_select_fneg_rhs_bf16:
@@ -34749,11 +34766,15 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v3, v4
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, v3.l, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -34856,14 +34877,19 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
 ; GFX11TRUE16-LABEL: v_vselect_v2bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v4.l, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v2.l, s0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -34936,16 +34962,27 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: s_select_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, s1, v1, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11TRUE16-LABEL: s_select_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, s0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, s1, v0.l, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11FAKE16-LABEL: s_select_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, s1, v1, vcc_lo
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11FAKE16-NEXT:    ; return to shader part epilog
   %cond = icmp eq i32 %c, 0
   %op = select i1 %cond, bfloat %a, bfloat %b
   %cast = bitcast bfloat %op to i16
@@ -35038,17 +35075,21 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ;
 ; GFX11TRUE16-LABEL: s_select_v2bf16:
 ; GFX11TRUE16:       ; %bb.0:
-; GFX11TRUE16-NEXT:    s_lshr_b32 s2, s1, 16
-; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, s2
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, s3
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, s1
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s0
+; GFX11TRUE16-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_cndmask_b32 v1, v3, v4
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, s3
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, s2
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, s1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, s0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11TRUE16-NEXT:    ; return to shader part epilog
 ;
@@ -35156,17 +35197,20 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ;
 ; GFX11TRUE16-LABEL: s_vselect_v2bf16:
 ; GFX11TRUE16:       ; %bb.0:
-; GFX11TRUE16-NEXT:    s_lshr_b32 s2, s1, 16
-; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, s2
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, s3
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s1
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, s0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX11TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 0, v1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, s3
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, s4
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, s1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, s0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, s2
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
@@ -36876,33 +36920,38 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ;
 ; GFX11TRUE16-LABEL: s_vselect_v4bf16:
 ; GFX11TRUE16:       ; %bb.0:
-; GFX11TRUE16-NEXT:    s_lshr_b32 s4, s3, 16
-; GFX11TRUE16-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s4
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, s5
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX11TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, s2
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, s0
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.l, s4
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc_lo
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s3
-; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, s3
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, s1
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc_lo
+; GFX11TRUE16-NEXT:    s_lshr_b32 s7, s3, 16
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v9, vcc_lo
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s4, 0, v1
+; GFX11TRUE16-NEXT:    s_lshr_b32 s8, s1, 16
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, s7
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, s3
+; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX11TRUE16-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s5, 0, v2
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s6, 0, v3
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, s8
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, s3
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, s7
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, s2
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, s0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.h, s1
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, s6
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v2.l, s4
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.h, v3.l, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.h, s5
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
+; GFX11TRUE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
 ; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX11FAKE16-LABEL: s_vselect_v4bf16:
@@ -37078,29 +37127,33 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
 ; GFX11TRUE16-LABEL: v_vselect_v4bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, v7.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v0
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v4.l, s0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v8.l, v3.l, s1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.l, v5.l, s2
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_vselect_v4bf16:
@@ -37368,51 +37421,51 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
 ; GFX11TRUE16-LABEL: v_vselect_v8bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v16.l, v15.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v17.l, v11.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v16, v17, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v14
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v10
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc_lo
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v10.l, v13.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v14.l, v9.l
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v5, v16, v17, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v10, v14, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v8
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v10, v14, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v13, v9, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v15, v11, vcc_lo
-; GFX11TRUE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v7
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v6
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v5
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v15
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v0
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s3, 1, v1
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s4, 1, v4
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s5, 1, v2
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s6, 1, v3
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v1.l, s2
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v12
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v9
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v13
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v10
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v14
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v15.l, v11.l, s3
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v14.l, v10.l, s4
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.l, v2.l, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.l, v12.l, v8.l, s0
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.h, v5.l, v4.l, s1
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.l, v13.l, v9.l, s5
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.h, v7.l, v6.l, s6
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v4, v5, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v1, v2, v6, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v2, v3, v7, 0x5040100
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_perm_b32 v3, v8, v9, 0x5040100
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_vselect_v8bf16:
@@ -38024,101 +38077,96 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v49.l, v26.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v50.l, v18.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v33.l, v30.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v34.l, v22.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v53.l, v24.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v54.l, v16.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v35.l, v29.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v36.l, v21.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v51.l, v25.l
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v12, v33, v34, vcc_lo
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v52.l, v17.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 1, v9
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v37.l, v28.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v38.l, v20.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v10, v35, v36, vcc_lo
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s7, 1, v9
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s8, 1, v8
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v39.l, v27.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v48.l, v19.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v8, v37, v38, vcc_lo
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v39, v48, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v4, v49, v50, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v32.l, v23.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v51, v52, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v10, 1, v10
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v53, v54, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v13, v30, v22, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v11, v29, v21, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v9, v28, v20, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v27, v19, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v27
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v24
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v24, v16, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v25, v17, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v5, v26, v18, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v2
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s5, 1, v7
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s6, 1, v6
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.l, v28.l, v20.l, s8
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.h, v38.l, v37.l, s7
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v29
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v26
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v25
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s3, 1, v5
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s4, 1, v4
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s9, 1, v11
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s10, 1, v12
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s11, 1, v13
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s12, 1, v10
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s13, 1, v15
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s14, 1, v14
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.l, v27.l, v19.l, s6
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.h, v48.l, v39.l, s5
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v4.h, v54.l, v53.l, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v5.l, v24.l, v16.l, s0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v13.l, v2.l
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v30.l, v22.l, s10
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v34.l, v33.l, s11
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v29.l, v21.l, s12
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.h, v36.l, v35.l, s9
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v5.h, v52.l, v51.l, s1
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.l, v25.l, v17.l, s2
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.h, v50.l, v49.l, s3
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v11.l, v3.l
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v4.l, v26.l, v18.l, s4
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v14.l, v1.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v15.l, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v16.l, v0.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v17.l, v0.l
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v7, v8, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v1, v5, v9, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v5, v14, v15, 0x5040100
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v31
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v17.l, v31.l
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v16.l
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v14, v17, v32 :: v_dual_and_b32 v15, 1, v15
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v15, v3, v23, vcc_lo
-; GFX11TRUE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11TRUE16-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v31
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.l, v31.l, v23.l, s14
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.h, v2.l, v32.l, s13
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.l
+; GFX11TRUE16-NEXT:    v_perm_b32 v2, v6, v4, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v4, v12, v13, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v6, v16, v17, 0x5040100
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.h
+; GFX11TRUE16-NEXT:    v_perm_b32 v3, v10, v11, 0x5040100
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_perm_b32 v7, v8, v7, 0x5040100
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_vselect_v16bf16:
@@ -39660,217 +39708,197 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v85, off, s32 offset:8
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v86, off, s32 offset:68
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v87, off, s32 offset:4
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v17, 1, v17
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v23, 1, v23
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v29, 1, v29
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s8, 1, v8
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s22, 1, v22
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s24, 1, v24
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s26, 1, v30
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s27, 1, v26
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s29, 1, v28
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v2
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s3, 1, v5
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s4, 1, v4
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s5, 1, v7
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s7, 1, v9
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s9, 1, v11
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s10, 1, v10
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s11, 1, v13
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s12, 1, v12
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s13, 1, v15
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s14, 1, v14
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s15, 1, v17
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s16, 1, v16
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s17, 1, v19
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s18, 1, v18
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s19, 1, v21
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s20, 1, v20
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s21, 1, v23
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s23, 1, v25
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s25, 1, v27
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s28, 1, v29
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s6, 1, v6
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v31
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v96.l, v32.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v32
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(30)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v97.l, v33.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v98.l, v34.l
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v32.l, v33.l, s26
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v33
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(28)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v99.l, v35.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v100.l, v36.l
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v34.l, v35.l, s29
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v35
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v34
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(26)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v101.l, v37.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v102.l, v38.l
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v36.l, v37.l, s27
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v37
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v36
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(24)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v103.l, v39.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v112.l, v48.l
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.h, v38.l, v39.l, s24
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v39
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v38
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(22)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v113.l, v49.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v114.l, v50.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(20)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v115.l, v51.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v116.l, v52.l
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.l, v48.l, v49.l, s22
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v49
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v48
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v117.l, v53.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v118.l, v54.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v119.l, v55.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v128.l, v64.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v53
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v52
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v129.l, v65.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v130.l, v66.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v131.l, v67.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v65
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v64
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.h, v50.l, v51.l, s20
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v132.l, v68.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v68
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v133.l, v69.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v69
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v134.l, v70.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v70
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v135.l, v71.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v71
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v70
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v71
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v80
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v81
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(5)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v146.l, v82.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v82
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v147.l, v83.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v83
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v28, 1, v28
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v30
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v83
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(3)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v30.l, v84.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v84
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v26, 1, v26
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v96, v96, v97, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v28
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v84
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v97.l, v85.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 16, v85
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v24, 1, v24
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v98, v98, v99, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v26
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v144.l, v80.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v145.l, v81.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v81
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v26, v100, v101, vcc_lo
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v80
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v24
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v69
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v68
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v20, 1, v20
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v24, v102, v103, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v67
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v66
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v18, 1, v18
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v22, v112, v113, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v20
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v65
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v64
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v16, 1, v16
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v20, v114, v115, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v18
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v55
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v18, v116, v117, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v53
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v52
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v16, v118, v119, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v19, 1, v19
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v51
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v50
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v14, v128, v129, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v17, 1, v17
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v12, v130, v131, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v23, 1, v23
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v10, v132, v133, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v8, v134, v135, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v27, 1, v27
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v144, v145, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v25, 1, v25
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v4, v146, v147, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v85
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v28.l, v86.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v86
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v99.l, v87.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v87
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v30, v97, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v29, 1, v29
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v86
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v30.l, v84.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v84.l, v85.l
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v28, v99, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v31
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v28.l, v86.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v85.l, v87.l
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v29, v34, v35, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v27
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v27, v36, v37, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v25
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v25, v38, v39, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v23
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v23, v48, v49, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v21
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v21, v50, v51, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v19
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v19, v52, v53, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v17, v54, v55, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v15, v64, v65, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v13, v66, v67, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v11, v68, v69, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v80, v81, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v30, v84, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v28, v85, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v5, v82, v83, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
-; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v9, v70, v71, vcc_lo
-; GFX11TRUE16-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v10, v21, v20, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v11, v23, v22, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v8, v17, v16, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v9, v19, v18, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v12, v25, v24, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v13, v27, v26, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v14, v29, v98, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v15, v31, v96, 0x5040100
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v7.h, v86.l, v87.l, s0
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v87
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v8
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v51
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v50
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.l, v52.l, v53.l, s18
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.h, v54.l, v55.l, s16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v55
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v54
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v4.l, v64.l, v65.l, s14
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v4.h, v66.l, v67.l, s12
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v67
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v66
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v5.h, v70.l, v71.l, s8
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.h, v82.l, v83.l, s4
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v8.l, v10.l, v9.l, s28
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v8.h, v12.l, v11.l, s25
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v9.l, v14.l, v13.l, s23
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v9.h, v18.l, v15.l, s21
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v10.h, v22.l, v21.l, s17
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v11.h, v26.l, v25.l, s13
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v12.h, v30.l, v29.l, s9
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v13.l, v32.l, v31.l, s7
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v13.h, v34.l, v33.l, s5
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v14.l, v36.l, v35.l, s3
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v14.h, v38.l, v37.l, s1
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v15.l, v48.l, v39.l, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v15.h, v17.l, v16.l, s0
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v5.l, v68.l, v69.l, s10
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.l, v80.l, v81.l, s6
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v7.l, v84.l, v85.l, s2
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v10.l, v20.l, v19.l, s19
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v11.l, v24.l, v23.l, s15
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v12.l, v28.l, v27.l, s11
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v18.l, v7.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v19.l, v6.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v20.l, v5.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v21.l, v4.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v22.l, v4.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v23.l, v3.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v24.l, v3.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v25.l, v2.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v26.l, v2.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v27.l, v1.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v28.l, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v29.l, v0.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v30.l, v0.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v15.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v14.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v14.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v13.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, v13.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v14.l, v11.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v16.l, v10.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v17.l, v9.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v31.l, v9.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v32.l, v8.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v33.l, v8.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v15.l, v15.h
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v18, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v1, v1, v7, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v2, v2, v19, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v3, v3, v6, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v4, v4, v20, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v5, v13, v5, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v6, v12, v21, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v7, v14, v22, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v8, v11, v23, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v9, v16, v24, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v10, v10, v25, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v11, v17, v26, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v12, v31, v27, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v13, v32, v28, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v14, v33, v29, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v15, v15, v30, 0x5040100
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_vselect_v32bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index 30c8e94c9a27f..a95a1aba0c914 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=amdgcn-- -verify-machineinstrs | FileCheck %s --check-prefix=SI
 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-REAL16
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16
 
 declare i16 @llvm.bswap.i16(i16) nounwind readnone
 declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) nounwind readnone
@@ -490,13 +491,21 @@ define float @missing_truncate_promote_bswap(i32 %arg) {
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: missing_truncate_promote_bswap:
-; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-REAL16-LABEL: missing_truncate_promote_bswap:
+; GFX11-REAL16:       ; %bb.0: ; %bb
+; GFX11-REAL16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-REAL16-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
+; GFX11-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-REAL16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: missing_truncate_promote_bswap:
+; GFX11-FAKE16:       ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = trunc i32 %arg to i16
   %tmp1 = call i16 @llvm.bswap.i16(i16 %tmp)
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
index 48ca53732ed06..b278bfca7f7a3 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass=si-lower-control-flow -amdgpu-remove-redundant-endcf %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -passes=si-lower-control-flow -amdgpu-remove-redundant-endcf %s -o - | FileCheck -check-prefix=GCN %s
 
 # Make sure dbg_value doesn't change codeegn when collapsing end_cf
 ---
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 7912d1cf8dc0d..add8c0f75bf33 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -452,11 +452,11 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3
 ; GCN-NEXT:    s_and_b32 s6, s4, 0x1010101
 ; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
 ; GCN-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <8 x i8> %vec, i8 1, i32 %sel
diff --git a/llvm/test/CodeGen/AMDGPU/licm-wwm.mir b/llvm/test/CodeGen/AMDGPU/licm-wwm.mir
new file mode 100644
index 0000000000000..fc20674971a71
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/licm-wwm.mir
@@ -0,0 +1,46 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=early-machinelicm,si-wqm -o - %s | FileCheck -check-prefix=GCN %s
+
+# Machine LICM may hoist an intruction from a WWM region, which will force SI-WQM pass
+# to create a second WWM region. This is an unwanted hoisting.
+
+---
+name: licm_move_wwm
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: licm_move_wwm
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[ENTER_STRICT_WWM:%[0-9]+]]:sreg_32 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+  ; GCN-NEXT:   $exec_lo = EXIT_STRICT_WWM [[ENTER_STRICT_WWM]]
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[ENTER_STRICT_WWM1:%[0-9]+]]:sreg_32 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GCN-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec
+  ; GCN-NEXT:   $exec_lo = EXIT_STRICT_WWM [[ENTER_STRICT_WWM1]]
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GCN-NEXT:   $exec_lo = S_OR_B32 $exec_lo, [[COPY]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_EXECNZ %bb.1, implicit $exec
+  ; GCN-NEXT:   S_BRANCH %bb.2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    S_BRANCH %bb.1
+
+  bb.1:
+    %0:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    %1:sreg_32 = V_READFIRSTLANE_B32 killed %0:vgpr_32, implicit $exec
+    early-clobber %2:sreg_32 = STRICT_WWM killed %1:sreg_32, implicit $exec
+    $exec_lo = S_OR_B32 $exec_lo, %2, implicit-def $scc
+    S_CBRANCH_EXECNZ %bb.1, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir
index 9eeec4fa3a93d..d156a0aef6c17 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir
+++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
 # RUN: llc -run-pass=liveintervals -run-pass=si-lower-control-flow -mtriple=amdgcn--amdpal -mcpu=gfx1030 -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -passes='require<live-intervals>,si-lower-control-flow' -mtriple=amdgcn--amdpal -mcpu=gfx1030 -o - %s | FileCheck %s
 
 # Check that verifier passes for the following.
 
diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir
index 914cc8ae8844c..eaf398fd51723 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir
+++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass=si-lower-control-flow -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -passes=si-lower-control-flow -o - %s | FileCheck %s
 
 # Test si-lower-control-flow insertion points when other terminator
 # instructions are present besides the control flow pseudo and a
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index b520dd1060ec8..30e3bc3ba5da8 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -385,17 +385,15 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1,
 ; SDAG-CI:       ; %bb.0:
 ; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v5, v1
+; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v1, v4
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v4, v5
-; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v5, v0
-; SDAG-CI-NEXT:    v_mad_f32 v0, v4, v2, v1
-; SDAG-CI-NEXT:    v_mac_f32_e32 v1, v5, v3
+; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; SDAG-CI-NEXT:    v_mad_f32 v0, v1, v2, v5
+; SDAG-CI-NEXT:    v_mad_f32 v1, v4, v3, v5
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_v2f32_shuffle:
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 3be17f9538d0f..7d18739fd0c32 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -1337,10 +1337,10 @@ define i64 @lshr_mad_i64_1(i64 %arg0, i64 %arg1) #0 {
 ; CI-LABEL: lshr_mad_i64_1:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v2, v1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_movk_i32 s4, 0xfc19
-; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
-; CI-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
-; CI-NEXT:    v_mov_b32_e32 v0, v2
+; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[0:1]
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: lshr_mad_i64_1:
@@ -1357,20 +1357,28 @@ define i64 @lshr_mad_i64_1(i64 %arg0, i64 %arg1) #0 {
 ; GFX9-LABEL: lshr_mad_i64_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_movk_i32 s4, 0xfc19
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
-; GFX9-NEXT:    v_sub_u32_e32 v1, v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[0:1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: lshr_mad_i64_1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0xfffffc19, v1, v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
-; GFX11-NEXT:    v_mov_b32_e32 v0, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-LABEL: lshr_mad_i64_1:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, 0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mad_u64_u32 v[2:3], null, 0xfffffc19, v4, v[0:1]
+; GFX1100-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1150-LABEL: lshr_mad_i64_1:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1150-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, 0xfffffc19, v2, v[0:1]
+; GFX1150-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: lshr_mad_i64_1:
 ; GFX12:       ; %bb.0:
@@ -1379,10 +1387,9 @@ define i64 @lshr_mad_i64_1(i64 %arg0, i64 %arg1) #0 {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0xfffffc19, v1, v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
-; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, 0xfffffc19, v2, v[0:1]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %lsh = lshr i64 %arg0, 32
   %mul = mul i64 %lsh, s0xfffffffffffffc19
@@ -1395,10 +1402,10 @@ define i64 @lshr_mad_i64_2(i64 %arg0) #0 {
 ; CI-LABEL: lshr_mad_i64_2:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v2, v1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_movk_i32 s4, 0xd1
-; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
-; CI-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
-; CI-NEXT:    v_mov_b32_e32 v0, v2
+; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[0:1]
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: lshr_mad_i64_2:
@@ -1415,20 +1422,28 @@ define i64 @lshr_mad_i64_2(i64 %arg0) #0 {
 ; GFX9-LABEL: lshr_mad_i64_2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_movk_i32 s4, 0xd1
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
-; GFX9-NEXT:    v_sub_u32_e32 v1, v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[0:1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: lshr_mad_i64_2:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
-; GFX11-NEXT:    v_mov_b32_e32 v0, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-LABEL: lshr_mad_i64_2:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, 0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mad_u64_u32 v[2:3], null, 0xd1, v4, v[0:1]
+; GFX1100-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1150-LABEL: lshr_mad_i64_2:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1150-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, 0xd1, v2, v[0:1]
+; GFX1150-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: lshr_mad_i64_2:
 ; GFX12:       ; %bb.0:
@@ -1437,10 +1452,9 @@ define i64 @lshr_mad_i64_2(i64 %arg0) #0 {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
-; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, 0xd1, v2, v[0:1]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %lsh = lshr i64 %arg0, 32
   %mul = mul i64 %lsh, s0xffffffff000000d1
@@ -1453,10 +1467,10 @@ define i64 @lshr_mad_i64_3(i64 %arg0) #0 {
 ; CI-LABEL: lshr_mad_i64_3:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v2, v1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_movk_i32 s4, 0xfc88
-; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
-; CI-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
-; CI-NEXT:    v_mov_b32_e32 v0, v2
+; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[0:1]
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: lshr_mad_i64_3:
@@ -1473,20 +1487,28 @@ define i64 @lshr_mad_i64_3(i64 %arg0) #0 {
 ; GFX9-LABEL: lshr_mad_i64_3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_movk_i32 s4, 0xfc88
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
-; GFX9-NEXT:    v_sub_u32_e32 v1, v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[0:1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: lshr_mad_i64_3:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0xfffffc88, v1, v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
-; GFX11-NEXT:    v_mov_b32_e32 v0, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-LABEL: lshr_mad_i64_3:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, 0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mad_u64_u32 v[2:3], null, 0xfffffc88, v4, v[0:1]
+; GFX1100-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1150-LABEL: lshr_mad_i64_3:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1150-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, 0xfffffc88, v2, v[0:1]
+; GFX1150-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: lshr_mad_i64_3:
 ; GFX12:       ; %bb.0:
@@ -1495,10 +1517,9 @@ define i64 @lshr_mad_i64_3(i64 %arg0) #0 {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0xfffffc88, v1, v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
-; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v2, v[0:1]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %lsh = lshr i64 %arg0, 32
   %mul = mul i64 s0xfffffffffffffc88, %lsh
@@ -1511,12 +1532,12 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 {
 ; CI-LABEL: lshr_mad_i64_4:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_mul_lo_u32 v3, v2, v0
-; CI-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v1, v0, 0
+; CI-NEXT:    v_mul_lo_u32 v2, v2, v0
+; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v0, 0
 ; CI-NEXT:    s_movk_i32 s4, 0xfc88
-; CI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[1:2]
-; CI-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
+; CI-NEXT:    v_add_i32_e32 v2, vcc, v1, v2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[0:1]
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: lshr_mad_i64_4:
@@ -1539,26 +1560,33 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v5
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v0, v[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_movk_i32 s4, 0xfc88
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[4:5]
-; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s4, v[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: lshr_mad_i64_4:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, v4
-; GFX11-NEXT:    v_mad_u64_u32 v[5:6], null, v2, v0, v[1:2]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v5
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v1, v5
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-LABEL: lshr_mad_i64_4:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v0, 0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0
+; GFX1100-NEXT:    v_mad_u64_u32 v[5:6], null, v2, v0, v[1:2]
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_mad_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4]
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1150-LABEL: lshr_mad_i64_4:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1150-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v0, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0
+; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v0, v[1:2]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, 0xfffffc88, v0, v[3:4]
+; GFX1150-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: lshr_mad_i64_4:
 ; GFX12:       ; %bb.0:
@@ -1569,13 +1597,10 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], null, v1, v0, 0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v1, v4
-; GFX12-NEXT:    v_mad_co_u64_u32 v[5:6], null, v2, v0, v[1:2]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v4, v5
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4]
+; GFX12-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v2, v0, v[1:2]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v1, v5
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v0, v[3:4]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ext = zext i32 %arg0 to i64
   %mul1 = mul i64 %arg1, %ext
@@ -1862,10 +1887,9 @@ define amdgpu_ps i64 @lshr_mad_i64_sgpr(i64 inreg %arg0) #0 {
 ; CI-LABEL: lshr_mad_i64_sgpr:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, 0xffff1c18
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s1, v2, v[0:1]
-; CI-NEXT:    v_subrev_i32_e32 v1, vcc, s1, v1
+; CI-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s1, v2, v[0:1]
 ; CI-NEXT:    v_readfirstlane_b32 s0, v0
 ; CI-NEXT:    v_readfirstlane_b32 s1, v1
 ; CI-NEXT:    ; return to shader part epilog
@@ -1920,14 +1944,16 @@ define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
 ; CI-LABEL: lshr_mad_i64_vec:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v6, v3
+; CI-NEXT:    v_mov_b32_e32 v3, v1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 s4, 0xffff1c18
-; CI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1]
+; CI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, s4, v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v3, v1
 ; CI-NEXT:    s_mov_b32 s4, 0xffff1118
-; CI-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3]
-; CI-NEXT:    v_sub_i32_e32 v1, vcc, v5, v1
-; CI-NEXT:    v_sub_i32_e32 v3, vcc, v7, v3
+; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s4, v[2:3]
 ; CI-NEXT:    v_mov_b32_e32 v0, v4
-; CI-NEXT:    v_mov_b32_e32 v2, v6
+; CI-NEXT:    v_mov_b32_e32 v1, v5
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: lshr_mad_i64_vec:
@@ -1950,28 +1976,44 @@ define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
 ; GFX9-LABEL: lshr_mad_i64_vec:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 0xffff1c18
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, s4, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0xffff1118
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3]
-; GFX9-NEXT:    v_sub_u32_e32 v1, v5, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, v7, v3
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s4, v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-NEXT:    v_mov_b32_e32 v2, v6
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: lshr_mad_i64_vec:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mad_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v5, v1
-; GFX11-NEXT:    v_mov_b32_e32 v0, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v3, v7, v3
-; GFX11-NEXT:    v_mov_b32_e32 v2, v6
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-LABEL: lshr_mad_i64_vec:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_mov_b32_e32 v8, v3
+; GFX1100-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v1, 0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mad_u64_u32 v[4:5], null, 0xffff1c18, v6, v[0:1]
+; GFX1100-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, v4
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mad_u64_u32 v[6:7], null, 0xffff1118, v8, v[2:3]
+; GFX1100-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-NEXT:    v_mov_b32_e32 v3, v7
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1150-LABEL: lshr_mad_i64_vec:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1150-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v5, v1
+; GFX1150-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, 0xffff1c18, v5, v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT:    v_mad_u64_u32 v[2:3], null, 0xffff1118, v4, v[2:3]
+; GFX1150-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: lshr_mad_i64_vec:
 ; GFX12:       ; %bb.0:
@@ -1980,14 +2022,13 @@ define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v5, v1
-; GFX12-NEXT:    v_mov_b32_e32 v0, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_sub_nc_u32_e32 v3, v7, v3
-; GFX12-NEXT:    v_mov_b32_e32 v2, v6
+; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v5, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v3, v1
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, 0xffff1c18, v5, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0xffff1118, v4, v[2:3]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %lsh = lshr <2 x i64> %arg0, <i64 32, i64 32>
   %mul = mul <2 x i64> %lsh, <i64 s0xffffffffffff1c18, i64 s0xffffffffffff1118>
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 6b7eff316fe95..0833dada43e4d 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -549,17 +549,19 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_fadd_fsub:
+; GCN-LABEL: {{^}}fadd_fadd_fsub_0:
 ; GFX900:       v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0
 ; GFX900:       v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; PACKED-SDAG:  v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 op_sel_hi:[1,0]{{$}}
-; PACKED-SDAG:  v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 op_sel_hi:[1,0]{{$}}
+
+; PACKED-SDAG: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0
+; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+
 ; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
 ; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
-define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg) {
+define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
 bb:
   %i12 = fadd <2 x float> zeroinitializer, %arg
-  %shift8 = shufflevector <2 x float> %i12, <2 x float> undef, <2 x i32> <i32 1, i32 undef>
+  %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
   %i13 = fadd <2 x float> zeroinitializer, %shift8
   %i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> <i32 0, i32 2>
   %i15 = fsub <2 x float> %i14, zeroinitializer
@@ -567,6 +569,26 @@ bb:
   ret void
 }
 
+; GCN-LABEL: {{^}}fadd_fadd_fsub:
+; GFX900:       v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; GFX900:       v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+
+; PACKED-SDAG:  v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; PACKED-SDAG:  v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
+
+; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
+; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
+define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, ptr addrspace(1) %ptr) {
+bb:
+  %i12 = fadd <2 x float> %arg, %arg1
+  %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
+  %i13 = fadd <2 x float> %arg1, %shift8
+  %i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> <i32 0, i32 2>
+  %i15 = fsub <2 x float> %i14, %arg1
+  store <2 x float> %i15, ptr addrspace(1) %ptr
+  ret void
+}
+
 ; GCN-LABEL: {{^}}fadd_shuffle_v4:
 ; GFX900-COUNT-4:       v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; PACKED-SDAG-COUNT-2:  v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
index aa3e05fdbdb36..15d172eb17688 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
@@ -156,3 +156,335 @@ define <2 x i16> @trunc_v2i64_arg_to_v2i16(<2 x i64> %arg0) #0 {
   %trunc = trunc <2 x i64> %arg0 to <2 x i16>
   ret <2 x i16> %trunc
 }
+
+; Test for regression where an unnecessary v_alignbit_b32 was inserted
+; on the final result, due to losing the fact that the upper half of
+; the lhs vector was undef.
+define <2 x i16> @vector_trunc_high_bits_undef_lshr_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_lshr_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_lshr_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = lshr <2 x i32> %undef.hi.elt, splat (i32 16)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshr_b32_e32 v0, 16, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e64 v0, v0, 16
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = lshr <2 x i32> splat (i32 16), %undef.hi.elt
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_ashr_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_ashr_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_ashr_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %ashr = ashr <2 x i32> %undef.hi.elt, splat (i32 16)
+  %trunc = trunc <2 x i32> %ashr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_ashr_rhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_ashr_rhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_ashr_i32_e32 v0, -4, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_ashr_rhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_ashrrev_i32_e64 v0, v0, -4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = ashr <2 x i32> splat (i32 -4), %undef.hi.elt
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_add_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_add_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_add_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = add <2 x i32> %undef.hi.elt, splat (i32 16)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_shl_rhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_shl_rhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshl_b32_e32 v0, 2, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xfffe, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_shl_rhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e64 v0, v0, 2
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = shl <2 x i32> splat (i32 2), %undef.hi.elt
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_sub_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_sub_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v0, vcc, -16, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_sub_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = sub <2 x i32> %undef.hi.elt, splat (i32 16)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_or_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v0, 0xffff0011, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0xffff
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_or_b32_e32 v0, 0xffff0011, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = or <2 x i32> %undef.hi.elt, splat (i32 17)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_xor_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_xor_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_xor_b32_e32 v0, 17, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_xor_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_xor_b32_e32 v0, 17, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = xor <2 x i32> %undef.hi.elt, splat (i32 17)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_shl_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_shl_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xfffc, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_shl_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v0, 2, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %shl = shl <2 x i32> %undef.hi.elt, splat (i32 2)
+  %trunc = trunc <2 x i32> %shl to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_mul_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_mul_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mul_lo_u32 v0, v0, 18
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    v_and_b32_e32 v0, 0xfffe, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_mul_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_mul_lo_u32 v0, v0, 18
+; VI-NEXT:    v_and_b32_e32 v0, 0xfffe, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = mul <2 x i32> %undef.hi.elt, splat (i32 18)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_sdiv_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_sdiv_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; SI-NEXT:    v_mul_hi_i32 v0, v0, s4
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_sdiv_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; VI-NEXT:    v_mul_hi_i32 v0, v0, s4
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
+; VI-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = sdiv <2 x i32> %undef.hi.elt, splat (i32 18)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_srem_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_srem_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; SI-NEXT:    v_mul_hi_i32 v1, v0, s4
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
+; SI-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; SI-NEXT:    v_mul_lo_u32 v1, v1, 18
+; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_srem_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; VI-NEXT:    v_mul_hi_i32 v1, v0, s4
+; VI-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; VI-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; VI-NEXT:    v_mul_lo_u32 v1, v1, 18
+; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = srem <2 x i32> %undef.hi.elt, splat (i32 18)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+
+define <2 x i16> @vector_trunc_high_bits_undef_udiv_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_udiv_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; SI-NEXT:    v_mul_hi_u32 v0, v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    v_bfe_u32 v0, v0, 2, 16
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_udiv_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; VI-NEXT:    v_mul_hi_u32 v0, v0, s4
+; VI-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = udiv <2 x i32> %undef.hi.elt, splat (i32 18)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_urem_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_urem_lhs_alignbit_regression:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; SI-NEXT:    v_mul_hi_u32 v1, v0, s4
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
+; SI-NEXT:    v_mul_lo_u32 v1, v1, 18
+; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_urem_lhs_alignbit_regression:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0x38e38e39
+; VI-NEXT:    v_mul_hi_u32 v1, v0, s4
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
+; VI-NEXT:    v_mul_lo_u32 v1, v1, 18
+; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+  %lshr = urem <2 x i32> %undef.hi.elt, splat (i32 18)
+  %trunc = trunc <2 x i32> %lshr to <2 x i16>
+  ret <2 x i16> %trunc
+}
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index f20c1ccb2d63e..c6cc479b5deb1 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare half @llvm.fabs.f16(half)
@@ -90,6 +91,24 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_cnd_nan_nosgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v0, v0, s[0:1]
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX12-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
   %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx
   %f = load float, ptr addrspace(1) %f.gep
@@ -155,6 +174,18 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, -1, s3, s[4:5]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_cnd_nan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX12-NEXT:    s_cselect_b32 s2, s3, -1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %setcc = icmp ne i32 %c, 0
   %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
   store float %select, ptr addrspace(1) %out
@@ -220,6 +251,21 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, s1, s[4:5]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x4c
+; GFX12-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_cmp_nlg_f32 s0, 0
+; GFX12-NEXT:    s_cselect_b32 s0, s1, 1.0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
@@ -285,6 +331,19 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, s6, s[2:3]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_cmp_nlg_f32 s2, 0
+; GFX12-NEXT:    s_cselect_b32 s2, s2, 1.0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
@@ -350,6 +409,21 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, s1, s[4:5]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x4c
+; GFX12-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_cmp_nlg_f32 s0, 0
+; GFX12-NEXT:    s_cselect_b32 s0, s1, 0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
@@ -415,6 +489,19 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, s6, s[2:3]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_cmp_nlg_f32 s2, 0
+; GFX12-NEXT:    s_cselect_b32 s2, s2, 0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
@@ -498,6 +585,23 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_cmp_nlg_f32 s2, 0
+; GFX12-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
@@ -583,6 +687,23 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_cmp_nlg_f32 s2, 0
+; GFX12-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
@@ -661,6 +782,21 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, s4, vcc
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
+; GFX12-NEXT:    v_cndmask_b32_e64 v1, 1.0, s4, vcc
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
@@ -751,6 +887,24 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_le_f32_e32 vcc, 0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
@@ -843,6 +997,24 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 2, v2, vcc
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, 2, v2, vcc
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
@@ -939,6 +1111,25 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 2, v2, vcc
 ; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[0:1], v4, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[2:3], v4, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[0:1]
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, 2, v2, vcc
+; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
+; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i64, ptr addrspace(1) %x.ptr, i64 %tid.ext
@@ -1048,6 +1239,28 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v5, v1, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_b128 v[0:3], v4, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_nge_f32_e32 vcc, 4.0, v5
+; GFX12-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
+; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
@@ -1157,6 +1370,28 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v5, v1, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_b128 v[0:3], v4, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_ge_f32_e32 vcc, 4.0, v5
+; GFX12-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
+; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
@@ -1268,6 +1503,28 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v5, v1, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_b128 v[0:3], v4, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_le_f32_e32 vcc, 4.0, v5
+; GFX12-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
+; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
@@ -1375,6 +1632,29 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX11-NEXT:    global_store_b8 v0, v1, s[8:9]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v1, s[10:11] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_u8 v2, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
+; GFX12-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v2
+; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX12-NEXT:    global_store_b8 v0, v1, s[8:9]
+; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
@@ -1479,6 +1759,26 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v1, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_le_f32_e32 vcc, 0, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
@@ -1581,6 +1881,26 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 3, v0, vcc
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v1, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_nlg_f32_e32 vcc, 0, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, 3, v0, vcc
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
@@ -1674,6 +1994,24 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 4.0, v2, vcc
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_gt_u32_e32 vcc, 2, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, 4.0, v2, vcc
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
@@ -1783,6 +2121,28 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add
 ; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_nle_f32_e32 vcc, 4.0, v1
+; GFX12-NEXT:    v_cndmask_b32_e64 v1, v2, -1.0, vcc
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v2, -2.0, vcc
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_store_b32 v0, v2, s[0:1] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
@@ -1890,6 +2250,27 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c,
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX11-NEXT:    global_store_b16 v2, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_cndmask_abs_neg_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_u16 v0, v0, s[0:1]
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX12-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX12-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX12-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
   %f.gep = getelementptr half, ptr addrspace(1) %fptr, i32 %idx
   %f = load half, ptr addrspace(1) %f.gep
@@ -1981,6 +2362,24 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c,
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_cndmask_abs_neg_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v0, v0, s[0:1]
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX12-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
   %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx
   %f = load float, ptr addrspace(1) %f.gep
@@ -2086,6 +2485,28 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c,
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_cndmask_abs_neg_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[0:1]
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX12-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
+; GFX12-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v0, vcc
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX12-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
+; GFX12-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
   %f.gep = getelementptr double, ptr addrspace(1) %fptr, i32 %idx
   %f = load double, ptr addrspace(1) %f.gep
diff --git a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
index c936c13ac6c66..d91ee54215924 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
@@ -159,16 +159,16 @@ name:            mask_hazard_cndmask_dpp3
 body:            |
   bb.0:
     ; GFX11-LABEL: name: mask_hazard_cndmask_dpp3
-    ; GFX11: $vgpr0 = V_CNDMASK_B16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
+    ; GFX11: $vgpr0 = V_CNDMASK_B16_fake16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
     ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
     ;
     ; GFX12-LABEL: name: mask_hazard_cndmask_dpp3
-    ; GFX12: $vgpr0 = V_CNDMASK_B16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
+    ; GFX12: $vgpr0 = V_CNDMASK_B16_fake16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
     ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX12-NEXT: S_ENDPGM 0
-    $vgpr0 = V_CNDMASK_B16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
+    $vgpr0 = V_CNDMASK_B16_fake16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
     $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     S_ENDPGM 0
 ...
diff --git a/llvm/test/CodeGen/ARM/vector-promotion.ll b/llvm/test/CodeGen/ARM/vector-promotion.ll
index a9a8f58963a37..344014ad80449 100644
--- a/llvm/test/CodeGen/ARM/vector-promotion.ll
+++ b/llvm/test/CodeGen/ARM/vector-promotion.ll
@@ -4,7 +4,7 @@
 
 ; IR-BOTH-LABEL: @simpleOneInstructionPromotion
 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, ptr %addr1
-; IR-BOTH-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 undef, i32 1>
+; IR-BOTH-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 poison, i32 1>
 ; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR]], i32 1
 ; IR-BOTH-NEXT: store i32 [[EXTRACT]], ptr %dest
 ; IR-BOTH-NEXT: ret
@@ -71,13 +71,13 @@ end:
 
 ; IR-LABEL: @chainOfInstructionsToPromote
 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, ptr %addr1
-; IR-BOTH-NEXT: [[VECTOR_OR1:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 1, i32 undef>
-; IR-BOTH-NEXT: [[VECTOR_OR2:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR1]], <i32 1, i32 undef>
-; IR-BOTH-NEXT: [[VECTOR_OR3:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR2]], <i32 1, i32 undef>
-; IR-BOTH-NEXT: [[VECTOR_OR4:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR3]], <i32 1, i32 undef>
-; IR-BOTH-NEXT: [[VECTOR_OR5:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR4]], <i32 1, i32 undef>
-; IR-BOTH-NEXT: [[VECTOR_OR6:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR5]], <i32 1, i32 undef>
-; IR-BOTH-NEXT: [[VECTOR_OR7:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR6]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[VECTOR_OR1:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 1, i32 poison>
+; IR-BOTH-NEXT: [[VECTOR_OR2:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR1]], <i32 1, i32 poison>
+; IR-BOTH-NEXT: [[VECTOR_OR3:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR2]], <i32 1, i32 poison>
+; IR-BOTH-NEXT: [[VECTOR_OR4:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR3]], <i32 1, i32 poison>
+; IR-BOTH-NEXT: [[VECTOR_OR5:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR4]], <i32 1, i32 poison>
+; IR-BOTH-NEXT: [[VECTOR_OR6:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR5]], <i32 1, i32 poison>
+; IR-BOTH-NEXT: [[VECTOR_OR7:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR6]], <i32 1, i32 poison>
 ; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR7]], i32 0
 ; IR-BOTH-NEXT: store i32 [[EXTRACT]], ptr %dest
 ; IR-BOTH-NEXT: ret
@@ -276,7 +276,7 @@ define void @undefRemCase(ptr %addr1, ptr %dest) {
 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem nnan float [[EXTRACT]], 7.0
 ; Vector version:
-; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem nnan <2 x float> [[LOAD]], <float undef, float 7.000000e+00>
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem nnan <2 x float> [[LOAD]], <float poison, float 7.000000e+00>
 ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
 ;
 ; IR-BOTH-NEXT: store float [[RES]], ptr %dest
@@ -297,7 +297,7 @@ define void @undefConstantFRemCaseWithFastMath(ptr %addr1, ptr %dest) {
 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem nnan float 7.000000e+00, [[EXTRACT]]
 ; Vector version:
-; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem nnan <2 x float> <float undef, float 7.000000e+00>, [[LOAD]]
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem nnan <2 x float> <float poison, float 7.000000e+00>, [[LOAD]]
 ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
 ;
 ; IR-BOTH-NEXT: store float [[RES]], ptr %dest
@@ -319,7 +319,7 @@ define void @undefVectorFRemCaseWithFastMath(ptr %addr1, ptr %dest) {
 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fadd float [[EXTRACT]], 1.0
 ; Vector version:
-; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = fadd <2 x float> [[LOAD]], <float undef, float 1.000000e+00>
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = fadd <2 x float> [[LOAD]], <float poison, float 1.000000e+00>
 ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
 ;
 ; IR-BOTH-NEXT: store float [[RES]], ptr %dest
@@ -358,7 +358,7 @@ define void @simpleOneInstructionPromotionVariableIdx(ptr %addr1, ptr %dest, i32
 ; Check a vector with more than 2 elements.
 ; IR-BOTH-LABEL: @simpleOneInstructionPromotion8x8
 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <8 x i8>, ptr %addr1
-; IR-BOTH-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <8 x i8> [[LOAD]], <i8 undef, i8 1, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
+; IR-BOTH-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <8 x i8> [[LOAD]], <i8 poison, i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
 ; IR-BOTH-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <8 x i8> [[OR]], i32 1
 ; IR-BOTH-NEXT: store i8 [[RES]], ptr %dest
 ; IR-BOTH-NEXT: ret
@@ -374,7 +374,7 @@ define void @simpleOneInstructionPromotion8x8(ptr %addr1, ptr %dest) {
 ; lowered on a Q register.
 ; IR-BOTH-LABEL: @simpleOneInstructionPromotion
 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>, ptr %addr1
-; IR-BOTH-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <4 x i32> [[LOAD]], <i32 undef, i32 1, i32 undef, i32 undef>
+; IR-BOTH-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <4 x i32> [[LOAD]], <i32 poison, i32 1, i32 poison, i32 poison>
 ; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[VECTOR_OR]], i32 1
 ; IR-BOTH-NEXT: store i32 [[EXTRACT]], ptr %dest
 ; IR-BOTH-NEXT: ret
diff --git a/llvm/test/CodeGen/DirectX/WaveActiveSum.ll b/llvm/test/CodeGen/DirectX/WaveActiveSum.ll
new file mode 100644
index 0000000000000..d5180eb10c699
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/WaveActiveSum.ll
@@ -0,0 +1,143 @@
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s
+
+; Test that for scalar values, WaveActiveSum maps down to the DirectX op
+
+define noundef half @wave_active_sum_half(half noundef %expr) {
+entry:
+; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr, i8 0, i8 0)
+  %ret = call half @llvm.dx.wave.reduce.sum.f16(half %expr)
+  ret half %ret
+}
+
+define noundef float @wave_active_sum_float(float noundef %expr) {
+entry:
+; CHECK: call float @dx.op.waveActiveOp.f32(i32 119, float %expr, i8 0, i8 0)
+  %ret = call float @llvm.dx.wave.reduce.sum.f32(float %expr)
+  ret float %ret
+}
+
+define noundef double @wave_active_sum_double(double noundef %expr) {
+entry:
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr, i8 0, i8 0)
+  %ret = call double @llvm.dx.wave.reduce.sum.f64(double %expr)
+  ret double %ret
+}
+
+define noundef i16 @wave_active_sum_i16(i16 noundef %expr) {
+entry:
+; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr, i8 0, i8 0)
+  %ret = call i16 @llvm.dx.wave.reduce.sum.i16(i16 %expr)
+  ret i16 %ret
+}
+
+define noundef i32 @wave_active_sum_i32(i32 noundef %expr) {
+entry:
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr, i8 0, i8 0)
+  %ret = call i32 @llvm.dx.wave.reduce.sum.i32(i32 %expr)
+  ret i32 %ret
+}
+
+define noundef i64 @wave_active_sum_i64(i64 noundef %expr) {
+entry:
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr, i8 0, i8 0)
+  %ret = call i64 @llvm.dx.wave.reduce.sum.i64(i64 %expr)
+  ret i64 %ret
+}
+
+define noundef i16 @wave_active_usum_i16(i16 noundef %expr) {
+entry:
+; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr, i8 0, i8 1)
+  %ret = call i16 @llvm.dx.wave.reduce.usum.i16(i16 %expr)
+  ret i16 %ret
+}
+
+define noundef i32 @wave_active_usum_i32(i32 noundef %expr) {
+entry:
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr, i8 0, i8 1)
+  %ret = call i32 @llvm.dx.wave.reduce.usum.i32(i32 %expr)
+  ret i32 %ret
+}
+
+define noundef i64 @wave_active_usum_i64(i64 noundef %expr) {
+entry:
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr, i8 0, i8 1)
+  %ret = call i64 @llvm.dx.wave.reduce.usum.i64(i64 %expr)
+  ret i64 %ret
+}
+
+declare half @llvm.dx.wave.reduce.sum.f16(half)
+declare float @llvm.dx.wave.reduce.sum.f32(float)
+declare double @llvm.dx.wave.reduce.sum.f64(double)
+
+declare i16 @llvm.dx.wave.reduce.sum.i16(i16)
+declare i32 @llvm.dx.wave.reduce.sum.i32(i32)
+declare i64 @llvm.dx.wave.reduce.sum.i64(i64)
+
+declare i16 @llvm.dx.wave.reduce.usum.i16(i16)
+declare i32 @llvm.dx.wave.reduce.usum.i32(i32)
+declare i64 @llvm.dx.wave.reduce.usum.i64(i64)
+
+; Test that for vector values, WaveActiveSum scalarizes and maps down to the
+; DirectX op
+
+define noundef <2 x half> @wave_active_sum_v2half(<2 x half> noundef %expr) {
+entry:
+; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr.i0, i8 0, i8 0)
+; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr.i1, i8 0, i8 0)
+  %ret = call <2 x half> @llvm.dx.wave.reduce.sum.v2f16(<2 x half> %expr)
+  ret <2 x half> %ret
+}
+
+define noundef <3 x i32> @wave_active_sum_v3i32(<3 x i32> noundef %expr) {
+entry:
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i0, i8 0, i8 0)
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i1, i8 0, i8 0)
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i2, i8 0, i8 0)
+  %ret = call <3 x i32> @llvm.dx.wave.reduce.sum.v3i32(<3 x i32> %expr)
+  ret <3 x i32> %ret
+}
+
+define noundef <4 x double> @wave_active_sum_v4f64(<4 x double> noundef %expr) {
+entry:
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i0, i8 0, i8 0)
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i1, i8 0, i8 0)
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i2, i8 0, i8 0)
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i3, i8 0, i8 0)
+  %ret = call <4 x double> @llvm.dx.wave.reduce.sum.v464(<4 x double> %expr)
+  ret <4 x double> %ret
+}
+
+declare <2 x half> @llvm.dx.wave.reduce.sum.v2f16(<2 x half>)
+declare <3 x i32> @llvm.dx.wave.reduce.sum.v3i32(<3 x i32>)
+declare <4 x double> @llvm.dx.wave.reduce.sum.v4f64(<4 x double>)
+
+define noundef <2 x i16> @wave_active_usum_v2i16(<2 x i16> noundef %expr) {
+entry:
+; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr.i0, i8 0, i8 1)
+; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr.i1, i8 0, i8 1)
+  %ret = call <2 x i16> @llvm.dx.wave.reduce.usum.v2f16(<2 x i16> %expr)
+  ret <2 x i16> %ret
+}
+
+define noundef <3 x i32> @wave_active_usum_v3i32(<3 x i32> noundef %expr) {
+entry:
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i0, i8 0, i8 1)
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i1, i8 0, i8 1)
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i2, i8 0, i8 1)
+  %ret = call <3 x i32> @llvm.dx.wave.reduce.usum.v3i32(<3 x i32> %expr)
+  ret <3 x i32> %ret
+}
+
+define noundef <4 x i64> @wave_active_usum_v4f64(<4 x i64> noundef %expr) {
+entry:
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i0, i8 0, i8 1)
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i1, i8 0, i8 1)
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i2, i8 0, i8 1)
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i3, i8 0, i8 1)
+  %ret = call <4 x i64> @llvm.dx.wave.reduce.usum.v464(<4 x i64> %expr)
+  ret <4 x i64> %ret
+}
+
+declare <2 x i16> @llvm.dx.wave.reduce.usum.v2f16(<2 x i16>)
+declare <3 x i32> @llvm.dx.wave.reduce.usum.v3i32(<3 x i32>)
+declare <4 x i64> @llvm.dx.wave.reduce.usum.v4f64(<4 x i64>)
diff --git a/llvm/test/CodeGen/DirectX/firstbitlow.ll b/llvm/test/CodeGen/DirectX/firstbitlow.ll
new file mode 100644
index 0000000000000..884ec1164fc99
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/firstbitlow.ll
@@ -0,0 +1,47 @@
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+
+; Make sure dxil operation function calls for firstbitlow are generated for all integer types.
+
+define noundef i32 @test_firstbitlow_short(i16 noundef %a) {
+entry:
+; CHECK: call i32 @dx.op.unaryBits.i16(i32 32, i16 %{{.*}})
+  %elt.firstbitlow = call i32 @llvm.dx.firstbitlow.i16(i16 %a)
+  ret i32 %elt.firstbitlow
+}
+
+define noundef i32 @test_firstbitlow_int(i32 noundef %a) {
+entry:
+; CHECK: call i32 @dx.op.unaryBits.i32(i32 32, i32 %{{.*}})
+  %elt.firstbitlow = call i32 @llvm.dx.firstbitlow.i32(i32 %a)
+  ret i32 %elt.firstbitlow
+}
+
+define noundef i32 @test_firstbitlow_long(i64 noundef %a) {
+entry:
+; CHECK: call i32 @dx.op.unaryBits.i64(i32 32, i64 %{{.*}})
+  %elt.firstbitlow = call i32 @llvm.dx.firstbitlow.i64(i64 %a)
+  ret i32 %elt.firstbitlow
+}
+
+define noundef <4 x i32> @test_firstbitlow_vec4_i32(<4 x i32> noundef %a)  {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x i32> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call i32 @dx.op.unaryBits.i32(i32 32, i32 [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x i32> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call i32 @dx.op.unaryBits.i32(i32 32, i32 [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x i32> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call i32 @dx.op.unaryBits.i32(i32 32, i32 [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x i32> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call i32 @dx.op.unaryBits.i32(i32 32, i32 [[ee3]])
+  ; CHECK: insertelement <4 x i32> poison, i32 [[ie0]], i64 0
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 [[ie1]], i64 1
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 [[ie2]], i64 2
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 [[ie3]], i64 3
+  %2 = call <4 x i32> @llvm.dx.firstbitlow.v4i32(<4 x i32> %a)
+  ret <4 x i32> %2
+}
+
+declare i32 @llvm.dx.firstbitlow.i16(i16)
+declare i32 @llvm.dx.firstbitlow.i32(i32)
+declare i32 @llvm.dx.firstbitlow.i64(i64)
+declare <4 x i32> @llvm.dx.firstbitlow.v4i32(<4 x i32>)
diff --git a/llvm/test/CodeGen/DirectX/firstbitlow_error.ll b/llvm/test/CodeGen/DirectX/firstbitlow_error.ll
new file mode 100644
index 0000000000000..d8b9333067f4a
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/firstbitlow_error.ll
@@ -0,0 +1,10 @@
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
+
+; DXIL operation firstbitshigh does not support double overload type
+; CHECK: invalid intrinsic signature
+
+define noundef double @firstbitlow_double(double noundef %a) {
+entry:
+  %1 = call double @llvm.dx.firstbitlow.f64(double %a)
+  ret double %1
+}
diff --git a/llvm/test/CodeGen/Hexagon/i128-fpconv-strict.ll b/llvm/test/CodeGen/Hexagon/i128-fpconv-strict.ll
new file mode 100644
index 0000000000000..01ae98009d78c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/i128-fpconv-strict.ll
@@ -0,0 +1,81 @@
+; RUN: llc < %s -mtriple=hexagon-unknown-linux-musl \
+; RUN:      | FileCheck %s -check-prefix=CHECK
+
+define i64 @double_to_i128(double %d) nounwind strictfp {
+; CHECK-LABEL: double_to_i128:
+; CHECK:       // %bb.0:
+; CHECK:          call __fixdfti
+; CHECK:          dealloc_return
+  %1 = tail call i128 @llvm.experimental.constrained.fptosi.i128.f64(double %d, metadata !"fpexcept.strict")
+  %2 = trunc i128 %1 to i64
+  ret i64 %2
+}
+
+define i64 @double_to_ui128(double %d) nounwind strictfp {
+; CHECK-LABEL: double_to_ui128:
+; CHECK:       // %bb.0:
+; CHECK:          call __fixunsdfti
+; CHECK:          dealloc_return
+  %1 = tail call i128 @llvm.experimental.constrained.fptoui.i128.f64(double %d, metadata !"fpexcept.strict")
+  %2 = trunc i128 %1 to i64
+  ret i64 %2
+}
+
+define i64 @float_to_i128(float %d) nounwind strictfp {
+; CHECK-LABEL: float_to_i128:
+; CHECK:       // %bb.0:
+; CHECK:          call __fixsfti
+; CHECK:          dealloc_return
+  %1 = tail call i128 @llvm.experimental.constrained.fptosi.i128.f32(float %d, metadata !"fpexcept.strict")
+  %2 = trunc i128 %1 to i64
+  ret i64 %2
+}
+
+define i64 @float_to_ui128(float %d) nounwind strictfp {
+; CHECK-LABEL: float_to_ui128:
+; CHECK:       // %bb.0:
+; CHECK:         call __fixunssfti
+; CHECK:         dealloc_return
+  %1 = tail call i128 @llvm.experimental.constrained.fptoui.i128.f32(float %d, metadata !"fpexcept.strict")
+  %2 = trunc i128 %1 to i64
+  ret i64 %2
+}
+
+define double @ui128_to_double(ptr nocapture readonly %0) nounwind strictfp {
+; CHECK-LABEL: ui128_to_double:
+; CHECK:       // %bb.0:
+; CHECK:         call __floatuntidf
+; CHECK:         dealloc_return
+  %2 = load i128, ptr %0, align 16
+  %3 = tail call double @llvm.experimental.constrained.uitofp.f64.i128(i128 %2, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret double %3
+}
+
+define float @i128_to_float(ptr nocapture readonly %0) nounwind strictfp {
+; CHECK-LABEL: i128_to_float:
+; CHECK:       // %bb.0:
+; CHECK:         call __floattisf
+; CHECK:         dealloc_return
+  %2 = load i128, ptr %0, align 16
+  %3 = tail call float @llvm.experimental.constrained.sitofp.f32.i128(i128 %2, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret float %3
+}
+
+define float @ui128_to_float(ptr nocapture readonly %0) nounwind strictfp {
+; CHECK-LABEL: ui128_to_float:
+; CHECK:       // %bb.0:
+; CHECK:         call __floatuntisf
+; CHECK:         dealloc_return
+  %2 = load i128, ptr %0, align 16
+  %3 = tail call float @llvm.experimental.constrained.uitofp.f32.i128(i128 %2, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret float %3
+}
+
+declare i128 @llvm.experimental.constrained.fptosi.i128.f64(double, metadata)
+declare i128 @llvm.experimental.constrained.fptoui.i128.f64(double, metadata)
+declare i128 @llvm.experimental.constrained.fptosi.i128.f32(float, metadata)
+declare i128 @llvm.experimental.constrained.fptoui.i128.f32(float, metadata)
+declare double @llvm.experimental.constrained.sitofp.f64.i128(i128, metadata, metadata)
+declare double @llvm.experimental.constrained.uitofp.f64.i128(i128, metadata, metadata)
+declare float @llvm.experimental.constrained.sitofp.f32.i128(i128, metadata, metadata)
+declare float @llvm.experimental.constrained.uitofp.f32.i128(i128, metadata, metadata)
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll b/llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll
new file mode 100644
index 0000000000000..f3b8deff61918
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll
@@ -0,0 +1,17 @@
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -stop-after=finalize-isel < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -stop-after=finalize-isel < %s | FileCheck --check-prefix=GCN %s
+
+; GCN-LABEL: name: init_wwm
+; GCN: hasInitWholeWave: true
+define void @init_wwm(ptr addrspace(1) inreg %p) {
+entry:
+  %entry_exec = call i1 @llvm.amdgcn.init.whole.wave()
+  br i1 %entry_exec, label %bb.1, label %bb.2
+
+bb.1:
+  store i32 1, ptr addrspace(1) %p
+  br label %bb.2
+
+bb.2:
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index f81b785f13225..67552b95e0491 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -46,58 +46,52 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-LABEL: test(
 ; CHECKPTX71:       {
 ; CHECKPTX71-NEXT:    .reg .pred %p<5>;
-; CHECKPTX71-NEXT:    .reg .b16 %rs<22>;
+; CHECKPTX71-NEXT:    .reg .b16 %rs<26>;
 ; CHECKPTX71-NEXT:    .reg .b32 %r<4>;
-; CHECKPTX71-NEXT:    .reg .f32 %f<12>;
 ; CHECKPTX71-EMPTY:
 ; CHECKPTX71-NEXT:  // %bb.0:
 ; CHECKPTX71-NEXT:    ld.param.b16 %rs13, [test_param_3];
 ; CHECKPTX71-NEXT:    ld.param.u32 %r3, [test_param_2];
 ; CHECKPTX71-NEXT:    ld.param.u32 %r2, [test_param_1];
 ; CHECKPTX71-NEXT:    ld.param.u32 %r1, [test_param_0];
-; CHECKPTX71-NEXT:    ld.b16 %rs18, [%r1];
-; CHECKPTX71-NEXT:    cvt.f32.bf16 %f1, %rs13;
+; CHECKPTX71-NEXT:    ld.b16 %rs22, [%r1];
 ; CHECKPTX71-NEXT:  $L__BB0_1: // %atomicrmw.start14
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    cvt.f32.bf16 %f2, %rs18;
-; CHECKPTX71-NEXT:    add.rn.f32 %f3, %f2, %f1;
-; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs14, %f3;
-; CHECKPTX71-NEXT:    atom.cas.b16 %rs3, [%r1], %rs18, %rs14;
-; CHECKPTX71-NEXT:    setp.ne.s16 %p1, %rs3, %rs18;
-; CHECKPTX71-NEXT:    mov.u16 %rs18, %rs3;
+; CHECKPTX71-NEXT:    mov.b16 %rs14, 0x3F80;
+; CHECKPTX71-NEXT:    fma.rn.bf16 %rs15, %rs22, %rs14, %rs13;
+; CHECKPTX71-NEXT:    atom.cas.b16 %rs3, [%r1], %rs22, %rs15;
+; CHECKPTX71-NEXT:    setp.ne.s16 %p1, %rs3, %rs22;
+; CHECKPTX71-NEXT:    mov.u16 %rs22, %rs3;
 ; CHECKPTX71-NEXT:    @%p1 bra $L__BB0_1;
 ; CHECKPTX71-NEXT:  // %bb.2: // %atomicrmw.end13
-; CHECKPTX71-NEXT:    ld.b16 %rs19, [%r1];
+; CHECKPTX71-NEXT:    ld.b16 %rs23, [%r1];
 ; CHECKPTX71-NEXT:  $L__BB0_3: // %atomicrmw.start8
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    cvt.f32.bf16 %f4, %rs19;
-; CHECKPTX71-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
-; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs15, %f5;
-; CHECKPTX71-NEXT:    atom.cas.b16 %rs6, [%r1], %rs19, %rs15;
-; CHECKPTX71-NEXT:    setp.ne.s16 %p2, %rs6, %rs19;
-; CHECKPTX71-NEXT:    mov.u16 %rs19, %rs6;
+; CHECKPTX71-NEXT:    mov.b16 %rs16, 0x3F80;
+; CHECKPTX71-NEXT:    fma.rn.bf16 %rs17, %rs23, %rs16, %rs16;
+; CHECKPTX71-NEXT:    atom.cas.b16 %rs6, [%r1], %rs23, %rs17;
+; CHECKPTX71-NEXT:    setp.ne.s16 %p2, %rs6, %rs23;
+; CHECKPTX71-NEXT:    mov.u16 %rs23, %rs6;
 ; CHECKPTX71-NEXT:    @%p2 bra $L__BB0_3;
 ; CHECKPTX71-NEXT:  // %bb.4: // %atomicrmw.end7
-; CHECKPTX71-NEXT:    ld.global.b16 %rs20, [%r2];
+; CHECKPTX71-NEXT:    ld.global.b16 %rs24, [%r2];
 ; CHECKPTX71-NEXT:  $L__BB0_5: // %atomicrmw.start2
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    cvt.f32.bf16 %f7, %rs20;
-; CHECKPTX71-NEXT:    add.rn.f32 %f8, %f7, %f1;
-; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs16, %f8;
-; CHECKPTX71-NEXT:    atom.global.cas.b16 %rs9, [%r2], %rs20, %rs16;
-; CHECKPTX71-NEXT:    setp.ne.s16 %p3, %rs9, %rs20;
-; CHECKPTX71-NEXT:    mov.u16 %rs20, %rs9;
+; CHECKPTX71-NEXT:    mov.b16 %rs18, 0x3F80;
+; CHECKPTX71-NEXT:    fma.rn.bf16 %rs19, %rs24, %rs18, %rs13;
+; CHECKPTX71-NEXT:    atom.global.cas.b16 %rs9, [%r2], %rs24, %rs19;
+; CHECKPTX71-NEXT:    setp.ne.s16 %p3, %rs9, %rs24;
+; CHECKPTX71-NEXT:    mov.u16 %rs24, %rs9;
 ; CHECKPTX71-NEXT:    @%p3 bra $L__BB0_5;
 ; CHECKPTX71-NEXT:  // %bb.6: // %atomicrmw.end1
-; CHECKPTX71-NEXT:    ld.shared.b16 %rs21, [%r3];
+; CHECKPTX71-NEXT:    ld.shared.b16 %rs25, [%r3];
 ; CHECKPTX71-NEXT:  $L__BB0_7: // %atomicrmw.start
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    cvt.f32.bf16 %f10, %rs21;
-; CHECKPTX71-NEXT:    add.rn.f32 %f11, %f10, %f1;
-; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs17, %f11;
-; CHECKPTX71-NEXT:    atom.shared.cas.b16 %rs12, [%r3], %rs21, %rs17;
-; CHECKPTX71-NEXT:    setp.ne.s16 %p4, %rs12, %rs21;
-; CHECKPTX71-NEXT:    mov.u16 %rs21, %rs12;
+; CHECKPTX71-NEXT:    mov.b16 %rs20, 0x3F80;
+; CHECKPTX71-NEXT:    fma.rn.bf16 %rs21, %rs25, %rs20, %rs13;
+; CHECKPTX71-NEXT:    atom.shared.cas.b16 %rs12, [%r3], %rs25, %rs21;
+; CHECKPTX71-NEXT:    setp.ne.s16 %p4, %rs12, %rs25;
+; CHECKPTX71-NEXT:    mov.u16 %rs25, %rs12;
 ; CHECKPTX71-NEXT:    @%p4 bra $L__BB0_7;
 ; CHECKPTX71-NEXT:  // %bb.8: // %atomicrmw.end
 ; CHECKPTX71-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index 6828bac18cad7..0c1b1e2166928 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -42,17 +42,14 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) {
 ;
 ; SM80-LABEL: test_fadd(
 ; SM80:       {
-; SM80-NEXT:    .reg .b16 %rs<4>;
-; SM80-NEXT:    .reg .f32 %f<4>;
+; SM80-NEXT:    .reg .b16 %rs<5>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b16 %rs1, [test_fadd_param_0];
 ; SM80-NEXT:    ld.param.b16 %rs2, [test_fadd_param_1];
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs2;
-; SM80-NEXT:    cvt.f32.bf16 %f2, %rs1;
-; SM80-NEXT:    add.rn.f32 %f3, %f2, %f1;
-; SM80-NEXT:    cvt.rn.bf16.f32 %rs3, %f3;
-; SM80-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM80-NEXT:    mov.b16 %rs3, 0x3F80;
+; SM80-NEXT:    fma.rn.bf16 %rs4, %rs1, %rs3, %rs2;
+; SM80-NEXT:    st.param.b16 [func_retval0], %rs4;
 ; SM80-NEXT:    ret;
 ;
 ; SM80-FTZ-LABEL: test_fadd(
@@ -113,17 +110,14 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) {
 ;
 ; SM80-LABEL: test_fsub(
 ; SM80:       {
-; SM80-NEXT:    .reg .b16 %rs<4>;
-; SM80-NEXT:    .reg .f32 %f<4>;
+; SM80-NEXT:    .reg .b16 %rs<5>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b16 %rs1, [test_fsub_param_0];
-; SM80-NEXT:    ld.param.b16 %rs2, [test_fsub_param_1];
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs2;
-; SM80-NEXT:    cvt.f32.bf16 %f2, %rs1;
-; SM80-NEXT:    sub.rn.f32 %f3, %f2, %f1;
-; SM80-NEXT:    cvt.rn.bf16.f32 %rs3, %f3;
-; SM80-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM80-NEXT:    mov.b16 %rs2, 0xBF80;
+; SM80-NEXT:    ld.param.b16 %rs3, [test_fsub_param_1];
+; SM80-NEXT:    fma.rn.bf16 %rs4, %rs3, %rs2, %rs1;
+; SM80-NEXT:    st.param.b16 [func_retval0], %rs4;
 ; SM80-NEXT:    ret;
 ;
 ; SM80-FTZ-LABEL: test_fsub(
@@ -202,23 +196,14 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ;
 ; SM80-LABEL: test_faddx2(
 ; SM80:       {
-; SM80-NEXT:    .reg .b16 %rs<5>;
-; SM80-NEXT:    .reg .b32 %r<4>;
-; SM80-NEXT:    .reg .f32 %f<7>;
+; SM80-NEXT:    .reg .b32 %r<5>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
-; SM80-NEXT:    ld.param.b32 %r1, [test_faddx2_param_0];
-; SM80-NEXT:    ld.param.b32 %r2, [test_faddx2_param_1];
-; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; SM80-NEXT:    cvt.f32.bf16 %f2, %rs3;
-; SM80-NEXT:    add.rn.f32 %f3, %f2, %f1;
-; SM80-NEXT:    cvt.f32.bf16 %f4, %rs2;
-; SM80-NEXT:    cvt.f32.bf16 %f5, %rs4;
-; SM80-NEXT:    add.rn.f32 %f6, %f5, %f4;
-; SM80-NEXT:    cvt.rn.bf16x2.f32 %r3, %f6, %f3;
-; SM80-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM80-NEXT:    ld.param.b32 %r1, [test_faddx2_param_1];
+; SM80-NEXT:    ld.param.b32 %r2, [test_faddx2_param_0];
+; SM80-NEXT:    mov.b32 %r3, 1065369472;
+; SM80-NEXT:    fma.rn.bf16x2 %r4, %r2, %r3, %r1;
+; SM80-NEXT:    st.param.b32 [func_retval0], %r4;
 ; SM80-NEXT:    ret;
 ;
 ; SM80-FTZ-LABEL: test_faddx2(
@@ -303,23 +288,14 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ;
 ; SM80-LABEL: test_fsubx2(
 ; SM80:       {
-; SM80-NEXT:    .reg .b16 %rs<5>;
-; SM80-NEXT:    .reg .b32 %r<4>;
-; SM80-NEXT:    .reg .f32 %f<7>;
+; SM80-NEXT:    .reg .b32 %r<5>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b32 %r1, [test_fsubx2_param_0];
 ; SM80-NEXT:    ld.param.b32 %r2, [test_fsubx2_param_1];
-; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; SM80-NEXT:    cvt.f32.bf16 %f2, %rs3;
-; SM80-NEXT:    sub.rn.f32 %f3, %f2, %f1;
-; SM80-NEXT:    cvt.f32.bf16 %f4, %rs2;
-; SM80-NEXT:    cvt.f32.bf16 %f5, %rs4;
-; SM80-NEXT:    sub.rn.f32 %f6, %f5, %f4;
-; SM80-NEXT:    cvt.rn.bf16x2.f32 %r3, %f6, %f3;
-; SM80-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM80-NEXT:    mov.b32 %r3, -1082081408;
+; SM80-NEXT:    fma.rn.bf16x2 %r4, %r2, %r3, %r1;
+; SM80-NEXT:    st.param.b32 [func_retval0], %r4;
 ; SM80-NEXT:    ret;
 ;
 ; SM80-FTZ-LABEL: test_fsubx2(
@@ -404,23 +380,14 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ;
 ; SM80-LABEL: test_fmulx2(
 ; SM80:       {
-; SM80-NEXT:    .reg .b16 %rs<5>;
-; SM80-NEXT:    .reg .b32 %r<4>;
-; SM80-NEXT:    .reg .f32 %f<7>;
+; SM80-NEXT:    .reg .b32 %r<5>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
-; SM80-NEXT:    ld.param.b32 %r1, [test_fmulx2_param_0];
-; SM80-NEXT:    ld.param.b32 %r2, [test_fmulx2_param_1];
-; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; SM80-NEXT:    cvt.f32.bf16 %f2, %rs3;
-; SM80-NEXT:    mul.rn.f32 %f3, %f2, %f1;
-; SM80-NEXT:    cvt.f32.bf16 %f4, %rs2;
-; SM80-NEXT:    cvt.f32.bf16 %f5, %rs4;
-; SM80-NEXT:    mul.rn.f32 %f6, %f5, %f4;
-; SM80-NEXT:    cvt.rn.bf16x2.f32 %r3, %f6, %f3;
-; SM80-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM80-NEXT:    ld.param.b32 %r1, [test_fmulx2_param_1];
+; SM80-NEXT:    ld.param.b32 %r2, [test_fmulx2_param_0];
+; SM80-NEXT:    mov.b32 %r3, -2147450880;
+; SM80-NEXT:    fma.rn.bf16x2 %r4, %r2, %r1, %r3;
+; SM80-NEXT:    st.param.b32 [func_retval0], %r4;
 ; SM80-NEXT:    ret;
 ;
 ; SM80-FTZ-LABEL: test_fmulx2(
@@ -727,15 +694,13 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
 ;
 ; SM80-LABEL: test_fadd_imm_1(
 ; SM80:       {
-; SM80-NEXT:    .reg .b16 %rs<3>;
-; SM80-NEXT:    .reg .f32 %f<3>;
+; SM80-NEXT:    .reg .b16 %rs<4>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM80-NEXT:    cvt.rn.bf16.f32 %rs2, %f2;
-; SM80-NEXT:    st.param.b16 [func_retval0], %rs2;
+; SM80-NEXT:    mov.b16 %rs2, 0x3F80;
+; SM80-NEXT:    fma.rn.bf16 %rs3, %rs1, %rs2, %rs2;
+; SM80-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-NEXT:    ret;
 ;
 ; SM80-FTZ-LABEL: test_fadd_imm_1(
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index 03cdeb9683aba..e6d35bd5ba536 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -22,19 +22,14 @@ define <2 x bfloat> @test_ret_const() #0 {
 define <2 x bfloat> @test_fadd_imm_0(<2 x bfloat> %a) #0 {
 ; SM80-LABEL: test_fadd_imm_0(
 ; SM80:       {
-; SM80-NEXT:    .reg .b16 %rs<3>;
-; SM80-NEXT:    .reg .b32 %r<3>;
-; SM80-NEXT:    .reg .f32 %f<5>;
+; SM80-NEXT:    .reg .b32 %r<5>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b32 %r1, [test_fadd_imm_0_param_0];
-; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM80-NEXT:    cvt.f32.bf16 %f3, %rs2;
-; SM80-NEXT:    add.rn.f32 %f4, %f3, 0f40000000;
-; SM80-NEXT:    cvt.rn.bf16x2.f32 %r2, %f4, %f2;
-; SM80-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM80-NEXT:    mov.b32 %r2, 1065369472;
+; SM80-NEXT:    mov.b32 %r3, 1073758080;
+; SM80-NEXT:    fma.rn.bf16x2 %r4, %r1, %r2, %r3;
+; SM80-NEXT:    st.param.b32 [func_retval0], %r4;
 ; SM80-NEXT:    ret;
 ;
 ; SM90-LABEL: test_fadd_imm_0(
@@ -54,15 +49,13 @@ define <2 x bfloat> @test_fadd_imm_0(<2 x bfloat> %a) #0 {
 define bfloat @test_fadd_imm_1(bfloat %a) #0 {
 ; SM80-LABEL: test_fadd_imm_1(
 ; SM80:       {
-; SM80-NEXT:    .reg .b16 %rs<3>;
-; SM80-NEXT:    .reg .f32 %f<3>;
+; SM80-NEXT:    .reg .b16 %rs<4>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM80-NEXT:    cvt.rn.bf16.f32 %rs2, %f2;
-; SM80-NEXT:    st.param.b16 [func_retval0], %rs2;
+; SM80-NEXT:    mov.b16 %rs2, 0x3F80;
+; SM80-NEXT:    fma.rn.bf16 %rs3, %rs1, %rs2, %rs2;
+; SM80-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-NEXT:    ret;
 ;
 ; SM90-LABEL: test_fadd_imm_1(
@@ -82,23 +75,14 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
 define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM80-LABEL: test_fsubx2(
 ; SM80:       {
-; SM80-NEXT:    .reg .b16 %rs<5>;
-; SM80-NEXT:    .reg .b32 %r<4>;
-; SM80-NEXT:    .reg .f32 %f<7>;
+; SM80-NEXT:    .reg .b32 %r<5>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b32 %r1, [test_fsubx2_param_0];
 ; SM80-NEXT:    ld.param.b32 %r2, [test_fsubx2_param_1];
-; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; SM80-NEXT:    cvt.f32.bf16 %f2, %rs3;
-; SM80-NEXT:    sub.rn.f32 %f3, %f2, %f1;
-; SM80-NEXT:    cvt.f32.bf16 %f4, %rs2;
-; SM80-NEXT:    cvt.f32.bf16 %f5, %rs4;
-; SM80-NEXT:    sub.rn.f32 %f6, %f5, %f4;
-; SM80-NEXT:    cvt.rn.bf16x2.f32 %r3, %f6, %f3;
-; SM80-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM80-NEXT:    mov.b32 %r3, -1082081408;
+; SM80-NEXT:    fma.rn.bf16x2 %r4, %r2, %r3, %r1;
+; SM80-NEXT:    st.param.b32 [func_retval0], %r4;
 ; SM80-NEXT:    ret;
 ;
 ; SM90-LABEL: test_fsubx2(
@@ -118,23 +102,14 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM80-LABEL: test_fmulx2(
 ; SM80:       {
-; SM80-NEXT:    .reg .b16 %rs<5>;
-; SM80-NEXT:    .reg .b32 %r<4>;
-; SM80-NEXT:    .reg .f32 %f<7>;
+; SM80-NEXT:    .reg .b32 %r<5>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
-; SM80-NEXT:    ld.param.b32 %r1, [test_fmulx2_param_0];
-; SM80-NEXT:    ld.param.b32 %r2, [test_fmulx2_param_1];
-; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; SM80-NEXT:    cvt.f32.bf16 %f2, %rs3;
-; SM80-NEXT:    mul.rn.f32 %f3, %f2, %f1;
-; SM80-NEXT:    cvt.f32.bf16 %f4, %rs2;
-; SM80-NEXT:    cvt.f32.bf16 %f5, %rs4;
-; SM80-NEXT:    mul.rn.f32 %f6, %f5, %f4;
-; SM80-NEXT:    cvt.rn.bf16x2.f32 %r3, %f6, %f3;
-; SM80-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM80-NEXT:    ld.param.b32 %r1, [test_fmulx2_param_1];
+; SM80-NEXT:    ld.param.b32 %r2, [test_fmulx2_param_0];
+; SM80-NEXT:    mov.b32 %r3, -2147450880;
+; SM80-NEXT:    fma.rn.bf16x2 %r4, %r2, %r1, %r3;
+; SM80-NEXT:    st.param.b32 [func_retval0], %r4;
 ; SM80-NEXT:    ret;
 ;
 ; SM90-LABEL: test_fmulx2(
@@ -543,30 +518,16 @@ define <2 x bfloat> @test_fabs(<2 x bfloat> %a) #0 {
 define <2 x bfloat> @test_fabs_add(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM80-LABEL: test_fabs_add(
 ; SM80:       {
-; SM80-NEXT:    .reg .b16 %rs<7>;
-; SM80-NEXT:    .reg .b32 %r<6>;
-; SM80-NEXT:    .reg .f32 %f<11>;
+; SM80-NEXT:    .reg .b32 %r<7>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b32 %r1, [test_fabs_add_param_1];
 ; SM80-NEXT:    ld.param.b32 %r2, [test_fabs_add_param_0];
-; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT:    add.rn.f32 %f2, %f1, %f1;
-; SM80-NEXT:    cvt.f32.bf16 %f3, %rs2;
-; SM80-NEXT:    add.rn.f32 %f4, %f3, %f3;
-; SM80-NEXT:    cvt.rn.bf16x2.f32 %r3, %f4, %f2;
-; SM80-NEXT:    abs.bf16x2 %r4, %r3;
-; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r4;
-; SM80-NEXT:    cvt.f32.bf16 %f5, %rs3;
-; SM80-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; SM80-NEXT:    cvt.f32.bf16 %f6, %rs5;
-; SM80-NEXT:    add.rn.f32 %f7, %f5, %f6;
-; SM80-NEXT:    cvt.f32.bf16 %f8, %rs4;
-; SM80-NEXT:    cvt.f32.bf16 %f9, %rs6;
-; SM80-NEXT:    add.rn.f32 %f10, %f8, %f9;
-; SM80-NEXT:    cvt.rn.bf16x2.f32 %r5, %f10, %f7;
-; SM80-NEXT:    st.param.b32 [func_retval0], %r5;
+; SM80-NEXT:    mov.b32 %r3, 1065369472;
+; SM80-NEXT:    fma.rn.bf16x2 %r4, %r2, %r3, %r2;
+; SM80-NEXT:    abs.bf16x2 %r5, %r4;
+; SM80-NEXT:    fma.rn.bf16x2 %r6, %r5, %r3, %r1;
+; SM80-NEXT:    st.param.b32 [func_retval0], %r6;
 ; SM80-NEXT:    ret;
 ;
 ; SM90-LABEL: test_fabs_add(
@@ -802,45 +763,18 @@ define <2 x bfloat> @test_round(<2 x bfloat> %a) #0 {
 }
 
 define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
-; SM80-LABEL: test_copysign(
-; SM80:       {
-; SM80-NEXT:    .reg .pred %p<3>;
-; SM80-NEXT:    .reg .b16 %rs<15>;
-; SM80-NEXT:    .reg .b32 %r<4>;
-; SM80-EMPTY:
-; SM80-NEXT:  // %bb.0:
-; SM80-NEXT:    ld.param.b32 %r1, [test_copysign_param_1];
-; SM80-NEXT:    ld.param.b32 %r2, [test_copysign_param_0];
-; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; SM80-NEXT:    abs.bf16 %rs3, %rs2;
-; SM80-NEXT:    neg.bf16 %rs4, %rs3;
-; SM80-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; SM80-NEXT:    shr.u16 %rs7, %rs6, 15;
-; SM80-NEXT:    and.b16 %rs8, %rs7, 1;
-; SM80-NEXT:    setp.eq.b16 %p1, %rs8, 1;
-; SM80-NEXT:    selp.b16 %rs9, %rs4, %rs3, %p1;
-; SM80-NEXT:    abs.bf16 %rs10, %rs1;
-; SM80-NEXT:    neg.bf16 %rs11, %rs10;
-; SM80-NEXT:    shr.u16 %rs12, %rs5, 15;
-; SM80-NEXT:    and.b16 %rs13, %rs12, 1;
-; SM80-NEXT:    setp.eq.b16 %p2, %rs13, 1;
-; SM80-NEXT:    selp.b16 %rs14, %rs11, %rs10, %p2;
-; SM80-NEXT:    mov.b32 %r3, {%rs14, %rs9};
-; SM80-NEXT:    st.param.b32 [func_retval0], %r3;
-; SM80-NEXT:    ret;
-;
-; SM90-LABEL: test_copysign(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<6>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b32 %r1, [test_copysign_param_0];
-; SM90-NEXT:    ld.param.b32 %r2, [test_copysign_param_1];
-; SM90-NEXT:    and.b32 %r3, %r2, -2147450880;
-; SM90-NEXT:    and.b32 %r4, %r1, 2147450879;
-; SM90-NEXT:    or.b32 %r5, %r4, %r3;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r5;
-; SM90-NEXT:    ret;
+; CHECK-LABEL: test_copysign(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_copysign_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_copysign_param_1];
+; CHECK-NEXT:    and.b32 %r3, %r2, -2147450880;
+; CHECK-NEXT:    and.b32 %r4, %r1, 2147450879;
+; CHECK-NEXT:    or.b32 %r5, %r4, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b)
   ret <2 x bfloat> %r
 }
diff --git a/llvm/test/CodeGen/NVPTX/combine-mad.ll b/llvm/test/CodeGen/NVPTX/combine-mad.ll
index 1b22cfde39725..304025fdb15fe 100644
--- a/llvm/test/CodeGen/NVPTX/combine-mad.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-mad.ll
@@ -183,3 +183,58 @@ define i32 @test4_rev(i32 %a, i32 %b, i32 %c, i1 %p) {
   %add = add i32 %c, %sel
   ret i32 %add
 }
+
+declare i32 @use(i32 %0, i32 %1)
+
+define i32 @test_mad_multi_use(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: test_mad_multi_use(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_mad_multi_use_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [test_mad_multi_use_param_1];
+; CHECK-NEXT:    mul.lo.s32 %r3, %r1, %r2;
+; CHECK-NEXT:    ld.param.u32 %r4, [test_mad_multi_use_param_2];
+; CHECK-NEXT:    add.s32 %r5, %r3, %r4;
+; CHECK-NEXT:    { // callseq 0, 0
+; CHECK-NEXT:    .param .b32 param0;
+; CHECK-NEXT:    st.param.b32 [param0], %r3;
+; CHECK-NEXT:    .param .b32 param1;
+; CHECK-NEXT:    st.param.b32 [param1], %r5;
+; CHECK-NEXT:    .param .b32 retval0;
+; CHECK-NEXT:    call.uni (retval0),
+; CHECK-NEXT:    use,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0,
+; CHECK-NEXT:    param1
+; CHECK-NEXT:    );
+; CHECK-NEXT:    ld.param.b32 %r6, [retval0];
+; CHECK-NEXT:    } // callseq 0
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
+  %mul = mul i32 %a, %b
+  %add = add i32 %mul, %c
+  %res = call i32 @use(i32 %mul, i32 %add)
+  ret i32 %res
+}
+
+;; This case relies on mad x 1 y => add x y, previously we emit:
+;;     mad.lo.s32      %r3, %r1, 1, %r2;
+define i32 @test_mad_fold(i32 %x) {
+; CHECK-LABEL: test_mad_fold(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_mad_fold_param_0];
+; CHECK-NEXT:    mul.hi.s32 %r2, %r1, -2147221471;
+; CHECK-NEXT:    add.s32 %r3, %r2, %r1;
+; CHECK-NEXT:    shr.u32 %r4, %r3, 31;
+; CHECK-NEXT:    shr.s32 %r5, %r3, 12;
+; CHECK-NEXT:    add.s32 %r6, %r5, %r4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
+  %div = sdiv i32 %x, 8191
+  ret i32 %div
+}
diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
index 27a523b9dd91d..de19d2983f343 100644
--- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
@@ -12,7 +12,7 @@
 ; CHECK-NOT: __local_depot
 
 ; CHECK-32:       ld.param.u32  %r[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
-; CHECK-32-NEXT:  mad.lo.s32 %r[[SIZE2:[0-9]]], %r[[SIZE]], 1, 7;
+; CHECK-32-NEXT:  add.s32 %r[[SIZE2:[0-9]]], %r[[SIZE]], 7;
 ; CHECK-32-NEXT:  and.b32         %r[[SIZE3:[0-9]]], %r[[SIZE2]], -8;
 ; CHECK-32-NEXT:  alloca.u32  %r[[ALLOCA:[0-9]]], %r[[SIZE3]], 16;
 ; CHECK-32-NEXT:  cvta.local.u32  %r[[ALLOCA]], %r[[ALLOCA]];
diff --git a/llvm/test/CodeGen/NVPTX/f16-ex2.ll b/llvm/test/CodeGen/NVPTX/f16-ex2.ll
index df3a36db52b1a..ae70946b4b1dc 100644
--- a/llvm/test/CodeGen/NVPTX/f16-ex2.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-ex2.ll
@@ -1,21 +1,37 @@
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_75 -mattr=+ptx70 | FileCheck %s
-; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK-FP16 %s
+; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %}
+target triple = "nvptx64-nvidia-cuda"
 
 declare half @llvm.nvvm.ex2.approx.f16(half)
 declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>)
 
-; CHECK-LABEL: exp2_half
-define half @exp2_half(half %0) {
-  ; CHECK-NOT: call
-  ; CHECK: ex2.approx.f16
-  %res = call half @llvm.nvvm.ex2.approx.f16(half %0);
+; CHECK-LABEL: ex2_half
+define half @ex2_half(half %0) {
+; CHECK-FP16-LABEL: ex2_half(
+; CHECK-FP16:       {
+; CHECK-FP16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT:  // %bb.0:
+; CHECK-FP16-NEXT:    ld.param.b16 %rs1, [ex2_half_param_0];
+; CHECK-FP16-NEXT:    ex2.approx.f16 %rs2, %rs1;
+; CHECK-FP16-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-FP16-NEXT:    ret;
+  %res = call half @llvm.nvvm.ex2.approx.f16(half %0)
   ret half %res
 }
 
-; CHECK-LABEL: exp2_2xhalf
-define <2 x half> @exp2_2xhalf(<2 x half> %0) {
-  ; CHECK-NOT: call
-  ; CHECK: ex2.approx.f16x2
-  %res = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %0);
+; CHECK-LABEL: ex2_2xhalf
+define <2 x half> @ex2_2xhalf(<2 x half> %0) {
+; CHECK-FP16-LABEL: ex2_2xhalf(
+; CHECK-FP16:       {
+; CHECK-FP16-NEXT:    .reg .b32 %r<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT:  // %bb.0:
+; CHECK-FP16-NEXT:    ld.param.b32 %r1, [ex2_2xhalf_param_0];
+; CHECK-FP16-NEXT:    ex2.approx.f16x2 %r2, %r1;
+; CHECK-FP16-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-FP16-NEXT:    ret;
+  %res = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %0)
   ret <2 x half> %res
 }
diff --git a/llvm/test/CodeGen/NVPTX/f32-ex2.ll b/llvm/test/CodeGen/NVPTX/f32-ex2.ll
new file mode 100644
index 0000000000000..c9eff2a8ff17d
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/f32-ex2.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %}
+target triple = "nvptx-nvidia-cuda"
+
+declare float @llvm.nvvm.ex2.approx.f(float)
+
+; CHECK-LABEL: ex2_float
+define float @ex2_float(float %0) {
+; CHECK-LABEL: ex2_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [ex2_float_param_0];
+; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+  %res = call float @llvm.nvvm.ex2.approx.f(float %0)
+  ret float %res
+}
+
+; CHECK-LABEL: ex2_float_ftz
+define float @ex2_float_ftz(float %0) {
+; CHECK-LABEL: ex2_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [ex2_float_ftz_param_0];
+; CHECK-NEXT:    ex2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+  %res = call float @llvm.nvvm.ex2.approx.ftz.f(float %0)
+  ret float %res
+}
diff --git a/llvm/test/CodeGen/NVPTX/f32-lg2.ll b/llvm/test/CodeGen/NVPTX/f32-lg2.ll
new file mode 100644
index 0000000000000..13324c6860926
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/f32-lg2.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_20 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 -mattr=+ptx32 | %ptxas-verify %}
+target triple = "nvptx-nvidia-cuda"
+
+declare float @llvm.nvvm.lg2.approx.f(float)
+declare float @llvm.nvvm.lg2.approx.ftz.f(float)
+
+; CHECK-LABEL: lg2_float
+define float @lg2_float(float %0) {
+; CHECK-LABEL: lg2_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [lg2_float_param_0];
+; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+  %res = call float @llvm.nvvm.lg2.approx.f(float %0)
+  ret float %res
+}
+
+; CHECK-LABEL: lg2_float_ftz
+define float @lg2_float_ftz(float %0) {
+; CHECK-LABEL: lg2_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [lg2_float_ftz_param_0];
+; CHECK-NEXT:    lg2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+  %res = call float @llvm.nvvm.lg2.approx.ftz.f(float %0)
+  ret float %res
+}
diff --git a/llvm/test/CodeGen/NVPTX/fexp2.ll b/llvm/test/CodeGen/NVPTX/fexp2.ll
new file mode 100644
index 0000000000000..7e485dca65764
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fexp2.ll
@@ -0,0 +1,414 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK-FP16 %s
+; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK-BF16 %s
+; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %}
+; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %}
+; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
+target triple = "nvptx64-nvidia-cuda"
+
+; --- f32 ---
+
+; CHECK-LABEL: exp2_test
+define float @exp2_test(float %in) {
+; CHECK-LABEL: exp2_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.f32 %f1, [exp2_test_param_0];
+; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FP16-LABEL: exp2_test(
+; CHECK-FP16:       {
+; CHECK-FP16-NEXT:    .reg .f32 %f<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT:  // %bb.0: // %entry
+; CHECK-FP16-NEXT:    ld.param.f32 %f1, [exp2_test_param_0];
+; CHECK-FP16-NEXT:    ex2.approx.f32 %f2, %f1;
+; CHECK-FP16-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-FP16-NEXT:    ret;
+;
+; CHECK-BF16-LABEL: exp2_test(
+; CHECK-BF16:       {
+; CHECK-BF16-NEXT:    .reg .f32 %f<3>;
+; CHECK-BF16-EMPTY:
+; CHECK-BF16-NEXT:  // %bb.0: // %entry
+; CHECK-BF16-NEXT:    ld.param.f32 %f1, [exp2_test_param_0];
+; CHECK-BF16-NEXT:    ex2.approx.f32 %f2, %f1;
+; CHECK-BF16-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-BF16-NEXT:    ret;
+entry:
+  %exp2 = call float @llvm.exp2.f32(float %in)
+  ret float %exp2
+}
+
+; CHECK-LABEL: exp2_ftz_test
+define float @exp2_ftz_test(float %in) #0 {
+; CHECK-LABEL: exp2_ftz_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.f32 %f1, [exp2_ftz_test_param_0];
+; CHECK-NEXT:    ex2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FP16-LABEL: exp2_ftz_test(
+; CHECK-FP16:       {
+; CHECK-FP16-NEXT:    .reg .f32 %f<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT:  // %bb.0: // %entry
+; CHECK-FP16-NEXT:    ld.param.f32 %f1, [exp2_ftz_test_param_0];
+; CHECK-FP16-NEXT:    ex2.approx.ftz.f32 %f2, %f1;
+; CHECK-FP16-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-FP16-NEXT:    ret;
+;
+; CHECK-BF16-LABEL: exp2_ftz_test(
+; CHECK-BF16:       {
+; CHECK-BF16-NEXT:    .reg .f32 %f<3>;
+; CHECK-BF16-EMPTY:
+; CHECK-BF16-NEXT:  // %bb.0: // %entry
+; CHECK-BF16-NEXT:    ld.param.f32 %f1, [exp2_ftz_test_param_0];
+; CHECK-BF16-NEXT:    ex2.approx.ftz.f32 %f2, %f1;
+; CHECK-BF16-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-BF16-NEXT:    ret;
+entry:
+  %exp2 = call float @llvm.exp2.f32(float %in)
+  ret float %exp2
+}
+
+; CHECK-LABEL: exp2_test_v
+define <2 x float> @exp2_test_v(<2 x float> %in) {
+; CHECK-LABEL: exp2_test_v(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.v2.f32 {%f1, %f2}, [exp2_test_v_param_0];
+; CHECK-NEXT:    ex2.approx.f32 %f3, %f2;
+; CHECK-NEXT:    ex2.approx.f32 %f4, %f1;
+; CHECK-NEXT:    st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-NEXT:    ret;
+;
+; CHECK-FP16-LABEL: exp2_test_v(
+; CHECK-FP16:       {
+; CHECK-FP16-NEXT:    .reg .f32 %f<5>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT:  // %bb.0: // %entry
+; CHECK-FP16-NEXT:    ld.param.v2.f32 {%f1, %f2}, [exp2_test_v_param_0];
+; CHECK-FP16-NEXT:    ex2.approx.f32 %f3, %f2;
+; CHECK-FP16-NEXT:    ex2.approx.f32 %f4, %f1;
+; CHECK-FP16-NEXT:    st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-FP16-NEXT:    ret;
+;
+; CHECK-BF16-LABEL: exp2_test_v(
+; CHECK-BF16:       {
+; CHECK-BF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-BF16-EMPTY:
+; CHECK-BF16-NEXT:  // %bb.0: // %entry
+; CHECK-BF16-NEXT:    ld.param.v2.f32 {%f1, %f2}, [exp2_test_v_param_0];
+; CHECK-BF16-NEXT:    ex2.approx.f32 %f3, %f2;
+; CHECK-BF16-NEXT:    ex2.approx.f32 %f4, %f1;
+; CHECK-BF16-NEXT:    st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-BF16-NEXT:    ret;
+entry:
+  %exp2 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in)
+  ret <2 x float> %exp2
+}
+
+; --- f16 ---
+
+; CHECK-LABEL: exp2_f16_test
+define half @exp2_f16_test(half %in) {
+; CHECK-LABEL: exp2_f16_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b16 %rs1, [exp2_f16_test_param_0];
+; CHECK-NEXT:    cvt.f32.f16 %f1, %rs1;
+; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FP16-LABEL: exp2_f16_test(
+; CHECK-FP16:       {
+; CHECK-FP16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT:  // %bb.0: // %entry
+; CHECK-FP16-NEXT:    ld.param.b16 %rs1, [exp2_f16_test_param_0];
+; CHECK-FP16-NEXT:    ex2.approx.f16 %rs2, %rs1;
+; CHECK-FP16-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-FP16-NEXT:    ret;
+;
+; CHECK-BF16-LABEL: exp2_f16_test(
+; CHECK-BF16:       {
+; CHECK-BF16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-BF16-EMPTY:
+; CHECK-BF16-NEXT:  // %bb.0: // %entry
+; CHECK-BF16-NEXT:    ld.param.b16 %rs1, [exp2_f16_test_param_0];
+; CHECK-BF16-NEXT:    ex2.approx.f16 %rs2, %rs1;
+; CHECK-BF16-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-BF16-NEXT:    ret;
+entry:
+  %exp2 = call half @llvm.exp2.f16(half %in)
+  ret half %exp2
+}
+
+; COM: we should never have .ftz for f16
+; CHECK-LABEL: exp2_f16_ftz_test
+define half @exp2_f16_ftz_test(half %in) #0 {
+; CHECK-LABEL: exp2_f16_ftz_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b16 %rs1, [exp2_f16_ftz_test_param_0];
+; CHECK-NEXT:    cvt.ftz.f32.f16 %f1, %rs1;
+; CHECK-NEXT:    ex2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FP16-LABEL: exp2_f16_ftz_test(
+; CHECK-FP16:       {
+; CHECK-FP16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT:  // %bb.0: // %entry
+; CHECK-FP16-NEXT:    ld.param.b16 %rs1, [exp2_f16_ftz_test_param_0];
+; CHECK-FP16-NEXT:    ex2.approx.f16 %rs2, %rs1;
+; CHECK-FP16-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-FP16-NEXT:    ret;
+;
+; CHECK-BF16-LABEL: exp2_f16_ftz_test(
+; CHECK-BF16:       {
+; CHECK-BF16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-BF16-EMPTY:
+; CHECK-BF16-NEXT:  // %bb.0: // %entry
+; CHECK-BF16-NEXT:    ld.param.b16 %rs1, [exp2_f16_ftz_test_param_0];
+; CHECK-BF16-NEXT:    ex2.approx.f16 %rs2, %rs1;
+; CHECK-BF16-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-BF16-NEXT:    ret;
+entry:
+  %exp2 = call half @llvm.exp2.f16(half %in)
+  ret half %exp2
+}
+
+; CHECK-LABEL: exp2_f16_test_v
+define <2 x half> @exp2_f16_test_v(<2 x half> %in) {
+; CHECK-LABEL: exp2_f16_test_v(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b32 %r1, [exp2_f16_test_v_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
+; CHECK-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NEXT:    ex2.approx.f32 %f4, %f3;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
+; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FP16-LABEL: exp2_f16_test_v(
+; CHECK-FP16:       {
+; CHECK-FP16-NEXT:    .reg .b32 %r<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT:  // %bb.0: // %entry
+; CHECK-FP16-NEXT:    ld.param.b32 %r1, [exp2_f16_test_v_param_0];
+; CHECK-FP16-NEXT:    ex2.approx.f16x2 %r2, %r1;
+; CHECK-FP16-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-FP16-NEXT:    ret;
+;
+; CHECK-BF16-LABEL: exp2_f16_test_v(
+; CHECK-BF16:       {
+; CHECK-BF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-BF16-EMPTY:
+; CHECK-BF16-NEXT:  // %bb.0: // %entry
+; CHECK-BF16-NEXT:    ld.param.b32 %r1, [exp2_f16_test_v_param_0];
+; CHECK-BF16-NEXT:    ex2.approx.f16x2 %r2, %r1;
+; CHECK-BF16-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-BF16-NEXT:    ret;
+entry:
+  %exp2 = call <2 x half> @llvm.exp2.v2f16(<2 x half> %in)
+  ret <2 x half> %exp2
+}
+
+; --- bf16 ---
+
+; COM: we should always have .ftz for bf16
+; CHECK-LABEL: exp2_bf16_test
+define bfloat @exp2_bf16_test(bfloat %in) {
+; CHECK-LABEL: exp2_bf16_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.u16 %r1, [exp2_bf16_test_param_0];
+; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-NEXT:    mov.b32 %f1, %r2;
+; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    mov.b32 %r3, %f2;
+; CHECK-NEXT:    bfe.u32 %r4, %r3, 16, 1;
+; CHECK-NEXT:    add.s32 %r5, %r4, %r3;
+; CHECK-NEXT:    add.s32 %r6, %r5, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p1, %f2, %f2;
+; CHECK-NEXT:    or.b32 %r7, %r3, 4194304;
+; CHECK-NEXT:    selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FP16-LABEL: exp2_bf16_test(
+; CHECK-FP16:       {
+; CHECK-FP16-NEXT:    .reg .pred %p<2>;
+; CHECK-FP16-NEXT:    .reg .b16 %rs<2>;
+; CHECK-FP16-NEXT:    .reg .b32 %r<9>;
+; CHECK-FP16-NEXT:    .reg .f32 %f<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT:  // %bb.0: // %entry
+; CHECK-FP16-NEXT:    ld.param.u16 %r1, [exp2_bf16_test_param_0];
+; CHECK-FP16-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-FP16-NEXT:    mov.b32 %f1, %r2;
+; CHECK-FP16-NEXT:    ex2.approx.f32 %f2, %f1;
+; CHECK-FP16-NEXT:    mov.b32 %r3, %f2;
+; CHECK-FP16-NEXT:    bfe.u32 %r4, %r3, 16, 1;
+; CHECK-FP16-NEXT:    add.s32 %r5, %r4, %r3;
+; CHECK-FP16-NEXT:    add.s32 %r6, %r5, 32767;
+; CHECK-FP16-NEXT:    setp.nan.f32 %p1, %f2, %f2;
+; CHECK-FP16-NEXT:    or.b32 %r7, %r3, 4194304;
+; CHECK-FP16-NEXT:    selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-FP16-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
+; CHECK-FP16-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-FP16-NEXT:    ret;
+;
+; CHECK-BF16-LABEL: exp2_bf16_test(
+; CHECK-BF16:       {
+; CHECK-BF16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-BF16-EMPTY:
+; CHECK-BF16-NEXT:  // %bb.0: // %entry
+; CHECK-BF16-NEXT:    ld.param.b16 %rs1, [exp2_bf16_test_param_0];
+; CHECK-BF16-NEXT:    ex2.approx.ftz.bf16 %rs2, %rs1;
+; CHECK-BF16-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-BF16-NEXT:    ret;
+entry:
+  %exp2 = call bfloat @llvm.exp2.bf16(bfloat %in)
+  ret bfloat %exp2
+}
+
+; CHECK-LABEL: exp2_bf16_test_v
+define <2 x bfloat> @exp2_bf16_test_v(<2 x bfloat> %in) {
+; CHECK-LABEL: exp2_bf16_test_v(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b32 %r1, [exp2_bf16_test_v_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs2;
+; CHECK-NEXT:    shl.b32 %r3, %r2, 16;
+; CHECK-NEXT:    mov.b32 %f1, %r3;
+; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    mov.b32 %r4, %f2;
+; CHECK-NEXT:    bfe.u32 %r5, %r4, 16, 1;
+; CHECK-NEXT:    add.s32 %r6, %r5, %r4;
+; CHECK-NEXT:    add.s32 %r7, %r6, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p1, %f2, %f2;
+; CHECK-NEXT:    or.b32 %r8, %r4, 4194304;
+; CHECK-NEXT:    selp.b32 %r9, %r8, %r7, %p1;
+; CHECK-NEXT:    cvt.u32.u16 %r10, %rs1;
+; CHECK-NEXT:    shl.b32 %r11, %r10, 16;
+; CHECK-NEXT:    mov.b32 %f3, %r11;
+; CHECK-NEXT:    ex2.approx.f32 %f4, %f3;
+; CHECK-NEXT:    mov.b32 %r12, %f4;
+; CHECK-NEXT:    bfe.u32 %r13, %r12, 16, 1;
+; CHECK-NEXT:    add.s32 %r14, %r13, %r12;
+; CHECK-NEXT:    add.s32 %r15, %r14, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p2, %f4, %f4;
+; CHECK-NEXT:    or.b32 %r16, %r12, 4194304;
+; CHECK-NEXT:    selp.b32 %r17, %r16, %r15, %p2;
+; CHECK-NEXT:    prmt.b32 %r18, %r17, %r9, 0x7632U;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r18;
+; CHECK-NEXT:    ret;
+;
+; CHECK-FP16-LABEL: exp2_bf16_test_v(
+; CHECK-FP16:       {
+; CHECK-FP16-NEXT:    .reg .pred %p<3>;
+; CHECK-FP16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-FP16-NEXT:    .reg .b32 %r<19>;
+; CHECK-FP16-NEXT:    .reg .f32 %f<5>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT:  // %bb.0: // %entry
+; CHECK-FP16-NEXT:    ld.param.b32 %r1, [exp2_bf16_test_v_param_0];
+; CHECK-FP16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-FP16-NEXT:    cvt.u32.u16 %r2, %rs2;
+; CHECK-FP16-NEXT:    shl.b32 %r3, %r2, 16;
+; CHECK-FP16-NEXT:    mov.b32 %f1, %r3;
+; CHECK-FP16-NEXT:    ex2.approx.f32 %f2, %f1;
+; CHECK-FP16-NEXT:    mov.b32 %r4, %f2;
+; CHECK-FP16-NEXT:    bfe.u32 %r5, %r4, 16, 1;
+; CHECK-FP16-NEXT:    add.s32 %r6, %r5, %r4;
+; CHECK-FP16-NEXT:    add.s32 %r7, %r6, 32767;
+; CHECK-FP16-NEXT:    setp.nan.f32 %p1, %f2, %f2;
+; CHECK-FP16-NEXT:    or.b32 %r8, %r4, 4194304;
+; CHECK-FP16-NEXT:    selp.b32 %r9, %r8, %r7, %p1;
+; CHECK-FP16-NEXT:    cvt.u32.u16 %r10, %rs1;
+; CHECK-FP16-NEXT:    shl.b32 %r11, %r10, 16;
+; CHECK-FP16-NEXT:    mov.b32 %f3, %r11;
+; CHECK-FP16-NEXT:    ex2.approx.f32 %f4, %f3;
+; CHECK-FP16-NEXT:    mov.b32 %r12, %f4;
+; CHECK-FP16-NEXT:    bfe.u32 %r13, %r12, 16, 1;
+; CHECK-FP16-NEXT:    add.s32 %r14, %r13, %r12;
+; CHECK-FP16-NEXT:    add.s32 %r15, %r14, 32767;
+; CHECK-FP16-NEXT:    setp.nan.f32 %p2, %f4, %f4;
+; CHECK-FP16-NEXT:    or.b32 %r16, %r12, 4194304;
+; CHECK-FP16-NEXT:    selp.b32 %r17, %r16, %r15, %p2;
+; CHECK-FP16-NEXT:    prmt.b32 %r18, %r17, %r9, 0x7632U;
+; CHECK-FP16-NEXT:    st.param.b32 [func_retval0], %r18;
+; CHECK-FP16-NEXT:    ret;
+;
+; CHECK-BF16-LABEL: exp2_bf16_test_v(
+; CHECK-BF16:       {
+; CHECK-BF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-BF16-EMPTY:
+; CHECK-BF16-NEXT:  // %bb.0: // %entry
+; CHECK-BF16-NEXT:    ld.param.b32 %r1, [exp2_bf16_test_v_param_0];
+; CHECK-BF16-NEXT:    ex2.approx.ftz.bf16x2 %r2, %r1;
+; CHECK-BF16-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-BF16-NEXT:    ret;
+entry:
+  %exp2 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %in)
+  ret <2 x bfloat> %exp2
+}
+
+declare float @llvm.exp2.f32(float %val)
+
+declare <2 x float> @llvm.exp2.v2f32(<2 x float> %val)
+
+declare half @llvm.exp2.f16(half %val)
+
+declare <2 x half> @llvm.exp2.v2f16(<2 x half> %val)
+
+declare bfloat @llvm.exp2.bf16(bfloat %val)
+
+declare <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %val)
+
+attributes #0 = {"denormal-fp-math"="preserve-sign"}
diff --git a/llvm/test/CodeGen/NVPTX/flog2.ll b/llvm/test/CodeGen/NVPTX/flog2.ll
new file mode 100644
index 0000000000000..ff762dcf74b2f
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/flog2.ll
@@ -0,0 +1,234 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 -nvptx-approx-log2f32 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_50 -mattr=+ptx32 -nvptx-approx-log2f32 | %ptxas-verify -arch=sm_50 %}
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: log2_test
+define float @log2_test(float %in) {
+; CHECK-LABEL: log2_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.f32 %f1, [log2_test_param_0];
+; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+entry:
+  %log2 = call float @llvm.log2.f32(float %in)
+  ret float %log2
+}
+
+; CHECK-LABEL: log2_ftz_test
+define float @log2_ftz_test(float %in) #0 {
+; CHECK-LABEL: log2_ftz_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.f32 %f1, [log2_ftz_test_param_0];
+; CHECK-NEXT:    lg2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+entry:
+  %log2 = call float @llvm.log2.f32(float %in)
+  ret float %log2
+}
+
+; CHECK-LABEL: log2_test_v
+define <2 x float> @log2_test_v(<2 x float> %in) {
+; CHECK-LABEL: log2_test_v(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.v2.f32 {%f1, %f2}, [log2_test_v_param_0];
+; CHECK-NEXT:    lg2.approx.f32 %f3, %f2;
+; CHECK-NEXT:    lg2.approx.f32 %f4, %f1;
+; CHECK-NEXT:    st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-NEXT:    ret;
+entry:
+  %log2 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in)
+  ret <2 x float> %log2
+}
+
+; --- f16 ---
+
+; CHECK-LABEL: log2_f16_test
+define half @log2_f16_test(half %in) {
+; CHECK-LABEL: log2_f16_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b16 %rs1, [log2_f16_test_param_0];
+; CHECK-NEXT:    cvt.f32.f16 %f1, %rs1;
+; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT:    ret;
+entry:
+  %log2 = call half @llvm.log2.f16(half %in)
+  ret half %log2
+}
+
+; CHECK-LABEL: log2_f16_ftz_test
+define half @log2_f16_ftz_test(half %in) #0 {
+; CHECK-LABEL: log2_f16_ftz_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b16 %rs1, [log2_f16_ftz_test_param_0];
+; CHECK-NEXT:    cvt.ftz.f32.f16 %f1, %rs1;
+; CHECK-NEXT:    lg2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT:    ret;
+entry:
+  %log2 = call half @llvm.log2.f16(half %in)
+  ret half %log2
+}
+
+; CHECK-LABEL: log2_f16_test_v
+define <2 x half> @log2_f16_test_v(<2 x half> %in) {
+; CHECK-LABEL: log2_f16_test_v(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b32 %r1, [log2_f16_test_v_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
+; CHECK-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NEXT:    lg2.approx.f32 %f4, %f3;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
+; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+entry:
+  %log2 = call <2 x half> @llvm.log2.v2f16(<2 x half> %in)
+  ret <2 x half> %log2
+}
+
+; --- bf16 ---
+
+; CHECK-LABEL: log2_bf16_test
+define bfloat @log2_bf16_test(bfloat %in) {
+; CHECK-LABEL: log2_bf16_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.u16 %r1, [log2_bf16_test_param_0];
+; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-NEXT:    mov.b32 %f1, %r2;
+; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    mov.b32 %r3, %f2;
+; CHECK-NEXT:    bfe.u32 %r4, %r3, 16, 1;
+; CHECK-NEXT:    add.s32 %r5, %r4, %r3;
+; CHECK-NEXT:    add.s32 %r6, %r5, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p1, %f2, %f2;
+; CHECK-NEXT:    or.b32 %r7, %r3, 4194304;
+; CHECK-NEXT:    selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT:    ret;
+entry:
+  %log2 = call bfloat @llvm.log2.bf16(bfloat %in)
+  ret bfloat %log2
+}
+
+; CHECK-LABEL: log2_bf16_ftz_test
+define bfloat @log2_bf16_ftz_test(bfloat %in) #0 {
+; CHECK-LABEL: log2_bf16_ftz_test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.u16 %r1, [log2_bf16_ftz_test_param_0];
+; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-NEXT:    mov.b32 %f1, %r2;
+; CHECK-NEXT:    lg2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT:    mov.b32 %r3, %f2;
+; CHECK-NEXT:    bfe.u32 %r4, %r3, 16, 1;
+; CHECK-NEXT:    add.s32 %r5, %r4, %r3;
+; CHECK-NEXT:    add.s32 %r6, %r5, 32767;
+; CHECK-NEXT:    setp.nan.ftz.f32 %p1, %f2, %f2;
+; CHECK-NEXT:    or.b32 %r7, %r3, 4194304;
+; CHECK-NEXT:    selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT:    ret;
+entry:
+  %log2 = call bfloat @llvm.log2.bf16(bfloat %in)
+  ret bfloat %log2
+}
+
+; CHECK-LABEL: log2_bf16_test_v
+define <2 x bfloat> @log2_bf16_test_v(<2 x bfloat> %in) {
+; CHECK-LABEL: log2_bf16_test_v(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b32 %r1, [log2_bf16_test_v_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs2;
+; CHECK-NEXT:    shl.b32 %r3, %r2, 16;
+; CHECK-NEXT:    mov.b32 %f1, %r3;
+; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    mov.b32 %r4, %f2;
+; CHECK-NEXT:    bfe.u32 %r5, %r4, 16, 1;
+; CHECK-NEXT:    add.s32 %r6, %r5, %r4;
+; CHECK-NEXT:    add.s32 %r7, %r6, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p1, %f2, %f2;
+; CHECK-NEXT:    or.b32 %r8, %r4, 4194304;
+; CHECK-NEXT:    selp.b32 %r9, %r8, %r7, %p1;
+; CHECK-NEXT:    cvt.u32.u16 %r10, %rs1;
+; CHECK-NEXT:    shl.b32 %r11, %r10, 16;
+; CHECK-NEXT:    mov.b32 %f3, %r11;
+; CHECK-NEXT:    lg2.approx.f32 %f4, %f3;
+; CHECK-NEXT:    mov.b32 %r12, %f4;
+; CHECK-NEXT:    bfe.u32 %r13, %r12, 16, 1;
+; CHECK-NEXT:    add.s32 %r14, %r13, %r12;
+; CHECK-NEXT:    add.s32 %r15, %r14, 32767;
+; CHECK-NEXT:    setp.nan.f32 %p2, %f4, %f4;
+; CHECK-NEXT:    or.b32 %r16, %r12, 4194304;
+; CHECK-NEXT:    selp.b32 %r17, %r16, %r15, %p2;
+; CHECK-NEXT:    prmt.b32 %r18, %r17, %r9, 0x7632U;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r18;
+; CHECK-NEXT:    ret;
+entry:
+  %log2 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %in)
+  ret <2 x bfloat> %log2
+}
+
+declare float @llvm.log2.f32(float %val)
+
+declare <2 x float> @llvm.log2.v2f32(<2 x float> %val)
+
+declare half @llvm.log2.f16(half %val)
+
+declare <2 x half> @llvm.log2.v2f16(<2 x half> %val)
+
+declare bfloat @llvm.log2.bf16(bfloat %val)
+
+declare <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %val)
+
+attributes #0 = {"denormal-fp-math"="preserve-sign"}
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
index 48c94f275274b..7dce894620e6b 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
@@ -352,9 +352,7 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
 define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<9>;
-; CHECK-NEXT:    .reg .b32 %r<7>;
-; CHECK-NEXT:    .reg .f32 %f<6>;
+; CHECK-NEXT:    .reg .b16 %rs<11>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
@@ -363,20 +361,11 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
 ; CHECK-NEXT:    fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
 ; CHECK-NEXT:    mov.b16 %rs5, 0x0000;
 ; CHECK-NEXT:    max.bf16 %rs6, %rs4, %rs5;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
-; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-NEXT:    mov.b32 %f1, %r2;
-; CHECK-NEXT:    add.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs7, %f2;
-; CHECK-NEXT:    cvt.u32.u16 %r3, %rs6;
-; CHECK-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-NEXT:    mov.b32 %f3, %r4;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs7;
-; CHECK-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-NEXT:    mov.b32 %f4, %r6;
-; CHECK-NEXT:    add.f32 %f5, %f3, %f4;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs8, %f5;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs8;
+; CHECK-NEXT:    mov.b16 %rs7, 0x3F80;
+; CHECK-NEXT:    mov.b16 %rs8, 0x40E0;
+; CHECK-NEXT:    fma.rn.bf16 %rs9, %rs4, %rs7, %rs8;
+; CHECK-NEXT:    fma.rn.bf16 %rs10, %rs6, %rs7, %rs9;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs10;
 ; CHECK-NEXT:    ret;
 ;
 ; CHECK-FTZ-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
@@ -959,9 +948,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
 define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 {
 ; CHECK-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NEXT:    .reg .b32 %r<20>;
-; CHECK-NEXT:    .reg .f32 %f<11>;
+; CHECK-NEXT:    .reg .b32 %r<11>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2];
@@ -970,34 +957,11 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-NEXT:    fma.rn.bf16x2 %r4, %r3, %r2, %r1;
 ; CHECK-NEXT:    mov.b32 %r5, 0;
 ; CHECK-NEXT:    max.bf16x2 %r6, %r4, %r5;
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-NEXT:    cvt.u32.u16 %r7, %rs2;
-; CHECK-NEXT:    shl.b32 %r8, %r7, 16;
-; CHECK-NEXT:    mov.b32 %f1, %r8;
-; CHECK-NEXT:    add.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
-; CHECK-NEXT:    cvt.u32.u16 %r9, %rs1;
-; CHECK-NEXT:    shl.b32 %r10, %r9, 16;
-; CHECK-NEXT:    mov.b32 %f3, %r10;
-; CHECK-NEXT:    add.f32 %f4, %f3, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
-; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r6;
-; CHECK-NEXT:    cvt.u32.u16 %r11, %rs5;
-; CHECK-NEXT:    shl.b32 %r12, %r11, 16;
-; CHECK-NEXT:    mov.b32 %f5, %r12;
-; CHECK-NEXT:    cvt.u32.u16 %r13, %rs4;
-; CHECK-NEXT:    shl.b32 %r14, %r13, 16;
-; CHECK-NEXT:    mov.b32 %f6, %r14;
-; CHECK-NEXT:    add.f32 %f7, %f5, %f6;
-; CHECK-NEXT:    cvt.u32.u16 %r15, %rs6;
-; CHECK-NEXT:    shl.b32 %r16, %r15, 16;
-; CHECK-NEXT:    mov.b32 %f8, %r16;
-; CHECK-NEXT:    cvt.u32.u16 %r17, %rs3;
-; CHECK-NEXT:    shl.b32 %r18, %r17, 16;
-; CHECK-NEXT:    mov.b32 %f9, %r18;
-; CHECK-NEXT:    add.f32 %f10, %f8, %f9;
-; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r19, %f10, %f7;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r19;
+; CHECK-NEXT:    mov.b32 %r7, 1065369472;
+; CHECK-NEXT:    mov.b32 %r8, 1088438496;
+; CHECK-NEXT:    fma.rn.bf16x2 %r9, %r4, %r7, %r8;
+; CHECK-NEXT:    fma.rn.bf16x2 %r10, %r6, %r7, %r9;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r10;
 ; CHECK-NEXT:    ret;
 ;
 ; CHECK-FTZ-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
index 561f2b0cc0673..eb51d7db81372 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
@@ -221,26 +221,18 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
 define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-LABEL: fma_bf16_no_nans_multiple_uses_of_fma(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NEXT:    .reg .b32 %r<5>;
-; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-NEXT:    .reg .b16 %rs<9>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
 ; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_no_nans_multiple_uses_of_fma_param_1];
 ; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_no_nans_multiple_uses_of_fma_param_2];
 ; CHECK-NEXT:    fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
-; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-NEXT:    mov.b32 %f1, %r2;
-; CHECK-NEXT:    add.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs5, %f2;
-; CHECK-NEXT:    cvt.u32.u16 %r3, %rs5;
-; CHECK-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-NEXT:    mov.b32 %f3, %r4;
-; CHECK-NEXT:    add.f32 %f4, %f3, %f1;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs6, %f4;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs6;
+; CHECK-NEXT:    mov.b16 %rs5, 0x3F80;
+; CHECK-NEXT:    mov.b16 %rs6, 0x40E0;
+; CHECK-NEXT:    fma.rn.bf16 %rs7, %rs4, %rs5, %rs6;
+; CHECK-NEXT:    fma.rn.bf16 %rs8, %rs7, %rs5, %rs4;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs8;
 ; CHECK-NEXT:    ret;
 ;
 ; CHECK-FTZ-LABEL: fma_bf16_no_nans_multiple_uses_of_fma(
@@ -642,36 +634,18 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
 define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 {
 ; CHECK-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NEXT:    .reg .b32 %r<14>;
-; CHECK-NEXT:    .reg .f32 %f<9>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_2];
 ; CHECK-NEXT:    ld.param.b32 %r2, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r3, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_0];
 ; CHECK-NEXT:    fma.rn.bf16x2 %r4, %r3, %r2, %r1;
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs2;
-; CHECK-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-NEXT:    mov.b32 %f1, %r6;
-; CHECK-NEXT:    add.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
-; CHECK-NEXT:    cvt.u32.u16 %r7, %rs1;
-; CHECK-NEXT:    shl.b32 %r8, %r7, 16;
-; CHECK-NEXT:    mov.b32 %f3, %r8;
-; CHECK-NEXT:    add.f32 %f4, %f3, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
-; CHECK-NEXT:    cvt.u32.u16 %r9, %rs4;
-; CHECK-NEXT:    shl.b32 %r10, %r9, 16;
-; CHECK-NEXT:    mov.b32 %f5, %r10;
-; CHECK-NEXT:    add.f32 %f6, %f5, %f3;
-; CHECK-NEXT:    cvt.u32.u16 %r11, %rs3;
-; CHECK-NEXT:    shl.b32 %r12, %r11, 16;
-; CHECK-NEXT:    mov.b32 %f7, %r12;
-; CHECK-NEXT:    add.f32 %f8, %f7, %f1;
-; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r13, %f8, %f6;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r13;
+; CHECK-NEXT:    mov.b32 %r5, 1065369472;
+; CHECK-NEXT:    mov.b32 %r6, 1088438496;
+; CHECK-NEXT:    fma.rn.bf16x2 %r7, %r4, %r5, %r6;
+; CHECK-NEXT:    fma.rn.bf16x2 %r8, %r7, %r5, %r4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
 ; CHECK-NEXT:    ret;
 ;
 ; CHECK-FTZ-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll b/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
index b20ca24dd91a0..a3545f5171425 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
@@ -233,9 +233,7 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c)  {
 define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c)  {
 ; CHECK-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<9>;
-; CHECK-NEXT:    .reg .b32 %r<7>;
-; CHECK-NEXT:    .reg .f32 %f<6>;
+; CHECK-NEXT:    .reg .b16 %rs<11>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
@@ -244,20 +242,11 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
 ; CHECK-NEXT:    fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
 ; CHECK-NEXT:    mov.b16 %rs5, 0x0000;
 ; CHECK-NEXT:    max.bf16 %rs6, %rs4, %rs5;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
-; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-NEXT:    mov.b32 %f1, %r2;
-; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs7, %f2;
-; CHECK-NEXT:    cvt.u32.u16 %r3, %rs6;
-; CHECK-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-NEXT:    mov.b32 %f3, %r4;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs7;
-; CHECK-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-NEXT:    mov.b32 %f4, %r6;
-; CHECK-NEXT:    add.rn.f32 %f5, %f3, %f4;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs8, %f5;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs8;
+; CHECK-NEXT:    mov.b16 %rs7, 0x3F80;
+; CHECK-NEXT:    mov.b16 %rs8, 0x40E0;
+; CHECK-NEXT:    fma.rn.bf16 %rs9, %rs4, %rs7, %rs8;
+; CHECK-NEXT:    fma.rn.bf16 %rs10, %rs6, %rs7, %rs9;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs10;
 ; CHECK-NEXT:    ret;
 ;
 ; CHECK-FTZ-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
@@ -694,9 +683,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
 define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)  {
 ; CHECK-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NEXT:    .reg .b32 %r<20>;
-; CHECK-NEXT:    .reg .f32 %f<11>;
+; CHECK-NEXT:    .reg .b32 %r<11>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2];
@@ -705,34 +692,11 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-NEXT:    fma.rn.bf16x2 %r4, %r3, %r2, %r1;
 ; CHECK-NEXT:    mov.b32 %r5, 0;
 ; CHECK-NEXT:    max.bf16x2 %r6, %r4, %r5;
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-NEXT:    cvt.u32.u16 %r7, %rs2;
-; CHECK-NEXT:    shl.b32 %r8, %r7, 16;
-; CHECK-NEXT:    mov.b32 %f1, %r8;
-; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
-; CHECK-NEXT:    cvt.u32.u16 %r9, %rs1;
-; CHECK-NEXT:    shl.b32 %r10, %r9, 16;
-; CHECK-NEXT:    mov.b32 %f3, %r10;
-; CHECK-NEXT:    add.rn.f32 %f4, %f3, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
-; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r6;
-; CHECK-NEXT:    cvt.u32.u16 %r11, %rs5;
-; CHECK-NEXT:    shl.b32 %r12, %r11, 16;
-; CHECK-NEXT:    mov.b32 %f5, %r12;
-; CHECK-NEXT:    cvt.u32.u16 %r13, %rs4;
-; CHECK-NEXT:    shl.b32 %r14, %r13, 16;
-; CHECK-NEXT:    mov.b32 %f6, %r14;
-; CHECK-NEXT:    add.rn.f32 %f7, %f5, %f6;
-; CHECK-NEXT:    cvt.u32.u16 %r15, %rs6;
-; CHECK-NEXT:    shl.b32 %r16, %r15, 16;
-; CHECK-NEXT:    mov.b32 %f8, %r16;
-; CHECK-NEXT:    cvt.u32.u16 %r17, %rs3;
-; CHECK-NEXT:    shl.b32 %r18, %r17, 16;
-; CHECK-NEXT:    mov.b32 %f9, %r18;
-; CHECK-NEXT:    add.rn.f32 %f10, %f8, %f9;
-; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r19, %f10, %f7;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r19;
+; CHECK-NEXT:    mov.b32 %r7, 1065369472;
+; CHECK-NEXT:    mov.b32 %r8, 1088438496;
+; CHECK-NEXT:    fma.rn.bf16x2 %r9, %r4, %r7, %r8;
+; CHECK-NEXT:    fma.rn.bf16x2 %r10, %r6, %r7, %r9;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r10;
 ; CHECK-NEXT:    ret;
 ;
 ; CHECK-FTZ-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
@@ -1204,26 +1168,18 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c)  {
 define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c)  {
 ; CHECK-LABEL: fma_bf16_no_nans_multiple_uses_of_fma(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NEXT:    .reg .b32 %r<5>;
-; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-NEXT:    .reg .b16 %rs<9>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
 ; CHECK-NEXT:    ld.param.b16 %rs2, [fma_bf16_no_nans_multiple_uses_of_fma_param_1];
 ; CHECK-NEXT:    ld.param.b16 %rs3, [fma_bf16_no_nans_multiple_uses_of_fma_param_2];
 ; CHECK-NEXT:    fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
-; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
-; CHECK-NEXT:    mov.b32 %f1, %r2;
-; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs5, %f2;
-; CHECK-NEXT:    cvt.u32.u16 %r3, %rs5;
-; CHECK-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-NEXT:    mov.b32 %f3, %r4;
-; CHECK-NEXT:    add.rn.f32 %f4, %f3, %f1;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs6, %f4;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs6;
+; CHECK-NEXT:    mov.b16 %rs5, 0x3F80;
+; CHECK-NEXT:    mov.b16 %rs6, 0x40E0;
+; CHECK-NEXT:    fma.rn.bf16 %rs7, %rs4, %rs5, %rs6;
+; CHECK-NEXT:    fma.rn.bf16 %rs8, %rs7, %rs5, %rs4;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs8;
 ; CHECK-NEXT:    ret;
 ;
 ; CHECK-FTZ-LABEL: fma_bf16_no_nans_multiple_uses_of_fma(
@@ -1629,36 +1585,18 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
 define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)  {
 ; CHECK-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NEXT:    .reg .b32 %r<14>;
-; CHECK-NEXT:    .reg .f32 %f<9>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_2];
 ; CHECK-NEXT:    ld.param.b32 %r2, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r3, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_0];
 ; CHECK-NEXT:    fma.rn.bf16x2 %r4, %r3, %r2, %r1;
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs2;
-; CHECK-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-NEXT:    mov.b32 %f1, %r6;
-; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
-; CHECK-NEXT:    cvt.u32.u16 %r7, %rs1;
-; CHECK-NEXT:    shl.b32 %r8, %r7, 16;
-; CHECK-NEXT:    mov.b32 %f3, %r8;
-; CHECK-NEXT:    add.rn.f32 %f4, %f3, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
-; CHECK-NEXT:    cvt.u32.u16 %r9, %rs4;
-; CHECK-NEXT:    shl.b32 %r10, %r9, 16;
-; CHECK-NEXT:    mov.b32 %f5, %r10;
-; CHECK-NEXT:    add.rn.f32 %f6, %f5, %f3;
-; CHECK-NEXT:    cvt.u32.u16 %r11, %rs3;
-; CHECK-NEXT:    shl.b32 %r12, %r11, 16;
-; CHECK-NEXT:    mov.b32 %f7, %r12;
-; CHECK-NEXT:    add.rn.f32 %f8, %f7, %f1;
-; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r13, %f8, %f6;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r13;
+; CHECK-NEXT:    mov.b32 %r5, 1065369472;
+; CHECK-NEXT:    mov.b32 %r6, 1088438496;
+; CHECK-NEXT:    fma.rn.bf16x2 %r7, %r4, %r5, %r6;
+; CHECK-NEXT:    fma.rn.bf16x2 %r8, %r7, %r5, %r4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
 ; CHECK-NEXT:    ret;
 ;
 ; CHECK-FTZ-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index 7ece0ccbd844e..ca1b5fdabbf8f 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -7,20 +7,20 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<19>;
 ; CHECK-NEXT:    .reg .b32 %r<16>;
-; CHECK-NEXT:    .reg .b64 %rd<129>;
+; CHECK-NEXT:    .reg .b64 %rd<127>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd45, %rd46}, [srem_i128_param_0];
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd49, %rd50}, [srem_i128_param_1];
 ; CHECK-NEXT:    shr.s64 %rd2, %rd46, 63;
-; CHECK-NEXT:    mov.b64 %rd119, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd52, %rd119, %rd45;
-; CHECK-NEXT:    subc.cc.s64 %rd53, %rd119, %rd46;
+; CHECK-NEXT:    mov.b64 %rd117, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd52, %rd117, %rd45;
+; CHECK-NEXT:    subc.cc.s64 %rd53, %rd117, %rd46;
 ; CHECK-NEXT:    setp.lt.s64 %p1, %rd46, 0;
 ; CHECK-NEXT:    selp.b64 %rd4, %rd53, %rd46, %p1;
 ; CHECK-NEXT:    selp.b64 %rd3, %rd52, %rd45, %p1;
-; CHECK-NEXT:    sub.cc.s64 %rd54, %rd119, %rd49;
-; CHECK-NEXT:    subc.cc.s64 %rd55, %rd119, %rd50;
+; CHECK-NEXT:    sub.cc.s64 %rd54, %rd117, %rd49;
+; CHECK-NEXT:    subc.cc.s64 %rd55, %rd117, %rd50;
 ; CHECK-NEXT:    setp.lt.s64 %p2, %rd50, 0;
 ; CHECK-NEXT:    selp.b64 %rd6, %rd55, %rd50, %p2;
 ; CHECK-NEXT:    selp.b64 %rd5, %rd54, %rd49, %p2;
@@ -44,7 +44,7 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    add.s64 %rd64, %rd63, 64;
 ; CHECK-NEXT:    selp.b64 %rd65, %rd62, %rd64, %p7;
 ; CHECK-NEXT:    sub.cc.s64 %rd66, %rd61, %rd65;
-; CHECK-NEXT:    subc.cc.s64 %rd67, %rd119, 0;
+; CHECK-NEXT:    subc.cc.s64 %rd67, %rd117, 0;
 ; CHECK-NEXT:    setp.eq.s64 %p8, %rd67, 0;
 ; CHECK-NEXT:    setp.ne.s64 %p9, %rd67, 0;
 ; CHECK-NEXT:    selp.u32 %r5, -1, 0, %p9;
@@ -57,14 +57,14 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    xor.b64 %rd68, %rd66, 127;
 ; CHECK-NEXT:    or.b64 %rd69, %rd68, %rd67;
 ; CHECK-NEXT:    setp.eq.s64 %p13, %rd69, 0;
-; CHECK-NEXT:    selp.b64 %rd128, 0, %rd4, %p12;
-; CHECK-NEXT:    selp.b64 %rd127, 0, %rd3, %p12;
+; CHECK-NEXT:    selp.b64 %rd126, 0, %rd4, %p12;
+; CHECK-NEXT:    selp.b64 %rd125, 0, %rd3, %p12;
 ; CHECK-NEXT:    or.pred %p14, %p12, %p13;
 ; CHECK-NEXT:    @%p14 bra $L__BB0_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd121, %rd66, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd122, %rd67, 0;
-; CHECK-NEXT:    or.b64 %rd72, %rd121, %rd122;
+; CHECK-NEXT:    add.cc.s64 %rd119, %rd66, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd120, %rd67, 0;
+; CHECK-NEXT:    or.b64 %rd72, %rd119, %rd120;
 ; CHECK-NEXT:    setp.eq.s64 %p15, %rd72, 0;
 ; CHECK-NEXT:    cvt.u32.u64 %r9, %rd66;
 ; CHECK-NEXT:    sub.s32 %r10, 127, %r9;
@@ -75,12 +75,12 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    sub.s32 %r12, 63, %r9;
 ; CHECK-NEXT:    shl.b64 %rd76, %rd3, %r12;
 ; CHECK-NEXT:    setp.gt.s32 %p16, %r10, 63;
-; CHECK-NEXT:    selp.b64 %rd126, %rd76, %rd75, %p16;
-; CHECK-NEXT:    shl.b64 %rd125, %rd3, %r10;
-; CHECK-NEXT:    mov.u64 %rd116, %rd119;
+; CHECK-NEXT:    selp.b64 %rd124, %rd76, %rd75, %p16;
+; CHECK-NEXT:    shl.b64 %rd123, %rd3, %r10;
+; CHECK-NEXT:    mov.u64 %rd114, %rd117;
 ; CHECK-NEXT:    @%p15 bra $L__BB0_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r13, %rd121;
+; CHECK-NEXT:    cvt.u32.u64 %r13, %rd119;
 ; CHECK-NEXT:    shr.u64 %rd79, %rd3, %r13;
 ; CHECK-NEXT:    sub.s32 %r14, 64, %r13;
 ; CHECK-NEXT:    shl.b64 %rd80, %rd4, %r14;
@@ -88,61 +88,59 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    add.s32 %r15, %r13, -64;
 ; CHECK-NEXT:    shr.u64 %rd82, %rd4, %r15;
 ; CHECK-NEXT:    setp.gt.s32 %p17, %r13, 63;
-; CHECK-NEXT:    selp.b64 %rd123, %rd82, %rd81, %p17;
-; CHECK-NEXT:    shr.u64 %rd124, %rd4, %r13;
+; CHECK-NEXT:    selp.b64 %rd121, %rd82, %rd81, %p17;
+; CHECK-NEXT:    shr.u64 %rd122, %rd4, %r13;
 ; CHECK-NEXT:    add.cc.s64 %rd35, %rd5, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd36, %rd6, -1;
-; CHECK-NEXT:    mov.b64 %rd116, 0;
-; CHECK-NEXT:    mov.u64 %rd119, %rd116;
+; CHECK-NEXT:    mov.b64 %rd114, 0;
+; CHECK-NEXT:    mov.u64 %rd117, %rd114;
 ; CHECK-NEXT:  $L__BB0_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd83, %rd123, 63;
-; CHECK-NEXT:    shl.b64 %rd84, %rd124, 1;
+; CHECK-NEXT:    shr.u64 %rd83, %rd121, 63;
+; CHECK-NEXT:    shl.b64 %rd84, %rd122, 1;
 ; CHECK-NEXT:    or.b64 %rd85, %rd84, %rd83;
-; CHECK-NEXT:    shl.b64 %rd86, %rd123, 1;
-; CHECK-NEXT:    shr.u64 %rd87, %rd126, 63;
+; CHECK-NEXT:    shl.b64 %rd86, %rd121, 1;
+; CHECK-NEXT:    shr.u64 %rd87, %rd124, 63;
 ; CHECK-NEXT:    or.b64 %rd88, %rd86, %rd87;
-; CHECK-NEXT:    shr.u64 %rd89, %rd125, 63;
-; CHECK-NEXT:    shl.b64 %rd90, %rd126, 1;
+; CHECK-NEXT:    shr.u64 %rd89, %rd123, 63;
+; CHECK-NEXT:    shl.b64 %rd90, %rd124, 1;
 ; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
-; CHECK-NEXT:    shl.b64 %rd92, %rd125, 1;
-; CHECK-NEXT:    or.b64 %rd125, %rd119, %rd92;
-; CHECK-NEXT:    or.b64 %rd126, %rd116, %rd91;
+; CHECK-NEXT:    shl.b64 %rd92, %rd123, 1;
+; CHECK-NEXT:    or.b64 %rd123, %rd117, %rd92;
+; CHECK-NEXT:    or.b64 %rd124, %rd114, %rd91;
 ; CHECK-NEXT:    sub.cc.s64 %rd93, %rd35, %rd88;
 ; CHECK-NEXT:    subc.cc.s64 %rd94, %rd36, %rd85;
 ; CHECK-NEXT:    shr.s64 %rd95, %rd94, 63;
-; CHECK-NEXT:    and.b64 %rd119, %rd95, 1;
+; CHECK-NEXT:    and.b64 %rd117, %rd95, 1;
 ; CHECK-NEXT:    and.b64 %rd96, %rd95, %rd5;
 ; CHECK-NEXT:    and.b64 %rd97, %rd95, %rd6;
-; CHECK-NEXT:    sub.cc.s64 %rd123, %rd88, %rd96;
-; CHECK-NEXT:    subc.cc.s64 %rd124, %rd85, %rd97;
-; CHECK-NEXT:    add.cc.s64 %rd121, %rd121, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd122, %rd122, -1;
-; CHECK-NEXT:    or.b64 %rd98, %rd121, %rd122;
+; CHECK-NEXT:    sub.cc.s64 %rd121, %rd88, %rd96;
+; CHECK-NEXT:    subc.cc.s64 %rd122, %rd85, %rd97;
+; CHECK-NEXT:    add.cc.s64 %rd119, %rd119, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd120, %rd120, -1;
+; CHECK-NEXT:    or.b64 %rd98, %rd119, %rd120;
 ; CHECK-NEXT:    setp.eq.s64 %p18, %rd98, 0;
 ; CHECK-NEXT:    @%p18 bra $L__BB0_4;
 ; CHECK-NEXT:    bra.uni $L__BB0_2;
 ; CHECK-NEXT:  $L__BB0_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd99, %rd125, 63;
-; CHECK-NEXT:    shl.b64 %rd100, %rd126, 1;
+; CHECK-NEXT:    shr.u64 %rd99, %rd123, 63;
+; CHECK-NEXT:    shl.b64 %rd100, %rd124, 1;
 ; CHECK-NEXT:    or.b64 %rd101, %rd100, %rd99;
-; CHECK-NEXT:    shl.b64 %rd102, %rd125, 1;
-; CHECK-NEXT:    or.b64 %rd127, %rd119, %rd102;
-; CHECK-NEXT:    or.b64 %rd128, %rd116, %rd101;
+; CHECK-NEXT:    shl.b64 %rd102, %rd123, 1;
+; CHECK-NEXT:    or.b64 %rd125, %rd117, %rd102;
+; CHECK-NEXT:    or.b64 %rd126, %rd114, %rd101;
 ; CHECK-NEXT:  $L__BB0_5: // %udiv-end
-; CHECK-NEXT:    mul.hi.u64 %rd103, %rd5, %rd127;
-; CHECK-NEXT:    mul.lo.s64 %rd104, %rd5, %rd128;
-; CHECK-NEXT:    add.s64 %rd105, %rd103, %rd104;
-; CHECK-NEXT:    mul.lo.s64 %rd106, %rd6, %rd127;
-; CHECK-NEXT:    add.s64 %rd107, %rd105, %rd106;
-; CHECK-NEXT:    mul.lo.s64 %rd108, %rd5, %rd127;
-; CHECK-NEXT:    sub.cc.s64 %rd109, %rd3, %rd108;
-; CHECK-NEXT:    subc.cc.s64 %rd110, %rd4, %rd107;
-; CHECK-NEXT:    xor.b64 %rd111, %rd109, %rd2;
-; CHECK-NEXT:    xor.b64 %rd112, %rd110, %rd2;
-; CHECK-NEXT:    sub.cc.s64 %rd113, %rd111, %rd2;
-; CHECK-NEXT:    subc.cc.s64 %rd114, %rd112, %rd2;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd113, %rd114};
+; CHECK-NEXT:    mul.hi.u64 %rd103, %rd5, %rd125;
+; CHECK-NEXT:    mad.lo.s64 %rd104, %rd5, %rd126, %rd103;
+; CHECK-NEXT:    mad.lo.s64 %rd105, %rd6, %rd125, %rd104;
+; CHECK-NEXT:    mul.lo.s64 %rd106, %rd5, %rd125;
+; CHECK-NEXT:    sub.cc.s64 %rd107, %rd3, %rd106;
+; CHECK-NEXT:    subc.cc.s64 %rd108, %rd4, %rd105;
+; CHECK-NEXT:    xor.b64 %rd109, %rd107, %rd2;
+; CHECK-NEXT:    xor.b64 %rd110, %rd108, %rd2;
+; CHECK-NEXT:    sub.cc.s64 %rd111, %rd109, %rd2;
+; CHECK-NEXT:    subc.cc.s64 %rd112, %rd110, %rd2;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd111, %rd112};
 ; CHECK-NEXT:    ret;
   %div = srem i128 %lhs, %rhs
   ret i128 %div
@@ -153,7 +151,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<17>;
 ; CHECK-NEXT:    .reg .b32 %r<16>;
-; CHECK-NEXT:    .reg .b64 %rd<115>;
+; CHECK-NEXT:    .reg .b64 %rd<113>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd41, %rd42}, [urem_i128_param_0];
@@ -177,9 +175,9 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd52, %r4;
 ; CHECK-NEXT:    add.s64 %rd53, %rd52, 64;
 ; CHECK-NEXT:    selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT:    mov.b64 %rd105, 0;
+; CHECK-NEXT:    mov.b64 %rd103, 0;
 ; CHECK-NEXT:    sub.cc.s64 %rd56, %rd50, %rd54;
-; CHECK-NEXT:    subc.cc.s64 %rd57, %rd105, 0;
+; CHECK-NEXT:    subc.cc.s64 %rd57, %rd103, 0;
 ; CHECK-NEXT:    setp.eq.s64 %p6, %rd57, 0;
 ; CHECK-NEXT:    setp.ne.s64 %p7, %rd57, 0;
 ; CHECK-NEXT:    selp.u32 %r5, -1, 0, %p7;
@@ -192,14 +190,14 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    xor.b64 %rd58, %rd56, 127;
 ; CHECK-NEXT:    or.b64 %rd59, %rd58, %rd57;
 ; CHECK-NEXT:    setp.eq.s64 %p11, %rd59, 0;
-; CHECK-NEXT:    selp.b64 %rd114, 0, %rd42, %p10;
-; CHECK-NEXT:    selp.b64 %rd113, 0, %rd41, %p10;
+; CHECK-NEXT:    selp.b64 %rd112, 0, %rd42, %p10;
+; CHECK-NEXT:    selp.b64 %rd111, 0, %rd41, %p10;
 ; CHECK-NEXT:    or.pred %p12, %p10, %p11;
 ; CHECK-NEXT:    @%p12 bra $L__BB1_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd107, %rd56, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd108, %rd57, 0;
-; CHECK-NEXT:    or.b64 %rd62, %rd107, %rd108;
+; CHECK-NEXT:    add.cc.s64 %rd105, %rd56, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd106, %rd57, 0;
+; CHECK-NEXT:    or.b64 %rd62, %rd105, %rd106;
 ; CHECK-NEXT:    setp.eq.s64 %p13, %rd62, 0;
 ; CHECK-NEXT:    cvt.u32.u64 %r9, %rd56;
 ; CHECK-NEXT:    sub.s32 %r10, 127, %r9;
@@ -210,12 +208,12 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    sub.s32 %r12, 63, %r9;
 ; CHECK-NEXT:    shl.b64 %rd66, %rd41, %r12;
 ; CHECK-NEXT:    setp.gt.s32 %p14, %r10, 63;
-; CHECK-NEXT:    selp.b64 %rd112, %rd66, %rd65, %p14;
-; CHECK-NEXT:    shl.b64 %rd111, %rd41, %r10;
-; CHECK-NEXT:    mov.u64 %rd102, %rd105;
+; CHECK-NEXT:    selp.b64 %rd110, %rd66, %rd65, %p14;
+; CHECK-NEXT:    shl.b64 %rd109, %rd41, %r10;
+; CHECK-NEXT:    mov.u64 %rd100, %rd103;
 ; CHECK-NEXT:    @%p13 bra $L__BB1_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r13, %rd107;
+; CHECK-NEXT:    cvt.u32.u64 %r13, %rd105;
 ; CHECK-NEXT:    shr.u64 %rd69, %rd41, %r13;
 ; CHECK-NEXT:    sub.s32 %r14, 64, %r13;
 ; CHECK-NEXT:    shl.b64 %rd70, %rd42, %r14;
@@ -223,57 +221,55 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    add.s32 %r15, %r13, -64;
 ; CHECK-NEXT:    shr.u64 %rd72, %rd42, %r15;
 ; CHECK-NEXT:    setp.gt.s32 %p15, %r13, 63;
-; CHECK-NEXT:    selp.b64 %rd109, %rd72, %rd71, %p15;
-; CHECK-NEXT:    shr.u64 %rd110, %rd42, %r13;
+; CHECK-NEXT:    selp.b64 %rd107, %rd72, %rd71, %p15;
+; CHECK-NEXT:    shr.u64 %rd108, %rd42, %r13;
 ; CHECK-NEXT:    add.cc.s64 %rd33, %rd3, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd34, %rd4, -1;
-; CHECK-NEXT:    mov.b64 %rd102, 0;
-; CHECK-NEXT:    mov.u64 %rd105, %rd102;
+; CHECK-NEXT:    mov.b64 %rd100, 0;
+; CHECK-NEXT:    mov.u64 %rd103, %rd100;
 ; CHECK-NEXT:  $L__BB1_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd73, %rd109, 63;
-; CHECK-NEXT:    shl.b64 %rd74, %rd110, 1;
+; CHECK-NEXT:    shr.u64 %rd73, %rd107, 63;
+; CHECK-NEXT:    shl.b64 %rd74, %rd108, 1;
 ; CHECK-NEXT:    or.b64 %rd75, %rd74, %rd73;
-; CHECK-NEXT:    shl.b64 %rd76, %rd109, 1;
-; CHECK-NEXT:    shr.u64 %rd77, %rd112, 63;
+; CHECK-NEXT:    shl.b64 %rd76, %rd107, 1;
+; CHECK-NEXT:    shr.u64 %rd77, %rd110, 63;
 ; CHECK-NEXT:    or.b64 %rd78, %rd76, %rd77;
-; CHECK-NEXT:    shr.u64 %rd79, %rd111, 63;
-; CHECK-NEXT:    shl.b64 %rd80, %rd112, 1;
+; CHECK-NEXT:    shr.u64 %rd79, %rd109, 63;
+; CHECK-NEXT:    shl.b64 %rd80, %rd110, 1;
 ; CHECK-NEXT:    or.b64 %rd81, %rd80, %rd79;
-; CHECK-NEXT:    shl.b64 %rd82, %rd111, 1;
-; CHECK-NEXT:    or.b64 %rd111, %rd105, %rd82;
-; CHECK-NEXT:    or.b64 %rd112, %rd102, %rd81;
+; CHECK-NEXT:    shl.b64 %rd82, %rd109, 1;
+; CHECK-NEXT:    or.b64 %rd109, %rd103, %rd82;
+; CHECK-NEXT:    or.b64 %rd110, %rd100, %rd81;
 ; CHECK-NEXT:    sub.cc.s64 %rd83, %rd33, %rd78;
 ; CHECK-NEXT:    subc.cc.s64 %rd84, %rd34, %rd75;
 ; CHECK-NEXT:    shr.s64 %rd85, %rd84, 63;
-; CHECK-NEXT:    and.b64 %rd105, %rd85, 1;
+; CHECK-NEXT:    and.b64 %rd103, %rd85, 1;
 ; CHECK-NEXT:    and.b64 %rd86, %rd85, %rd3;
 ; CHECK-NEXT:    and.b64 %rd87, %rd85, %rd4;
-; CHECK-NEXT:    sub.cc.s64 %rd109, %rd78, %rd86;
-; CHECK-NEXT:    subc.cc.s64 %rd110, %rd75, %rd87;
-; CHECK-NEXT:    add.cc.s64 %rd107, %rd107, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd108, %rd108, -1;
-; CHECK-NEXT:    or.b64 %rd88, %rd107, %rd108;
+; CHECK-NEXT:    sub.cc.s64 %rd107, %rd78, %rd86;
+; CHECK-NEXT:    subc.cc.s64 %rd108, %rd75, %rd87;
+; CHECK-NEXT:    add.cc.s64 %rd105, %rd105, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd106, %rd106, -1;
+; CHECK-NEXT:    or.b64 %rd88, %rd105, %rd106;
 ; CHECK-NEXT:    setp.eq.s64 %p16, %rd88, 0;
 ; CHECK-NEXT:    @%p16 bra $L__BB1_4;
 ; CHECK-NEXT:    bra.uni $L__BB1_2;
 ; CHECK-NEXT:  $L__BB1_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd89, %rd111, 63;
-; CHECK-NEXT:    shl.b64 %rd90, %rd112, 1;
+; CHECK-NEXT:    shr.u64 %rd89, %rd109, 63;
+; CHECK-NEXT:    shl.b64 %rd90, %rd110, 1;
 ; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
-; CHECK-NEXT:    shl.b64 %rd92, %rd111, 1;
-; CHECK-NEXT:    or.b64 %rd113, %rd105, %rd92;
-; CHECK-NEXT:    or.b64 %rd114, %rd102, %rd91;
+; CHECK-NEXT:    shl.b64 %rd92, %rd109, 1;
+; CHECK-NEXT:    or.b64 %rd111, %rd103, %rd92;
+; CHECK-NEXT:    or.b64 %rd112, %rd100, %rd91;
 ; CHECK-NEXT:  $L__BB1_5: // %udiv-end
-; CHECK-NEXT:    mul.hi.u64 %rd93, %rd3, %rd113;
-; CHECK-NEXT:    mul.lo.s64 %rd94, %rd3, %rd114;
-; CHECK-NEXT:    add.s64 %rd95, %rd93, %rd94;
-; CHECK-NEXT:    mul.lo.s64 %rd96, %rd4, %rd113;
-; CHECK-NEXT:    add.s64 %rd97, %rd95, %rd96;
-; CHECK-NEXT:    mul.lo.s64 %rd98, %rd3, %rd113;
-; CHECK-NEXT:    sub.cc.s64 %rd99, %rd41, %rd98;
-; CHECK-NEXT:    subc.cc.s64 %rd100, %rd42, %rd97;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd99, %rd100};
+; CHECK-NEXT:    mul.hi.u64 %rd93, %rd3, %rd111;
+; CHECK-NEXT:    mad.lo.s64 %rd94, %rd3, %rd112, %rd93;
+; CHECK-NEXT:    mad.lo.s64 %rd95, %rd4, %rd111, %rd94;
+; CHECK-NEXT:    mul.lo.s64 %rd96, %rd3, %rd111;
+; CHECK-NEXT:    sub.cc.s64 %rd97, %rd41, %rd96;
+; CHECK-NEXT:    subc.cc.s64 %rd98, %rd42, %rd95;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd97, %rd98};
 ; CHECK-NEXT:    ret;
   %div = urem i128 %lhs, %rhs
   ret i128 %div
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index 2ffb079e83b0b..8b26c58d5bee1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -1133,233 +1133,210 @@ define <16 x i32> @shuffle_disjoint_lanes_one_splat(i32 %v, <16 x i32> %w) {
 define <4 x i128> @shuffle_i128(<4 x i128> %a) {
 ; RV32-LABEL: shuffle_i128:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a2, 0(a1)
-; RV32-NEXT:    lw a3, 4(a1)
-; RV32-NEXT:    lw a4, 8(a1)
-; RV32-NEXT:    lw a5, 12(a1)
-; RV32-NEXT:    lw a6, 48(a1)
-; RV32-NEXT:    lw a7, 52(a1)
-; RV32-NEXT:    lw t0, 56(a1)
-; RV32-NEXT:    lw t1, 60(a1)
-; RV32-NEXT:    lw t2, 32(a1)
-; RV32-NEXT:    lw t3, 36(a1)
-; RV32-NEXT:    lw t4, 40(a1)
-; RV32-NEXT:    lw a1, 44(a1)
-; RV32-NEXT:    sw t2, 48(a0)
-; RV32-NEXT:    sw t3, 52(a0)
-; RV32-NEXT:    sw t4, 56(a0)
-; RV32-NEXT:    sw a1, 60(a0)
-; RV32-NEXT:    sw a6, 32(a0)
-; RV32-NEXT:    sw a7, 36(a0)
-; RV32-NEXT:    sw t0, 40(a0)
-; RV32-NEXT:    sw t1, 44(a0)
-; RV32-NEXT:    sw a2, 16(a0)
-; RV32-NEXT:    sw a3, 20(a0)
-; RV32-NEXT:    sw a4, 24(a0)
-; RV32-NEXT:    sw a5, 28(a0)
-; RV32-NEXT:    sw a2, 0(a0)
-; RV32-NEXT:    sw a3, 4(a0)
-; RV32-NEXT:    sw a4, 8(a0)
-; RV32-NEXT:    sw a5, 12(a0)
+; RV32-NEXT:    addi sp, sp, -128
+; RV32-NEXT:    .cfi_def_cfa_offset 128
+; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    addi s0, sp, 128
+; RV32-NEXT:    .cfi_def_cfa s0, 0
+; RV32-NEXT:    andi sp, sp, -64
+; RV32-NEXT:    lw a2, 60(a1)
+; RV32-NEXT:    sw a2, 60(sp)
+; RV32-NEXT:    lw a2, 56(a1)
+; RV32-NEXT:    sw a2, 56(sp)
+; RV32-NEXT:    lw a2, 52(a1)
+; RV32-NEXT:    sw a2, 52(sp)
+; RV32-NEXT:    lw a2, 48(a1)
+; RV32-NEXT:    sw a2, 48(sp)
+; RV32-NEXT:    lw a2, 44(a1)
+; RV32-NEXT:    sw a2, 44(sp)
+; RV32-NEXT:    lw a2, 40(a1)
+; RV32-NEXT:    sw a2, 40(sp)
+; RV32-NEXT:    lw a2, 36(a1)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lw a2, 32(a1)
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    lw a2, 12(a1)
+; RV32-NEXT:    sw a2, 12(sp)
+; RV32-NEXT:    lw a2, 8(a1)
+; RV32-NEXT:    sw a2, 8(sp)
+; RV32-NEXT:    lw a2, 4(a1)
+; RV32-NEXT:    sw a2, 4(sp)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    mv a2, sp
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    lui a1, %hi(.LCPI78_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI78_0)
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v8, (a2)
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vle16.v v12, (a1)
+; RV32-NEXT:    vrgatherei16.vv v16, v8, v12
+; RV32-NEXT:    vse64.v v16, (a0)
+; RV32-NEXT:    addi sp, s0, -128
+; RV32-NEXT:    .cfi_def_cfa sp, 128
+; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    .cfi_restore s0
+; RV32-NEXT:    addi sp, sp, 128
+; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_i128:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -128
+; RV64-NEXT:    .cfi_def_cfa_offset 128
+; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    addi s0, sp, 128
+; RV64-NEXT:    .cfi_def_cfa s0, 0
+; RV64-NEXT:    andi sp, sp, -64
+; RV64-NEXT:    ld a2, 56(a1)
+; RV64-NEXT:    sd a2, 56(sp)
 ; RV64-NEXT:    ld a2, 48(a1)
-; RV64-NEXT:    ld a3, 56(a1)
-; RV64-NEXT:    ld a4, 0(a1)
-; RV64-NEXT:    ld a5, 8(a1)
-; RV64-NEXT:    ld a6, 32(a1)
-; RV64-NEXT:    ld a1, 40(a1)
-; RV64-NEXT:    sd a2, 32(a0)
-; RV64-NEXT:    sd a3, 40(a0)
-; RV64-NEXT:    sd a6, 48(a0)
-; RV64-NEXT:    sd a1, 56(a0)
-; RV64-NEXT:    sd a4, 0(a0)
-; RV64-NEXT:    sd a5, 8(a0)
-; RV64-NEXT:    sd a4, 16(a0)
-; RV64-NEXT:    sd a5, 24(a0)
+; RV64-NEXT:    sd a2, 48(sp)
+; RV64-NEXT:    ld a2, 40(a1)
+; RV64-NEXT:    sd a2, 40(sp)
+; RV64-NEXT:    ld a2, 32(a1)
+; RV64-NEXT:    sd a2, 32(sp)
+; RV64-NEXT:    ld a2, 8(a1)
+; RV64-NEXT:    sd a2, 8(sp)
+; RV64-NEXT:    ld a1, 0(a1)
+; RV64-NEXT:    mv a2, sp
+; RV64-NEXT:    sd a1, 0(sp)
+; RV64-NEXT:    lui a1, %hi(.LCPI78_0)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI78_0)
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT:    vle64.v v8, (a2)
+; RV64-NEXT:    vle16.v v12, (a1)
+; RV64-NEXT:    vrgatherei16.vv v16, v8, v12
+; RV64-NEXT:    vse64.v v16, (a0)
+; RV64-NEXT:    addi sp, s0, -128
+; RV64-NEXT:    .cfi_def_cfa sp, 128
+; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    .cfi_restore s0
+; RV64-NEXT:    addi sp, sp, 128
+; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %res = shufflevector <4 x i128> %a, <4 x i128> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
   ret <4 x i128> %res
 }
 
 define void @shuffle_i128_ldst(ptr %p) {
-; RV32-LABEL: shuffle_i128_ldst:
+; CHECK-LABEL: shuffle_i128_ldst:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    lui a1, %hi(.LCPI79_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI79_0)
+; CHECK-NEXT:    vle16.v v12, (a1)
+; CHECK-NEXT:    vrgatherei16.vv v16, v8, v12
+; CHECK-NEXT:    vse64.v v16, (a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i128>, ptr %p
+  %res = shufflevector <4 x i128> %a, <4 x i128> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
+  store <4 x i128> %res, ptr %p
+  ret void
+}
+
+define void @shuffle_i256_ldst(ptr %p) {
+; CHECK-LABEL: shuffle_i256_ldst:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, %hi(.LCPI80_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI80_0)
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a1)
+; CHECK-NEXT:    vle64.v v16, (a0)
+; CHECK-NEXT:    vsext.vf2 v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v24, v16, v10
+; CHECK-NEXT:    vse64.v v24, (a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i256>, ptr %p
+  %res = shufflevector <4 x i256> %a, <4 x i256> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
+  store <4 x i256> %res, ptr %p
+  ret void
+}
+
+define void @shuffle_i64_splat(ptr %p) nounwind {
+; RV32-LABEL: shuffle_i64_splat:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a1, 48(a0)
-; RV32-NEXT:    lw a2, 52(a0)
-; RV32-NEXT:    lw a3, 56(a0)
-; RV32-NEXT:    lw a4, 60(a0)
-; RV32-NEXT:    lw a5, 0(a0)
-; RV32-NEXT:    lw a6, 4(a0)
-; RV32-NEXT:    lw a7, 8(a0)
-; RV32-NEXT:    lw t0, 12(a0)
-; RV32-NEXT:    lw t1, 32(a0)
-; RV32-NEXT:    lw t2, 36(a0)
-; RV32-NEXT:    lw t3, 40(a0)
-; RV32-NEXT:    lw t4, 44(a0)
-; RV32-NEXT:    sw t1, 48(a0)
-; RV32-NEXT:    sw t2, 52(a0)
-; RV32-NEXT:    sw t3, 56(a0)
-; RV32-NEXT:    sw t4, 60(a0)
-; RV32-NEXT:    sw a5, 16(a0)
-; RV32-NEXT:    sw a6, 20(a0)
-; RV32-NEXT:    sw a7, 24(a0)
-; RV32-NEXT:    sw t0, 28(a0)
-; RV32-NEXT:    sw a1, 32(a0)
-; RV32-NEXT:    sw a2, 36(a0)
-; RV32-NEXT:    sw a3, 40(a0)
-; RV32-NEXT:    sw a4, 44(a0)
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a0), zero
+; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: shuffle_i128_ldst:
+; RV64-LABEL: shuffle_i64_splat:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    ld a1, 0(a0)
-; RV64-NEXT:    ld a2, 8(a0)
-; RV64-NEXT:    ld a3, 32(a0)
-; RV64-NEXT:    ld a4, 40(a0)
-; RV64-NEXT:    ld a5, 48(a0)
-; RV64-NEXT:    ld a6, 56(a0)
-; RV64-NEXT:    sd a3, 48(a0)
-; RV64-NEXT:    sd a4, 56(a0)
-; RV64-NEXT:    sd a1, 16(a0)
-; RV64-NEXT:    sd a2, 24(a0)
-; RV64-NEXT:    sd a5, 32(a0)
-; RV64-NEXT:    sd a6, 40(a0)
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    ret
+  %a = load <4 x i64>, ptr %p
+  %res = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  store <4 x i64> %res, ptr %p
+  ret void
+}
+
+define void @shuffle_i128_splat(ptr %p) nounwind {
+; CHECK-LABEL: shuffle_i128_splat:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    lui a1, 16
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v16, v8, v12
+; CHECK-NEXT:    vse64.v v16, (a0)
+; CHECK-NEXT:    ret
   %a = load <4 x i128>, ptr %p
-  %res = shufflevector <4 x i128> %a, <4 x i128> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
+  %res = shufflevector <4 x i128> %a, <4 x i128> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
   store <4 x i128> %res, ptr %p
   ret void
 }
 
-define void @shuffle_i256_ldst(ptr %p) {
-; RV32-LABEL: shuffle_i256_ldst:
+define void @shuffle_i256_splat(ptr %p) nounwind {
+; RV32-LABEL: shuffle_i256_splat:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
-; RV32-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s9, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset s0, -4
-; RV32-NEXT:    .cfi_offset s1, -8
-; RV32-NEXT:    .cfi_offset s2, -12
-; RV32-NEXT:    .cfi_offset s3, -16
-; RV32-NEXT:    .cfi_offset s4, -20
-; RV32-NEXT:    .cfi_offset s5, -24
-; RV32-NEXT:    .cfi_offset s6, -28
-; RV32-NEXT:    .cfi_offset s7, -32
-; RV32-NEXT:    .cfi_offset s8, -36
-; RV32-NEXT:    .cfi_offset s9, -40
-; RV32-NEXT:    lw a1, 0(a0)
-; RV32-NEXT:    lw a2, 4(a0)
-; RV32-NEXT:    lw a3, 8(a0)
-; RV32-NEXT:    lw a4, 12(a0)
-; RV32-NEXT:    lw a5, 16(a0)
-; RV32-NEXT:    lw a6, 20(a0)
-; RV32-NEXT:    lw a7, 24(a0)
-; RV32-NEXT:    lw t0, 28(a0)
-; RV32-NEXT:    lw t1, 96(a0)
-; RV32-NEXT:    lw t2, 100(a0)
-; RV32-NEXT:    lw t3, 104(a0)
-; RV32-NEXT:    lw t4, 108(a0)
-; RV32-NEXT:    lw t5, 112(a0)
-; RV32-NEXT:    lw t6, 116(a0)
-; RV32-NEXT:    lw s0, 120(a0)
-; RV32-NEXT:    lw s1, 124(a0)
-; RV32-NEXT:    lw s2, 64(a0)
-; RV32-NEXT:    lw s3, 68(a0)
-; RV32-NEXT:    lw s4, 72(a0)
-; RV32-NEXT:    lw s5, 76(a0)
-; RV32-NEXT:    lw s6, 80(a0)
-; RV32-NEXT:    lw s7, 84(a0)
-; RV32-NEXT:    lw s8, 88(a0)
-; RV32-NEXT:    lw s9, 92(a0)
-; RV32-NEXT:    sw s6, 112(a0)
-; RV32-NEXT:    sw s7, 116(a0)
-; RV32-NEXT:    sw s8, 120(a0)
-; RV32-NEXT:    sw s9, 124(a0)
-; RV32-NEXT:    sw s2, 96(a0)
-; RV32-NEXT:    sw s3, 100(a0)
-; RV32-NEXT:    sw s4, 104(a0)
-; RV32-NEXT:    sw s5, 108(a0)
-; RV32-NEXT:    sw t5, 80(a0)
-; RV32-NEXT:    sw t6, 84(a0)
-; RV32-NEXT:    sw s0, 88(a0)
-; RV32-NEXT:    sw s1, 92(a0)
-; RV32-NEXT:    sw t1, 64(a0)
-; RV32-NEXT:    sw t2, 68(a0)
-; RV32-NEXT:    sw t3, 72(a0)
-; RV32-NEXT:    sw t4, 76(a0)
-; RV32-NEXT:    sw a5, 48(a0)
-; RV32-NEXT:    sw a6, 52(a0)
-; RV32-NEXT:    sw a7, 56(a0)
-; RV32-NEXT:    sw t0, 60(a0)
-; RV32-NEXT:    sw a1, 32(a0)
-; RV32-NEXT:    sw a2, 36(a0)
-; RV32-NEXT:    sw a3, 40(a0)
-; RV32-NEXT:    sw a4, 44(a0)
-; RV32-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s9, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT:    .cfi_restore s0
-; RV32-NEXT:    .cfi_restore s1
-; RV32-NEXT:    .cfi_restore s2
-; RV32-NEXT:    .cfi_restore s3
-; RV32-NEXT:    .cfi_restore s4
-; RV32-NEXT:    .cfi_restore s5
-; RV32-NEXT:    .cfi_restore s6
-; RV32-NEXT:    .cfi_restore s7
-; RV32-NEXT:    .cfi_restore s8
-; RV32-NEXT:    .cfi_restore s9
-; RV32-NEXT:    addi sp, sp, 48
-; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    lui a1, 12320
+; RV32-NEXT:    addi a1, a1, 256
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV32-NEXT:    vsext.vf2 v18, v16
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v24, v8, v18
+; RV32-NEXT:    vse64.v v24, (a0)
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: shuffle_i256_ldst:
+; RV64-LABEL: shuffle_i256_splat:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    ld a1, 96(a0)
-; RV64-NEXT:    ld a2, 104(a0)
-; RV64-NEXT:    ld a3, 112(a0)
-; RV64-NEXT:    ld a4, 120(a0)
-; RV64-NEXT:    ld a5, 0(a0)
-; RV64-NEXT:    ld a6, 8(a0)
-; RV64-NEXT:    ld a7, 16(a0)
-; RV64-NEXT:    ld t0, 24(a0)
-; RV64-NEXT:    ld t1, 64(a0)
-; RV64-NEXT:    ld t2, 72(a0)
-; RV64-NEXT:    ld t3, 80(a0)
-; RV64-NEXT:    ld t4, 88(a0)
-; RV64-NEXT:    sd t1, 96(a0)
-; RV64-NEXT:    sd t2, 104(a0)
-; RV64-NEXT:    sd t3, 112(a0)
-; RV64-NEXT:    sd t4, 120(a0)
-; RV64-NEXT:    sd a5, 32(a0)
-; RV64-NEXT:    sd a6, 40(a0)
-; RV64-NEXT:    sd a7, 48(a0)
-; RV64-NEXT:    sd t0, 56(a0)
-; RV64-NEXT:    sd a1, 64(a0)
-; RV64-NEXT:    sd a2, 72(a0)
-; RV64-NEXT:    sd a3, 80(a0)
-; RV64-NEXT:    sd a4, 88(a0)
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    lui a1, 98305
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    addi a1, a1, 1
+; RV64-NEXT:    slli a1, a1, 16
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vrgatherei16.vv v24, v8, v16
+; RV64-NEXT:    vse64.v v24, (a0)
 ; RV64-NEXT:    ret
   %a = load <4 x i256>, ptr %p
-  %res = shufflevector <4 x i256> %a, <4 x i256> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
+  %res = shufflevector <4 x i256> %a, <4 x i256> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
   store <4 x i256> %res, ptr %p
   ret void
 }
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll
new file mode 100644
index 0000000000000..ee9609992c049
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll
@@ -0,0 +1,351 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
+
+
+define <1 x i32> @select_addsub_v1i32(<1 x i1> %cc, <1 x i32> %a, <1 x i32> %b) {
+; CHECK-LABEL: select_addsub_v1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
+; CHECK-NEXT:    vadd.vv v10, v8, v9
+; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+  %sub = sub <1 x i32> %a, %b
+  %add = add <1 x i32> %a, %b
+  %res = select <1 x i1> %cc,  <1 x i32> %sub, <1 x i32> %add
+  ret <1 x i32> %res
+}
+
+define <2 x i32> @select_addsub_v2i32(<2 x i1> %cc, <2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: select_addsub_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; CHECK-NEXT:    vadd.vv v10, v8, v9
+; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+  %sub = sub <2 x i32> %a, %b
+  %add = add <2 x i32> %a, %b
+  %res = select <2 x i1> %cc,  <2 x i32> %sub, <2 x i32> %add
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @select_addsub_v4i32(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: select_addsub_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vadd.vv v10, v8, v9
+; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %sub = sub <4 x i32> %a, %b
+  %add = add <4 x i32> %a, %b
+  %res = select <4 x i1> %cc,  <4 x i32> %sub, <4 x i32> %add
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @select_addsub_v4i32_select_swapped(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: select_addsub_v4i32_select_swapped:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vsub.vv v10, v8, v9
+; CHECK-NEXT:    vadd.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %sub = sub <4 x i32> %a, %b
+  %add = add <4 x i32> %a, %b
+  %res = select <4 x i1> %cc,  <4 x i32> %add, <4 x i32> %sub
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @select_addsub_v4i32_add_swapped(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: select_addsub_v4i32_add_swapped:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vadd.vv v10, v9, v8
+; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %sub = sub <4 x i32> %a, %b
+  %add = add <4 x i32> %b, %a
+  %res = select <4 x i1> %cc,  <4 x i32> %sub, <4 x i32> %add
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @select_addsub_v4i32_both_swapped(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: select_addsub_v4i32_both_swapped:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vsub.vv v10, v8, v9
+; CHECK-NEXT:    vadd.vv v10, v9, v8, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %sub = sub <4 x i32> %a, %b
+  %add = add <4 x i32> %b, %a
+  %res = select <4 x i1> %cc,  <4 x i32> %add, <4 x i32> %sub
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @select_addsub_v4i32_sub_swapped(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: select_addsub_v4i32_sub_swapped:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vadd.vv v10, v9, v8
+; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %sub = sub <4 x i32> %a, %b
+  %add = add <4 x i32> %b, %a
+  %res = select <4 x i1> %cc,  <4 x i32> %sub, <4 x i32> %add
+  ret <4 x i32> %res
+}
+
+define <8 x i32> @select_addsub_v8i32(<8 x i1> %cc, <8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: select_addsub_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; CHECK-NEXT:    vadd.vv v12, v8, v10
+; CHECK-NEXT:    vsub.vv v12, v8, v10, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+  %sub = sub <8 x i32> %a, %b
+  %add = add <8 x i32> %a, %b
+  %res = select <8 x i1> %cc,  <8 x i32> %sub, <8 x i32> %add
+  ret <8 x i32> %res
+}
+
+define <16 x i32> @select_addsub_v16i32(<16 x i1> %cc, <16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: select_addsub_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT:    vadd.vv v16, v8, v12
+; CHECK-NEXT:    vsub.vv v16, v8, v12, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+  %sub = sub <16 x i32> %a, %b
+  %add = add <16 x i32> %a, %b
+  %res = select <16 x i1> %cc,  <16 x i32> %sub, <16 x i32> %add
+  ret <16 x i32> %res
+}
+
+define <32 x i32> @select_addsub_v32i32(<32 x i1> %cc, <32 x i32> %a, <32 x i32> %b) {
+; CHECK-LABEL: select_addsub_v32i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
+; CHECK-NEXT:    vadd.vv v24, v8, v16
+; CHECK-NEXT:    vsub.vv v24, v8, v16, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v24
+; CHECK-NEXT:    ret
+  %sub = sub <32 x i32> %a, %b
+  %add = add <32 x i32> %a, %b
+  %res = select <32 x i1> %cc,  <32 x i32> %sub, <32 x i32> %add
+  ret <32 x i32> %res
+}
+
+define <64 x i32> @select_addsub_v64i32(<64 x i1> %cc, <64 x i32> %a, <64 x i32> %b) {
+; CHECK-LABEL: select_addsub_v64i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vle32.v v16, (a0)
+; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    vle32.v v24, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vadd.vv v24, v8, v16
+; CHECK-NEXT:    vsub.vv v24, v8, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v0, 4
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vadd.vv v16, v16, v8
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsub.vv v16, v24, v8, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    ret
+  %sub = sub <64 x i32> %a, %b
+  %add = add <64 x i32> %a, %b
+  %res = select <64 x i1> %cc,  <64 x i32> %sub, <64 x i32> %add
+  ret <64 x i32> %res
+}
+
+define <8 x i64> @select_addsub_v8i64(<8 x i1> %cc, <8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: select_addsub_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT:    vadd.vv v16, v8, v12
+; CHECK-NEXT:    vsub.vv v16, v8, v12, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+  %sub = sub <8 x i64> %a, %b
+  %add = add <8 x i64> %a, %b
+  %res = select <8 x i1> %cc,  <8 x i64> %sub, <8 x i64> %add
+  ret <8 x i64> %res
+}
+
+define <8 x i16> @select_addsub_v8i16(<8 x i1> %cc, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: select_addsub_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; CHECK-NEXT:    vadd.vv v10, v8, v9
+; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %sub = sub <8 x i16> %a, %b
+  %add = add <8 x i16> %a, %b
+  %res = select <8 x i1> %cc,  <8 x i16> %sub, <8 x i16> %add
+  ret <8 x i16> %res
+}
+
+define <8 x i8> @select_addsub_v8i8(<8 x i1> %cc, <8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: select_addsub_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vadd.vv v10, v8, v9
+; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+  %sub = sub <8 x i8> %a, %b
+  %add = add <8 x i8> %a, %b
+  %res = select <8 x i1> %cc,  <8 x i8> %sub, <8 x i8> %add
+  ret <8 x i8> %res
+}
+
+define <8 x i1> @select_addsub_v8i1(<8 x i1> %cc, <8 x i1> %a, <8 x i1> %b) {
+; CHECK-LABEL: select_addsub_v8i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vmxor.mm v0, v8, v9
+; CHECK-NEXT:    ret
+  %sub = sub <8 x i1> %a, %b
+  %add = add <8 x i1> %a, %b
+  %res = select <8 x i1> %cc,  <8 x i1> %sub, <8 x i1> %add
+  ret <8 x i1> %res
+}
+
+define <8 x i2> @select_addsub_v8i2(<8 x i1> %cc, <8 x i2> %a, <8 x i2> %b) {
+; CHECK-LABEL: select_addsub_v8i2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vadd.vv v10, v8, v9
+; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+  %sub = sub <8 x i2> %a, %b
+  %add = add <8 x i2> %a, %b
+  %res = select <8 x i1> %cc,  <8 x i2> %sub, <8 x i2> %add
+  ret <8 x i2> %res
+}
+
+define <4 x i32> @select_addsub_v4i32_constmask(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: select_addsub_v4i32_constmask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.v.i v0, 5
+; CHECK-NEXT:    vadd.vv v10, v8, v9
+; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %sub = sub <4 x i32> %a, %b
+  %add = add <4 x i32> %a, %b
+  %res = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>,  <4 x i32> %sub, <4 x i32> %add
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @select_addsub_v4i32_constmask2(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: select_addsub_v4i32_constmask2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.v.i v0, 5
+; CHECK-NEXT:    vadd.vv v10, v9, v8
+; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %sub = sub <4 x i32> %a, %b
+  %add = add <4 x i32> %b, %a
+  %res = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>,  <4 x i32> %sub, <4 x i32> %add
+  ret <4 x i32> %res
+}
+
+; Same pattern as above, but the select is disguised as a shuffle
+define <4 x i32> @select_addsub_v4i32_as_shuffle(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: select_addsub_v4i32_as_shuffle:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.v.i v0, 5
+; CHECK-NEXT:    vadd.vv v10, v8, v9
+; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %sub = sub <4 x i32> %a, %b
+  %add = add <4 x i32> %a, %b
+  %res = shufflevector <4 x i32> %sub, <4 x i32> %add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %res
+}
+
+; Same pattern as above, but the select is disguised as a shuffle
+define <4 x i32> @select_addsub_v4i32_as_shuffle2(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: select_addsub_v4i32_as_shuffle2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.v.i v0, 5
+; CHECK-NEXT:    vadd.vv v10, v8, v9
+; CHECK-NEXT:    vsub.vv v10, v9, v8, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %sub = sub <4 x i32> %b, %a
+  %add = add <4 x i32> %a, %b
+  %res = shufflevector <4 x i32> %sub, <4 x i32> %add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
index 54d0acc3ba8b5..afd560fd74d16 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -396,49 +396,16 @@ entry:
 }
 
 define void @shuffle_i128_ldst(ptr %p) vscale_range(2,2) {
-; RV32-LABEL: shuffle_i128_ldst:
-; RV32:       # %bb.0:
-; RV32-NEXT:    lw a1, 48(a0)
-; RV32-NEXT:    lw a2, 52(a0)
-; RV32-NEXT:    lw a3, 56(a0)
-; RV32-NEXT:    lw a4, 60(a0)
-; RV32-NEXT:    lw a5, 0(a0)
-; RV32-NEXT:    lw a6, 4(a0)
-; RV32-NEXT:    lw a7, 8(a0)
-; RV32-NEXT:    lw t0, 12(a0)
-; RV32-NEXT:    lw t1, 32(a0)
-; RV32-NEXT:    lw t2, 36(a0)
-; RV32-NEXT:    lw t3, 40(a0)
-; RV32-NEXT:    lw t4, 44(a0)
-; RV32-NEXT:    sw t1, 48(a0)
-; RV32-NEXT:    sw t2, 52(a0)
-; RV32-NEXT:    sw t3, 56(a0)
-; RV32-NEXT:    sw t4, 60(a0)
-; RV32-NEXT:    sw a5, 16(a0)
-; RV32-NEXT:    sw a6, 20(a0)
-; RV32-NEXT:    sw a7, 24(a0)
-; RV32-NEXT:    sw t0, 28(a0)
-; RV32-NEXT:    sw a1, 32(a0)
-; RV32-NEXT:    sw a2, 36(a0)
-; RV32-NEXT:    sw a3, 40(a0)
-; RV32-NEXT:    sw a4, 44(a0)
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: shuffle_i128_ldst:
-; RV64:       # %bb.0:
-; RV64-NEXT:    ld a1, 0(a0)
-; RV64-NEXT:    ld a2, 8(a0)
-; RV64-NEXT:    ld a3, 32(a0)
-; RV64-NEXT:    ld a4, 40(a0)
-; RV64-NEXT:    ld a5, 48(a0)
-; RV64-NEXT:    ld a6, 56(a0)
-; RV64-NEXT:    sd a3, 48(a0)
-; RV64-NEXT:    sd a4, 56(a0)
-; RV64-NEXT:    sd a1, 16(a0)
-; RV64-NEXT:    sd a2, 24(a0)
-; RV64-NEXT:    sd a5, 32(a0)
-; RV64-NEXT:    sd a6, 40(a0)
-; RV64-NEXT:    ret
+; CHECK-LABEL: shuffle_i128_ldst:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl4re64.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v9, v8
+; CHECK-NEXT:    vmv4r.v v12, v8
+; CHECK-NEXT:    vmv1r.v v14, v11
+; CHECK-NEXT:    vmv1r.v v15, v10
+; CHECK-NEXT:    vs4r.v v12, (a0)
+; CHECK-NEXT:    ret
   %a = load <4 x i128>, ptr %p
   %res = shufflevector <4 x i128> %a, <4 x i128> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
   store <4 x i128> %res, ptr %p
@@ -446,129 +413,19 @@ define void @shuffle_i128_ldst(ptr %p) vscale_range(2,2) {
 }
 
 define void @shuffle_i256_ldst(ptr %p) vscale_range(2,2) {
-; RV32-LABEL: shuffle_i256_ldst:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
-; RV32-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s9, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset s0, -4
-; RV32-NEXT:    .cfi_offset s1, -8
-; RV32-NEXT:    .cfi_offset s2, -12
-; RV32-NEXT:    .cfi_offset s3, -16
-; RV32-NEXT:    .cfi_offset s4, -20
-; RV32-NEXT:    .cfi_offset s5, -24
-; RV32-NEXT:    .cfi_offset s6, -28
-; RV32-NEXT:    .cfi_offset s7, -32
-; RV32-NEXT:    .cfi_offset s8, -36
-; RV32-NEXT:    .cfi_offset s9, -40
-; RV32-NEXT:    lw a1, 0(a0)
-; RV32-NEXT:    lw a2, 4(a0)
-; RV32-NEXT:    lw a3, 8(a0)
-; RV32-NEXT:    lw a4, 12(a0)
-; RV32-NEXT:    lw a5, 16(a0)
-; RV32-NEXT:    lw a6, 20(a0)
-; RV32-NEXT:    lw a7, 24(a0)
-; RV32-NEXT:    lw t0, 28(a0)
-; RV32-NEXT:    lw t1, 96(a0)
-; RV32-NEXT:    lw t2, 100(a0)
-; RV32-NEXT:    lw t3, 104(a0)
-; RV32-NEXT:    lw t4, 108(a0)
-; RV32-NEXT:    lw t5, 112(a0)
-; RV32-NEXT:    lw t6, 116(a0)
-; RV32-NEXT:    lw s0, 120(a0)
-; RV32-NEXT:    lw s1, 124(a0)
-; RV32-NEXT:    lw s2, 64(a0)
-; RV32-NEXT:    lw s3, 68(a0)
-; RV32-NEXT:    lw s4, 72(a0)
-; RV32-NEXT:    lw s5, 76(a0)
-; RV32-NEXT:    lw s6, 80(a0)
-; RV32-NEXT:    lw s7, 84(a0)
-; RV32-NEXT:    lw s8, 88(a0)
-; RV32-NEXT:    lw s9, 92(a0)
-; RV32-NEXT:    sw s6, 112(a0)
-; RV32-NEXT:    sw s7, 116(a0)
-; RV32-NEXT:    sw s8, 120(a0)
-; RV32-NEXT:    sw s9, 124(a0)
-; RV32-NEXT:    sw s2, 96(a0)
-; RV32-NEXT:    sw s3, 100(a0)
-; RV32-NEXT:    sw s4, 104(a0)
-; RV32-NEXT:    sw s5, 108(a0)
-; RV32-NEXT:    sw t5, 80(a0)
-; RV32-NEXT:    sw t6, 84(a0)
-; RV32-NEXT:    sw s0, 88(a0)
-; RV32-NEXT:    sw s1, 92(a0)
-; RV32-NEXT:    sw t1, 64(a0)
-; RV32-NEXT:    sw t2, 68(a0)
-; RV32-NEXT:    sw t3, 72(a0)
-; RV32-NEXT:    sw t4, 76(a0)
-; RV32-NEXT:    sw a5, 48(a0)
-; RV32-NEXT:    sw a6, 52(a0)
-; RV32-NEXT:    sw a7, 56(a0)
-; RV32-NEXT:    sw t0, 60(a0)
-; RV32-NEXT:    sw a1, 32(a0)
-; RV32-NEXT:    sw a2, 36(a0)
-; RV32-NEXT:    sw a3, 40(a0)
-; RV32-NEXT:    sw a4, 44(a0)
-; RV32-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s9, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT:    .cfi_restore s0
-; RV32-NEXT:    .cfi_restore s1
-; RV32-NEXT:    .cfi_restore s2
-; RV32-NEXT:    .cfi_restore s3
-; RV32-NEXT:    .cfi_restore s4
-; RV32-NEXT:    .cfi_restore s5
-; RV32-NEXT:    .cfi_restore s6
-; RV32-NEXT:    .cfi_restore s7
-; RV32-NEXT:    .cfi_restore s8
-; RV32-NEXT:    .cfi_restore s9
-; RV32-NEXT:    addi sp, sp, 48
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: shuffle_i256_ldst:
-; RV64:       # %bb.0:
-; RV64-NEXT:    ld a1, 96(a0)
-; RV64-NEXT:    ld a2, 104(a0)
-; RV64-NEXT:    ld a3, 112(a0)
-; RV64-NEXT:    ld a4, 120(a0)
-; RV64-NEXT:    ld a5, 0(a0)
-; RV64-NEXT:    ld a6, 8(a0)
-; RV64-NEXT:    ld a7, 16(a0)
-; RV64-NEXT:    ld t0, 24(a0)
-; RV64-NEXT:    ld t1, 64(a0)
-; RV64-NEXT:    ld t2, 72(a0)
-; RV64-NEXT:    ld t3, 80(a0)
-; RV64-NEXT:    ld t4, 88(a0)
-; RV64-NEXT:    sd t1, 96(a0)
-; RV64-NEXT:    sd t2, 104(a0)
-; RV64-NEXT:    sd t3, 112(a0)
-; RV64-NEXT:    sd t4, 120(a0)
-; RV64-NEXT:    sd a5, 32(a0)
-; RV64-NEXT:    sd a6, 40(a0)
-; RV64-NEXT:    sd a7, 48(a0)
-; RV64-NEXT:    sd t0, 56(a0)
-; RV64-NEXT:    sd a1, 64(a0)
-; RV64-NEXT:    sd a2, 72(a0)
-; RV64-NEXT:    sd a3, 80(a0)
-; RV64-NEXT:    sd a4, 88(a0)
-; RV64-NEXT:    ret
+; CHECK-LABEL: shuffle_i256_ldst:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl8re64.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vmv1r.v v11, v9
+; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    vmv1r.v v20, v14
+; CHECK-NEXT:    vmv1r.v v21, v15
+; CHECK-NEXT:    vmv1r.v v22, v12
+; CHECK-NEXT:    vmv1r.v v23, v13
+; CHECK-NEXT:    vs8r.v v16, (a0)
+; CHECK-NEXT:    ret
   %a = load <4 x i256>, ptr %p
   %res = shufflevector <4 x i256> %a, <4 x i256> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
   store <4 x i256> %res, ptr %p
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
index 83a9b23a387d2..84de566e05dff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
@@ -320,8 +320,8 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @gather_unknown_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, i64 %shift) {
 ; CHECK-LABEL: @gather_unknown_pow2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STEP:%.*]] = shl i64 8, [[SHIFT:%.*]]
-; CHECK-NEXT:    [[STRIDE:%.*]] = shl i64 1, [[SHIFT]]
+; CHECK-NEXT:    [[STRIDE:%.*]] = shl i64 1, [[SHIFT:%.*]]
+; CHECK-NEXT:    [[STEP:%.*]] = shl i64 8, [[SHIFT]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[STRIDE]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
diff --git a/llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll b/llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll
index 2e6df11840179..a556c3125c85d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll
@@ -105,3 +105,147 @@ define <vscale x 4 x float> @splat_idx_nxv4f32(<vscale x 4 x float> %v, i64 %idx
   %splat = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
   ret <vscale x 4 x float> %splat
 }
+
+define <vscale x 8 x float> @splat_idx_nxv4f32_nxv8f32(<vscale x 4 x float> %v, i64 %idx) {
+; CHECK-LABEL: splat_idx_nxv4f32_nxv8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vrgather.vx v12, v8, a0
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+  %x = extractelement <vscale x 4 x float> %v, i64 %idx
+  %ins = insertelement <vscale x 8 x float> poison, float %x, i32 0
+  %splat = shufflevector <vscale x 8 x float> %ins, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x float> %splat
+}
+
+define <vscale x 4 x float> @splat_idx_v4f32_nxv4f32(<4 x float> %v, i64 %idx) {
+; CHECK-LABEL: splat_idx_v4f32_nxv4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vrgather.vx v10, v8, a0
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %x = extractelement <4 x float> %v, i64 %idx
+  %ins = insertelement <vscale x 4 x float> poison, float %x, i32 0
+  %splat = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x float> %splat
+}
+
+; Negative test, scale could have a value > 2
+define <8 x float> @splat_idx_nxv4f32_v8f32(<vscale x 4 x float> %v, i64 %idx) {
+; CHECK-LABEL: splat_idx_nxv4f32_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vfmv.f.s fa5, v8
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vfmv.v.f v8, fa5
+; CHECK-NEXT:    ret
+  %x = extractelement <vscale x 4 x float> %v, i64 %idx
+  %ins = insertelement <8 x float> poison, float %x, i32 0
+  %splat = shufflevector <8 x float> %ins, <8 x float> poison, <8 x i32> zeroinitializer
+  ret <8 x float> %splat
+}
+
+define <vscale x 4 x float> @splat_idx_nxv8f32_nxv4f32_constant_0(<vscale x 8 x float> %v) {
+; CHECK-LABEL: splat_idx_nxv8f32_nxv4f32_constant_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vrgather.vi v10, v8, 0
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %x = extractelement <vscale x 8 x float> %v, i64 0
+  %ins = insertelement <vscale x 4 x float> poison, float %x, i32 0
+  %splat = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x float> %splat
+}
+
+define <vscale x 4 x i8> @splat_idx_nxv8i8_nxv4i8_constant_0(<vscale x 8 x i8> %v) {
+; CHECK-LABEL: splat_idx_nxv8i8_nxv4i8_constant_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    ret
+  %x = extractelement <vscale x 8 x i8> %v, i64 0
+  %ins = insertelement <vscale x 4 x i8> poison, i8 %x, i32 0
+  %splat = shufflevector <vscale x 4 x i8> %ins, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x i8> %splat
+}
+
+define <vscale x 4 x i8> @splat_idx_nxv8i8_nxv4i8_constant_3(<vscale x 8 x i8> %v) {
+; CHECK-LABEL: splat_idx_nxv8i8_nxv4i8_constant_3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 3
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    ret
+  %x = extractelement <vscale x 8 x i8> %v, i64 3
+  %ins = insertelement <vscale x 4 x i8> poison, i8 %x, i32 0
+  %splat = shufflevector <vscale x 4 x i8> %ins, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x i8> %splat
+}
+
+
+; Negative test, vscale coule be 2
+define <vscale x 4 x i8> @splat_idx_nxv8i8_nxv4i8_constant_15(<vscale x 8 x i8> %v) {
+; CHECK-LABEL: splat_idx_nxv8i8_nxv4i8_constant_15:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 15
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    ret
+  %x = extractelement <vscale x 8 x i8> %v, i64 15
+  %ins = insertelement <vscale x 4 x i8> poison, i8 %x, i32 0
+  %splat = shufflevector <vscale x 4 x i8> %ins, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x i8> %splat
+}
+
+define <8 x float> @splat_idx_nxv4f32_v8f32_constant_0(<vscale x 4 x float> %v) {
+; CHECK-LABEL: splat_idx_nxv4f32_v8f32_constant_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vfmv.f.s fa5, v8
+; CHECK-NEXT:    vfmv.v.f v8, fa5
+; CHECK-NEXT:    ret
+  %x = extractelement <vscale x 4 x float> %v, i64 0
+  %ins = insertelement <8 x float> poison, float %x, i32 0
+  %splat = shufflevector <8 x float> %ins, <8 x float> poison, <8 x i32> zeroinitializer
+  ret <8 x float> %splat
+}
+
+define <8 x float> @splat_idx_nxv4f32_v8f32_constant_7(<vscale x 4 x float> %v) {
+; CHECK-LABEL: splat_idx_nxv4f32_v8f32_constant_7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 7
+; CHECK-NEXT:    vfmv.f.s fa5, v8
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vfmv.v.f v8, fa5
+; CHECK-NEXT:    ret
+  %x = extractelement <vscale x 4 x float> %v, i64 7
+  %ins = insertelement <8 x float> poison, float %x, i32 0
+  %splat = shufflevector <8 x float> %ins, <8 x float> poison, <8 x i32> zeroinitializer
+  ret <8 x float> %splat
+}
+
+; Negative test, vscale might be 4
+define <8 x float> @splat_idx_nxv4f32_v8f32_constant_8(<vscale x 4 x float> %v) {
+; CHECK-LABEL: splat_idx_nxv4f32_v8f32_constant_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 8
+; CHECK-NEXT:    vfmv.f.s fa5, v8
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vfmv.v.f v8, fa5
+; CHECK-NEXT:    ret
+  %x = extractelement <vscale x 4 x float> %v, i64 8
+  %ins = insertelement <8 x float> poison, float %x, i32 0
+  %splat = shufflevector <8 x float> %ins, <8 x float> poison, <8 x i32> zeroinitializer
+  ret <8 x float> %splat
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
new file mode 100644
index 0000000000000..c3c1643e6de01
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
@@ -0,0 +1,550 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -O2 < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64I
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -O2 < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32I
+
+; Tests copied from AArch64.
+
+; Dynamically-sized allocation, needs a loop which can handle any size at
+; runtime. The final iteration of the loop will temporarily put SP below the
+; target address, but this doesn't break any of the ABI constraints on the
+; stack, and also doesn't probe below the target SP value.
+define void @dynamic(i64 %size, ptr %out) #0 {
+; RV64I-LABEL: dynamic:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    .cfi_def_cfa_offset 16
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    addi s0, sp, 16
+; RV64I-NEXT:    .cfi_def_cfa s0, 0
+; RV64I-NEXT:    addi a0, a0, 15
+; RV64I-NEXT:    andi a0, a0, -16
+; RV64I-NEXT:    sub a0, sp, a0
+; RV64I-NEXT:    lui a2, 1
+; RV64I-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    sub sp, sp, a2
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    blt a0, sp, .LBB0_1
+; RV64I-NEXT:  # %bb.2:
+; RV64I-NEXT:    mv sp, a0
+; RV64I-NEXT:    sd a0, 0(a1)
+; RV64I-NEXT:    addi sp, s0, -16
+; RV64I-NEXT:    .cfi_def_cfa sp, 16
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: dynamic:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    addi s0, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa s0, 0
+; RV32I-NEXT:    addi a0, a0, 15
+; RV32I-NEXT:    andi a0, a0, -16
+; RV32I-NEXT:    sub a0, sp, a0
+; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    sub sp, sp, a1
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    blt a0, sp, .LBB0_1
+; RV32I-NEXT:  # %bb.2:
+; RV32I-NEXT:    mv sp, a0
+; RV32I-NEXT:    sw a0, 0(a2)
+; RV32I-NEXT:    addi sp, s0, -16
+; RV32I-NEXT:    .cfi_def_cfa sp, 16
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+  %v = alloca i8, i64 %size, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; This function has a fixed-size stack slot and a dynamic one. The fixed size
+; slot isn't large enough that we would normally probe it, but we need to do so
+; here otherwise the gap between the CSR save and the first probe of the
+; dynamic allocation could be too far apart when the size of the dynamic
+; allocation is close to the guard size.
+define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 {
+; RV64I-LABEL: dynamic_fixed:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -80
+; RV64I-NEXT:    .cfi_def_cfa_offset 80
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    addi s0, sp, 80
+; RV64I-NEXT:    .cfi_def_cfa s0, 0
+; RV64I-NEXT:    addi a3, s0, -80
+; RV64I-NEXT:    addi a0, a0, 15
+; RV64I-NEXT:    sd a3, 0(a1)
+; RV64I-NEXT:    andi a0, a0, -16
+; RV64I-NEXT:    sub a0, sp, a0
+; RV64I-NEXT:    lui a1, 1
+; RV64I-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    sub sp, sp, a1
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    blt a0, sp, .LBB1_1
+; RV64I-NEXT:  # %bb.2:
+; RV64I-NEXT:    mv sp, a0
+; RV64I-NEXT:    sd a0, 0(a2)
+; RV64I-NEXT:    addi sp, s0, -80
+; RV64I-NEXT:    .cfi_def_cfa sp, 80
+; RV64I-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    addi sp, sp, 80
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: dynamic_fixed:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -80
+; RV32I-NEXT:    .cfi_def_cfa_offset 80
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    addi s0, sp, 80
+; RV32I-NEXT:    .cfi_def_cfa s0, 0
+; RV32I-NEXT:    addi a1, s0, -72
+; RV32I-NEXT:    addi a0, a0, 15
+; RV32I-NEXT:    sw a1, 0(a2)
+; RV32I-NEXT:    andi a0, a0, -16
+; RV32I-NEXT:    sub a0, sp, a0
+; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    sub sp, sp, a1
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    blt a0, sp, .LBB1_1
+; RV32I-NEXT:  # %bb.2:
+; RV32I-NEXT:    mv sp, a0
+; RV32I-NEXT:    sw a0, 0(a3)
+; RV32I-NEXT:    addi sp, s0, -80
+; RV32I-NEXT:    .cfi_def_cfa sp, 80
+; RV32I-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    addi sp, sp, 80
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+  %v1 = alloca i8, i64 64, align 1
+  store ptr %v1, ptr %out1, align 8
+  %v2 = alloca i8, i64 %size, align 1
+  store ptr %v2, ptr %out2, align 8
+  ret void
+}
+
+; Dynamic allocation, with an alignment requirement greater than the alignment
+; of SP. Done by ANDing the target SP with a constant to align it down, then
+; doing the loop as normal. Note that we also re-align the stack in the prolog,
+; which isn't actually needed because the only aligned allocations are dynamic,
+; this is done even without stack probing.
+define void @dynamic_align_64(i64 %size, ptr %out) #0 {
+; RV64I-LABEL: dynamic_align_64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -64
+; RV64I-NEXT:    .cfi_def_cfa_offset 64
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    addi s0, sp, 64
+; RV64I-NEXT:    .cfi_def_cfa s0, 0
+; RV64I-NEXT:    andi sp, sp, -64
+; RV64I-NEXT:    mv s1, sp
+; RV64I-NEXT:    addi a0, a0, 15
+; RV64I-NEXT:    andi a0, a0, -16
+; RV64I-NEXT:    sub a0, sp, a0
+; RV64I-NEXT:    andi a0, a0, -64
+; RV64I-NEXT:    lui a2, 1
+; RV64I-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    sub sp, sp, a2
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    blt a0, sp, .LBB2_1
+; RV64I-NEXT:  # %bb.2:
+; RV64I-NEXT:    mv sp, a0
+; RV64I-NEXT:    sd a0, 0(a1)
+; RV64I-NEXT:    addi sp, s0, -64
+; RV64I-NEXT:    .cfi_def_cfa sp, 64
+; RV64I-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: dynamic_align_64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -64
+; RV32I-NEXT:    .cfi_def_cfa_offset 64
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    addi s0, sp, 64
+; RV32I-NEXT:    .cfi_def_cfa s0, 0
+; RV32I-NEXT:    andi sp, sp, -64
+; RV32I-NEXT:    mv s1, sp
+; RV32I-NEXT:    addi a0, a0, 15
+; RV32I-NEXT:    andi a0, a0, -16
+; RV32I-NEXT:    sub a0, sp, a0
+; RV32I-NEXT:    andi a0, a0, -64
+; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    sub sp, sp, a1
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    blt a0, sp, .LBB2_1
+; RV32I-NEXT:  # %bb.2:
+; RV32I-NEXT:    mv sp, a0
+; RV32I-NEXT:    sw a0, 0(a2)
+; RV32I-NEXT:    addi sp, s0, -64
+; RV32I-NEXT:    .cfi_def_cfa sp, 64
+; RV32I-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+  %v = alloca i8, i64 %size, align 64
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; Dynamic allocation, with an alignment greater than the stack guard size. The
+; only difference to the dynamic allocation is the constant used for aligning
+; the target SP, the loop will probe the whole allocation without needing to
+; know about the alignment padding.
+define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
+; RV64I-LABEL: dynamic_align_8192:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -2032
+; RV64I-NEXT:    .cfi_def_cfa_offset 2032
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd ra, 2024(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 2016(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 2008(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    addi s0, sp, 2032
+; RV64I-NEXT:    .cfi_def_cfa s0, 0
+; RV64I-NEXT:    lui a2, 1
+; RV64I-NEXT:    sub sp, sp, a2
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sub sp, sp, a2
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sub sp, sp, a2
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    addi sp, sp, -2048
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    srli a2, sp, 13
+; RV64I-NEXT:    slli sp, a2, 13
+; RV64I-NEXT:    mv s1, sp
+; RV64I-NEXT:    addi a0, a0, 15
+; RV64I-NEXT:    lui a2, 1048574
+; RV64I-NEXT:    andi a0, a0, -16
+; RV64I-NEXT:    sub a0, sp, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 1
+; RV64I-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    sub sp, sp, a2
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    blt a0, sp, .LBB3_1
+; RV64I-NEXT:  # %bb.2:
+; RV64I-NEXT:    mv sp, a0
+; RV64I-NEXT:    sd a0, 0(a1)
+; RV64I-NEXT:    addi sp, s0, -2032
+; RV64I-NEXT:    .cfi_def_cfa sp, 2032
+; RV64I-NEXT:    ld ra, 2024(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 2016(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 2008(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 2032
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: dynamic_align_8192:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -2032
+; RV32I-NEXT:    .cfi_def_cfa_offset 2032
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw ra, 2028(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 2024(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 2020(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    addi s0, sp, 2032
+; RV32I-NEXT:    .cfi_def_cfa s0, 0
+; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:    sub sp, sp, a1
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sub sp, sp, a1
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sub sp, sp, a1
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    addi sp, sp, -2048
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    srli a1, sp, 13
+; RV32I-NEXT:    slli sp, a1, 13
+; RV32I-NEXT:    mv s1, sp
+; RV32I-NEXT:    addi a0, a0, 15
+; RV32I-NEXT:    lui a1, 1048574
+; RV32I-NEXT:    andi a0, a0, -16
+; RV32I-NEXT:    sub a0, sp, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    sub sp, sp, a1
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    blt a0, sp, .LBB3_1
+; RV32I-NEXT:  # %bb.2:
+; RV32I-NEXT:    mv sp, a0
+; RV32I-NEXT:    sw a0, 0(a2)
+; RV32I-NEXT:    addi sp, s0, -2032
+; RV32I-NEXT:    .cfi_def_cfa sp, 2032
+; RV32I-NEXT:    lw ra, 2028(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 2024(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 2020(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 2032
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+  %v = alloca i8, i64 %size, align 8192
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; If a function has variable-sized stack objects, then any function calls which
+; need to pass arguments on the stack must allocate the stack space for them
+; dynamically, to ensure they are at the bottom of the frame.
+define void @no_reserved_call_frame(i64 %n, i32 %dummy) #0 {
+; RV64I-LABEL: no_reserved_call_frame:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    .cfi_def_cfa_offset 16
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    addi s0, sp, 16
+; RV64I-NEXT:    .cfi_def_cfa s0, 0
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    addi a0, a0, 15
+; RV64I-NEXT:    andi a0, a0, -16
+; RV64I-NEXT:    sub a0, sp, a0
+; RV64I-NEXT:    lui a2, 1
+; RV64I-NEXT:  .LBB4_1: # %entry
+; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    sub sp, sp, a2
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    blt a0, sp, .LBB4_1
+; RV64I-NEXT:  # %bb.2: # %entry
+; RV64I-NEXT:    mv sp, a0
+; RV64I-NEXT:    call callee_stack_args
+; RV64I-NEXT:    addi sp, s0, -16
+; RV64I-NEXT:    .cfi_def_cfa sp, 16
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: no_reserved_call_frame:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    addi s0, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa s0, 0
+; RV32I-NEXT:    mv a1, a2
+; RV32I-NEXT:    slli a0, a0, 2
+; RV32I-NEXT:    addi a0, a0, 15
+; RV32I-NEXT:    andi a0, a0, -16
+; RV32I-NEXT:    sub a0, sp, a0
+; RV32I-NEXT:    lui a2, 1
+; RV32I-NEXT:  .LBB4_1: # %entry
+; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    sub sp, sp, a2
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    blt a0, sp, .LBB4_1
+; RV32I-NEXT:  # %bb.2: # %entry
+; RV32I-NEXT:    mv sp, a0
+; RV32I-NEXT:    call callee_stack_args
+; RV32I-NEXT:    addi sp, s0, -16
+; RV32I-NEXT:    .cfi_def_cfa sp, 16
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+entry:
+  %v = alloca i32, i64 %n
+  call void @callee_stack_args(ptr %v, i32 %dummy)
+  ret void
+}
+
+; Same as above but without a variable-sized allocation, so the reserved call
+; frame can be folded into the fixed-size allocation in the prologue.
+define void @reserved_call_frame(i64 %n, i32 %dummy) #0 {
+; RV64I-LABEL: reserved_call_frame:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -416
+; RV64I-NEXT:    .cfi_def_cfa_offset 416
+; RV64I-NEXT:    sd ra, 408(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    addi a0, sp, 8
+; RV64I-NEXT:    call callee_stack_args
+; RV64I-NEXT:    ld ra, 408(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    addi sp, sp, 416
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: reserved_call_frame:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -416
+; RV32I-NEXT:    .cfi_def_cfa_offset 416
+; RV32I-NEXT:    sw ra, 412(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    mv a1, a2
+; RV32I-NEXT:    addi a0, sp, 12
+; RV32I-NEXT:    call callee_stack_args
+; RV32I-NEXT:    lw ra, 412(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    addi sp, sp, 416
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+entry:
+  %v = alloca i32, i64 100
+  call void @callee_stack_args(ptr %v, i32 %dummy)
+  ret void
+}
+
+declare void @callee_stack_args(ptr, i32)
+
+; Dynamic allocation of vectors
+define void @dynamic_vector(i64 %size, ptr %out) #0 {
+; RV64I-LABEL: dynamic_vector:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    .cfi_def_cfa_offset 16
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    addi s0, sp, 16
+; RV64I-NEXT:    .cfi_def_cfa s0, 0
+; RV64I-NEXT:    csrr a2, vlenb
+; RV64I-NEXT:    mul a0, a2, a0
+; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    sub a0, sp, a0
+; RV64I-NEXT:    lui a2, 1
+; RV64I-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    sub sp, sp, a2
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    blt a0, sp, .LBB6_1
+; RV64I-NEXT:  # %bb.2:
+; RV64I-NEXT:    mv sp, a0
+; RV64I-NEXT:    sd a0, 0(a1)
+; RV64I-NEXT:    addi sp, s0, -16
+; RV64I-NEXT:    .cfi_def_cfa sp, 16
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: dynamic_vector:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    addi s0, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa s0, 0
+; RV32I-NEXT:    csrr a1, vlenb
+; RV32I-NEXT:    mul a0, a1, a0
+; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    sub a0, sp, a0
+; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    sub sp, sp, a1
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    blt a0, sp, .LBB6_1
+; RV32I-NEXT:  # %bb.2:
+; RV32I-NEXT:    mv sp, a0
+; RV32I-NEXT:    sw a0, 0(a2)
+; RV32I-NEXT:    addi sp, s0, -16
+; RV32I-NEXT:    .cfi_def_cfa sp, 16
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+  %v = alloca <vscale x 4 x float>, i64 %size, align 16
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
index f777c450bc106..45f158f929ca8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
@@ -105,6 +105,166 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret <vscale x 1 x i64> %accum.next
 }
 
+define <vscale x 1 x i64> @gather_non_invariant_step(ptr %a, ptr %b, i32 %len) {
+; CHECK-LABEL: @gather_non_invariant_step(
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACCUM:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP0]], i64 16, <vscale x 1 x i1> splat (i1 true), i32 [[TMP1]])
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i64> undef, i32 [[TMP1]])
+; CHECK-NEXT:    [[ACCUM_NEXT]] = add <vscale x 1 x i64> [[ACCUM]], [[GATHER]]
+; CHECK-NEXT:    [[B:%.*]] = getelementptr i64, ptr [[B1:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[STEP:%.*]] = load i64, ptr [[B]], align 8
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[STEP]]
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[STEP]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret <vscale x 1 x i64> [[ACCUM_NEXT]]
+;
+vector.ph:
+  %wide.trip.count = zext i32 %len to i64
+  %1 = tail call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <vscale x 1 x i64> [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %accum = phi <vscale x 1 x i64> [ zeroinitializer, %vector.ph ], [ %accum.next, %vector.body ]
+  %2 = getelementptr inbounds %struct.foo, ptr %a, <vscale x 1 x i64> %vec.ind, i32 3
+  %gather = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> %2, i32 8, <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> undef)
+  %accum.next = add <vscale x 1 x i64> %accum, %gather
+
+  %b.gep = getelementptr i64, ptr %b, i64 %index
+  %step = load i64, ptr %b.gep
+  %index.next = add nuw i64 %index, %step
+  %.splatinsert = insertelement <vscale x 1 x i64> poison, i64 %step, i64 0
+  %.splat = shufflevector <vscale x 1 x i64> %.splatinsert, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %vec.ind.next = add <vscale x 1 x i64> %vec.ind, %.splat
+  %3 = icmp ne i64 %index.next, %wide.trip.count
+  br i1 %3, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret <vscale x 1 x i64> %accum.next
+}
+
+define <vscale x 1 x i64> @gather_non_invariant_step_shl(ptr %a, ptr %b, i32 %len) {
+; CHECK-LABEL: @gather_non_invariant_step_shl(
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 168, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACCUM:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP0]], i64 64, <vscale x 1 x i1> splat (i1 true), i32 [[TMP1]])
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i64> undef, i32 [[TMP1]])
+; CHECK-NEXT:    [[ACCUM_NEXT]] = add <vscale x 1 x i64> [[ACCUM]], [[GATHER]]
+; CHECK-NEXT:    [[B:%.*]] = getelementptr i64, ptr [[B1:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[STEP:%.*]] = load i64, ptr [[B]], align 8
+; CHECK-NEXT:    [[STEP1:%.*]] = shl i64 [[STEP]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[STEP]]
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[STEP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret <vscale x 1 x i64> [[ACCUM_NEXT]]
+;
+vector.ph:
+  %wide.trip.count = zext i32 %len to i64
+  %1 = tail call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <vscale x 1 x i64> [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %accum = phi <vscale x 1 x i64> [ zeroinitializer, %vector.ph ], [ %accum.next, %vector.body ]
+
+  %vec.ind.add = add <vscale x 1 x i64> %vec.ind, splat (i64 42)
+  %vec.ind.shl = shl <vscale x 1 x i64> %vec.ind.add, splat (i64 2)
+
+  %2 = getelementptr inbounds %struct.foo, ptr %a, <vscale x 1 x i64> %vec.ind.shl, i32 3
+  %gather = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> %2, i32 8, <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> undef)
+  %accum.next = add <vscale x 1 x i64> %accum, %gather
+
+  %b.gep = getelementptr i64, ptr %b, i64 %index
+  %step = load i64, ptr %b.gep
+  %index.next = add nuw i64 %index, %step
+  %.splatinsert = insertelement <vscale x 1 x i64> poison, i64 %step, i64 0
+  %.splat = shufflevector <vscale x 1 x i64> %.splatinsert, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %vec.ind.next = add <vscale x 1 x i64> %vec.ind, %.splat
+  %3 = icmp ne i64 %index.next, %wide.trip.count
+  br i1 %3, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret <vscale x 1 x i64> %accum.next
+}
+
+; Check that the operand of the binary op (%scale.splat in shl) always dominates
+; the existing step value when we're adjusting it.
+define <vscale x 1 x i64> @gather_splat_op_after_step(ptr %a, ptr %b, i32 %len) {
+; CHECK-LABEL: @gather_splat_op_after_step(
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[SCALE:%.*]] = load i64, ptr [[B:%.*]], align 8
+; CHECK-NEXT:    [[STRIDE:%.*]] = shl i64 1, [[SCALE]]
+; CHECK-NEXT:    [[STEP:%.*]] = shl i64 [[TMP0]], [[SCALE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[STRIDE]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACCUM:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]], i32 3
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP2]], i64 [[TMP1]], <vscale x 1 x i1> splat (i1 true), i32 [[TMP3]])
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> [[TMP4]], <vscale x 1 x i64> undef, i32 [[TMP3]])
+; CHECK-NEXT:    [[ACCUM_NEXT]] = add <vscale x 1 x i64> [[ACCUM]], [[GATHER]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP0]]
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], [[STEP]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret <vscale x 1 x i64> [[ACCUM_NEXT]]
+;
+vector.ph:
+  %wide.trip.count = zext i32 %len to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = tail call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+  %.splatinsert = insertelement <vscale x 1 x i64> poison, i64 %0, i64 0
+  %.splat = shufflevector <vscale x 1 x i64> %.splatinsert, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+
+  %scale = load i64, ptr %b
+  %scale.head = insertelement <vscale x 1 x i64> poison, i64 %scale, i64 0
+  %scale.splat = shufflevector <vscale x 1 x i64> %scale.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <vscale x 1 x i64> [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %accum = phi <vscale x 1 x i64> [ zeroinitializer, %vector.ph ], [ %accum.next, %vector.body ]
+  %vec.ind.shl = shl <vscale x 1 x i64> %vec.ind, %scale.splat
+  %2 = getelementptr inbounds %struct.foo, ptr %a, <vscale x 1 x i64> %vec.ind.shl, i32 3
+  %gather = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> %2, i32 8, <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> undef)
+  %accum.next = add <vscale x 1 x i64> %accum, %gather
+  %index.next = add nuw i64 %index, %0
+  %vec.ind.next = add <vscale x 1 x i64> %vec.ind, %.splat
+  %3 = icmp ne i64 %index.next, %wide.trip.count
+  br i1 %3, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret <vscale x 1 x i64> %accum.next
+}
+
 define void @scatter(ptr %a, i32 %len) {
 ; CHECK-LABEL: @scatter(
 ; CHECK-NEXT:  vector.ph:
@@ -146,6 +306,99 @@ for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
 
+define void @scatter_non_invariant_step(ptr %a, ptr %b, i32 %len) {
+; CHECK-LABEL: @scatter_non_invariant_step(
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.nxv1i64.p0.i64(<vscale x 1 x i64> zeroinitializer, ptr [[TMP0]], i64 16, <vscale x 1 x i1> splat (i1 true), i32 [[TMP1]])
+; CHECK-NEXT:    [[B:%.*]] = getelementptr i64, ptr [[B1:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[STEP:%.*]] = load i64, ptr [[B]], align 8
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[STEP]]
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[STEP]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+vector.ph:
+  %wide.trip.count = zext i32 %len to i64
+  %1 = tail call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <vscale x 1 x i64> [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %2 = getelementptr inbounds %struct.foo, ptr %a, <vscale x 1 x i64> %vec.ind, i32 3
+  tail call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x ptr> %2, i32 8, <vscale x 1 x i1> splat (i1 true))
+
+  %b.gep = getelementptr i64, ptr %b, i64 %index
+  %step = load i64, ptr %b.gep
+  %index.next = add nuw i64 %index, %step
+  %.splatinsert = insertelement <vscale x 1 x i64> poison, i64 %step, i64 0
+  %.splat = shufflevector <vscale x 1 x i64> %.splatinsert, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %vec.ind.next = add <vscale x 1 x i64> %vec.ind, %.splat
+  %3 = icmp ne i64 %index.next, %wide.trip.count
+  br i1 %3, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @scatter_non_invariant_step_add_shl(ptr %a, ptr %b, i32 %len) {
+; CHECK-LABEL: @scatter_non_invariant_step_add_shl(
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 168, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.nxv1i64.p0.i64(<vscale x 1 x i64> zeroinitializer, ptr [[TMP0]], i64 64, <vscale x 1 x i1> splat (i1 true), i32 [[TMP1]])
+; CHECK-NEXT:    [[B:%.*]] = getelementptr i64, ptr [[B1:%.*]], i64 [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[STEP:%.*]] = load i64, ptr [[B]], align 8
+; CHECK-NEXT:    [[STEP1:%.*]] = shl i64 [[STEP]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[STEP]]
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[STEP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+vector.ph:
+  %wide.trip.count = zext i32 %len to i64
+  %1 = tail call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <vscale x 1 x i64> [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ]
+
+  %vec.ind.add = add <vscale x 1 x i64> %vec.ind, splat (i64 42)
+  %vec.ind.shl = shl <vscale x 1 x i64> %vec.ind.add, splat (i64 2)
+
+  %2 = getelementptr inbounds %struct.foo, ptr %a, <vscale x 1 x i64> %vec.ind.shl, i32 3
+  tail call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x ptr> %2, i32 8, <vscale x 1 x i1> splat (i1 true))
+
+  %b.gep = getelementptr i64, ptr %b, i64 %index
+  %step = load i64, ptr %b.gep
+  %index.next = add nuw i64 %index, %step
+  %.splatinsert = insertelement <vscale x 1 x i64> poison, i64 %step, i64 0
+  %.splat = shufflevector <vscale x 1 x i64> %.splatinsert, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %vec.ind.next = add <vscale x 1 x i64> %vec.ind, %.splat
+  %3 = icmp ne i64 %index.next, %wide.trip.count
+  br i1 %3, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
 define <vscale x 1 x i64> @gather_loopless(ptr %p, i64 %stride) {
 ; CHECK-LABEL: @gather_loopless(
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[STRIDE:%.*]], 4
@@ -491,23 +744,20 @@ define <vscale x 1 x i64> @evl_gather(ptr %a, i32 %len) {
 ; CHECK-LABEL: @evl_gather(
 ; CHECK-NEXT:  vector.ph:
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[TMP1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[INDEX]]
+; CHECK-NEXT:    [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR]]
 ; CHECK-NEXT:    [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true)
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], <vscale x 1 x i64> [[VEC_IND]], i32 3
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.vp.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[TMP2]], <vscale x 1 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP0]], i64 16, <vscale x 1 x i1> splat (i1 true), i32 [[EVL]])
 ; CHECK-NEXT:    [[ACCUM_NEXT]] = add <vscale x 1 x i64> [[ACCUM]], [[GATHER]]
 ; CHECK-NEXT:    [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[EVL_ZEXT]]
-; CHECK-NEXT:    [[EVL_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[EVL_ZEXT]], i64 0
-; CHECK-NEXT:    [[EVL_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[EVL_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[EVL_SPLAT]]
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[EVL_ZEXT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[EVL_ZEXT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[ACCUM_NEXT]]
@@ -548,21 +798,18 @@ define void @evl_scatter(ptr %a, i32 %len) {
 ; CHECK-LABEL: @evl_scatter(
 ; CHECK-NEXT:  vector.ph:
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR1]]
 ; CHECK-NEXT:    [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true)
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], <vscale x 1 x i64> [[VEC_IND]], i32 3
-; CHECK-NEXT:    tail call void @llvm.vp.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x ptr> [[TMP1]], <vscale x 1 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]], i32 3
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.nxv1i64.p0.i64(<vscale x 1 x i64> zeroinitializer, ptr [[TMP0]], i64 16, <vscale x 1 x i1> splat (i1 true), i32 [[EVL]])
 ; CHECK-NEXT:    [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64
-; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[EVL_ZEXT]]
-; CHECK-NEXT:    [[EVL_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[EVL_ZEXT]], i64 0
-; CHECK-NEXT:    [[EVL_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[EVL_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[EVL_SPLAT]]
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR1]] = add nuw i64 [[VEC_IND_SCALAR1]], [[EVL_ZEXT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], [[EVL_ZEXT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR1]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
index fe0929a6f8745..edcd32c4098bc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
@@ -243,6 +243,36 @@ body: |
     %y:vrm2 = PseudoVWADD_WV_M1 $noreg, $noreg, %x, 1, 3 /* e8 */, 0
 ...
 ---
+name: tied_vwop_wv_vs1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: tied_vwop_wv_vs1
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: early-clobber %y:vrm2 = PseudoVWADD_WV_M1_TIED $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vrm2 = PseudoVWADD_WV_M1_TIED $noreg, %x, 1, 3 /* e8 */, 0
+...
+---
+name: tied_vwop_wv_vs1_incompatible_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: tied_vwop_wv_vs1_incompatible_eew
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */
+    ; CHECK-NEXT: early-clobber %y:vrm2 = PseudoVWADD_WV_M1_TIED $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
+    %y:vrm2 = PseudoVWADD_WV_M1_TIED $noreg, %x, 1, 3 /* e8 */, 0
+...
+---
+name: tied_vwop_wv_vs1_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: tied_vwop_wv_vs1_incompatible_emul
+    ; CHECK: %x:vr = PseudoVADD_VV_MF2 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: early-clobber %y:vrm2 = PseudoVWADD_WV_M1_TIED $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_MF2 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vrm2 = PseudoVWADD_WV_M1_TIED $noreg, %x, 1, 3 /* e8 */, 0
+...
+---
 name: vop_vf2_vd
 body: |
   bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
index 56bfe0fd3eb93..027eb8ca3c17f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
@@ -140,4 +140,12 @@ body: |
     %x:vr = nofpexcept PseudoVFNCVTBF16_F_F_W_M1_E16 $noreg, $noreg, 7, -1, 4 /* e16 */, 0 /* tu, mu */, implicit $frm
     %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 4 /* e16 */, 0
 ...
-
+---
+name: vwadd_tied_vs1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vwadd_tied_vs1
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: early-clobber %y:vrm2 = PseudoVWADD_WV_M1_TIED $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    %y:vrm2 = PseudoVWADD_WV_M1_TIED $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-combine-reverse-load.ll b/llvm/test/CodeGen/RISCV/rvv/vp-combine-reverse-load.ll
new file mode 100644
index 0000000000000..50e26bd141070
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-combine-reverse-load.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+f,+v -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 2 x float> @test_reverse_load_combiner(<vscale x 2 x float>* %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: test_reverse_load_combiner:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a2, a1, 2
+; CHECK-NEXT:    add a0, a2, a0
+; CHECK-NEXT:    addi a0, a0, -4
+; CHECK-NEXT:    li a2, -4
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vlse32.v v8, (a0), a2
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x float> @llvm.vp.load.nxv2f32.p0nxv2f32(<vscale x 2 x float>* %ptr, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+  %rev = call <vscale x 2 x float> @llvm.experimental.vp.reverse.nxv2f32(<vscale x 2 x float> %load, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 2 x float> %rev
+}
+
+define <vscale x 2 x float> @test_load_mask_is_vp_reverse(<vscale x 2 x float>* %ptr, <vscale x 2 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_load_mask_is_vp_reverse:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a2, a1, 2
+; CHECK-NEXT:    add a0, a2, a0
+; CHECK-NEXT:    addi a0, a0, -4
+; CHECK-NEXT:    li a2, -4
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vlse32.v v8, (a0), a2, v0.t
+; CHECK-NEXT:    ret
+  %loadmask = call <vscale x 2 x i1> @llvm.experimental.vp.reverse.nxv2i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+  %load = call <vscale x 2 x float> @llvm.vp.load.nxv2f32.p0nxv2f32(<vscale x 2 x float>* %ptr, <vscale x 2 x i1> %loadmask, i32 %evl)
+  %rev = call <vscale x 2 x float> @llvm.experimental.vp.reverse.nxv2f32(<vscale x 2 x float> %load, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 2 x float> %rev
+}
+
+define <vscale x 2 x float> @test_load_mask_not_all_one(<vscale x 2 x float>* %ptr, <vscale x 2 x i1> %notallones, i32 zeroext %evl) {
+; CHECK-LABEL: test_load_mask_not_all_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vle32.v v9, (a0), v0.t
+; CHECK-NEXT:    vid.v v8, v0.t
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    vrsub.vx v10, v8, a1, v0.t
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x float> @llvm.vp.load.nxv2f32.p0nxv2f32(<vscale x 2 x float>* %ptr, <vscale x 2 x i1> %notallones, i32 %evl)
+  %rev = call <vscale x 2 x float> @llvm.experimental.vp.reverse.nxv2f32(<vscale x 2 x float> %load, <vscale x 2 x i1> %notallones, i32 %evl)
+  ret <vscale x 2 x float> %rev
+}
+
+define <vscale x 2 x float> @test_different_evl(<vscale x 2 x float>* %ptr, <vscale x 2 x i1> %mask, i32 zeroext %evl1, i32 zeroext %evl2) {
+; CHECK-LABEL: test_different_evl:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a3, a1, -1
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vx v8, v8, a3
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT:    vrgatherei16.vv v10, v9, v8
+; CHECK-NEXT:    vmsne.vi v0, v10, 0
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vle32.v v9, (a0), v0.t
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vrsub.vx v10, v8, a2
+; CHECK-NEXT:    vrgather.vv v8, v9, v10
+; CHECK-NEXT:    ret
+  %loadmask = call <vscale x 2 x i1> @llvm.experimental.vp.reverse.nxv2i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> splat (i1 true), i32 %evl1)
+  %load = call <vscale x 2 x float> @llvm.vp.load.nxv2f32.p0nxv2f32(<vscale x 2 x float>* %ptr, <vscale x 2 x i1> %loadmask, i32 %evl2)
+  %rev = call <vscale x 2 x float> @llvm.experimental.vp.reverse.nxv2f32(<vscale x 2 x float> %load, <vscale x 2 x i1> splat (i1 true), i32 %evl2)
+  ret <vscale x 2 x float> %rev
+}
+
+declare <vscale x 2 x float> @llvm.vp.load.nxv2f32.p0nxv2f32(<vscale x 2 x float>* nocapture, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x float> @llvm.experimental.vp.reverse.nxv2f32(<vscale x 2 x float>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i1> @llvm.experimental.vp.reverse.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-combine-store-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/vp-combine-store-reverse.ll
new file mode 100644
index 0000000000000..4896a1367935a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-combine-store-reverse.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+f,+v -verify-machineinstrs < %s | FileCheck %s
+
+define void @test_store_reverse_combiner(<vscale x 2 x float> %val, <vscale x 2 x float>* %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: test_store_reverse_combiner:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a2, a1, 2
+; CHECK-NEXT:    add a0, a2, a0
+; CHECK-NEXT:    addi a0, a0, -4
+; CHECK-NEXT:    li a2, -4
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a2
+; CHECK-NEXT:    ret
+  %rev = call <vscale x 2 x float> @llvm.experimental.vp.reverse.nxv2f32(<vscale x 2 x float> %val, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+  call void @llvm.vp.store.nxv2f32.p0nxv2f32(<vscale x 2 x float> %rev, <vscale x 2 x float>* %ptr, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+  ret void
+}
+
+define void @test_store_mask_is_vp_reverse(<vscale x 2 x float> %val, <vscale x 2 x float>* %ptr, <vscale x 2 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_store_mask_is_vp_reverse:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a2, a1, 2
+; CHECK-NEXT:    add a0, a2, a0
+; CHECK-NEXT:    addi a0, a0, -4
+; CHECK-NEXT:    li a2, -4
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a2, v0.t
+; CHECK-NEXT:    ret
+  %storemask = call <vscale x 2 x i1> @llvm.experimental.vp.reverse.nxv2i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+  %rev = call <vscale x 2 x float> @llvm.experimental.vp.reverse.nxv2f32(<vscale x 2 x float> %val, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+  call void @llvm.vp.store.nxv2f32.p0nxv2f32(<vscale x 2 x float> %rev, <vscale x 2 x float>* %ptr, <vscale x 2 x i1> %storemask, i32 %evl)
+  ret void
+}
+
+define void @test_store_mask_not_all_one(<vscale x 2 x float> %val, <vscale x 2 x float>* %ptr, <vscale x 2 x i1> %notallones, i32 zeroext %evl) {
+; CHECK-LABEL: test_store_mask_not_all_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    vrsub.vx v9, v9, a1, v0.t
+; CHECK-NEXT:    vrgather.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vse32.v v10, (a0), v0.t
+; CHECK-NEXT:    ret
+  %rev = call <vscale x 2 x float> @llvm.experimental.vp.reverse.nxv2f32(<vscale x 2 x float> %val, <vscale x 2 x i1> %notallones, i32 %evl)
+  call void @llvm.vp.store.nxv2f32.p0nxv2f32(<vscale x 2 x float> %rev, <vscale x 2 x float>* %ptr, <vscale x 2 x i1> %notallones, i32 %evl)
+  ret void
+}
+
+define void @test_different_evl(<vscale x 2 x float> %val, <vscale x 2 x float>* %ptr, <vscale x 2 x i1> %mask, i32 zeroext %evl1, i32 zeroext %evl2) {
+; CHECK-LABEL: test_different_evl:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vx v9, v9, a1
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vrsub.vx v11, v11, a1
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v12, v10, v9
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vrgather.vv v9, v8, v11
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v9, (a0), v0.t
+; CHECK-NEXT:    ret
+  %storemask = call <vscale x 2 x i1> @llvm.experimental.vp.reverse.nxv2i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> splat (i1 true), i32 %evl1)
+  %rev = call <vscale x 2 x float> @llvm.experimental.vp.reverse.nxv2f32(<vscale x 2 x float> %val, <vscale x 2 x i1> splat (i1 true), i32 %evl1)
+  call void @llvm.vp.store.nxv2f32.p0nxv2f32(<vscale x 2 x float> %rev, <vscale x 2 x float>* %ptr, <vscale x 2 x i1> %storemask, i32 %evl2)
+  ret void
+}
+
+declare <vscale x 2 x float> @llvm.experimental.vp.reverse.nxv2f32(<vscale x 2 x float>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i1> @llvm.experimental.vp.reverse.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
+declare void @llvm.vp.store.nxv2f32.p0nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>* nocapture, <vscale x 2 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll b/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll
index 843e57a42d926..b1c0755c36ec1 100644
--- a/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll
+++ b/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll
@@ -606,4 +606,129 @@ define i32 @f10(i64 %i) local_unnamed_addr #0 {
   ret i32 %c
 }
 
+define void @f11(i32 %vla_size, i64 %i) #0 {
+; RV64I-LABEL: f11:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -2032
+; RV64I-NEXT:    .cfi_def_cfa_offset 2032
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd ra, 2024(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 2016(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 2008(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    addi s0, sp, 2032
+; RV64I-NEXT:    .cfi_def_cfa s0, 0
+; RV64I-NEXT:    lui a2, 15
+; RV64I-NEXT:    sub t1, sp, a2
+; RV64I-NEXT:    lui t2, 1
+; RV64I-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    sub sp, sp, t2
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    bne sp, t1, .LBB11_1
+; RV64I-NEXT:  # %bb.2:
+; RV64I-NEXT:    addi sp, sp, -2048
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    srli a2, sp, 15
+; RV64I-NEXT:    slli sp, a2, 15
+; RV64I-NEXT:    mv s1, sp
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    lui a2, 8
+; RV64I-NEXT:    add a2, s1, a2
+; RV64I-NEXT:    add a1, a2, a1
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    sw a2, 0(a1)
+; RV64I-NEXT:    addi a0, a0, 15
+; RV64I-NEXT:    andi a0, a0, -16
+; RV64I-NEXT:    sub a0, sp, a0
+; RV64I-NEXT:    andi a0, a0, -2048
+; RV64I-NEXT:    lui a1, 1
+; RV64I-NEXT:  .LBB11_3: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    sub sp, sp, a1
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    blt a0, sp, .LBB11_3
+; RV64I-NEXT:  # %bb.4:
+; RV64I-NEXT:    mv sp, a0
+; RV64I-NEXT:    lbu zero, 0(a0)
+; RV64I-NEXT:    addi sp, s0, -2032
+; RV64I-NEXT:    .cfi_def_cfa sp, 2032
+; RV64I-NEXT:    ld ra, 2024(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 2016(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 2008(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 2032
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: f11:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -2032
+; RV32I-NEXT:    .cfi_def_cfa_offset 2032
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw ra, 2028(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 2024(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 2020(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    addi s0, sp, 2032
+; RV32I-NEXT:    .cfi_def_cfa s0, 0
+; RV32I-NEXT:    lui a2, 15
+; RV32I-NEXT:    sub t1, sp, a2
+; RV32I-NEXT:    lui t2, 1
+; RV32I-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    sub sp, sp, t2
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    bne sp, t1, .LBB11_1
+; RV32I-NEXT:  # %bb.2:
+; RV32I-NEXT:    addi sp, sp, -2048
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    srli a2, sp, 15
+; RV32I-NEXT:    slli sp, a2, 15
+; RV32I-NEXT:    mv s1, sp
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    lui a2, 8
+; RV32I-NEXT:    add a2, s1, a2
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    li a2, 1
+; RV32I-NEXT:    addi a0, a0, 15
+; RV32I-NEXT:    andi a0, a0, -16
+; RV32I-NEXT:    sw a2, 0(a1)
+; RV32I-NEXT:    sub a0, sp, a0
+; RV32I-NEXT:    andi a0, a0, -2048
+; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:  .LBB11_3: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    sub sp, sp, a1
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    blt a0, sp, .LBB11_3
+; RV32I-NEXT:  # %bb.4:
+; RV32I-NEXT:    mv sp, a0
+; RV32I-NEXT:    lbu zero, 0(a0)
+; RV32I-NEXT:    addi sp, s0, -2032
+; RV32I-NEXT:    .cfi_def_cfa sp, 2032
+; RV32I-NEXT:    lw ra, 2028(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 2024(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 2020(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 2032
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+  %a = alloca i32, i32 4096, align 32768
+  %b = getelementptr inbounds i32, ptr %a, i64 %i
+  store volatile i32 1, ptr %b
+  %1 = zext i32 %vla_size to i64
+  %vla = alloca i8, i64 %1, align 2048
+  %2 = load volatile i8, ptr %vla, align 2048
+  ret void
+}
+
 attributes #0 = { "probe-stack"="inline-asm" }
diff --git a/llvm/test/CodeGen/SPIRV/GlobalISel/InstCombine/prelegalizercombiner-length-to-distance.mir b/llvm/test/CodeGen/SPIRV/GlobalISel/InstCombine/prelegalizercombiner-length-to-distance.mir
new file mode 100644
index 0000000000000..219b98ecca6f0
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/GlobalISel/InstCombine/prelegalizercombiner-length-to-distance.mir
@@ -0,0 +1,26 @@
+# RUN: llc -verify-machineinstrs -O0 -mtriple spirv-unknown-unknown -run-pass=spirv-prelegalizer-combiner %s -o - | FileCheck %s
+# REQUIRES: asserts
+---
+name:            distance_instcombine_float4
+tracksRegLiveness: true
+legalized: true
+body:             |
+  bb.1.entry:
+    ; CHECK-LABEL: name: distance_instcombine_float4
+    ; CHECK-NOT: %6:_(<4 x s32>) = G_FSUB %2, %3
+    ; CHECK-NOT: %7:id(s32) = G_INTRINSIC intrinsic(@llvm.spv.length), %6(<4 x s32>)
+    ; CHECK: %7:id(s32) = G_INTRINSIC intrinsic(@llvm.spv.distance), %2(<4 x s32>), %3(<4 x s32>)
+    %0:type(s64) = OpTypeFloat 32
+    %1:type(s64) = OpTypeVector %0(s64), 4
+    OpName %2(<4 x s32>), 97
+    OpName %3(<4 x s32>), 98
+    %4:type(s64) = OpTypeFunction %0(s64), %1(s64), %1(s64)
+    %5:iid(s64) = OpFunction %0(s64), 0, %4(s64)
+    %2:vfid(<4 x s32>) = OpFunctionParameter %1(s64)
+    %3:vfid(<4 x s32>) = OpFunctionParameter %1(s64)
+    OpName %5(s64), 1953720676, 1701015137, 1936615775, 1836016500, 1701734754, 1869375071, 3437665
+    OpDecorate %5(s64), 41, 1953720676, 1701015137, 1936615775, 1836016500, 1701734754, 1869375071, 3437665, 0
+    %6:_(<4 x s32>) = G_FSUB %2, %3
+    %7:id(s32) = G_INTRINSIC intrinsic(@llvm.spv.length), %6(<4 x s32>)
+    OpReturnValue %7(s32)
+    
\ No newline at end of file
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveSum.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveSum.ll
new file mode 100644
index 0000000000000..739b7bb1d5bd4
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveSum.ll
@@ -0,0 +1,41 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Test lowering to spir-v backend for various types and scalar/vector
+
+; CHECK-DAG:   %[[#f16:]] = OpTypeFloat 16
+; CHECK-DAG:   %[[#f32:]] = OpTypeFloat 32
+; CHECK-DAG:   %[[#uint:]] = OpTypeInt 32 0
+; CHECK-DAG:   %[[#v4_half:]] = OpTypeVector %[[#f16]] 4
+; CHECK-DAG:   %[[#scope:]] = OpConstant %[[#uint]] 3
+
+; CHECK-LABEL: Begin function test_float
+; CHECK:   %[[#fexpr:]] = OpFunctionParameter %[[#f32]]
+define float @test_float(float %fexpr) {
+entry:
+; CHECK:   %[[#fret:]] = OpGroupNonUniformFAdd %[[#f32]] %[[#scope]] Reduce %[[#fexpr]]
+  %0 = call float @llvm.spv.wave.reduce.sum.f32(float %fexpr)
+  ret float %0
+}
+
+; CHECK-LABEL: Begin function test_int
+; CHECK:   %[[#iexpr:]] = OpFunctionParameter %[[#uint]]
+define i32 @test_int(i32 %iexpr) {
+entry:
+; CHECK:   %[[#iret:]] = OpGroupNonUniformIAdd %[[#uint]] %[[#scope]] Reduce %[[#iexpr]]
+  %0 = call i32 @llvm.spv.wave.reduce.sum.i32(i32 %iexpr)
+  ret i32 %0
+}
+
+; CHECK-LABEL: Begin function test_vhalf
+; CHECK:   %[[#vbexpr:]] = OpFunctionParameter %[[#v4_half]]
+define <4 x half> @test_vhalf(<4 x half> %vbexpr) {
+entry:
+; CHECK:   %[[#vhalfret:]] = OpGroupNonUniformFAdd %[[#v4_half]] %[[#scope]] Reduce %[[#vbexpr]]
+  %0 = call <4 x half> @llvm.spv.wave.reduce.sum.v4half(<4 x half> %vbexpr)
+  ret <4 x half> %0
+}
+
+declare float @llvm.spv.wave.reduce.sum.f32(float)
+declare i32 @llvm.spv.wave.reduce.sum.i32(i32)
+declare <4 x half> @llvm.spv.wave.reduce.sum.v4half(<4 x half>)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/distance.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/distance.ll
index 85a24a0127ae0..cb92f775eef31 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/distance.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/distance.ll
@@ -1,33 +1,44 @@
-; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
-
-; Make sure SPIRV operation function calls for distance are lowered correctly.
-
-; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
-; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
-; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
-; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
-; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
-
-define noundef half @distance_half4(<4 x half> noundef %a, <4 x half> noundef %b) {
-entry:
-  ; CHECK: %[[#]] = OpFunction %[[#float_16]] None %[[#]]
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]]
-  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_16]]
-  ; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Distance %[[#arg0]] %[[#arg1]]
-  %spv.distance = call half @llvm.spv.distance.f16(<4 x half> %a, <4 x half> %b)
-  ret half %spv.distance
-}
-
-define noundef float @distance_float4(<4 x float> noundef %a, <4 x float> noundef %b) {
-entry:
-  ; CHECK: %[[#]] = OpFunction %[[#float_32]] None %[[#]]
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]]
-  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]]
-  ; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Distance %[[#arg0]] %[[#arg1]]
-  %spv.distance = call float @llvm.spv.distance.f32(<4 x float> %a, <4 x float> %b)
-  ret float %spv.distance
-}
-
-declare half @llvm.spv.distance.f16(<4 x half>, <4 x half>)
-declare float @llvm.spv.distance.f32(<4 x float>, <4 x float>)
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Make sure SPIRV operation function calls for distance are lowered correctly.
+
+; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+
+define noundef half @distance_half4(<4 x half> noundef %a, <4 x half> noundef %b) {
+entry:
+  ; CHECK: %[[#]] = OpFunction %[[#float_16]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_16]]
+  ; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Distance %[[#arg0]] %[[#arg1]]
+  %spv.distance = call half @llvm.spv.distance.f16(<4 x half> %a, <4 x half> %b)
+  ret half %spv.distance
+}
+
+define noundef float @distance_float4(<4 x float> noundef %a, <4 x float> noundef %b) {
+entry:
+  ; CHECK: %[[#]] = OpFunction %[[#float_32]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Distance %[[#arg0]] %[[#arg1]]
+  %spv.distance = call float @llvm.spv.distance.f32(<4 x float> %a, <4 x float> %b)
+  ret float %spv.distance
+}
+
+define noundef float @distance_instcombine_float4(<4 x float> noundef %a, <4 x float> noundef %b) {
+entry:
+  ; CHECK: %[[#]] = OpFunction %[[#float_32]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Distance %[[#arg0]] %[[#arg1]]
+  %delta = fsub  <4 x float> %a, %b
+  %spv.length = call float @llvm.spv.length.f32(<4 x float> %delta)
+  ret float %spv.length
+}
+
+declare half @llvm.spv.distance.f16(<4 x half>, <4 x half>)
+declare float @llvm.spv.distance.f32(<4 x float>, <4 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbithigh.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbithigh.ll
index 3d35e102310f5..a4dd09d84d996 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbithigh.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbithigh.ll
@@ -1,94 +1,261 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK: OpMemoryModel Logical GLSL450
-; CHECK-DAG: [[Z:%.*]] = OpConstant %[[#]] 0
-; CHECK-DAG: [[X:%.*]] = OpConstant %[[#]] 1
+; CHECK-DAG: [[glsl_450_ext:%.+]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: OpMemoryModel Logical GLSL450
+; CHECK-DAG: [[u32_t:%.+]] = OpTypeInt 32 0
+; CHECK-DAG: [[u32x2_t:%.+]] = OpTypeVector [[u32_t]] 2
+; CHECK-DAG: [[u32x3_t:%.+]] = OpTypeVector [[u32_t]] 3
+; CHECK-DAG: [[u32x4_t:%.+]] = OpTypeVector [[u32_t]] 4
+; CHECK-DAG: [[const_0:%.*]] = OpConstant [[u32_t]] 0
+; CHECK-DAG: [[const_2:%.*]] = OpConstant [[u32_t]] 2
+; CHECK-DAG: [[const_0x2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_0]] [[const_0]]
+; CHECK-DAG: [[const_1:%.*]] = OpConstant [[u32_t]] 1
+; CHECK-DAG: [[const_32:%.*]] = OpConstant [[u32_t]] 32
+; CHECK-DAG: [[const_32x2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_32]] [[const_32]]
+; CHECK-DAG: [[const_neg1:%.*]] = OpConstant [[u32_t]] 4294967295
+; CHECK-DAG: [[const_neg1x2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_neg1]] [[const_neg1]]
+; CHECK-DAG: [[u16_t:%.+]] = OpTypeInt 16 0
+; CHECK-DAG: [[u16x2_t:%.+]] = OpTypeVector [[u16_t]] 2
+; CHECK-DAG: [[u16x3_t:%.+]] = OpTypeVector [[u16_t]] 3
+; CHECK-DAG: [[u16x4_t:%.+]] = OpTypeVector [[u16_t]] 4
+; CHECK-DAG: [[u64_t:%.+]] = OpTypeInt 64 0
+; CHECK-DAG: [[u64x2_t:%.+]] = OpTypeVector [[u64_t]] 2
+; CHECK-DAG: [[u64x3_t:%.+]] = OpTypeVector [[u64_t]] 3
+; CHECK-DAG: [[u64x4_t:%.+]] = OpTypeVector [[u64_t]] 4
+; CHECK-DAG: [[bool_t:%.+]] = OpTypeBool
+; CHECK-DAG: [[boolx2_t:%.+]] = OpTypeVector [[bool_t]] 2
 
+; CHECK-LABEL: Begin function firstbituhigh_i32
 define noundef i32 @firstbituhigh_i32(i32 noundef %a) {
 entry:
-; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] FindUMsb %[[#]]
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u32_t]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32_t]] [[glsl_450_ext]] FindUMsb [[a]]
+; CHECK: OpReturnValue [[ret]]
   %elt.firstbituhigh = call i32 @llvm.spv.firstbituhigh.i32(i32 %a)
   ret i32 %elt.firstbituhigh
 }
 
-define noundef <2 x i32> @firstbituhigh_2xi32(<2 x i32> noundef %a) {
+; CHECK-LABEL: Begin function firstbituhigh_v2xi32
+define noundef <2 x i32> @firstbituhigh_v2xi32(<2 x i32> noundef %a) {
 entry:
-; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] FindUMsb %[[#]]
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u32x2_t]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindUMsb [[a]]
+; CHECK: OpReturnValue [[ret]]
   %elt.firstbituhigh = call <2 x i32> @llvm.spv.firstbituhigh.v2i32(<2 x i32> %a)
   ret <2 x i32> %elt.firstbituhigh
 }
 
+; CHECK-LABEL: Begin function firstbituhigh_v3xi32
+define noundef <3 x i32> @firstbituhigh_v3xi32(<3 x i32> noundef %a) {
+entry:
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u32x3_t]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x3_t]] [[glsl_450_ext]] FindUMsb [[a]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbituhigh = call <3 x i32> @llvm.spv.firstbituhigh.v3i32(<3 x i32> %a)
+  ret <3 x i32> %elt.firstbituhigh
+}
+
+; CHECK-LABEL: Begin function firstbituhigh_v4xi32
+define noundef <4 x i32> @firstbituhigh_v4xi32(<4 x i32> noundef %a) {
+entry:
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u32x4_t]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindUMsb [[a]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbituhigh = call <4 x i32> @llvm.spv.firstbituhigh.v4i32(<4 x i32> %a)
+  ret <4 x i32> %elt.firstbituhigh
+}
+
+; CHECK-LABEL: Begin function firstbituhigh_i16
 define noundef i32 @firstbituhigh_i16(i16 noundef %a) {
 entry:
-; CHECK: [[A:%.*]] = OpUConvert %[[#]]
-; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] FindUMsb [[A]]
+; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16_t]]
+; CHECK: [[a32:%.+]] = OpUConvert [[u32_t]] [[a16]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32_t]] [[glsl_450_ext]] FindUMsb [[a32]]
+; CHECK: OpReturnValue [[ret]]
   %elt.firstbituhigh = call i32 @llvm.spv.firstbituhigh.i16(i16 %a)
   ret i32 %elt.firstbituhigh
 }
 
-define noundef <2 x i32> @firstbituhigh_v2i16(<2 x i16> noundef %a) {
+; CHECK-LABEL: Begin function firstbituhigh_v2xi16
+define noundef <2 x i32> @firstbituhigh_v2xi16(<2 x i16> noundef %a) {
 entry:
-; CHECK: [[A:%.*]] = OpUConvert %[[#]]
-; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] FindUMsb [[A]]
+; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16x2_t]]
+; CHECK: [[a32:%.+]] = OpUConvert [[u32x2_t]] [[a16]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindUMsb [[a32]]
+; CHECK: OpReturnValue [[ret]]
   %elt.firstbituhigh = call <2 x i32> @llvm.spv.firstbituhigh.v2i16(<2 x i16> %a)
   ret <2 x i32> %elt.firstbituhigh
 }
 
+; CHECK-LABEL: Begin function firstbituhigh_v3xi16
+define noundef <3 x i32> @firstbituhigh_v3xi16(<3 x i16> noundef %a) {
+entry:
+; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16x3_t]]
+; CHECK: [[a32:%.+]] = OpUConvert [[u32x3_t]] [[a16]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x3_t]] [[glsl_450_ext]] FindUMsb [[a32]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbituhigh = call <3 x i32> @llvm.spv.firstbituhigh.v3i16(<3 x i16> %a)
+  ret <3 x i32> %elt.firstbituhigh
+}
+
+; CHECK-LABEL: Begin function firstbituhigh_v4xi16
+define noundef <4 x i32> @firstbituhigh_v4xi16(<4 x i16> noundef %a) {
+entry:
+; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16x4_t]]
+; CHECK: [[a32:%.+]] = OpUConvert [[u32x4_t]] [[a16]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindUMsb [[a32]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbituhigh = call <4 x i32> @llvm.spv.firstbituhigh.v4i16(<4 x i16> %a)
+  ret <4 x i32> %elt.firstbituhigh
+}
+
+; CHECK-LABEL: Begin function firstbituhigh_i64
 define noundef i32 @firstbituhigh_i64(i64 noundef %a) {
 entry:
-; CHECK: [[O:%.*]] = OpBitcast %[[#]] %[[#]]
-; CHECK: [[N:%.*]] = OpExtInst %[[#]] %[[#]] FindUMsb [[O]]
-; CHECK: [[M:%.*]] = OpVectorExtractDynamic %[[#]] [[N]] [[Z]]
-; CHECK: [[L:%.*]] = OpVectorExtractDynamic %[[#]] [[N]] [[X]]
-; CHECK: [[I:%.*]] = OpIEqual %[[#]] [[M]] %[[#]]
-; CHECK: [[H:%.*]] = OpSelect %[[#]] [[I]] [[L]] [[M]]
-; CHECK: [[C:%.*]] = OpSelect %[[#]] [[I]] %[[#]] %[[#]]
-; CHECK: [[B:%.*]] = OpIAdd %[[#]] [[C]] [[H]]
+; CHECK: [[a64:%.+]] = OpFunctionParameter [[u64_t]]
+; CHECK: [[a32x2:%.+]] = OpBitcast [[u32x2_t]] [[a64]]
+; CHECK: [[lsb_bits:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindUMsb [[a32x2]]
+; CHECK: [[high_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[lsb_bits]] [[const_0]]
+; CHECK: [[low_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[lsb_bits]] [[const_1]]
+; CHECK: [[should_use_low:%.+]] = OpIEqual [[bool_t]] [[high_bits]] [[const_neg1]]
+; CHECK: [[ans_bits:%.+]] = OpSelect [[u32_t]] [[should_use_low]] [[low_bits]] [[high_bits]]
+; CHECK: [[ans_offset:%.+]] = OpSelect [[u32_t]] [[should_use_low]] [[const_0]] [[const_32]]
+; CHECK: [[ret:%.+]] = OpIAdd [[u32_t]] [[ans_offset]] [[ans_bits]]
+; CHECK: OpReturnValue [[ret]]
   %elt.firstbituhigh = call i32 @llvm.spv.firstbituhigh.i64(i64 %a)
   ret i32 %elt.firstbituhigh
 }
 
-define noundef <2 x i32> @firstbituhigh_v2i64(<2 x i64> noundef %a) {
+; CHECK-LABEL: Begin function firstbituhigh_v2xi64
+define noundef <2 x i32> @firstbituhigh_v2xi64(<2 x i64> noundef %a) {
 entry:
-; CHECK: [[O:%.*]] = OpBitcast %[[#]] %[[#]]
-; CHECK: [[N:%.*]] = OpExtInst %[[#]] %[[#]] FindUMsb [[O]]
-; CHECK: [[M:%.*]] = OpVectorShuffle %[[#]] [[N]] [[N]] 0
-; CHECK: [[L:%.*]] = OpVectorShuffle %[[#]] [[N]] [[N]] 1
-; CHECK: [[I:%.*]] = OpIEqual %[[#]] [[M]] %[[#]]
-; CHECK: [[H:%.*]] = OpSelect %[[#]] [[I]] [[L]] [[M]]
-; CHECK: [[C:%.*]] = OpSelect %[[#]] [[I]] %[[#]] %[[#]]
-; CHECK: [[B:%.*]] = OpIAdd %[[#]] [[C]] [[H]]
-; CHECK: OpReturnValue [[B]]
+; CHECK: [[a64x2:%.+]] = OpFunctionParameter [[u64x2_t]]
+; CHECK: [[a32x4:%.+]] = OpBitcast [[u32x4_t]] [[a64x2]]
+; CHECK: [[lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindUMsb [[a32x4]]
+; CHECK: [[high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[lsb_bits]] [[lsb_bits]] 0 2
+; CHECK: [[low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[lsb_bits]] [[lsb_bits]] 1 3
+; CHECK: [[should_use_low:%.+]] = OpIEqual [[boolx2_t]] [[high_bits]] [[const_neg1x2]]
+; CHECK: [[ans_bits:%.+]] = OpSelect [[u32x2_t]] [[should_use_low]] [[low_bits]] [[high_bits]]
+; CHECK: [[ans_offset:%.+]] = OpSelect [[u32x2_t]] [[should_use_low]] [[const_0x2]] [[const_32x2]]
+; CHECK: [[ret:%.+]] = OpIAdd [[u32x2_t]] [[ans_offset]] [[ans_bits]]
+; CHECK: OpReturnValue [[ret]]
   %elt.firstbituhigh = call <2 x i32> @llvm.spv.firstbituhigh.v2i64(<2 x i64> %a)
   ret <2 x i32> %elt.firstbituhigh
 }
 
+; CHECK-LABEL: Begin function firstbituhigh_v3xi64
+define noundef <3 x i32> @firstbituhigh_v3xi64(<3 x i64> noundef %a) {
+entry:
+; Preamble
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u64x3_t]]
+
+; Extract first 2 components from %a
+; CHECK: [[pt1:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 0 1
+
+; Do firstbituhigh on the first 2 components
+; CHECK: [[pt1_cast:%.+]] = OpBitcast [[u32x4_t]] [[pt1]]
+; CHECK: [[pt1_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindUMsb [[pt1_cast]]
+; CHECK: [[pt1_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt1_lsb_bits]] [[pt1_lsb_bits]] 0 2
+; CHECK: [[pt1_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt1_lsb_bits]] [[pt1_lsb_bits]] 1 3
+; CHECK: [[pt1_should_use_low:%.+]] = OpIEqual [[boolx2_t]] [[pt1_high_bits]] [[const_neg1x2]]
+; CHECK: [[pt1_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[pt1_should_use_low]] [[pt1_low_bits]] [[pt1_high_bits]]
+; CHECK: [[pt1_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[pt1_should_use_low]] [[const_0x2]] [[const_32x2]]
+; CHECK: [[pt1_res:%.+]] = OpIAdd [[u32x2_t]] [[pt1_ans_offset]] [[pt1_ans_bits]]
+
+; Extract the last component from %a
+; CHECK: [[pt2:%.+]] = OpVectorExtractDynamic [[u64_t]] [[a]] [[const_2]]
+
+; Do firstbituhigh on the last component
+; CHECK: [[pt2_cast:%.+]] = OpBitcast [[u32x2_t]] [[pt2]]
+; CHECK: [[pt2_lsb_bits:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindUMsb [[pt2_cast]]
+; CHECK: [[pt2_high_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[pt2_lsb_bits]] [[const_0]]
+; CHECK: [[pt2_low_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[pt2_lsb_bits]] [[const_1]]
+; CHECK: [[pt2_should_use_low:%.+]] = OpIEqual [[bool_t]] [[pt2_high_bits]] [[const_neg1]]
+; CHECK: [[pt2_ans_bits:%.+]] = OpSelect [[u32_t]] [[pt2_should_use_low]] [[pt2_low_bits]] [[pt2_high_bits]]
+; CHECK: [[pt2_ans_offset:%.+]] = OpSelect [[u32_t]] [[pt2_should_use_low]] [[const_0]] [[const_32]]
+; CHECK: [[pt2_res:%.+]] = OpIAdd [[u32_t]] [[pt2_ans_offset]] [[pt2_ans_bits]]
+
+; Merge the parts into the final i32x3 and return it
+; CHECK: [[ret:%.+]] = OpCompositeConstruct [[u32x3_t]] [[pt1_res]] [[pt2_res]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbituhigh = call <3 x i32> @llvm.spv.firstbituhigh.v3i64(<3 x i64> %a)
+  ret <3 x i32> %elt.firstbituhigh
+}
+
+; CHECK-LABEL: Begin function firstbituhigh_v4xi64
+define noundef <4 x i32> @firstbituhigh_v4xi64(<4 x i64> noundef %a) {
+entry:
+; Preamble
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u64x4_t]]
+
+; Extract first 2 components from %a
+; CHECK: [[pt1:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 0 1
+
+; Do firstbituhigh on the first 2 components
+; CHECK: [[pt1_cast:%.+]] = OpBitcast [[u32x4_t]] [[pt1]]
+; CHECK: [[pt1_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindUMsb [[pt1_cast]]
+; CHECK: [[pt1_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt1_lsb_bits]] [[pt1_lsb_bits]] 0 2
+; CHECK: [[pt1_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt1_lsb_bits]] [[pt1_lsb_bits]] 1 3
+; CHECK: [[pt1_should_use_low:%.+]] = OpIEqual [[boolx2_t]] [[pt1_high_bits]] [[const_neg1x2]]
+; CHECK: [[pt1_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[pt1_should_use_low]] [[pt1_low_bits]] [[pt1_high_bits]]
+; CHECK: [[pt1_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[pt1_should_use_low]] [[const_0x2]] [[const_32x2]]
+; CHECK: [[pt1_res:%.+]] = OpIAdd [[u32x2_t]] [[pt1_ans_offset]] [[pt1_ans_bits]]
+
+; Extract last 2 components from %a
+; CHECK: [[pt2:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 2 3
+
+; Do firstbituhigh on the last 2 components
+; CHECK: [[pt2_cast:%.+]] = OpBitcast [[u32x4_t]] [[pt2]]
+; CHECK: [[pt2_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindUMsb [[pt2_cast]]
+; CHECK: [[pt2_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt2_lsb_bits]] [[pt2_lsb_bits]] 0 2
+; CHECK: [[pt2_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt2_lsb_bits]] [[pt2_lsb_bits]] 1 3
+; CHECK: [[pt2_should_use_low:%.+]] = OpIEqual [[boolx2_t]] [[pt2_high_bits]] [[const_neg1x2]]
+; CHECK: [[pt2_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[pt2_should_use_low]] [[pt2_low_bits]] [[pt2_high_bits]]
+; CHECK: [[pt2_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[pt2_should_use_low]] [[const_0x2]] [[const_32x2]]
+; CHECK: [[pt2_res:%.+]] = OpIAdd [[u32x2_t]] [[pt2_ans_offset]] [[pt2_ans_bits]]
+
+; Merge the parts into the final i32x4 and return it
+; CHECK: [[ret:%.+]] = OpCompositeConstruct [[u32x4_t]] [[pt1_res]] [[pt2_res]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbituhigh = call <4 x i32> @llvm.spv.firstbituhigh.v4i64(<4 x i64> %a)
+  ret <4 x i32> %elt.firstbituhigh
+}
+
+; CHECK-LABEL: Begin function firstbitshigh_i32
 define noundef i32 @firstbitshigh_i32(i32 noundef %a) {
 entry:
-; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] FindSMsb %[[#]]
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u32_t]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32_t]] [[glsl_450_ext]] FindSMsb [[a]]
+; CHECK: OpReturnValue [[ret]]
   %elt.firstbitshigh = call i32 @llvm.spv.firstbitshigh.i32(i32 %a)
   ret i32 %elt.firstbitshigh
 }
 
+; CHECK-LABEL: Begin function firstbitshigh_i16
 define noundef i32 @firstbitshigh_i16(i16 noundef %a) {
 entry:
-; CHECK: [[A:%.*]] = OpSConvert %[[#]]
-; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] FindSMsb %[[#]]
+; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16_t]]
+; CHECK: [[a32:%.+]] = OpSConvert [[u32_t]] [[a16]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32_t]] [[glsl_450_ext]] FindSMsb [[a32]]
+; CHECK: OpReturnValue [[ret]]
   %elt.firstbitshigh = call i32 @llvm.spv.firstbitshigh.i16(i16 %a)
   ret i32 %elt.firstbitshigh
 }
 
+; CHECK-LABEL: Begin function firstbitshigh_i64
 define noundef i32 @firstbitshigh_i64(i64 noundef %a) {
 entry:
-; CHECK: [[O:%.*]] = OpBitcast %[[#]] %[[#]]
-; CHECK: [[N:%.*]] = OpExtInst %[[#]] %[[#]] FindSMsb [[O]]
-; CHECK: [[M:%.*]] = OpVectorExtractDynamic %[[#]] [[N]] [[Z]]
-; CHECK: [[L:%.*]] = OpVectorExtractDynamic %[[#]] [[N]] [[X]]
-; CHECK: [[I:%.*]] = OpIEqual %[[#]] [[M]] %[[#]]
-; CHECK: [[H:%.*]] = OpSelect %[[#]] [[I]] [[L]] [[M]]
-; CHECK: [[C:%.*]] = OpSelect %[[#]] [[I]] %[[#]] %[[#]]
-; CHECK: [[B:%.*]] = OpIAdd %[[#]] [[C]] [[H]]
+; CHECK: [[a64:%.+]] = OpFunctionParameter [[u64_t]]
+; CHECK: [[a32x2:%.+]] = OpBitcast [[u32x2_t]] [[a64]]
+; CHECK: [[lsb_bits:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindSMsb [[a32x2]]
+; CHECK: [[high_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[lsb_bits]] [[const_0]]
+; CHECK: [[low_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[lsb_bits]] [[const_1]]
+; CHECK: [[should_use_low:%.+]] = OpIEqual [[bool_t]] [[high_bits]] [[const_neg1]]
+; CHECK: [[ans_bits:%.+]] = OpSelect [[u32_t]] [[should_use_low]] [[low_bits]] [[high_bits]]
+; CHECK: [[ans_offset:%.+]] = OpSelect [[u32_t]] [[should_use_low]] [[const_0]] [[const_32]]
+; CHECK: [[ret:%.+]] = OpIAdd [[u32_t]] [[ans_offset]] [[ans_bits]]
+; CHECK: OpReturnValue [[ret]]
   %elt.firstbitshigh = call i32 @llvm.spv.firstbitshigh.i64(i64 %a)
   ret i32 %elt.firstbitshigh
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll
new file mode 100644
index 0000000000000..6de6cdc60ea9c
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll
@@ -0,0 +1,230 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: [[glsl_450_ext:%.+]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: OpMemoryModel Logical GLSL450
+; CHECK-DAG: [[u32_t:%.+]] = OpTypeInt 32 0
+; CHECK-DAG: [[u32x2_t:%.+]] = OpTypeVector [[u32_t]] 2
+; CHECK-DAG: [[u32x3_t:%.+]] = OpTypeVector [[u32_t]] 3
+; CHECK-DAG: [[u32x4_t:%.+]] = OpTypeVector [[u32_t]] 4
+; CHECK-DAG: [[const_0:%.*]] = OpConstant [[u32_t]] 0
+; CHECK-DAG: [[const_0x2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_0]] [[const_0]]
+; CHECK-DAG: [[const_1:%.*]] = OpConstant [[u32_t]] 1
+; CHECK-DAG: [[const_2:%.*]] = OpConstant [[u32_t]] 2
+; CHECK-DAG: [[const_32:%.*]] = OpConstant [[u32_t]] 32
+; CHECK-DAG: [[const_32x2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_32]] [[const_32]]
+; CHECK-DAG: [[const_neg1:%.*]] = OpConstant [[u32_t]] 4294967295
+; CHECK-DAG: [[const_neg1x2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_neg1]] [[const_neg1]]
+; CHECK-DAG: [[u16_t:%.+]] = OpTypeInt 16 0
+; CHECK-DAG: [[u16x2_t:%.+]] = OpTypeVector [[u16_t]] 2
+; CHECK-DAG: [[u16x3_t:%.+]] = OpTypeVector [[u16_t]] 3
+; CHECK-DAG: [[u16x4_t:%.+]] = OpTypeVector [[u16_t]] 4
+; CHECK-DAG: [[u64_t:%.+]] = OpTypeInt 64 0
+; CHECK-DAG: [[u64x2_t:%.+]] = OpTypeVector [[u64_t]] 2
+; CHECK-DAG: [[u64x3_t:%.+]] = OpTypeVector [[u64_t]] 3
+; CHECK-DAG: [[u64x4_t:%.+]] = OpTypeVector [[u64_t]] 4
+; CHECK-DAG: [[bool_t:%.+]] = OpTypeBool
+; CHECK-DAG: [[boolx2_t:%.+]] = OpTypeVector [[bool_t]] 2
+
+; CHECK-LABEL: Begin function firstbitlow_i32
+define noundef i32 @firstbitlow_i32(i32 noundef %a) {
+entry:
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u32_t]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32_t]] [[glsl_450_ext]] FindILsb [[a]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call i32 @llvm.spv.firstbitlow.i32(i32 %a)
+  ret i32 %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_v2xi32
+define noundef <2 x i32> @firstbitlow_v2xi32(<2 x i32> noundef %a) {
+entry:
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u32x2_t]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindILsb [[a]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <2 x i32> @llvm.spv.firstbitlow.v2i32(<2 x i32> %a)
+  ret <2 x i32> %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_v3xi32
+define noundef <3 x i32> @firstbitlow_v3xi32(<3 x i32> noundef %a) {
+entry:
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u32x3_t]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x3_t]] [[glsl_450_ext]] FindILsb [[a]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <3 x i32> @llvm.spv.firstbitlow.v3i32(<3 x i32> %a)
+  ret <3 x i32> %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_v4xi32
+define noundef <4 x i32> @firstbitlow_v4xi32(<4 x i32> noundef %a) {
+entry:
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u32x4_t]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[a]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <4 x i32> @llvm.spv.firstbitlow.v4i32(<4 x i32> %a)
+  ret <4 x i32> %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_i16
+define noundef i32 @firstbitlow_i16(i16 noundef %a) {
+entry:
+; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16_t]]
+; CHECK: [[a32:%.+]] = OpUConvert [[u32_t]] [[a16]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32_t]] [[glsl_450_ext]] FindILsb [[a32]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call i32 @llvm.spv.firstbitlow.i16(i16 %a)
+  ret i32 %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_v2xi16
+define noundef <2 x i32> @firstbitlow_v2xi16(<2 x i16> noundef %a) {
+entry:
+; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16x2_t]]
+; CHECK: [[a32:%.+]] = OpUConvert [[u32x2_t]] [[a16]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindILsb [[a32]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <2 x i32> @llvm.spv.firstbitlow.v2i16(<2 x i16> %a)
+  ret <2 x i32> %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_v3xi16
+define noundef <3 x i32> @firstbitlow_v3xi16(<3 x i16> noundef %a) {
+entry:
+; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16x3_t]]
+; CHECK: [[a32:%.+]] = OpUConvert [[u32x3_t]] [[a16]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x3_t]] [[glsl_450_ext]] FindILsb [[a32]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <3 x i32> @llvm.spv.firstbitlow.v3i16(<3 x i16> %a)
+  ret <3 x i32> %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_v4xi16
+define noundef <4 x i32> @firstbitlow_v4xi16(<4 x i16> noundef %a) {
+entry:
+; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16x4_t]]
+; CHECK: [[a32:%.+]] = OpUConvert [[u32x4_t]] [[a16]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[a32]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <4 x i32> @llvm.spv.firstbitlow.v4i16(<4 x i16> %a)
+  ret <4 x i32> %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_i64
+define noundef i32 @firstbitlow_i64(i64 noundef %a) {
+entry:
+; CHECK: [[a64:%.+]] = OpFunctionParameter [[u64_t]]
+; CHECK: [[a32x2:%.+]] = OpBitcast [[u32x2_t]] [[a64]]
+; CHECK: [[lsb_bits:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindILsb [[a32x2]]
+; CHECK: [[high_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[lsb_bits]] [[const_0]]
+; CHECK: [[low_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[lsb_bits]] [[const_1]]
+; CHECK: [[should_use_high:%.+]] = OpIEqual [[bool_t]] [[low_bits]] [[const_neg1]]
+; CHECK: [[ans_bits:%.+]] = OpSelect [[u32_t]] [[should_use_high]] [[high_bits]] [[low_bits]]
+; CHECK: [[ans_offset:%.+]] = OpSelect [[u32_t]] [[should_use_high]] [[const_32]] [[const_0]]
+; CHECK: [[ret:%.+]] = OpIAdd [[u32_t]] [[ans_offset]] [[ans_bits]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call i32 @llvm.spv.firstbitlow.i64(i64 %a)
+  ret i32 %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_v2xi64
+define noundef <2 x i32> @firstbitlow_v2xi64(<2 x i64> noundef %a) {
+entry:
+; CHECK: [[a64x2:%.+]] = OpFunctionParameter [[u64x2_t]]
+; CHECK: [[a32x4:%.+]] = OpBitcast [[u32x4_t]] [[a64x2]]
+; CHECK: [[lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[a32x4]]
+; CHECK: [[high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[lsb_bits]] [[lsb_bits]] 0 2
+; CHECK: [[low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[lsb_bits]] [[lsb_bits]] 1 3
+; CHECK: [[should_use_high:%.+]] = OpIEqual [[boolx2_t]] [[low_bits]] [[const_neg1x2]]
+; CHECK: [[ans_bits:%.+]] = OpSelect [[u32x2_t]] [[should_use_high]] [[high_bits]] [[low_bits]]
+; CHECK: [[ans_offset:%.+]] = OpSelect [[u32x2_t]] [[should_use_high]] [[const_32x2]] [[const_0x2]]
+; CHECK: [[ret:%.+]] = OpIAdd [[u32x2_t]] [[ans_offset]] [[ans_bits]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <2 x i32> @llvm.spv.firstbitlow.v2i64(<2 x i64> %a)
+  ret <2 x i32> %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_v3xi64
+define noundef <3 x i32> @firstbitlow_v3xi64(<3 x i64> noundef %a) {
+entry:
+; Preamble
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u64x3_t]]
+
+; Extract first 2 components from %a
+; CHECK: [[pt1:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 0 1
+
+; Do firstbitlow on the first 2 components
+; CHECK: [[pt1_cast:%.+]] = OpBitcast [[u32x4_t]] [[pt1]]
+; CHECK: [[pt1_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[pt1_cast]]
+; CHECK: [[pt1_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt1_lsb_bits]] [[pt1_lsb_bits]] 0 2
+; CHECK: [[pt1_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt1_lsb_bits]] [[pt1_lsb_bits]] 1 3
+; CHECK: [[pt1_should_use_high:%.+]] = OpIEqual [[boolx2_t]] [[pt1_low_bits]] [[const_neg1x2]]
+; CHECK: [[pt1_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[pt1_should_use_high]] [[pt1_high_bits]] [[pt1_low_bits]]
+; CHECK: [[pt1_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[pt1_should_use_high]] [[const_32x2]] [[const_0x2]]
+; CHECK: [[pt1_res:%.+]] = OpIAdd [[u32x2_t]] [[pt1_ans_offset]] [[pt1_ans_bits]]
+
+; Extract the last component from %a
+; CHECK: [[pt2:%.+]] = OpVectorExtractDynamic [[u64_t]] [[a]] [[const_2]]
+
+; Do firstbitlow on the last component
+; CHECK: [[pt2_cast:%.+]] = OpBitcast [[u32x2_t]] [[pt2]]
+; CHECK: [[pt2_lsb_bits:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindILsb [[pt2_cast]]
+; CHECK: [[pt2_high_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[pt2_lsb_bits]] [[const_0]]
+; CHECK: [[pt2_low_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[pt2_lsb_bits]] [[const_1]]
+; CHECK: [[pt2_should_use_high:%.+]] = OpIEqual [[bool_t]] [[pt2_low_bits]] [[const_neg1]]
+; CHECK: [[pt2_ans_bits:%.+]] = OpSelect [[u32_t]] [[pt2_should_use_high]] [[pt2_high_bits]] [[pt2_low_bits]]
+; CHECK: [[pt2_ans_offset:%.+]] = OpSelect [[u32_t]] [[pt2_should_use_high]] [[const_32]] [[const_0]]
+; CHECK: [[pt2_res:%.+]] = OpIAdd [[u32_t]] [[pt2_ans_offset]] [[pt2_ans_bits]]
+
+; Merge the parts into the final i32x3 and return it
+; CHECK: [[ret:%.+]] = OpCompositeConstruct [[u32x3_t]] [[pt1_res]] [[pt2_res]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <3 x i32> @llvm.spv.firstbitlow.v3i64(<3 x i64> %a)
+  ret <3 x i32> %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_v4xi64
+define noundef <4 x i32> @firstbitlow_v4xi64(<4 x i64> noundef %a) {
+entry:
+; Preamble
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u64x4_t]]
+
+; Extract first 2 components from %a
+; CHECK: [[pt1:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 0 1
+
+; Do firstbitlow on the first 2 components
+; CHECK: [[pt1_cast:%.+]] = OpBitcast [[u32x4_t]] [[pt1]]
+; CHECK: [[pt1_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[pt1_cast]]
+; CHECK: [[pt1_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt1_lsb_bits]] [[pt1_lsb_bits]] 0 2
+; CHECK: [[pt1_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt1_lsb_bits]] [[pt1_lsb_bits]] 1 3
+; CHECK: [[pt1_should_use_high:%.+]] = OpIEqual [[boolx2_t]] [[pt1_low_bits]] [[const_neg1x2]]
+; CHECK: [[pt1_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[pt1_should_use_high]] [[pt1_high_bits]] [[pt1_low_bits]]
+; CHECK: [[pt1_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[pt1_should_use_high]] [[const_32x2]] [[const_0x2]]
+; CHECK: [[pt1_res:%.+]] = OpIAdd [[u32x2_t]] [[pt1_ans_offset]] [[pt1_ans_bits]]
+
+; Extract last 2 components from %a
+; CHECK: [[pt2:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 2 3
+
+; Do firstbituhigh on the last 2 components
+; CHECK: [[pt2_cast:%.+]] = OpBitcast [[u32x4_t]] [[pt2]]
+; CHECK: [[pt2_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[pt2_cast]]
+; CHECK: [[pt2_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt2_lsb_bits]] [[pt2_lsb_bits]] 0 2
+; CHECK: [[pt2_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt2_lsb_bits]] [[pt2_lsb_bits]] 1 3
+; CHECK: [[pt2_should_use_high:%.+]] = OpIEqual [[boolx2_t]] [[pt2_low_bits]] [[const_neg1x2]]
+; CHECK: [[pt2_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[pt2_should_use_high]] [[pt2_high_bits]] [[pt2_low_bits]]
+; CHECK: [[pt2_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[pt2_should_use_high]] [[const_32x2]] [[const_0x2]]
+; CHECK: [[pt2_res:%.+]] = OpIAdd [[u32x2_t]] [[pt2_ans_offset]] [[pt2_ans_bits]]
+
+; Merge the parts into the final i32x4 and return it
+; CHECK: [[ret:%.+]] = OpCompositeConstruct [[u32x4_t]] [[pt1_res]] [[pt2_res]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <4 x i32> @llvm.spv.firstbitlow.v4i64(<4 x i64> %a)
+  ret <4 x i32> %elt.firstbitlow
+}
+
+;declare i16 @llvm.spv.firstbitlow.i16(i16)
+;declare i32 @llvm.spv.firstbitlow.i32(i32)
+;declare i64 @llvm.spv.firstbitlow.i64(i64)
+;declare i16 @llvm.spv.firstbitlow.v2i16(<2 x i16>)
+;declare i32 @llvm.spv.firstbitlow.v2i32(<2 x i32>)
+;declare i64 @llvm.spv.firstbitlow.v2i64(<2 x i64>)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoadStore.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoadStore.ll
new file mode 100644
index 0000000000000..25dcc90cb61cd
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoadStore.ll
@@ -0,0 +1,60 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-library %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-library %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: [[float:%[0-9]+]] = OpTypeFloat 32
+; CHECK-DAG: [[v4float:%[0-9]+]] = OpTypeVector [[float]] 4
+; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0
+; CHECK-DAG: [[zero:%[0-9]+]] = OpConstant [[int]] 0
+; CHECK-DAG: [[one:%[0-9]+]] = OpConstant [[int]] 1
+; CHECK-DAG: [[twenty:%[0-9]+]] = OpConstant [[int]] 20
+; CHECK-DAG: [[twenty_three:%[0-9]+]] = OpConstant [[int]] 23
+; CHECK-DAG: [[ImageType:%[0-9]+]] = OpTypeImage [[float]] Buffer 2 0 0 2 Rgba32f 
+; CHECK-DAG: [[ImagePtr:%[0-9]+]] = OpTypePointer UniformConstant [[ImageType]]
+; CHECK: [[Var:%[0-9]+]] = OpVariable [[ImagePtr]] UniformConstant
+
+; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none)
+define void @main() local_unnamed_addr #0 {
+entry:
+; CHECK: [[H:%[0-9]+]] = OpLoad [[ImageType]] [[Var]]
+  %s_h.i = tail call target("spirv.Image", float, 5, 2, 0, 0, 2, 1) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
+
+; CHECK: [[R:%[0-9]+]] = OpImageRead [[v4float]] [[H]] [[one]]
+; CHECK: [[V:%[0-9]+]] = OpCompositeExtract [[float]] [[R]] 0
+  %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", float, 5, 2, 0, 0, 2, 1) %s_h.i, i32 1)
+  %1 = load float, ptr %0, align 4
+; CHECK: OpBranch [[bb_store:%[0-9]+]]
+  br label %bb_store
+
+; CHECK: [[bb_store]] = OpLabel
+bb_store:
+
+; CHECK: [[H:%[0-9]+]] = OpLoad [[ImageType]] [[Var]]
+; CHECK: OpImageWrite [[H]] [[zero]] [[V]]
+  %2 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", float, 5, 2, 0, 0, 2, 1) %s_h.i, i32 0)
+  store float %1, ptr %2, align 4
+; CHECK: OpBranch [[bb_both:%[0-9]+]]
+  br label %bb_both
+
+; CHECK: [[bb_both]] = OpLabel
+bb_both:  
+; CHECK: [[H:%[0-9]+]] = OpLoad [[ImageType]] [[Var]]
+; CHECK: [[R:%[0-9]+]] = OpImageRead [[v4float]] [[H]] [[twenty_three]]
+; CHECK: [[V:%[0-9]+]] = OpCompositeExtract [[float]] [[R]] 0
+  %3 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", float, 5, 2, 0, 0, 2, 1) %s_h.i, i32 23)
+  %4 = load float, ptr %3, align 4
+  
+; CHECK: [[H:%[0-9]+]] = OpLoad [[ImageType]] [[Var]]
+; CHECK: OpImageWrite [[H]] [[twenty]] [[V]]
+  %5 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", float, 5, 2, 0, 0, 2, 1) %s_h.i, i32 20)
+  store float %4, ptr %5, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", float, 5, 2, 0, 0, 2, 1), i32) #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare target("spirv.Image", float, 5, 2, 0, 0, 2, 1) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32, i32, i32, i32, i1) #1
+
+attributes #0 = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none) "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferStore.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferStore.ll
index b0ffa01ccdd44..812e20e45565b 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferStore.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferStore.ll
@@ -20,13 +20,14 @@ declare <4 x i32> @get_data() #1
 ; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}}
 ; CHECK-NEXT: OpLabel
 define void @RWBufferStore_Vec4_I32() #0 {
-; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_i32_5_2_0_0_2_24(
           i32 16, i32 7, i32 1, i32 0, i1 false)
 
 ; CHECK: [[data:%[0-9]+]] = OpFunctionCall
   %data = call <4 x i32> @get_data()
+
+; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
 ; CHECK: OpImageWrite [[buffer]] [[zero]] [[data]]
   call void @llvm.spv.resource.store.typedbuffer(target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) %buffer0, i32 0, <4 x i32> %data)
 
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/CombinedSamplerImageDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/CombinedSamplerImageDynIdx.ll
deleted file mode 100644
index 97a7252eb067b..0000000000000
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/CombinedSamplerImageDynIdx.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %}
-
-; CHECK: OpCapability Shader
-; CHECK-NEXT: OpCapability SampledImageArrayDynamicIndexing
-; CHECK-NEXT: OpCapability Sampled1D
-; CHECK-NOT: OpCapability
-
-; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3
-; CHECK-DAG: OpDecorate [[Var]] Binding 4
-
-; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0
-; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] 1D 2 0 0 1 R32i {{$}}
-; CHECK-DAG: [[CombindedType:%[0-9]+]] = OpTypeSampledImage [[BufferType]]
-; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[CombindedType]]
-; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3
-; CHECK-DAG: [[One:%[0-9]+]] = OpConstant [[int]] 1
-; CHECK-DAG: [[Zero:%[0-9]+]] = OpConstant [[int]] 0
-; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[CombindedType]] [[ArraySize]]
-; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]]
-; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant
-
-; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}}
-; CHECK-NEXT: OpLabel
-define void @main() #0 {
-; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
-; CHECK: [[buffer:%[0-9]+]] = OpLoad [[CombindedType]] [[ac]]
-  %buffer0 = call target("spirv.SampledImage", i32, 0, 2, 0, 0, 1, 24)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_1_24(
-          i32 3, i32 4, i32 3, i32 0, i1 false)
-
-; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
-; CHECK: [[buffer:%[0-9]+]] = OpLoad [[CombindedType]] [[ac]]
-  %buffer1 = call target("spirv.SampledImage", i32, 0, 2, 0, 0, 1, 24)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_1_24(
-          i32 3, i32 4, i32 3, i32 1, i1 false)
-  ret void
-}
-
-attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/CombinedSamplerImageNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/CombinedSamplerImageNonUniformIdx.ll
deleted file mode 100644
index 6c5c126e4462b..0000000000000
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/CombinedSamplerImageNonUniformIdx.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %}
-
-; CHECK: OpCapability Shader
-; CHECK: OpCapability ShaderNonUniform
-; CHECK-NEXT: OpCapability SampledImageArrayNonUniformIndexing
-; CHECK-NEXT: OpCapability Sampled1D
-; CHECK-NOT: OpCapability
-
-; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3
-; CHECK-DAG: OpDecorate [[Var]] Binding 4
-; CHECK: OpDecorate [[Zero:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ac0:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ld0:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[One:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ac1:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ld1:%[0-9]+]] NonUniform
-
-; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0
-; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] 1D 2 0 0 1 R32i {{$}}
-; CHECK-DAG: [[CombindedType:%[0-9]+]] = OpTypeSampledImage [[BufferType]]
-; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[CombindedType]]
-; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3
-; CHECK-DAG: [[One]] = OpConstant [[int]] 1
-; CHECK-DAG: [[Zero]] = OpConstant [[int]] 0
-; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[CombindedType]] [[ArraySize]]
-; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]]
-; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant
-
-; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}}
-; CHECK-NEXT: OpLabel
-define void @main() #0 {
-; CHECK: [[ac0]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
-; CHECK: [[ld0:%[0-9]+]] = OpLoad [[CombindedType]] [[ac0]]
-  %buffer0 = call target("spirv.SampledImage", i32, 0, 2, 0, 0, 1, 24)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_1_24(
-          i32 3, i32 4, i32 3, i32 0, i1 true)
-
-; CHECK: [[ac1]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
-; CHECK: [[ld1]] = OpLoad [[CombindedType]] [[ac1]]
-  %buffer1 = call target("spirv.SampledImage", i32, 0, 2, 0, 0, 1, 24)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_1_24(
-          i32 3, i32 4, i32 3, i32 1, i1 true)
-  ret void
-}
-
-attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/InputAttachmentImageDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/InputAttachmentImageDynIdx.ll
deleted file mode 100644
index 2a52dd1817f0c..0000000000000
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/InputAttachmentImageDynIdx.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %}
-
-; CHECK: OpCapability Shader
-; CHECK-NEXT: OpCapability InputAttachmentArrayDynamicIndexing
-; SCHECK-NEXT: OpCapability InputAttachment
-; CHECK-NOT: OpCapability
-
-; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3
-; CHECK-DAG: OpDecorate [[Var]] Binding 4
-
-; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0
-; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] SubpassData 2 0 0 2 Unknown {{$}}
-; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferType]]
-; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3
-; CHECK-DAG: [[One:%[0-9]+]] = OpConstant [[int]] 1
-; CHECK-DAG: [[Zero:%[0-9]+]] = OpConstant [[int]] 0
-; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[BufferType]] [[ArraySize]]
-; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]]
-; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant
-
-; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}}
-; CHECK-NEXT: OpLabel
-define void @main() #0 {
-; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
-; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
-  %buffer0 = call target("spirv.Image", i32, 6, 2, 0, 0, 2, 0)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_6_2_0_0_2_0(
-          i32 3, i32 4, i32 3, i32 0, i1 false)
-
-; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
-; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
-  %buffer1 = call target("spirv.Image", i32, 6, 2, 0, 0, 2, 0)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_6_2_0_0_2_0(
-          i32 3, i32 4, i32 3, i32 1, i1 false)
-  ret void
-}
-
-attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/InputAttachmentImageNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/InputAttachmentImageNonUniformIdx.ll
deleted file mode 100644
index 6dae79c5b385d..0000000000000
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/InputAttachmentImageNonUniformIdx.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %}
-
-; CHECK: OpCapability Shader
-; CHECK-NEXT: OpCapability ShaderNonUniformEXT
-; CHECK-NEXT: OpCapability InputAttachmentArrayNonUniformIndexing
-; SCHECK-NEXT: OpCapability InputAttachment
-; CHECK-NOT: OpCapability
-
-; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3
-; CHECK-DAG: OpDecorate [[Var]] Binding 4
-; CHECK: OpDecorate [[Zero:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ac0:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ld0:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[One:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ac1:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ld1:%[0-9]+]] NonUniform
-
-; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0
-; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] SubpassData 2 0 0 2 Unknown {{$}}
-; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferType]]
-; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3
-; CHECK-DAG: [[One]] = OpConstant [[int]] 1
-; CHECK-DAG: [[Zero]] = OpConstant [[int]] 0
-; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[BufferType]] [[ArraySize]]
-; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]]
-; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant
-
-; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}}
-; CHECK-NEXT: OpLabel
-define void @main() #0 {
-; CHECK: [[ac0]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
-; CHECK: [[ld0]] = OpLoad [[BufferType]] [[ac0]]
-  %buffer0 = call target("spirv.Image", i32, 6, 2, 0, 0, 2, 0)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_6_2_0_0_2_0(
-          i32 3, i32 4, i32 3, i32 0, i1 true)
-
-; CHECK: [[ac1:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
-; CHECK: [[ld1]] = OpLoad [[BufferType]] [[ac1]]
-  %buffer1 = call target("spirv.Image", i32, 6, 2, 0, 0, 2, 0)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_6_2_0_0_2_0(
-          i32 3, i32 4, i32 3, i32 1, i1 true)
-  ret void
-}
-
-attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/SampledImageDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/SampledImageDynIdx.ll
deleted file mode 100644
index efd89c5977f97..0000000000000
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/SampledImageDynIdx.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %}
-
-; CHECK: OpCapability Shader
-; CHECK-NEXT: OpCapability SampledImageArrayDynamicIndexing
-; CHECK-NEXT: OpCapability Sampled1D
-; CHECK-NOT: OpCapability
-
-; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3
-; CHECK-DAG: OpDecorate [[Var]] Binding 4
-; CHECK-DAG: OpDecorate [[OtherVar:%[0-9]+]] DescriptorSet 3
-; CHECK-DAG: OpDecorate [[OtherVar]] Binding 4
-
-; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0
-; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] 1D 2 0 0 1 R32i {{$}}
-; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferType]]
-; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3
-; CHECK-DAG: [[One:%[0-9]+]] = OpConstant [[int]] 1
-; CHECK-DAG: [[Zero:%[0-9]+]] = OpConstant [[int]] 0
-; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[BufferType]] [[ArraySize]]
-; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]]
-; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant
-
-; CHECK-DAG: [[OtherArraySize:%[0-9]+]] = OpConstant [[int]] 5
-; CHECK-DAG: [[OtherBufferArrayType:%[0-9]+]] = OpTypeArray [[BufferType]] [[OtherArraySize]]
-; CHECK-DAG: [[OtherArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[OtherBufferArrayType]]
-; CHECK-DAG: [[OtherVar]] = OpVariable [[OtherArrayPtrType]] UniformConstant
-
-; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}}
-; CHECK-NEXT: OpLabel
-define void @main() #0 {
-; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
-; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
-  %buffer0 = call target("spirv.Image", i32, 0, 2, 0, 0, 1, 24)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_1_24(
-          i32 3, i32 4, i32 3, i32 0, i1 false)
-
-; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
-; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
-  %buffer1 = call target("spirv.Image", i32, 0, 2, 0, 0, 1, 24)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_1_24(
-          i32 3, i32 4, i32 3, i32 1, i1 false)
-  ret void
-}
-
-; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}}
-; CHECK-NEXT: OpLabel
-define void @DifferentArraySizesAreDifferentVariables() #0 {
-; Make sure we use different variables when the array sizes are different
-; same in case one function calls the other.
-; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
-; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
-  %buffer0 = call target("spirv.Image", i32, 0, 2, 0, 0, 1, 24)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_1_24(
-          i32 3, i32 4, i32 3, i32 0, i1 false)
-
-; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[OtherVar]] [[One]]
-; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
-  %buffer1 = call target("spirv.Image", i32, 0, 2, 0, 0, 1, 24)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_1_24(
-          i32 3, i32 4, i32 5, i32 1, i1 false)
-  ret void
-}
-
-attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/SampledImageNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/SampledImageNonUniformIdx.ll
deleted file mode 100644
index 6d93051ce3f0a..0000000000000
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/SampledImageNonUniformIdx.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %}
-
-; CHECK: OpCapability Shader
-; CHECK-NEXT: OpCapability ShaderNonUniformEXT
-; CHECK-NEXT: OpCapability SampledImageArrayNonUniformIndexing
-; CHECK-NEXT: OpCapability Sampled1D
-; CHECK-NOT: OpCapability
-
-; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3
-; CHECK-DAG: OpDecorate [[Var]] Binding 4
-; CHECK: OpDecorate [[Zero:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ac0:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ld0:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[One:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ac1:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ld1:%[0-9]+]] NonUniform
-
-; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0
-; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] 1D 2 0 0 1 R32i {{$}}
-; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferType]]
-; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3
-; CHECK-DAG: [[One]] = OpConstant [[int]] 1
-; CHECK-DAG: [[Zero]] = OpConstant [[int]] 0
-; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[BufferType]] [[ArraySize]]
-; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]]
-; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant
-
-; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}}
-; CHECK-NEXT: OpLabel
-define void @main() #0 {
-; CHECK: [[ac0]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
-; CHECK: [[ld0]] = OpLoad [[BufferType]] [[ac0]]
-  %buffer0 = call target("spirv.Image", i32, 0, 2, 0, 0, 1, 24)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_1_24(
-          i32 3, i32 4, i32 3, i32 0, i1 true)
-
-; CHECK: [[ac1:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
-; CHECK: [[ld1]] = OpLoad [[BufferType]] [[ac1]]
-  %buffer1 = call target("spirv.Image", i32, 0, 2, 0, 0, 1, 24)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_1_24(
-          i32 3, i32 4, i32 3, i32 1, i1 true)
-  ret void
-}
-
-attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/SamplerArrayDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/SamplerArrayDynIdx.ll
deleted file mode 100644
index fd276e9ef4a98..0000000000000
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/SamplerArrayDynIdx.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %}
-
-; CHECK: OpCapability Shader
-; CHECK-NEXT: OpCapability SampledImageArrayDynamicIndexing
-; CHECK-NOT: OpCapability
-
-; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3
-; CHECK-DAG: OpDecorate [[Var]] Binding 4
-
-; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0
-; CHECK-DAG: [[SamplerType:%[0-9]+]] = OpTypeSampler
-; CHECK-DAG: [[SamplerPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[SamplerType]]
-; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3
-; CHECK-DAG: [[One:%[0-9]+]] = OpConstant [[int]] 1
-; CHECK-DAG: [[Zero:%[0-9]+]] = OpConstant [[int]] 0
-; CHECK-DAG: [[SamplerArrayType:%[0-9]+]] = OpTypeArray [[SamplerType]] [[ArraySize]]
-; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[SamplerArrayType]]
-; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant
-
-; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}}
-; CHECK-NEXT: OpLabel
-define void @main() #0 {
-; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[SamplerPtrType]] [[Var]] [[Zero]]
-; CHECK: [[buffer:%[0-9]+]] = OpLoad [[SamplerType]] [[ac]]
-  %buffer0 = call target("spirv.Sampler")
-      @llvm.spv.resource.handlefrombinding.tspirv.Image(
-          i32 3, i32 4, i32 3, i32 0, i1 false)
-
-; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[SamplerPtrType]] [[Var]] [[One]]
-; CHECK: [[buffer:%[0-9]+]] = OpLoad [[SamplerType]] [[ac]]
-  %buffer1 = call target("spirv.Sampler")
-      @llvm.spv.resource.handlefrombinding.tspirv.Image(
-          i32 3, i32 4, i32 3, i32 1, i1 false)
-  ret void
-}
-
-attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/SamplerArrayNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/SamplerArrayNonUniformIdx.ll
deleted file mode 100644
index 3e59d66febf0b..0000000000000
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/SamplerArrayNonUniformIdx.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %}
-
-; CHECK: OpCapability Shader
-; CHECK-NEXT: ShaderNonUniform
-; CHECK-NEXT: OpCapability SampledImageArrayNonUniformIndexing
-; CHECK-NOT: OpCapability
-
-; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3
-; CHECK-DAG: OpDecorate [[Var]] Binding 4
-; CHECK: OpDecorate [[Zero:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ac0:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ld0:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[One:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ac1:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ld1:%[0-9]+]] NonUniform
-
-; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0
-; CHECK-DAG: [[SamplerType:%[0-9]+]] = OpTypeSampler
-; CHECK-DAG: [[SamplerPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[SamplerType]]
-; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3
-; CHECK-DAG: [[One]] = OpConstant [[int]] 1
-; CHECK-DAG: [[Zero]] = OpConstant [[int]] 0
-; CHECK-DAG: [[SamplerArrayType:%[0-9]+]] = OpTypeArray [[SamplerType]] [[ArraySize]]
-; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[SamplerArrayType]]
-; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant
-
-; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}}
-; CHECK-NEXT: OpLabel
-define void @main() #0 {
-; CHECK: [[ac0]] = OpAccessChain [[SamplerPtrType]] [[Var]] [[Zero]]
-; CHECK: [[ld0]] = OpLoad [[SamplerType]] [[ac0]]
-  %buffer0 = call target("spirv.Sampler")
-      @llvm.spv.resource.handlefrombinding.tspirv.Image(
-          i32 3, i32 4, i32 3, i32 0, i1 true)
-
-; CHECK: [[ac1:%[0-9]+]] = OpAccessChain [[SamplerPtrType]] [[Var]] [[One]]
-; CHECK: [[ld1]] = OpLoad [[SamplerType]] [[ac1]]
-  %buffer1 = call target("spirv.Sampler")
-      @llvm.spv.resource.handlefrombinding.tspirv.Image(
-          i32 3, i32 4, i32 3, i32 1, i1 true)
-  ret void
-}
-
-attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/ScalarResourceType.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/ScalarResourceType.ll
index 52cc2275bc7a6..f52fd44bf3801 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/ScalarResourceType.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/ScalarResourceType.ll
@@ -22,12 +22,16 @@ define void @RWBufferLoad() #0 {
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_24(
           i32 16, i32 7, i32 1, i32 0, i1 false)
+  %ptr0 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) %buffer0, i32 0)
+  store i32 0, ptr %ptr0, align 4
 
 ; Make sure we use the same variable with multiple loads.
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_24(
           i32 16, i32 7, i32 1, i32 0, i1 false)
+  %ptr1 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) %buffer1, i32 0)
+  store i32 0, ptr %ptr1, align 4
   ret void
 }
 
@@ -40,6 +44,8 @@ define void @UseDifferentGlobalVar() #0 {
   %buffer0 = call target("spirv.Image", float, 5, 2, 0, 0, 2, 3)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_3(
           i32 16, i32 7, i32 1, i32 0, i1 false)
+  %ptr0 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", float, 5, 2, 0, 0, 2, 3) %buffer0, i32 0)
+  store float 0.0, ptr %ptr0, align 4
   ret void
 }
 
@@ -52,6 +58,8 @@ define void @ReuseGlobalVarFromFirstFunction() #0 {
   %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_24(
           i32 16, i32 7, i32 1, i32 0, i1 false)
+  %ptr1 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) %buffer1, i32 0)
+  store i32 0, ptr %ptr1, align 4
   ret void
 }
 
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll
index 082a5c832f1c4..94ebe74148b95 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll
@@ -27,12 +27,16 @@ define void @main() #0 {
   %buffer0 = call target("spirv.Image", i32, 0, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_2_24(
           i32 3, i32 4, i32 3, i32 0, i1 false)
+  %ptr0 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 0, 2, 0, 0, 2, 24) %buffer0, i32 0)
+  store i32 0, ptr %ptr0, align 4
 
 ; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
   %buffer1 = call target("spirv.Image", i32, 0, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_2_24(
           i32 3, i32 4, i32 3, i32 1, i1 false)
+  %ptr1 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 0, 2, 0, 0, 2, 24) %buffer1, i32 0)
+  store i32 0, ptr %ptr1, align 4
   ret void
 }
 
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll
index d6419492bb952..f9466e431c19c 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll
@@ -34,12 +34,16 @@ define void @main() #0 {
   %buffer0 = call target("spirv.Image", i32, 0, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_2_24(
           i32 3, i32 4, i32 3, i32 0, i1 true)
+  %ptr0 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 0, 2, 0, 0, 2, 24) %buffer0, i32 0)
+  store i32 0, ptr %ptr0, align 4
 
 ; CHECK: [[ac1:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
 ; CHECK: [[ld1]] = OpLoad [[BufferType]] [[ac1]]
   %buffer1 = call target("spirv.Image", i32, 0, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_2_24(
           i32 3, i32 4, i32 3, i32 1, i1 true)
+  %ptr1 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 0, 2, 0, 0, 2, 24) %buffer1, i32 0)
+  store i32 0, ptr %ptr1, align 4
   ret void
 }
 
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageTexelBufferDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageTexelBufferDynIdx.ll
deleted file mode 100644
index 31fdcb362eb73..0000000000000
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageTexelBufferDynIdx.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %}
-
-; CHECK: OpCapability Shader
-; SCHECK-NEXT: OpCapability ImageBuffer
-; CHECK-NEXT: OpCapability StorageTexelBufferArrayDynamicIndexing
-; CHECK-NOT: OpCapability
-
-; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3
-; CHECK-DAG: OpDecorate [[Var]] Binding 4
-
-; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0
-; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] Buffer 2 0 0 2 R32i {{$}}
-; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferType]]
-; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3
-; CHECK-DAG: [[One:%[0-9]+]] = OpConstant [[int]] 1
-; CHECK-DAG: [[Zero:%[0-9]+]] = OpConstant [[int]] 0
-; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[BufferType]] [[ArraySize]]
-; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]]
-; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant
-
-; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}}
-; CHECK-NEXT: OpLabel
-define void @void() #0 {
-; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
-; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
-  %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_24(
-          i32 3, i32 4, i32 3, i32 0, i1 false)
-
-; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
-; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
-  %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_24(
-          i32 3, i32 4, i32 3, i32 1, i1 false)
-  ret void
-}
-
-attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageTexelBufferNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageTexelBufferNonUniformIdx.ll
deleted file mode 100644
index a5608979025fe..0000000000000
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageTexelBufferNonUniformIdx.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %}
-
-; CHECK: OpCapability Shader
-; SCHECK-NEXT: OpCapability ImageBuffer
-; CHECK-NEXT: OpCapability ShaderNonUniformEXT
-; CHECK-NEXT: OpCapability StorageTexelBufferArrayNonUniformIndexingEXT
-; CHECK-NOT: OpCapability
-
-; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3
-; CHECK-DAG: OpDecorate [[Var]] Binding 4
-; CHECK: OpDecorate [[Zero:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ac0:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ld0:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[One:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ac1:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ld1:%[0-9]+]] NonUniform
-
-; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0
-; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] Buffer 2 0 0 2 R32i {{$}}
-; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferType]]
-; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3
-; CHECK-DAG: [[One]] = OpConstant [[int]] 1
-; CHECK-DAG: [[Zero]] = OpConstant [[int]] 0
-; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[BufferType]] [[ArraySize]]
-; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]]
-; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant
-
-; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}}
-; CHECK-NEXT: OpLabel
-define void @main() #0 {
-; CHECK: [[ac0]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
-; CHECK: [[ld0]] = OpLoad [[BufferType]] [[ac0]]
-  %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_24(
-          i32 3, i32 4, i32 3, i32 0, i1 true)
-
-; CHECK: [[ac1:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
-; CHECK: [[ld1]] = OpLoad [[BufferType]] [[ac1]]
-  %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_24(
-          i32 3, i32 4, i32 3, i32 1, i1 true)
-  ret void
-}
-
-attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/UniformTexelBufferDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/UniformTexelBufferDynIdx.ll
deleted file mode 100644
index 131a6b38d393e..0000000000000
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/UniformTexelBufferDynIdx.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %}
-
-; CHECK: OpCapability Shader
-; SCHECK-NEXT: OpCapability SampledBuffer
-; CHECK-NEXT: OpCapability UniformTexelBufferArrayDynamicIndexing
-; CHECK-NOT: OpCapability
-
-; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3
-; CHECK-DAG: OpDecorate [[Var]] Binding 4
-
-; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0
-; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] Buffer 2 0 0 1 R32i {{$}}
-; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferType]]
-; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3
-; CHECK-DAG: [[One:%[0-9]+]] = OpConstant [[int]] 1
-; CHECK-DAG: [[Zero:%[0-9]+]] = OpConstant [[int]] 0
-; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[BufferType]] [[ArraySize]]
-; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]]
-; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant
-
-; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}}
-; CHECK-NEXT: OpLabel
-define void @main() #0 {
-; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
-; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
-  %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 1, 24)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_1_24(
-          i32 3, i32 4, i32 3, i32 0, i1 false)
-
-; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
-; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
-  %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 1, 24)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_1_24(
-          i32 3, i32 4, i32 3, i32 1, i1 false)
-  ret void
-}
-
-attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/UniformTexelBufferNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/UniformTexelBufferNonUniformIdx.ll
deleted file mode 100644
index cfb3eb5f52076..0000000000000
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/UniformTexelBufferNonUniformIdx.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %}
-
-; CHECK: OpCapability Shader
-; SCHECK-NEXT: OpCapability SampledBuffer
-; CHECK-NEXT: OpCapability ShaderNonUniformEXT
-; CHECK-NEXT: OpCapability UniformTexelBufferArrayNonUniformIndexing
-; CHECK-NOT: OpCapability
-
-; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3
-; CHECK-DAG: OpDecorate [[Var]] Binding 4
-; CHECK: OpDecorate [[Zero:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ac0:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ld0:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[One:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ac1:%[0-9]+]] NonUniform
-; CHECK: OpDecorate [[ld1:%[0-9]+]] NonUniform
-
-; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0
-; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] Buffer 2 0 0 1 R32i {{$}}
-; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferType]]
-; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3
-; CHECK-DAG: [[One]] = OpConstant [[int]] 1
-; CHECK-DAG: [[Zero]] = OpConstant [[int]] 0
-; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[BufferType]] [[ArraySize]]
-; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]]
-; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant
-
-; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}}
-; CHECK-NEXT: OpLabel
-define void @main() #0 {
-; CHECK: [[ac0]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
-; CHECK: [[ld0]] = OpLoad [[BufferType]] [[ac0]]
-  %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 1, 24)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_1_24(
-          i32 3, i32 4, i32 3, i32 0, i1 true)
-
-; CHECK: [[ac1:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
-; CHECK: [[ld1]] = OpLoad [[BufferType]] [[ac1]]
-  %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 1, 24)
-      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_1_24(
-          i32 3, i32 4, i32 3, i32 1, i1 true)
-  ret void
-}
-
-attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferStore.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferStore.ll
index 4c6f9bfd97ed7..a4123c36a4488 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferStore.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferStore.ll
@@ -18,13 +18,14 @@ declare <4 x i32> @get_data() #1
 ; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}}
 ; CHECK-NEXT: OpLabel
 define void @RWBufferLoad_Vec4_I32() #0 {
-; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 0)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0(
           i32 16, i32 7, i32 1, i32 0, i1 false)
 
 ; CHECK: [[data:%[0-9]+]] = OpFunctionCall
   %data = call <4 x i32> @get_data()
+
+; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
 ; CHECK: OpImageWrite [[buffer]] [[ten]] [[data]]
   call void @llvm.spv.resource.store.typedbuffer(
       target("spirv.Image", i32, 5, 2, 0, 0, 2, 0) %buffer0, i32 10, <4 x i32> %data)
diff --git a/llvm/test/CodeGen/SPIRV/opencl/distance.ll b/llvm/test/CodeGen/SPIRV/opencl/distance.ll
index ac18804c00c9a..ed329175e9c07 100644
--- a/llvm/test/CodeGen/SPIRV/opencl/distance.ll
+++ b/llvm/test/CodeGen/SPIRV/opencl/distance.ll
@@ -30,5 +30,16 @@ entry:
   ret float %spv.distance
 }
 
+define noundef float @distance_instcombine_float4(<4 x float> noundef %a, <4 x float> noundef %b) {
+entry:
+  ; CHECK: %[[#]] = OpFunction %[[#float_32]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_cl]] distance %[[#arg0]] %[[#arg1]]
+  %delta = fsub  <4 x float> %a, %b
+  %spv.length = call float @llvm.spv.length.f32(<4 x float> %delta)
+  ret float %spv.length
+}
+
 declare half @llvm.spv.distance.f16(<4 x half>, <4 x half>)
 declare float @llvm.spv.distance.f32(<4 x float>, <4 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/validate/sycl-tangle-group-algorithms.ll b/llvm/test/CodeGen/SPIRV/validate/sycl-tangle-group-algorithms.ll
index a10341bce4859..e1dfecb8bca82 100644
--- a/llvm/test/CodeGen/SPIRV/validate/sycl-tangle-group-algorithms.ll
+++ b/llvm/test/CodeGen/SPIRV/validate/sycl-tangle-group-algorithms.ll
@@ -8,7 +8,7 @@
 
 ; The only pass criterion is that spirv-val considers output valid.
 
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64v1.5-unknown-unknown %s -o - -filetype=obj | spirv-val --target-env spv1.4 %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64v1.5-unknown-unknown %s -o - -filetype=obj | spirv-val --target-env spv1.5 %}
 
 %"nd_item" = type { i8 }
 %struct.AssertHappened = type { i32, [257 x i8], [257 x i8], [129 x i8], i32, i64, i64, i64, i64, i64, i64 }
diff --git a/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll b/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll
index da7fedee88821..8f82bd2587ec3 100644
--- a/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll
@@ -20,43 +20,43 @@ define <16 x float> @test_tcvtrowd2psi() {
 }
 declare <16 x float> @llvm.x86.tcvtrowd2ps(i8 %A, i32 %B)
 
-define <32 x bfloat> @test_tcvtrowps2pbf16h(i32 %A) {
-; CHECK-LABEL: test_tcvtrowps2pbf16h:
+define <32 x bfloat> @test_tcvtrowps2bf16h(i32 %A) {
+; CHECK-LABEL: test_tcvtrowps2bf16h:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    tcvtrowps2pbf16h %edi, %tmm1, %zmm0 # encoding: [0x62,0xf2,0x47,0x48,0x6d,0xc1]
+; CHECK-NEXT:    tcvtrowps2bf16h %edi, %tmm1, %zmm0 # encoding: [0x62,0xf2,0x47,0x48,0x6d,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-  %ret = call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16h(i8 1, i32 %A)
+  %ret = call <32 x bfloat> @llvm.x86.tcvtrowps2bf16h(i8 1, i32 %A)
   ret <32 x bfloat> %ret
 }
 
-define <32 x bfloat> @test_tcvtrowps2pbf16hi() {
-; CHECK-LABEL: test_tcvtrowps2pbf16hi:
+define <32 x bfloat> @test_tcvtrowps2bf16hi() {
+; CHECK-LABEL: test_tcvtrowps2bf16hi:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    tcvtrowps2pbf16h $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7f,0x48,0x07,0xc1,0x7f]
+; CHECK-NEXT:    tcvtrowps2bf16h $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7f,0x48,0x07,0xc1,0x7f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-  %ret = call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16h(i8 1, i32 127)
+  %ret = call <32 x bfloat> @llvm.x86.tcvtrowps2bf16h(i8 1, i32 127)
   ret <32 x bfloat> %ret
 }
-declare <32 x bfloat> @llvm.x86.tcvtrowps2pbf16h(i8 %A, i32 %B)
+declare <32 x bfloat> @llvm.x86.tcvtrowps2bf16h(i8 %A, i32 %B)
 
-define <32 x bfloat> @test_tcvtrowps2pbf16l(i32 %A) {
-; CHECK-LABEL: test_tcvtrowps2pbf16l:
+define <32 x bfloat> @test_tcvtrowps2bf16l(i32 %A) {
+; CHECK-LABEL: test_tcvtrowps2bf16l:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    tcvtrowps2pbf16l %edi, %tmm1, %zmm0 # encoding: [0x62,0xf2,0x46,0x48,0x6d,0xc1]
+; CHECK-NEXT:    tcvtrowps2bf16l %edi, %tmm1, %zmm0 # encoding: [0x62,0xf2,0x46,0x48,0x6d,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-  %ret = call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16l(i8 1, i32 %A)
+  %ret = call <32 x bfloat> @llvm.x86.tcvtrowps2bf16l(i8 1, i32 %A)
   ret <32 x bfloat> %ret
 }
 
-define <32 x bfloat> @test_tcvtrowps2pbf16li() {
-; CHECK-LABEL: test_tcvtrowps2pbf16li:
+define <32 x bfloat> @test_tcvtrowps2bf16li() {
+; CHECK-LABEL: test_tcvtrowps2bf16li:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    tcvtrowps2pbf16l $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7e,0x48,0x77,0xc1,0x7f]
+; CHECK-NEXT:    tcvtrowps2bf16l $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7e,0x48,0x77,0xc1,0x7f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-  %ret = call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16l(i8 1, i32 127)
+  %ret = call <32 x bfloat> @llvm.x86.tcvtrowps2bf16l(i8 1, i32 127)
   ret <32 x bfloat> %ret
 }
-declare <32 x bfloat> @llvm.x86.tcvtrowps2pbf16l(i8 %A, i32 %B)
+declare <32 x bfloat> @llvm.x86.tcvtrowps2bf16l(i8 %A, i32 %B)
 
 define <32 x half> @test_tcvtrowps2phh(i32 %A) {
 ; CHECK-LABEL: test_tcvtrowps2phh:
diff --git a/llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll b/llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll
index b4a5c90bbea33..fd3925fabc513 100644
--- a/llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll
+++ b/llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll
@@ -15,10 +15,10 @@ define void @test_amx(i8* %pointer, i8* %base, i32 %index, i64 %stride) {
 ; CHECK-NEXT:    tileloadd (%rsi,%rcx), %tmm0
 ; CHECK-NEXT:    tcvtrowd2ps %edx, %tmm0, %zmm0
 ; CHECK-NEXT:    tcvtrowd2ps $16, %tmm0, %zmm0
-; CHECK-NEXT:    tcvtrowps2pbf16h %edx, %tmm0, %zmm0
-; CHECK-NEXT:    tcvtrowps2pbf16h $16, %tmm0, %zmm0
-; CHECK-NEXT:    tcvtrowps2pbf16l %edx, %tmm0, %zmm0
-; CHECK-NEXT:    tcvtrowps2pbf16l $16, %tmm0, %zmm0
+; CHECK-NEXT:    tcvtrowps2bf16h %edx, %tmm0, %zmm0
+; CHECK-NEXT:    tcvtrowps2bf16h $16, %tmm0, %zmm0
+; CHECK-NEXT:    tcvtrowps2bf16l %edx, %tmm0, %zmm0
+; CHECK-NEXT:    tcvtrowps2bf16l $16, %tmm0, %zmm0
 ; CHECK-NEXT:    tcvtrowps2phh %edx, %tmm0, %zmm0
 ; CHECK-NEXT:    tcvtrowps2phh $16, %tmm0, %zmm0
 ; CHECK-NEXT:    tcvtrowps2phl %edx, %tmm0, %zmm0
@@ -33,10 +33,10 @@ define void @test_amx(i8* %pointer, i8* %base, i32 %index, i64 %stride) {
   %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
   call <16 x float> @llvm.x86.tcvtrowd2ps.internal(i16 8, i16 8, x86_amx %a, i32 %index)
   call <16 x float> @llvm.x86.tcvtrowd2ps.internal(i16 8, i16 8, x86_amx %a, i32 16)
-  call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16h.internal(i16 8, i16 8, x86_amx %a, i32 %index)
-  call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16h.internal(i16 8, i16 8, x86_amx %a, i32 16)
-  call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16l.internal(i16 8, i16 8, x86_amx %a, i32 %index)
-  call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16l.internal(i16 8, i16 8, x86_amx %a, i32 16)
+  call <32 x bfloat> @llvm.x86.tcvtrowps2bf16h.internal(i16 8, i16 8, x86_amx %a, i32 %index)
+  call <32 x bfloat> @llvm.x86.tcvtrowps2bf16h.internal(i16 8, i16 8, x86_amx %a, i32 16)
+  call <32 x bfloat> @llvm.x86.tcvtrowps2bf16l.internal(i16 8, i16 8, x86_amx %a, i32 %index)
+  call <32 x bfloat> @llvm.x86.tcvtrowps2bf16l.internal(i16 8, i16 8, x86_amx %a, i32 16)
   call <32 x half> @llvm.x86.tcvtrowps2phh.internal(i16 8, i16 8, x86_amx %a, i32 %index)
   call <32 x half> @llvm.x86.tcvtrowps2phh.internal(i16 8, i16 8, x86_amx %a, i32 16)
   call <32 x half> @llvm.x86.tcvtrowps2phl.internal(i16 8, i16 8, x86_amx %a, i32 %index)
@@ -54,8 +54,8 @@ declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, i8*, i64)
 declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
 
 declare <16 x float> @llvm.x86.tcvtrowd2ps.internal(i16, i16, x86_amx, i32)
-declare <32 x bfloat> @llvm.x86.tcvtrowps2pbf16h.internal(i16, i16, x86_amx, i32)
-declare <32 x bfloat> @llvm.x86.tcvtrowps2pbf16l.internal(i16, i16, x86_amx, i32)
+declare <32 x bfloat> @llvm.x86.tcvtrowps2bf16h.internal(i16, i16, x86_amx, i32)
+declare <32 x bfloat> @llvm.x86.tcvtrowps2bf16l.internal(i16, i16, x86_amx, i32)
 declare <32 x half> @llvm.x86.tcvtrowps2phh.internal(i16, i16, x86_amx, i32)
 declare <32 x half> @llvm.x86.tcvtrowps2phl.internal(i16, i16, x86_amx, i32)
 declare <16 x i32> @llvm.x86.tilemovrow.internal(i16, i16, x86_amx, i32)
diff --git a/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
index da212a1850964..1b93ae029f27b 100755
--- a/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
+++ b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-movrs | FileCheck %s
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR
 
 define void @test_amx_internal(i16 %m, i16 %n, ptr %buf, i64 %s) {
 ; CHECK-LABEL: test_amx_internal:
@@ -35,6 +36,44 @@ define void @test_amx_internal(i16 %m, i16 %n, ptr %buf, i64 %s) {
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx_internal:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    pushq %rbp # encoding: [0x55]
+; EGPR-NEXT:    .cfi_def_cfa_offset 16
+; EGPR-NEXT:    .cfi_offset %rbp, -16
+; EGPR-NEXT:    movq %rsp, %rbp # encoding: [0x48,0x89,0xe5]
+; EGPR-NEXT:    .cfi_def_cfa_register %rbp
+; EGPR-NEXT:    andq $-1024, %rsp # encoding: [0x48,0x81,0xe4,0x00,0xfc,0xff,0xff]
+; EGPR-NEXT:    # imm = 0xFC00
+; EGPR-NEXT:    subq $3072, %rsp # encoding: [0x48,0x81,0xec,0x00,0x0c,0x00,0x00]
+; EGPR-NEXT:    # imm = 0xC00
+; EGPR-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xc0,0x03,0x00,0x00]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xd0,0x03,0x00,0x00]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xe0,0x03,0x00,0x00]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xf0,0x03,0x00,0x00]
+; EGPR-NEXT:    movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xc0,0x03,0x00,0x00,0x01]
+; EGPR-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    # encoding: [0x48,0x89,0x8c,0x24,0xb8,0x03,0x00,0x00]
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    # encoding: [0x48,0x8b,0xb4,0x24,0xb8,0x03,0x00,0x00]
+; EGPR-NEXT:    movw %ax, %cx # encoding: [0x66,0x89,0xc1]
+; EGPR-NEXT:    movw %di, %ax # encoding: [0x66,0x89,0xf8]
+; EGPR-NEXT:    # implicit-def: $al
+; EGPR-NEXT:    movb %al, {{[0-9]+}}(%rsp) # encoding: [0x88,0x84,0x24,0xf0,0x03,0x00,0x00]
+; EGPR-NEXT:    movw %cx, {{[0-9]+}}(%rsp) # encoding: [0x66,0x89,0x8c,0x24,0xd0,0x03,0x00,0x00]
+; EGPR-NEXT:    ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0xc0,0x03,0x00,0x00]
+; EGPR-NEXT:    tileloaddrs (%rdx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4a,0x04,0x32]
+; EGPR-NEXT:    movl $64, %esi # encoding: [0xbe,0x40,0x00,0x00,0x00]
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx # encoding: [0x48,0x8d,0x94,0x24,0x00,0x04,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm0, (%rdx,%rsi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x32]
+; EGPR-NEXT:    movq %rbp, %rsp # encoding: [0x48,0x89,0xec]
+; EGPR-NEXT:    popq %rbp # encoding: [0x5d]
+; EGPR-NEXT:    .cfi_def_cfa %rsp, 8
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %t1 = call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %m, i16 %n, ptr %buf, i64 %s)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
@@ -48,6 +87,12 @@ define void @test_amx_old(i16 %m, i16 %n, ptr %buf) {
 ; CHECK-NEXT:    movl $32, %eax
 ; CHECK-NEXT:    tileloaddrs (%rdx,%rax), %tmm2
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx_old:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl $32, %eax # encoding: [0xb8,0x20,0x00,0x00,0x00]
+; EGPR-NEXT:    tileloaddrs (%rdx,%rax), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4a,0x14,0x02]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   call void @llvm.x86.tileloaddrs64(i8 2, ptr %buf, i64 32)
   ret void
@@ -88,6 +133,44 @@ define void @test_amx_t1_internal(i16 %m, i16 %n, ptr %buf, i64 %s) {
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx_t1_internal:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    pushq %rbp # encoding: [0x55]
+; EGPR-NEXT:    .cfi_def_cfa_offset 16
+; EGPR-NEXT:    .cfi_offset %rbp, -16
+; EGPR-NEXT:    movq %rsp, %rbp # encoding: [0x48,0x89,0xe5]
+; EGPR-NEXT:    .cfi_def_cfa_register %rbp
+; EGPR-NEXT:    andq $-1024, %rsp # encoding: [0x48,0x81,0xe4,0x00,0xfc,0xff,0xff]
+; EGPR-NEXT:    # imm = 0xFC00
+; EGPR-NEXT:    subq $3072, %rsp # encoding: [0x48,0x81,0xec,0x00,0x0c,0x00,0x00]
+; EGPR-NEXT:    # imm = 0xC00
+; EGPR-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xc0,0x03,0x00,0x00]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xd0,0x03,0x00,0x00]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xe0,0x03,0x00,0x00]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xf0,0x03,0x00,0x00]
+; EGPR-NEXT:    movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xc0,0x03,0x00,0x00,0x01]
+; EGPR-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    # encoding: [0x48,0x89,0x8c,0x24,0xb8,0x03,0x00,0x00]
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    # encoding: [0x48,0x8b,0xb4,0x24,0xb8,0x03,0x00,0x00]
+; EGPR-NEXT:    movw %ax, %cx # encoding: [0x66,0x89,0xc1]
+; EGPR-NEXT:    movw %di, %ax # encoding: [0x66,0x89,0xf8]
+; EGPR-NEXT:    # implicit-def: $al
+; EGPR-NEXT:    movb %al, {{[0-9]+}}(%rsp) # encoding: [0x88,0x84,0x24,0xf0,0x03,0x00,0x00]
+; EGPR-NEXT:    movw %cx, {{[0-9]+}}(%rsp) # encoding: [0x66,0x89,0x8c,0x24,0xd0,0x03,0x00,0x00]
+; EGPR-NEXT:    ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0xc0,0x03,0x00,0x00]
+; EGPR-NEXT:    tileloaddrst1 (%rdx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x4a,0x04,0x32]
+; EGPR-NEXT:    movl $64, %esi # encoding: [0xbe,0x40,0x00,0x00,0x00]
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx # encoding: [0x48,0x8d,0x94,0x24,0x00,0x04,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm0, (%rdx,%rsi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x32]
+; EGPR-NEXT:    movq %rbp, %rsp # encoding: [0x48,0x89,0xec]
+; EGPR-NEXT:    popq %rbp # encoding: [0x5d]
+; EGPR-NEXT:    .cfi_def_cfa %rsp, 8
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %t1 = call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %m, i16 %n, ptr %buf, i64 %s)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
@@ -101,6 +184,12 @@ define void @test_amx_t1_old(i16 %m, i16 %n, ptr %buf) {
 ; CHECK-NEXT:    movl $32, %eax
 ; CHECK-NEXT:    tileloaddrst1 (%rdx,%rax), %tmm2
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx_t1_old:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl $32, %eax # encoding: [0xb8,0x20,0x00,0x00,0x00]
+; EGPR-NEXT:    tileloaddrst1 (%rdx,%rax), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x4a,0x14,0x02]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   call void @llvm.x86.tileloaddrst164(i8 2, ptr %buf, i64 32)
   ret void
diff --git a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
index 146b69773eb18..1f5758c804b2b 100755
--- a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
+++ b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O0
 ; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O2
+; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR
 
 define void @test_amx(i64 %stride, i8* %addr1) #0 {
 ; CHECK-LABEL: test_amx:
@@ -10,6 +11,14 @@ define void @test_amx(i64 %stride, i8* %addr1) #0 {
 ; CHECK-NEXT:    t2rpntlvwz1rs (%rsi,%rdi), %tmm0
 ; CHECK-NEXT:    t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    t2rpntlvwz0rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x04,0x3e]
+; EGPR-NEXT:    t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x3e]
+; EGPR-NEXT:    t2rpntlvwz1rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x04,0x3e]
+; EGPR-NEXT:    t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x3e]
+; EGPR-NEXT:    retq # encoding: [0xc3]
   call void @llvm.x86.t2rpntlvwz0rs(i8 1, i8* %addr1, i64 %stride)
   call void @llvm.x86.t2rpntlvwz0rst1(i8 2, i8* %addr1, i64 %stride)
   call void @llvm.x86.t2rpntlvwz1rs(i8 1, i8* %addr1, i64 %stride)
@@ -80,6 +89,27 @@ define void @test_amx2(i8* %base, i64 %stride) #0 {
 ; O2-NEXT:    t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4
 ; O2-NEXT:    tilerelease
 ; O2-NEXT:    retq
+;
+; EGPR-LABEL: test_amx2:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
+; EGPR-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xc0]
+; EGPR-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xd0]
+; EGPR-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xe0]
+; EGPR-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xf0]
+; EGPR-NEXT:    movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x08,0x00]
+; EGPR-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0]
+; EGPR-NEXT:    movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
+; EGPR-NEXT:    t2rpntlvwz0rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x24,0x37]
+; EGPR-NEXT:    t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x24,0x37]
+; EGPR-NEXT:    t2rpntlvwz1rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x24,0x37]
+; EGPR-NEXT:    t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x24,0x37]
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    retq # encoding: [0xc3]
   call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
   call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
   call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
diff --git a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
index cc4360317db7d..4cfd97afe721b 100644
--- a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
+++ b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR
 
 define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x float> %xmm) #0 {
 ; CHECK-LABEL: test_amx:
@@ -16,6 +17,21 @@ define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x floa
 ; CHECK-NEXT:    tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1
 ; CHECK-NEXT:    tconjtfp16 %tmm2, %tmm1
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    t2rpntlvwz0 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x04,0x31]
+; EGPR-NEXT:    t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x31]
+; EGPR-NEXT:    t2rpntlvwz1 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x04,0x31]
+; EGPR-NEXT:    t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x31]
+; EGPR-NEXT:    ttransposed %tmm3, %tmm1 # encoding: [0xc4,0xe2,0x7a,0x5f,0xcb]
+; EGPR-NEXT:    ttdpbf16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6c,0xca]
+; EGPR-NEXT:    ttdpfp16ps %tmm6, %tmm5, %tmm4 # encoding: [0xc4,0xe2,0x4b,0x6c,0xe5]
+; EGPR-NEXT:    ttcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x63,0x6b,0xca]
+; EGPR-NEXT:    ttcmmrlfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6b,0xca]
+; EGPR-NEXT:    tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x60,0x6b,0xca]
+; EGPR-NEXT:    tconjtfp16 %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x79,0x6b,0xca]
+; EGPR-NEXT:    retq # encoding: [0xc3]
   call void @llvm.x86.t2rpntlvwz0(i8 1, i8* %addr1, i64 %stride)
   call void @llvm.x86.t2rpntlvwz0t1(i8 2, i8* %addr1, i64 %stride)
   call void @llvm.x86.t2rpntlvwz1(i8 1, i8* %addr1, i64 %stride)
@@ -78,6 +94,46 @@ define void @test_amx2(i8* %pointer, i8* %base, i64 %stride) #0 {
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx2:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    pushq %rbp # encoding: [0x55]
+; EGPR-NEXT:    subq $2928, %rsp # encoding: [0x48,0x81,0xec,0x70,0x0b,0x00,0x00]
+; EGPR-NEXT:    # imm = 0xB70
+; EGPR-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
+; EGPR-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0x0d]
+; EGPR-NEXT:    movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x40,0x03,0x00,0x00,0x01]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x70,0x03,0x00,0x00,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x50,0x03,0x00,0x00,0x08,0x00]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x71,0x03,0x00,0x00,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x52,0x03,0x00,0x00,0x08,0x00]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x72,0x03,0x00,0x00,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x54,0x03,0x00,0x00,0x08,0x00]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x73,0x03,0x00,0x00,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x56,0x03,0x00,0x00,0x08,0x00]
+; EGPR-NEXT:    ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0x40,0x03,0x00,0x00]
+; EGPR-NEXT:    movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
+; EGPR-NEXT:    tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16]
+; EGPR-NEXT:    tilezero %tmm1 # encoding: [0xc4,0xe2,0x7b,0x49,0xc8]
+; EGPR-NEXT:    tilezero %tmm2 # encoding: [0xc4,0xe2,0x7b,0x49,0xd0]
+; EGPR-NEXT:    ttdpbf16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6c,0xd0]
+; EGPR-NEXT:    ttdpfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6c,0xd0]
+; EGPR-NEXT:    ttcmmimfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6b,0xd0]
+; EGPR-NEXT:    ttcmmrlfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6b,0xd0]
+; EGPR-NEXT:    movabsq $64, %rbp # encoding: [0x48,0xbd,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill
+; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x94,0x2c,0x80,0x03,0x00,0x00]
+; EGPR-NEXT:    tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload
+; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x9c,0x2c,0x80,0x03,0x00,0x00]
+; EGPR-NEXT:    tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3 # encoding: [0xc4,0xe2,0x70,0x6b,0xd8]
+; EGPR-NEXT:    tconjtfp16 %tmm3, %tmm0 # encoding: [0xc4,0xe2,0x79,0x6b,0xc3]
+; EGPR-NEXT:    tilestored %tmm2, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x14,0x17]
+; EGPR-NEXT:    addq $2928, %rsp # encoding: [0x48,0x81,0xc4,0x70,0x0b,0x00,0x00]
+; EGPR-NEXT:    # imm = 0xB70
+; EGPR-NEXT:    popq %rbp # encoding: [0x5d]
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 
   %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
   %b = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
@@ -117,6 +173,30 @@ define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 {
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx3:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
+; EGPR-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xff]
+; EGPR-NEXT:    movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf0,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd0,0x08,0x00]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00]
+; EGPR-NEXT:    movb $0, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x00]
+; EGPR-NEXT:    movw $0, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x00,0x00]
+; EGPR-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0]
+; EGPR-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; EGPR-NEXT:    movw $8, %cx # encoding: [0x66,0xb9,0x08,0x00]
+; EGPR-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16]
+; EGPR-NEXT:    t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x24,0x16]
+; EGPR-NEXT:    t2rpntlvwz1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x24,0x16]
+; EGPR-NEXT:    t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x24,0x16]
+; EGPR-NEXT:    ttransposed %tmm4, %tmm0 # encoding: [0xc4,0xe2,0x7a,0x5f,0xc4]
+; EGPR-NEXT:    tilestored %tmm0, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x17]
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; EGPR-NEXT:    retq # encoding: [0xc3]
   %1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
   %2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
   %3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
@@ -179,6 +259,72 @@ define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 {
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx_spill:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    subq $6088, %rsp # encoding: [0x48,0x81,0xec,0xc8,0x17,0x00,0x00]
+; EGPR-NEXT:    # imm = 0x17C8
+; EGPR-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
+; EGPR-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xfe]
+; EGPR-NEXT:    movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x80,0x01]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb0,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x90,0x08,0x00]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb4,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x98,0x08,0x00]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb5,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9a,0x08,0x00]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb6,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9c,0x08,0x00]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb7,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9e,0x08,0x00]
+; EGPR-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x80]
+; EGPR-NEXT:    movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
+; EGPR-NEXT:    tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16]
+; EGPR-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16]
+; EGPR-NEXT:    t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x34,0x16]
+; EGPR-NEXT:    movabsq $64, %rcx # encoding: [0x48,0xb9,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x0f,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x13,0x00,0x00]
+; EGPR-NEXT:    t2rpntlvwz1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x34,0x16]
+; EGPR-NEXT:    tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x07,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x0b,0x00,0x00]
+; EGPR-NEXT:    t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x34,0x16]
+; EGPR-NEXT:    tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0x74,0x0c,0xc0]
+; EGPR-NEXT:    tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x03,0x00,0x00]
+; EGPR-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x34,0x16]
+; EGPR-NEXT:    tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
+; EGPR-NEXT:    tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
+; EGPR-NEXT:    tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x0f,0x00,0x00]
+; EGPR-NEXT:    tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x13,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
+; EGPR-NEXT:    tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
+; EGPR-NEXT:    tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x07,0x00,0x00]
+; EGPR-NEXT:    tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x0b,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
+; EGPR-NEXT:    tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
+; EGPR-NEXT:    tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0x64,0x0c,0xc0]
+; EGPR-NEXT:    tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x03,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
+; EGPR-NEXT:    tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
+; EGPR-NEXT:    tilestored %tmm6, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x34,0x16]
+; EGPR-NEXT:    tilestored %tmm7, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x3c,0x16]
+; EGPR-NEXT:    addq $6088, %rsp # encoding: [0x48,0x81,0xc4,0xc8,0x17,0x00,0x00]
+; EGPR-NEXT:    # imm = 0x17C8
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; EGPR-NEXT:    retq # encoding: [0xc3]
   %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
   %b1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
   %b2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith.ll b/llvm/test/CodeGen/X86/avx512fp16-arith.ll
index 8d811d8d29e06..b264f5fc34688 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-arith.ll
@@ -179,7 +179,7 @@ define half @add_sh(half %i, half %j, ptr %x.ptr) nounwind readnone {
 define half @sub_sh(half %i, half %j, ptr %x.ptr) nounwind readnone {
 ; CHECK-LABEL: sub_sh:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovsh (%rdi), %xmm2
+; CHECK-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vsubsh %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vsubsh %xmm0, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -216,7 +216,7 @@ define half @mul_sh(half %i, half %j, ptr %x.ptr) nounwind readnone {
 define half @div_sh(half %i, half %j, ptr %x.ptr) nounwind readnone {
 ; CHECK-LABEL: div_sh:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovsh (%rdi), %xmm2
+; CHECK-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vdivsh %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vdivsh %xmm0, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -329,7 +329,7 @@ define half @fcopysign(half %x, half %y) {
 ; CHECK-LABEL: fcopysign:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
-; CHECK-NEXT:    vpternlogd $226, %xmm1, %xmm2, %xmm0
+; CHECK-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm2 & (xmm0 ^ xmm1))
 ; CHECK-NEXT:    retq
   %a = call half @llvm.copysign.f16(half %x, half %y)
   ret half %a
@@ -341,7 +341,7 @@ define half @fround(half %x) {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1]
-; CHECK-NEXT:    vpternlogq $248, %xmm1, %xmm0, %xmm2
+; CHECK-NEXT:    vpternlogq {{.*#+}} xmm2 = xmm2 | (xmm0 & xmm1)
 ; CHECK-NEXT:    vaddsh %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vrndscalesh $11, %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
@@ -384,7 +384,7 @@ declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
 define <8 x half> @fcopysignv8f16(<8 x half> %x, <8 x half> %y) {
 ; CHECK-LABEL: fcopysignv8f16:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; CHECK-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (mem & (xmm0 ^ xmm1))
 ; CHECK-NEXT:    retq
   %a = call <8 x half> @llvm.copysign.v8f16(<8 x half> %x, <8 x half> %y)
   ret <8 x half> %a
@@ -396,7 +396,7 @@ define <8 x half> @roundv8f16(<8 x half> %x) {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1]
-; CHECK-NEXT:    vpternlogq $248, %xmm1, %xmm0, %xmm2
+; CHECK-NEXT:    vpternlogq {{.*#+}} xmm2 = xmm2 | (xmm0 & xmm1)
 ; CHECK-NEXT:    vaddph %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vrndscaleph $11, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
@@ -439,7 +439,7 @@ declare <16 x half> @llvm.fabs.v16f16(<16 x half>)
 define <16 x half> @fcopysignv16f16(<16 x half> %x, <16 x half> %y) {
 ; CHECK-LABEL: fcopysignv16f16:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; CHECK-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
 ; CHECK-NEXT:    retq
   %a = call <16 x half> @llvm.copysign.v16f16(<16 x half> %x, <16 x half> %y)
   ret <16 x half> %a
@@ -451,7 +451,7 @@ define <16 x half> @roundv16f16(<16 x half> %x) {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1]
-; CHECK-NEXT:    vpternlogq $248, %ymm1, %ymm0, %ymm2
+; CHECK-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm1)
 ; CHECK-NEXT:    vaddph %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vrndscaleph $11, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
@@ -494,7 +494,7 @@ declare <32 x half> @llvm.fabs.v32f16(<32 x half>)
 define <32 x half> @fcopysignv32f16(<32 x half> %x, <32 x half> %y) {
 ; CHECK-LABEL: fcopysignv32f16:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; CHECK-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (mem & (zmm0 ^ zmm1))
 ; CHECK-NEXT:    retq
   %a = call <32 x half> @llvm.copysign.v32f16(<32 x half> %x, <32 x half> %y)
   ret <32 x half> %a
@@ -506,7 +506,7 @@ define <32 x half> @roundv32f16(<32 x half> %x) {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vpbroadcastw {{.*#+}} zmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    vpbroadcastw {{.*#+}} zmm2 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1]
-; CHECK-NEXT:    vpternlogq $248, %zmm1, %zmm0, %zmm2
+; CHECK-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm0 & zmm1)
 ; CHECK-NEXT:    vaddph %zmm2, %zmm0, %zmm0
 ; CHECK-NEXT:    vrndscaleph $11, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
index e1e013528738a..3040e58b37997 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
@@ -144,7 +144,7 @@ define float @f16tof32(half %b) nounwind {
 ; X86-LABEL: f16tof32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
@@ -166,7 +166,7 @@ define double @f16tof64(half %b) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    vmovsh 8(%ebp), %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vcvtsh2sd %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovsd %xmm0, (%esp)
 ; X86-NEXT:    fldl (%esp)
@@ -356,7 +356,7 @@ define <8 x half> @f64to8f16(<8 x double> %b) {
 define float @extload_f16_f32(ptr %x) {
 ; X64-LABEL: extload_f16_f32:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovsh (%rdi), %xmm0
+; X64-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X64-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; X64-NEXT:    retq
 ;
@@ -365,7 +365,7 @@ define float @extload_f16_f32(ptr %x) {
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vmovsh (%eax), %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
@@ -380,7 +380,7 @@ define float @extload_f16_f32(ptr %x) {
 define double @extload_f16_f64(ptr %x) {
 ; X64-LABEL: extload_f16_f64:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovsh (%rdi), %xmm0
+; X64-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X64-NEXT:    vcvtsh2sd %xmm0, %xmm0, %xmm0
 ; X64-NEXT:    retq
 ;
@@ -394,7 +394,7 @@ define double @extload_f16_f64(ptr %x) {
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    vmovsh (%eax), %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vcvtsh2sd %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovsd %xmm0, (%esp)
 ; X86-NEXT:    fldl (%esp)
@@ -777,7 +777,7 @@ define i64 @half_to_s64(half %x) {
 ;
 ; X86-LABEL: half_to_s64:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vcvttph2qq %xmm0, %xmm0
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
@@ -808,7 +808,7 @@ define i128 @half_to_s128(half %x) {
 ; X86-NEXT:    subl $48, %esp
 ; X86-NEXT:    .cfi_offset %esi, -12
 ; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    vmovsh 12(%ebp), %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vmovsh %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
@@ -880,7 +880,7 @@ define i64 @half_to_u64(half %x) {
 ;
 ; X86-LABEL: half_to_u64:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vcvttph2uqq %xmm0, %xmm0
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
@@ -911,7 +911,7 @@ define i128 @half_to_u128(half %x) {
 ; X86-NEXT:    subl $48, %esp
 ; X86-NEXT:    .cfi_offset %esi, -12
 ; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    vmovsh 12(%ebp), %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vmovsh %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
@@ -940,7 +940,7 @@ define x86_fp80 @half_to_f80(half %x) nounwind {
 ; X86-LABEL: half_to_f80:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vmovsh %xmm0, (%esp)
 ; X86-NEXT:    calll __extendhfxf2
 ; X86-NEXT:    popl %eax
@@ -990,7 +990,7 @@ define fp128 @half_to_f128(half %x) nounwind {
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $48, %esp
 ; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    vmovsh 12(%ebp), %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll
index 9db57fe68bb42..3ea79c856e1ca 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll
@@ -41,7 +41,7 @@ define <8 x half> @test_fminimum_v8f16(<8 x half> %x, <8 x half> %y) "no-nans-fp
 define half @test_fminimum_nnan(half %x, half %y) "no-nans-fp-math"="true" {
 ; CHECK-LABEL: test_fminimum_nnan:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vfpclasssh $5, %xmm1, %k1
+; CHECK-NEXT:    vfpclasssh $5, %xmm1, %k1 # k1 = isQuietNaN(xmm1) | isNegativeZero(xmm1)
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm2
 ; CHECK-NEXT:    vmovsh %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vmovsh %xmm0, %xmm0, %xmm1 {%k1}
@@ -78,7 +78,7 @@ define half @test_fminimum_combine_cmps(half %x, half %y) {
 ; CHECK-LABEL: test_fminimum_combine_cmps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vdivsh %xmm0, %xmm1, %xmm1
-; CHECK-NEXT:    vfpclasssh $5, %xmm0, %k1
+; CHECK-NEXT:    vfpclasssh $5, %xmm0, %k1 # k1 = isQuietNaN(xmm0) | isNegativeZero(xmm0)
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm2
 ; CHECK-NEXT:    vmovsh %xmm0, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vmovsh %xmm1, %xmm0, %xmm0 {%k1}
@@ -121,7 +121,7 @@ define half @test_fmaximum_nnan(half %x, half %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm2
 ; CHECK-NEXT:    vsubsh %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vfpclasssh $3, %xmm0, %k1
+; CHECK-NEXT:    vfpclasssh $3, %xmm0, %k1 # k1 = isQuietNaN(xmm0) | isPositiveZero(xmm0)
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm1
 ; CHECK-NEXT:    vmovsh %xmm0, %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovsh %xmm2, %xmm0, %xmm0 {%k1}
@@ -161,7 +161,7 @@ define half @test_fmaximum_combine_cmps(half %x, half %y) {
 ; CHECK-LABEL: test_fmaximum_combine_cmps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vdivsh %xmm0, %xmm1, %xmm1
-; CHECK-NEXT:    vfpclasssh $3, %xmm0, %k1
+; CHECK-NEXT:    vfpclasssh $3, %xmm0, %k1 # k1 = isQuietNaN(xmm0) | isPositiveZero(xmm0)
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm2
 ; CHECK-NEXT:    vmovsh %xmm0, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vmovsh %xmm1, %xmm0, %xmm0 {%k1}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
index 40578fe746edb..85e1890c2b79a 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
@@ -244,8 +244,8 @@ declare <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half>, i32)
 define i32 @test_int_x86_avx512_fpclass_ph_512(<32 x half> %x0) {
 ; CHECK-LABEL: test_int_x86_avx512_fpclass_ph_512:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vfpclassph $2, %zmm0, %k1
-; CHECK-NEXT:    vfpclassph $4, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    vfpclassph $2, %zmm0, %k1 # k1 = isPositiveZero(zmm0)
+; CHECK-NEXT:    vfpclassph $4, %zmm0, %k0 {%k1} # k0 {%k1} = isNegativeZero(zmm0)
 ; CHECK-NEXT:    kmovd %k0, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -261,8 +261,8 @@ declare i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half>, i32, i8)
 define i8 @test_int_x86_avx512_mask_fpclass_sh(<8 x half> %x0) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_sh:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vfpclasssh $4, %xmm0, %k1
-; CHECK-NEXT:    vfpclasssh $2, %xmm0, %k0 {%k1}
+; CHECK-NEXT:    vfpclasssh $4, %xmm0, %k1 # k1 = isNegativeZero(xmm0)
+; CHECK-NEXT:    vfpclasssh $2, %xmm0, %k0 {%k1} # k0 {%k1} = isPositiveZero(xmm0)
 ; CHECK-NEXT:    kmovd %k0, %eax
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
@@ -274,7 +274,7 @@ define i8 @test_int_x86_avx512_mask_fpclass_sh(<8 x half> %x0) {
 define i8 @test_int_x86_avx512_mask_fpclass_sh_load(ptr %x0ptr) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_sh_load:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vfpclasssh $4, (%rdi), %k0
+; CHECK-NEXT:    vfpclasssh $4, (%rdi), %k0 # k0 = isNegativeZero(mem)
 ; CHECK-NEXT:    kmovd %k0, %eax
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/avx512fp16-unsafe-fp-math.ll b/llvm/test/CodeGen/X86/avx512fp16-unsafe-fp-math.ll
index c9b45983e09a8..5b92ce76d5736 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-unsafe-fp-math.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-unsafe-fp-math.ll
@@ -112,7 +112,7 @@ define half @test_max_f16(half %a, ptr %ptr) {
 ;
 ; CHECK-LABEL: test_max_f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovsh (%rdi), %xmm1
+; CHECK-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vmaxsh %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 entry:
@@ -130,7 +130,7 @@ define half @test_min_f16(half %a, ptr %ptr) {
 ;
 ; CHECK-LABEL: test_min_f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovsh (%rdi), %xmm1
+; CHECK-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vminsh %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll
index a0fc8180e10b9..47bfea91f58dd 100644
--- a/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll
@@ -1131,7 +1131,7 @@ declare <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half>, i32)
 define i8 @test_int_x86_avx512_fpclass_ph_128(<8 x half> %x0) {
 ; CHECK-LABEL: test_int_x86_avx512_fpclass_ph_128:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vfpclassph $2, %xmm0, %k1
+; CHECK-NEXT:    vfpclassph $2, %xmm0, %k1 # k1 = isPositiveZero(xmm0)
 ; CHECK-NEXT:    vfpclassph $4, %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovd %k0, %eax
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
@@ -1146,7 +1146,7 @@ define i8 @test_int_x86_avx512_fpclass_ph_128(<8 x half> %x0) {
 define i16 @test_int_x86_avx512_fpclass_ph_256(<16 x half> %x0) {
 ; CHECK-LABEL: test_int_x86_avx512_fpclass_ph_256:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vfpclassph $2, %ymm0, %k1
+; CHECK-NEXT:    vfpclassph $2, %ymm0, %k1 # k1 = isPositiveZero(ymm0)
 ; CHECK-NEXT:    vfpclassph $4, %ymm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovd %k0, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index a6b3e3fd1fd16..d67cd6b62c2b9 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -708,10 +708,8 @@ define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) {
 ;
 ; BF16-LABEL: pr62997:
 ; BF16:       # %bb.0:
-; BF16-NEXT:    vpextrw $0, %xmm0, %eax
-; BF16-NEXT:    vpextrw $0, %xmm1, %ecx
-; BF16-NEXT:    vmovd %eax, %xmm0
-; BF16-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0
+; BF16-NEXT:    vpextrw $0, %xmm1, %eax
+; BF16-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
 ; BF16-NEXT:    retq
 ;
 ; FP16-LABEL: pr62997:
@@ -1652,66 +1650,63 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
 ; AVXNC-NEXT:    pushq %r12
 ; AVXNC-NEXT:    pushq %rbx
 ; AVXNC-NEXT:    subq $168, %rsp
-; AVXNC-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVXNC-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
 ; AVXNC-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVXNC-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVXNC-NEXT:    vzeroupper
 ; AVXNC-NEXT:    callq __truncdfbf2@PLT
 ; AVXNC-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVXNC-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVXNC-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVXNC-NEXT:    # xmm0 = mem[1,0]
+; AVXNC-NEXT:    callq __truncdfbf2@PLT
+; AVXNC-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVXNC-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVXNC-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVXNC-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVXNC-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVXNC-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVXNC-NEXT:    vzeroupper
 ; AVXNC-NEXT:    callq __truncdfbf2@PLT
-; AVXNC-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVXNC-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVXNC-NEXT:    # xmm0 = mem[1,0]
-; AVXNC-NEXT:    callq __truncdfbf2@PLT
-; AVXNC-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVXNC-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVXNC-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVXNC-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVXNC-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVXNC-NEXT:    vzeroupper
 ; AVXNC-NEXT:    callq __truncdfbf2@PLT
 ; AVXNC-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVXNC-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVXNC-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
 ; AVXNC-NEXT:    # xmm0 = mem[1,0]
 ; AVXNC-NEXT:    callq __truncdfbf2@PLT
 ; AVXNC-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVXNC-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVXNC-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVXNC-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVXNC-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVXNC-NEXT:    vzeroupper
 ; AVXNC-NEXT:    callq __truncdfbf2@PLT
-; AVXNC-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVXNC-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVXNC-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVXNC-NEXT:    # xmm0 = mem[1,0]
 ; AVXNC-NEXT:    callq __truncdfbf2@PLT
-; AVXNC-NEXT:    vpextrw $0, %xmm0, %eax
-; AVXNC-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVXNC-NEXT:    vpextrw $0, %xmm0, %ebx
+; AVXNC-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
 ; AVXNC-NEXT:    vpextrw $0, %xmm0, %ebp
 ; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVXNC-NEXT:    vpextrw $0, %xmm0, %r14d
 ; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVXNC-NEXT:    vpextrw $0, %xmm0, %r15d
-; AVXNC-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVXNC-NEXT:    vpextrw $0, %xmm0, %r12d
 ; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVXNC-NEXT:    vpextrw $0, %xmm0, %r13d
 ; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVXNC-NEXT:    vpextrw $0, %xmm0, %ebx
-; AVXNC-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVXNC-NEXT:    # xmm0 = mem[1,0]
 ; AVXNC-NEXT:    callq __truncdfbf2@PLT
 ; AVXNC-NEXT:    vpextrw $0, %xmm0, %eax
-; AVXNC-NEXT:    vmovd %ebx, %xmm0
-; AVXNC-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; AVXNC-NEXT:    vpinsrw $2, %r13d, %xmm0, %xmm0
+; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVXNC-NEXT:    vpinsrw $1, %r13d, %xmm0, %xmm0
+; AVXNC-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
 ; AVXNC-NEXT:    vpinsrw $3, %r12d, %xmm0, %xmm0
 ; AVXNC-NEXT:    vpinsrw $4, %r15d, %xmm0, %xmm0
 ; AVXNC-NEXT:    vpinsrw $5, %r14d, %xmm0, %xmm0
 ; AVXNC-NEXT:    vpinsrw $6, %ebp, %xmm0, %xmm0
-; AVXNC-NEXT:    vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVXNC-NEXT:    vpinsrw $7, %ebx, %xmm0, %xmm0
 ; AVXNC-NEXT:    addq $168, %rsp
 ; AVXNC-NEXT:    popq %rbx
 ; AVXNC-NEXT:    popq %r12
diff --git a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
index fdf0bf3f692d6..e911a24d830f7 100644
--- a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
+++ b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
@@ -133,11 +133,7 @@ define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind {
 ;
 ; AVX512-LABEL: complex_canonicalize_fmul_half:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX512-NEXT:    vpextrw $0, %xmm0, %ecx
-; AVX512-NEXT:    vmovd %ecx, %xmm0
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %eax, %xmm1
 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX512-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/cvt16-2.ll b/llvm/test/CodeGen/X86/cvt16-2.ll
index bab6768b16322..8dbbc57f10564 100644
--- a/llvm/test/CodeGen/X86/cvt16-2.ll
+++ b/llvm/test/CodeGen/X86/cvt16-2.ll
@@ -34,7 +34,7 @@ define float @test2(ptr nocapture %src) {
 ;
 ; FP16-LABEL: test2:
 ; FP16:       # %bb.0:
-; FP16-NEXT:    vmovsh (%rdi), %xmm0
+; FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; FP16-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; FP16-NEXT:    retq
   %1 = load i16, ptr %src, align 2
@@ -77,7 +77,7 @@ define double @test4(ptr nocapture %src) {
 ;
 ; FP16-LABEL: test4:
 ; FP16:       # %bb.0:
-; FP16-NEXT:    vmovsh (%rdi), %xmm0
+; FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; FP16-NEXT:    vcvtsh2sd %xmm0, %xmm0, %xmm0
 ; FP16-NEXT:    retq
   %1 = load i16, ptr %src, align 2
@@ -123,7 +123,7 @@ define x86_fp80 @test6(ptr nocapture %src) {
 ; FP16:       # %bb.0:
 ; FP16-NEXT:    pushq %rax
 ; FP16-NEXT:    .cfi_def_cfa_offset 16
-; FP16-NEXT:    vmovsh (%rdi), %xmm0
+; FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; FP16-NEXT:    callq __extendhfxf2@PLT
 ; FP16-NEXT:    popq %rax
 ; FP16-NEXT:    .cfi_def_cfa_offset 8
diff --git a/llvm/test/CodeGen/X86/cvt16.ll b/llvm/test/CodeGen/X86/cvt16.ll
index c7ef353f7f603..efc457e35e7f3 100644
--- a/llvm/test/CodeGen/X86/cvt16.ll
+++ b/llvm/test/CodeGen/X86/cvt16.ll
@@ -59,8 +59,7 @@ define float @test2(ptr nocapture %src) nounwind {
 ;
 ; F16C-LABEL: test2:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    movzwl (%rdi), %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
+; F16C-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    retq
 ;
@@ -119,8 +118,7 @@ define double @test4(ptr nocapture %src) nounwind {
 ;
 ; F16C-LABEL: test4:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    movzwl (%rdi), %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
+; F16C-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
 ; F16C-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/dag-large-offset.ll b/llvm/test/CodeGen/X86/dag-large-offset.ll
new file mode 100644
index 0000000000000..2774a93993153
--- /dev/null
+++ b/llvm/test/CodeGen/X86/dag-large-offset.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=i386 --frame-pointer=all | FileCheck %s
+
+; ISel will try to fold pointer arithmetic into the address displacement. However, we don't
+; want to do that if the offset is very close to the expressible limit because the final frame
+; layout may push it over/under the limit.
+
+define i32 @foo(i1 %b) #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    .cfi_offset %ebp, -8
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    .cfi_def_cfa_register %ebp
+; CHECK-NEXT:    subl $8, %esp
+; CHECK-NEXT:    movl __stack_chk_guard, %eax
+; CHECK-NEXT:    movl %eax, -4(%ebp)
+; CHECK-NEXT:    testb $1, 8(%ebp)
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    movl $-2147483647, %eax # imm = 0x80000001
+; CHECK-NEXT:    leal -5(%ebp,%eax), %eax
+; CHECK-NEXT:  .LBB0_3: # %entry
+; CHECK-NEXT:    movl __stack_chk_guard, %ecx
+; CHECK-NEXT:    cmpl -4(%ebp), %ecx
+; CHECK-NEXT:    jne .LBB0_5
+; CHECK-NEXT:  # %bb.4: # %entry
+; CHECK-NEXT:    addl $8, %esp
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    .cfi_def_cfa %esp, 4
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB0_5: # %entry
+; CHECK-NEXT:    .cfi_def_cfa %ebp, 8
+; CHECK-NEXT:    calll __stack_chk_fail
+entry:
+  %a = alloca i8, align 1
+  %0 = ptrtoint ptr %a to i32
+  %sub = add i32 %0, -2147483647
+  %retval.0 = select i1 %b, i32 %sub, i32 0
+  ret i32 %retval.0
+}
+
+attributes #0 = { sspreq }
diff --git a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
index f26368c02de2b..dbfa69d497698 100644
--- a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
@@ -1347,3 +1347,815 @@ define <4 x i32> @vp_bswap_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
   ret <4 x i32> %v
 }
 declare <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vp_ctpop_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
+; X86-LABEL: vp_ctpop_v4i32:
+; X86:       # %bb.0:
+; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X86-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; X86-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X86-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; X86-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; X86-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; X86-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
+; X86-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X86-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; SSE-LABEL: vp_ctpop_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    psubb %xmm1, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    psrlw $2, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    paddb %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrlw $4, %xmm1
+; SSE-NEXT:    paddb %xmm1, %xmm0
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT:    psadbw %xmm1, %xmm2
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    psadbw %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: vp_ctpop_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vp_ctpop_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: vp_ctpop_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %v = call <4 x i32> @llvm.vp.ctpop.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+declare <4 x i32> @llvm.vp.ctpop.v4i32(<4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vp_ctlz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
+; X86-LABEL: vp_ctlz_v4i32:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; X86-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3, %xmm3
+; X86-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; X86-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
+; X86-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; X86-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; X86-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
+; X86-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; X86-NEXT:    vpand %xmm2, %xmm1, %xmm2
+; X86-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; X86-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; X86-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm0
+; X86-NEXT:    vpsrld $16, %xmm0, %xmm0
+; X86-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vpsrld $16, %xmm1, %xmm1
+; X86-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; SSE-LABEL: vp_ctlz_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $1, %xmm1
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $2, %xmm1
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $4, %xmm1
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $8, %xmm1
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    psubb %xmm1, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    psrlw $2, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    paddb %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrlw $4, %xmm1
+; SSE-NEXT:    paddb %xmm1, %xmm0
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT:    psadbw %xmm1, %xmm2
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    psadbw %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: vp_ctlz_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
+; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vp_ctlz_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
+; AVX2-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
+; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX2-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: vp_ctlz_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
+; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
+; AVX512-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX512-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
+; AVX512-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm2
+; AVX512-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX512-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX512-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %v = call <4 x i32> @llvm.vp.ctlz.v4i32(<4 x i32> %va, i1 false, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+declare <4 x i32> @llvm.vp.ctlz.v4i32(<4 x i32>, i1, <4 x i1>, i32)
+
+define <4 x i32> @vp_cttz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
+; X86-LABEL: vp_cttz_v4i32:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X86-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; X86-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X86-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; X86-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; X86-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; X86-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
+; X86-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X86-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; SSE-LABEL: vp_cttz_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    pandn %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    psubb %xmm1, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    psrlw $2, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    paddb %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrlw $4, %xmm1
+; SSE-NEXT:    paddb %xmm1, %xmm0
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT:    psadbw %xmm1, %xmm2
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    psadbw %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: vp_cttz_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vp_cttz_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: vp_cttz_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %v = call <4 x i32> @llvm.vp.cttz.v4i32(<4 x i32> %va, i1 false, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+declare <4 x i32> @llvm.vp.cttz.v4i32(<4 x i32>, i1, <4 x i1>, i32)
+
+
+define <4 x i32> @vp_sadd_sat_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 zeroext %evl) {
+; X86-LABEL: vp_sadd_sat_v4i32:
+; X86:       # %bb.0:
+; X86-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
+; X86-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
+; X86-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vpsrad $31, %xmm2, %xmm1
+; X86-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; X86-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; X86-NEXT:    retl
+;
+; SSE-LABEL: vp_sadd_sat_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: vp_sadd_sat_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vp_sadd_sat_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: vp_sadd_sat_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm2, %k0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
+; AVX512-NEXT:    vpsrad $31, %xmm1, %xmm0
+; AVX512-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1}
+; AVX512-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+declare <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vp_uadd_sat_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 zeroext %evl) {
+; X86-LABEL: vp_uadd_sat_v4i32:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X86-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; X86-NEXT:    vpminud %xmm2, %xmm0, %xmm0
+; X86-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; SSE-LABEL: vp_uadd_sat_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pxor %xmm2, %xmm3
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE-NEXT:    por %xmm3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: vp_uadd_sat_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpminud %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vp_uadd_sat_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpminud %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: vp_uadd_sat_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa %xmm1, %xmm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm2 = ~xmm2
+; AVX512-NEXT:    vpminud %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+declare <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vp_ssub_sat_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 zeroext %evl) {
+; X86-LABEL: vp_ssub_sat_v4i32:
+; X86:       # %bb.0:
+; X86-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X86-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
+; X86-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vpxor %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vpsrad $31, %xmm1, %xmm2
+; X86-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm2
+; X86-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; SSE-LABEL: vp_ssub_sat_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psubd %xmm1, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    pandn %xmm3, %xmm1
+; SSE-NEXT:    psrad $31, %xmm3
+; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: vp_ssub_sat_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vp_ssub_sat_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: vp_ssub_sat_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
+; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k1
+; AVX512-NEXT:    vpsrad $31, %xmm1, %xmm0
+; AVX512-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1}
+; AVX512-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+declare <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vp_usub_sat_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 zeroext %evl) {
+; X86-LABEL: vp_usub_sat_v4i32:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; SSE-LABEL: vp_usub_sat_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm1, %xmm3
+; SSE-NEXT:    pxor %xmm2, %xmm3
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE-NEXT:    psubd %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: vp_usub_sat_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+declare <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vp_fshl_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i32> %vc, <4 x i1> %m, i32 zeroext %evl) {
+; X86-LABEL: vp_fshl_v4i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
+; X86-NEXT:    vpandn %xmm3, %xmm2, %xmm4
+; X86-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT:    vpsrld $1, %xmm1, %xmm1
+; X86-NEXT:    vpsrld %xmm5, %xmm1, %xmm5
+; X86-NEXT:    vpsrlq $32, %xmm4, %xmm6
+; X86-NEXT:    vpsrld %xmm6, %xmm1, %xmm6
+; X86-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; X86-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; X86-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; X86-NEXT:    vpsrld %xmm6, %xmm1, %xmm6
+; X86-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; X86-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
+; X86-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
+; X86-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
+; X86-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; X86-NEXT:    vpslld $23, %xmm2, %xmm2
+; X86-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm2
+; X86-NEXT:    vcvttps2dq %xmm2, %xmm2
+; X86-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
+; X86-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    retl
+;
+; SSE-LABEL: vp_fshl_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [31,31,31,31]
+; SSE-NEXT:    movdqa %xmm2, %xmm5
+; SSE-NEXT:    pandn %xmm4, %xmm5
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
+; SSE-NEXT:    psrld $1, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm6
+; SSE-NEXT:    psrld %xmm3, %xmm6
+; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
+; SSE-NEXT:    movdqa %xmm1, %xmm3
+; SSE-NEXT:    psrld %xmm7, %xmm3
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE-NEXT:    movdqa %xmm1, %xmm7
+; SSE-NEXT:    psrld %xmm6, %xmm7
+; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE-NEXT:    psrld %xmm5, %xmm1
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    pslld $23, %xmm2
+; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT:    cvttps2dq %xmm2, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm2, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    por %xmm3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: vp_fshl_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
+; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld %xmm5, %xmm1, %xmm5
+; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm6
+; AVX1-NEXT:    vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX1-NEXT:    vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vp_fshl_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
+; AVX2-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: vp_fshl_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512-NEXT:    vpandn %xmm3, %xmm2, %xmm4
+; AVX512-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX512-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
+; AVX512-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %v = call <4 x i32> @llvm.vp.fshl.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i32> %vc, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+declare <4 x i32> @llvm.vp.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vp_fshr_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i32> %vc, <4 x i1> %m, i32 zeroext %evl) {
+; X86-LABEL: vp_fshr_v4i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
+; X86-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; X86-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT:    vpsrld %xmm5, %xmm1, %xmm5
+; X86-NEXT:    vpsrlq $32, %xmm4, %xmm6
+; X86-NEXT:    vpsrld %xmm6, %xmm1, %xmm6
+; X86-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; X86-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; X86-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; X86-NEXT:    vpsrld %xmm6, %xmm1, %xmm6
+; X86-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; X86-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
+; X86-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
+; X86-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
+; X86-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; X86-NEXT:    vpslld $23, %xmm2, %xmm2
+; X86-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm2
+; X86-NEXT:    vcvttps2dq %xmm2, %xmm2
+; X86-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
+; X86-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    retl
+;
+; SSE-LABEL: vp_fshr_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [31,31,31,31]
+; SSE-NEXT:    movdqa %xmm2, %xmm5
+; SSE-NEXT:    pand %xmm4, %xmm5
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
+; SSE-NEXT:    movdqa %xmm1, %xmm6
+; SSE-NEXT:    psrld %xmm3, %xmm6
+; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
+; SSE-NEXT:    movdqa %xmm1, %xmm3
+; SSE-NEXT:    psrld %xmm7, %xmm3
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE-NEXT:    movdqa %xmm1, %xmm7
+; SSE-NEXT:    psrld %xmm6, %xmm7
+; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE-NEXT:    psrld %xmm5, %xmm1
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
+; SSE-NEXT:    pandn %xmm4, %xmm2
+; SSE-NEXT:    pslld $23, %xmm2
+; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT:    cvttps2dq %xmm2, %xmm1
+; SSE-NEXT:    paddd %xmm0, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm2, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    por %xmm3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: vp_fshr_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm5, %xmm1, %xmm5
+; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm6
+; AVX1-NEXT:    vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX1-NEXT:    vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
+; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vp_fshr_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX2-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: vp_fshr_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
+; AVX512-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %v = call <4 x i32> @llvm.vp.fshr.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i32> %vc, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+declare <4 x i32> @llvm.vp.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i1>, i32)
diff --git a/llvm/test/CodeGen/X86/fp-roundeven.ll b/llvm/test/CodeGen/X86/fp-roundeven.ll
index 8037c783dd8e6..7d1c52cd65451 100644
--- a/llvm/test/CodeGen/X86/fp-roundeven.ll
+++ b/llvm/test/CodeGen/X86/fp-roundeven.ll
@@ -50,8 +50,6 @@ define half @roundeven_f16(half %h) {
 ;
 ; AVX512F-LABEL: roundeven_f16:
 ; AVX512F:       ## %bb.0: ## %entry
-; AVX512F-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX512F-NEXT:    vmovd %eax, %xmm0
 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundss $8, %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
index 3ecddd5279814..bf93c8a1f5b51 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
@@ -52,7 +52,7 @@ define half @fadd_f16(half %a, half %b) nounwind strictfp {
 ;
 ; X86-LABEL: fadd_f16:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vaddsh {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; X86-NEXT:    retl
 ;
@@ -102,7 +102,7 @@ define half @fsub_f16(half %a, half %b) nounwind strictfp {
 ;
 ; X86-LABEL: fsub_f16:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vsubsh {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; X86-NEXT:    retl
 ;
@@ -152,7 +152,7 @@ define half @fmul_f16(half %a, half %b) nounwind strictfp {
 ;
 ; X86-LABEL: fmul_f16:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vmulsh {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; X86-NEXT:    retl
 ;
@@ -202,7 +202,7 @@ define half @fdiv_f16(half %a, half %b) nounwind strictfp {
 ;
 ; X86-LABEL: fdiv_f16:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vdivsh {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; X86-NEXT:    retl
 ;
@@ -239,14 +239,14 @@ define void @fpext_f16_to_f32(ptr %val, ptr %ret) nounwind strictfp {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovsh (%ecx), %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fpext_f16_to_f32:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovsh (%rdi), %xmm0
+; X64-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X64-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; X64-NEXT:    vmovss %xmm0, (%rsi)
 ; X64-NEXT:    retq
@@ -282,14 +282,14 @@ define void @fpext_f16_to_f64(ptr %val, ptr %ret) nounwind strictfp {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovsh (%ecx), %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vcvtsh2sd %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovsd %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fpext_f16_to_f64:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovsh (%rdi), %xmm0
+; X64-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X64-NEXT:    vcvtsh2sd %xmm0, %xmm0, %xmm0
 ; X64-NEXT:    vmovsd %xmm0, (%rsi)
 ; X64-NEXT:    retq
@@ -418,14 +418,14 @@ define void @fsqrt_f16(ptr %a) nounwind strictfp {
 ; X86-LABEL: fsqrt_f16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vmovsh (%eax), %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vsqrtsh %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovsh %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fsqrt_f16:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovsh (%rdi), %xmm0
+; X64-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X64-NEXT:    vsqrtsh %xmm0, %xmm0, %xmm0
 ; X64-NEXT:    vmovsh %xmm0, (%rdi)
 ; X64-NEXT:    retq
@@ -510,8 +510,8 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
 ;
 ; X86-LABEL: fma_f16:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm1
-; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vfmadd213sh {{[0-9]+}}(%esp), %xmm1, %xmm0
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
index 3b9798a2af582..6fe5dcd292930 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
@@ -273,7 +273,7 @@ define half @fround16(half %f) #0 {
 ; X86-LABEL: fround16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    calll roundf
diff --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll
index 1515cd1366bc6..0d8290b120fa4 100644
--- a/llvm/test/CodeGen/X86/fp16-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll
@@ -9,8 +9,6 @@
 define void @test_half_ceil(half %a0, ptr %p0) nounwind {
 ; F16C-LABEL: test_half_ceil:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -107,8 +105,6 @@ define void @test_half_cos(half %a0, ptr %p0) nounwind {
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    pushq %rbx
 ; F16C-NEXT:    movq %rdi, %rbx
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq cosf@PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -168,8 +164,6 @@ define void @test_half_exp(half %a0, ptr %p0) nounwind {
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    pushq %rbx
 ; F16C-NEXT:    movq %rdi, %rbx
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq expf@PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -229,8 +223,6 @@ define void @test_half_exp2(half %a0, ptr %p0) nounwind {
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    pushq %rbx
 ; F16C-NEXT:    movq %rdi, %rbx
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq exp2f@PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -290,8 +282,6 @@ define void @test_half_exp10(half %a0, ptr %p0) nounwind {
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    pushq %rbx
 ; F16C-NEXT:    movq %rdi, %rbx
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq exp10f@PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -349,8 +339,6 @@ define void @test_half_exp10(half %a0, ptr %p0) nounwind {
 define void @test_half_fabs(half %a0, ptr %p0) nounwind {
 ; F16C-LABEL: test_half_fabs:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -392,8 +380,6 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
 define void @test_half_floor(half %a0, ptr %p0) nounwind {
 ; F16C-LABEL: test_half_floor:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -447,14 +433,8 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind {
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    pushq %rbx
 ; F16C-NEXT:    movq %rdi, %rbx
-; F16C-NEXT:    vpextrw $0, %xmm2, %eax
-; F16C-NEXT:    vpextrw $0, %xmm1, %ecx
-; F16C-NEXT:    vpextrw $0, %xmm0, %edx
-; F16C-NEXT:    vmovd %edx, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT:    vmovd %ecx, %xmm1
 ; F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
-; F16C-NEXT:    vmovd %eax, %xmm2
 ; F16C-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; F16C-NEXT:    callq fmaf@PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -542,8 +522,6 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind {
 define void @test_half_fneg(half %a0, ptr %p0) nounwind {
 ; F16C-LABEL: test_half_fneg:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -587,8 +565,6 @@ define void @test_half_log(half %a0, ptr %p0) nounwind {
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    pushq %rbx
 ; F16C-NEXT:    movq %rdi, %rbx
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq logf@PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -648,8 +624,6 @@ define void @test_half_log2(half %a0, ptr %p0) nounwind {
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    pushq %rbx
 ; F16C-NEXT:    movq %rdi, %rbx
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq log2f@PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -709,8 +683,6 @@ define void @test_half_log10(half %a0, ptr %p0) nounwind {
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    pushq %rbx
 ; F16C-NEXT:    movq %rdi, %rbx
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq log10f@PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -768,8 +740,6 @@ define void @test_half_log10(half %a0, ptr %p0) nounwind {
 define void @test_half_nearbyint(half %a0, ptr %p0) nounwind {
 ; F16C-LABEL: test_half_nearbyint:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vroundss $12, %xmm0, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -823,11 +793,7 @@ define void @test_half_pow(half %a0, half %a1, ptr %p0) nounwind {
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    pushq %rbx
 ; F16C-NEXT:    movq %rdi, %rbx
-; F16C-NEXT:    vpextrw $0, %xmm1, %eax
-; F16C-NEXT:    vpextrw $0, %xmm0, %ecx
-; F16C-NEXT:    vmovd %ecx, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT:    vmovd %eax, %xmm1
 ; F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; F16C-NEXT:    callq powf@PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -907,8 +873,6 @@ define void @test_half_powi(half %a0, i32 %a1, ptr %p0) nounwind {
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    pushq %rbx
 ; F16C-NEXT:    movq %rsi, %rbx
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq __powisf2@PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -976,8 +940,6 @@ define void @test_half_powi(half %a0, i32 %a1, ptr %p0) nounwind {
 define void @test_half_rint(half %a0, ptr %p0) nounwind {
 ; F16C-LABEL: test_half_rint:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vroundss $4, %xmm0, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -1031,8 +993,6 @@ define void @test_half_sin(half %a0, ptr %p0) nounwind {
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    pushq %rbx
 ; F16C-NEXT:    movq %rdi, %rbx
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq sinf@PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -1090,8 +1050,6 @@ define void @test_half_sin(half %a0, ptr %p0) nounwind {
 define void @test_half_sqrt(half %a0, ptr %p0) nounwind {
 ; F16C-LABEL: test_half_sqrt:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -1146,8 +1104,6 @@ define void @test_half_tan(half %a0, ptr %p0) nounwind {
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    pushq %rbx
 ; F16C-NEXT:    movq %rdi, %rbx
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq tanf@PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -1205,8 +1161,6 @@ define void @test_half_tan(half %a0, ptr %p0) nounwind {
 define void @test_half_trunc(half %a0, ptr %p0) nounwind {
 ; F16C-LABEL: test_half_trunc:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/half-darwin.ll b/llvm/test/CodeGen/X86/half-darwin.ll
index 1d2f4eb39bbe6..7388429143df5 100644
--- a/llvm/test/CodeGen/X86/half-darwin.ll
+++ b/llvm/test/CodeGen/X86/half-darwin.ll
@@ -76,14 +76,13 @@ define float @extendhfsf(ptr %ptr) nounwind {
 ;
 ; CHECK-F16C-LABEL: extendhfsf:
 ; CHECK-F16C:       ## %bb.0:
-; CHECK-F16C-NEXT:    movzwl (%rdi), %eax
-; CHECK-F16C-NEXT:    vmovd %eax, %xmm0
+; CHECK-F16C-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
 ; CHECK-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; CHECK-F16C-NEXT:    retq
 ;
 ; CHECK-FP16-LABEL: extendhfsf:
 ; CHECK-FP16:       ## %bb.0:
-; CHECK-FP16-NEXT:    vmovsh (%rdi), %xmm0
+; CHECK-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; CHECK-FP16-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-FP16-NEXT:    retq
 
@@ -175,7 +174,7 @@ define float @strict_extendhfsf(ptr %ptr) nounwind strictfp {
 ;
 ; CHECK-FP16-LABEL: strict_extendhfsf:
 ; CHECK-FP16:       ## %bb.0:
-; CHECK-FP16-NEXT:    vmovsh (%rdi), %xmm0
+; CHECK-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; CHECK-FP16-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-FP16-NEXT:    retq
 
diff --git a/llvm/test/CodeGen/X86/half-fp80-darwin.ll b/llvm/test/CodeGen/X86/half-fp80-darwin.ll
index 0ba734e66c7b2..65a26187c5857 100644
--- a/llvm/test/CodeGen/X86/half-fp80-darwin.ll
+++ b/llvm/test/CodeGen/X86/half-fp80-darwin.ll
@@ -19,8 +19,7 @@ define void @extendhfxf(ptr %outptr, ptr %inptr) nounwind {
 ;
 ; CHECK-F16C-LABEL: extendhfxf:
 ; CHECK-F16C:       ## %bb.0:
-; CHECK-F16C-NEXT:    movzwl (%rsi), %eax
-; CHECK-F16C-NEXT:    vmovd %eax, %xmm0
+; CHECK-F16C-NEXT:    vpinsrw $0, (%rsi), %xmm0, %xmm0
 ; CHECK-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; CHECK-F16C-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp)
 ; CHECK-F16C-NEXT:    flds -{{[0-9]+}}(%rsp)
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 033cadae6a1e7..7bac075e48680 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -81,8 +81,7 @@ define float @test_extend32(ptr %addr) #0 {
 ;
 ; BWON-F16C-LABEL: test_extend32:
 ; BWON-F16C:       # %bb.0:
-; BWON-F16C-NEXT:    movzwl (%rdi), %eax
-; BWON-F16C-NEXT:    vmovd %eax, %xmm0
+; BWON-F16C-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; BWON-F16C-NEXT:    retq
 ;
@@ -113,8 +112,7 @@ define double @test_extend64(ptr %addr) #0 {
 ;
 ; BWON-F16C-LABEL: test_extend64:
 ; BWON-F16C:       # %bb.0:
-; BWON-F16C-NEXT:    movzwl (%rdi), %eax
-; BWON-F16C-NEXT:    vmovd %eax, %xmm0
+; BWON-F16C-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
 ; BWON-F16C-NEXT:    retq
@@ -220,8 +218,7 @@ define i64 @test_fptosi_i64(ptr %p) #0 {
 ;
 ; BWON-F16C-LABEL: test_fptosi_i64:
 ; BWON-F16C:       # %bb.0:
-; BWON-F16C-NEXT:    movzwl (%rdi), %eax
-; BWON-F16C-NEXT:    vmovd %eax, %xmm0
+; BWON-F16C-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vcvttss2si %xmm0, %rax
 ; BWON-F16C-NEXT:    retq
@@ -312,8 +309,7 @@ define i64 @test_fptoui_i64(ptr %p) #0 {
 ;
 ; BWON-F16C-LABEL: test_fptoui_i64:
 ; BWON-F16C:       # %bb.0:
-; BWON-F16C-NEXT:    movzwl (%rdi), %eax
-; BWON-F16C-NEXT:    vmovd %eax, %xmm0
+; BWON-F16C-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vcvttss2si %xmm0, %rcx
 ; BWON-F16C-NEXT:    movq %rcx, %rdx
@@ -851,13 +847,12 @@ define float @test_sitofp_fadd_i32(i32 %a, ptr %b) #0 {
 ;
 ; BWON-F16C-LABEL: test_sitofp_fadd_i32:
 ; BWON-F16C:       # %bb.0:
-; BWON-F16C-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; BWON-F16C-NEXT:    movzwl (%rsi), %eax
-; BWON-F16C-NEXT:    vmovd %eax, %xmm1
+; BWON-F16C-NEXT:    vpinsrw $0, (%rsi), %xmm0, %xmm0
+; BWON-F16C-NEXT:    vcvtsi2ss %edi, %xmm1, %xmm1
+; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
-; BWON-F16C-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; BWON-F16C-NEXT:    retq
@@ -916,8 +911,6 @@ define half @PR40273(half) #0 {
 ;
 ; BWON-F16C-LABEL: PR40273:
 ; BWON-F16C:       # %bb.0:
-; BWON-F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; BWON-F16C-NEXT:    vmovd %eax, %xmm0
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; BWON-F16C-NEXT:    xorl %eax, %eax
 ; BWON-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
@@ -969,8 +962,6 @@ define void @brcond(half %0) #0 {
 ;
 ; BWON-F16C-LABEL: brcond:
 ; BWON-F16C:       # %bb.0: # %entry
-; BWON-F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; BWON-F16C-NEXT:    vmovd %eax, %xmm0
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; BWON-F16C-NEXT:    vucomiss %xmm1, %xmm0
@@ -1024,8 +1015,6 @@ define half @test_sqrt(half %0) #0 {
 ;
 ; BWON-F16C-LABEL: test_sqrt:
 ; BWON-F16C:       # %bb.0: # %entry
-; BWON-F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; BWON-F16C-NEXT:    vmovd %eax, %xmm0
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -1136,9 +1125,7 @@ define void @main.45() #0 {
 ; CHECK-LIBCALL-NEXT:    pushq %rbx
 ; CHECK-LIBCALL-NEXT:    pushq %rax
 ; CHECK-LIBCALL-NEXT:    pinsrw $0, (%rax), %xmm0
-; CHECK-LIBCALL-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-LIBCALL-NEXT:    movd %eax, %xmm1
-; CHECK-LIBCALL-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; CHECK-LIBCALL-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
 ; CHECK-LIBCALL-NEXT:    movq %xmm1, %rbx
 ; CHECK-LIBCALL-NEXT:    movq %rbx, %r14
 ; CHECK-LIBCALL-NEXT:    shrq $48, %r14
@@ -1167,15 +1154,12 @@ define void @main.45() #0 {
 ; BWON-F16C-LABEL: main.45:
 ; BWON-F16C:       # %bb.0: # %entry
 ; BWON-F16C-NEXT:    vpinsrw $0, (%rax), %xmm0, %xmm0
-; BWON-F16C-NEXT:    vpextrw $0, %xmm0, %eax
 ; BWON-F16C-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; BWON-F16C-NEXT:    vmovd %eax, %xmm1
-; BWON-F16C-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm1
 ; BWON-F16C-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; BWON-F16C-NEXT:    vcmpunordps %xmm2, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
+; BWON-F16C-NEXT:    vcmpunordps %xmm2, %xmm1, %xmm1
+; BWON-F16C-NEXT:    vpackssdw %xmm1, %xmm1, %xmm1
+; BWON-F16C-NEXT:    vpblendvb %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vmovq %xmm0, (%rax)
 ; BWON-F16C-NEXT:    retq
 ;
@@ -1185,12 +1169,11 @@ define void @main.45() #0 {
 ; CHECK-I686-NEXT:    pushl %esi
 ; CHECK-I686-NEXT:    subl $20, %esp
 ; CHECK-I686-NEXT:    pinsrw $0, (%eax), %xmm0
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movd %eax, %xmm0
-; CHECK-I686-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-I686-NEXT:    movd %xmm0, %esi
+; CHECK-I686-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; CHECK-I686-NEXT:    movd %xmm1, %esi
 ; CHECK-I686-NEXT:    movl %esi, %edi
 ; CHECK-I686-NEXT:    shrl $16, %edi
+; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
 ; CHECK-I686-NEXT:    movw %ax, (%esp)
 ; CHECK-I686-NEXT:    calll __extendhfsf2
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
@@ -1336,13 +1319,9 @@ define half @pr61271(half %0, half %1) #0 {
 ;
 ; BWON-F16C-LABEL: pr61271:
 ; BWON-F16C:       # %bb.0:
-; BWON-F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; BWON-F16C-NEXT:    vpextrw $0, %xmm1, %ecx
-; BWON-F16C-NEXT:    vmovd %ecx, %xmm0
-; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; BWON-F16C-NEXT:    vmovd %eax, %xmm1
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
-; BWON-F16C-NEXT:    vminss %xmm0, %xmm1, %xmm0
+; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT:    vminss %xmm1, %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vmovd %xmm0, %eax
 ; BWON-F16C-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/movrs-builtins.ll b/llvm/test/CodeGen/X86/movrs-builtins.ll
index c1722c831c95d..ccf0833e53990 100644
--- a/llvm/test/CodeGen/X86/movrs-builtins.ll
+++ b/llvm/test/CodeGen/X86/movrs-builtins.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+movrs | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+movrs,+egpr | FileCheck %s --check-prefix=EGPR
 
 define i8 @test_movrs_si8(ptr %__A) {
 ; CHECK-LABEL: test_movrs_si8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movrsb (%rdi), %al # encoding: [0x0f,0x38,0x8a,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_movrs_si8:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movrsb (%rdi), %al # EVEX TO LEGACY Compression encoding: [0x0f,0x38,0x8a,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = call i8 @llvm.x86.movrsqi(ptr %__A)
   ret i8 %0
@@ -17,6 +23,11 @@ define i16 @test_movrs_si16(ptr %__A) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movrsw (%rdi), %ax # encoding: [0x66,0x0f,0x38,0x8b,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_movrs_si16:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movrsw (%rdi), %ax # EVEX TO LEGACY Compression encoding: [0x66,0x0f,0x38,0x8b,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = call i16 @llvm.x86.movrshi(ptr %__A)
   ret i16 %0
@@ -28,6 +39,11 @@ define i32 @test_movrs_si32(ptr %__A) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movrsl (%rdi), %eax # encoding: [0x0f,0x38,0x8b,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_movrs_si32:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movrsl (%rdi), %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x38,0x8b,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = call i32 @llvm.x86.movrssi(ptr %__A)
   ret i32 %0
@@ -39,6 +55,11 @@ define i64 @test_movrs_si64(ptr %__A) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movrsq (%rdi), %rax # encoding: [0x48,0x0f,0x38,0x8b,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_movrs_si64:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movrsq (%rdi), %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x38,0x8b,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = call i64 @llvm.x86.movrsdi(ptr %__A)
   ret i64 %0
diff --git a/llvm/test/CodeGen/X86/pr114520.ll b/llvm/test/CodeGen/X86/pr114520.ll
index c557da6b3ab8c..9bd1f49ff67c9 100644
--- a/llvm/test/CodeGen/X86/pr114520.ll
+++ b/llvm/test/CodeGen/X86/pr114520.ll
@@ -5,7 +5,6 @@ define half @test1(half %x) {
 ; CHECK-LABEL: test1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpextrw $0, %xmm0, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm0
 ; CHECK-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; CHECK-NEXT:    vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    movl $64512, %ecx # imm = 0xFC00
diff --git a/llvm/test/CodeGen/X86/pr31088.ll b/llvm/test/CodeGen/X86/pr31088.ll
index ce37622c476db..1c9c8e40c009d 100644
--- a/llvm/test/CodeGen/X86/pr31088.ll
+++ b/llvm/test/CodeGen/X86/pr31088.ll
@@ -51,17 +51,7 @@ define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind {
 ;
 ; F16C-O0-LABEL: ir_fadd_v1f16:
 ; F16C-O0:       # %bb.0:
-; F16C-O0-NEXT:    vpextrw $0, %xmm1, %eax
-; F16C-O0-NEXT:    movw %ax, %cx
-; F16C-O0-NEXT:    # implicit-def: $eax
-; F16C-O0-NEXT:    movw %cx, %ax
-; F16C-O0-NEXT:    vmovd %eax, %xmm1
 ; F16C-O0-NEXT:    vcvtph2ps %xmm1, %xmm1
-; F16C-O0-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-O0-NEXT:    movw %ax, %cx
-; F16C-O0-NEXT:    # implicit-def: $eax
-; F16C-O0-NEXT:    movw %cx, %ax
-; F16C-O0-NEXT:    vmovd %eax, %xmm0
 ; F16C-O0-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-O0-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; F16C-O0-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/pr86305.ll b/llvm/test/CodeGen/X86/pr86305.ll
index 79b42bb2532ca..0d2e1abe8e5fc 100644
--- a/llvm/test/CodeGen/X86/pr86305.ll
+++ b/llvm/test/CodeGen/X86/pr86305.ll
@@ -28,17 +28,16 @@ define <4 x bfloat> @fptrunc_v4f32(<4 x float> %a) nounwind {
 ; CHECK-LABEL: fptrunc_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rbp
-; CHECK-NEXT:    pushq %r15
 ; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    subq $64, %rsp
 ; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; CHECK-NEXT:    callq __truncsfbf2@PLT
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
 ; CHECK-NEXT:    callq __truncsfbf2@PLT
-; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
 ; CHECK-NEXT:    callq __truncsfbf2@PLT
@@ -49,24 +48,21 @@ define <4 x bfloat> @fptrunc_v4f32(<4 x float> %a) nounwind {
 ; CHECK-NEXT:    vpextrw $0, %xmm0, %ebp
 ; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpextrw $0, %xmm0, %r14d
-; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vpextrw $0, %xmm0, %r15d
-; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
 ; CHECK-NEXT:    callq __truncsfbf2@PLT
 ; CHECK-NEXT:    vpextrw $0, %xmm0, %eax
-; CHECK-NEXT:    vmovd %r15d, %xmm0
-; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrw $2, %r14d, %xmm0, %xmm0
+; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vpinsrw $1, %r14d, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
 ; CHECK-NEXT:    vpinsrw $3, %ebp, %xmm0, %xmm0
 ; CHECK-NEXT:    vpinsrw $4, %ebx, %xmm0, %xmm0
 ; CHECK-NEXT:    vpinsrw $5, %ebx, %xmm0, %xmm0
 ; CHECK-NEXT:    vpinsrw $6, %ebx, %xmm0, %xmm0
 ; CHECK-NEXT:    vpinsrw $7, %ebx, %xmm0, %xmm0
-; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    addq $64, %rsp
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
   %b = fptrunc <4 x float> %a to <4 x bfloat>
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index 94a1792cb8985..87a948a4f1f7e 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -mtriple=x86_64 -mattr=+avx512f                        < %s | FileCheck %s --check-prefixes=CHECK,AVX512F
 ; RUN: llc -mtriple=x86_64 -mattr=+avx512f,+avx512vl,+avx512vbmi2 < %s | FileCheck %s --check-prefixes=CHECK,AVX512VL
 
-define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> %passthru) {
+define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> %passthru) nounwind {
 ; AVX2-LABEL: test_compress_v4i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
@@ -64,7 +64,7 @@ define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask, <4 x i32>
     ret <4 x i32> %out
 }
 
-define <4 x float> @test_compress_v4f32(<4 x float> %vec, <4 x i1> %mask, <4 x float> %passthru) {
+define <4 x float> @test_compress_v4f32(<4 x float> %vec, <4 x i1> %mask, <4 x float> %passthru) nounwind {
 ; AVX2-LABEL: test_compress_v4f32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
@@ -129,7 +129,7 @@ define <4 x float> @test_compress_v4f32(<4 x float> %vec, <4 x i1> %mask, <4 x f
     ret <4 x float> %out
 }
 
-define <2 x i64> @test_compress_v2i64(<2 x i64> %vec, <2 x i1> %mask, <2 x i64> %passthru) {
+define <2 x i64> @test_compress_v2i64(<2 x i64> %vec, <2 x i1> %mask, <2 x i64> %passthru) nounwind {
 ; AVX2-LABEL: test_compress_v2i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsllq $63, %xmm1, %xmm1
@@ -181,7 +181,7 @@ define <2 x i64> @test_compress_v2i64(<2 x i64> %vec, <2 x i1> %mask, <2 x i64>
     ret <2 x i64> %out
 }
 
-define <2 x double> @test_compress_v2f64(<2 x double> %vec, <2 x i1> %mask, <2 x double> %passthru) {
+define <2 x double> @test_compress_v2f64(<2 x double> %vec, <2 x i1> %mask, <2 x double> %passthru) nounwind {
 ; AVX2-LABEL: test_compress_v2f64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsllq $63, %xmm1, %xmm1
@@ -236,18 +236,14 @@ define <2 x double> @test_compress_v2f64(<2 x double> %vec, <2 x i1> %mask, <2 x
     ret <2 x double> %out
 }
 
-define <8 x i32> @test_compress_v8i32(<8 x i32> %vec, <8 x i1> %mask, <8 x i32> %passthru) {
+define <8 x i32> @test_compress_v8i32(<8 x i32> %vec, <8 x i1> %mask, <8 x i32> %passthru) nounwind {
 ; AVX2-LABEL: test_compress_v8i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    movq %rsp, %rbp
-; AVX2-NEXT:    .cfi_def_cfa_register %rbp
 ; AVX2-NEXT:    pushq %rbx
 ; AVX2-NEXT:    andq $-32, %rsp
 ; AVX2-NEXT:    subq $64, %rsp
-; AVX2-NEXT:    .cfi_offset %rbx, -24
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX2-NEXT:    vpslld $31, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrad $31, %ymm1, %ymm3
@@ -315,7 +311,6 @@ define <8 x i32> @test_compress_v8i32(<8 x i32> %vec, <8 x i1> %mask, <8 x i32>
 ; AVX2-NEXT:    leaq -8(%rbp), %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa %rsp, 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test_compress_v8i32:
@@ -340,14 +335,11 @@ define <8 x i32> @test_compress_v8i32(<8 x i32> %vec, <8 x i1> %mask, <8 x i32>
     ret <8 x i32> %out
 }
 
-define <8 x float> @test_compress_v8f32(<8 x float> %vec, <8 x i1> %mask, <8 x float> %passthru) {
+define <8 x float> @test_compress_v8f32(<8 x float> %vec, <8 x i1> %mask, <8 x float> %passthru) nounwind {
 ; AVX2-LABEL: test_compress_v8f32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    movq %rsp, %rbp
-; AVX2-NEXT:    .cfi_def_cfa_register %rbp
 ; AVX2-NEXT:    andq $-32, %rsp
 ; AVX2-NEXT:    subq $64, %rsp
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
@@ -424,7 +416,6 @@ define <8 x float> @test_compress_v8f32(<8 x float> %vec, <8 x i1> %mask, <8 x f
 ; AVX2-NEXT:    vmovaps (%rsp), %ymm0
 ; AVX2-NEXT:    movq %rbp, %rsp
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa %rsp, 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test_compress_v8f32:
@@ -449,14 +440,11 @@ define <8 x float> @test_compress_v8f32(<8 x float> %vec, <8 x i1> %mask, <8 x f
     ret <8 x float> %out
 }
 
-define <4 x i64> @test_compress_v4i64(<4 x i64> %vec, <4 x i1> %mask, <4 x i64> %passthru) {
+define <4 x i64> @test_compress_v4i64(<4 x i64> %vec, <4 x i1> %mask, <4 x i64> %passthru) nounwind {
 ; AVX2-LABEL: test_compress_v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    movq %rsp, %rbp
-; AVX2-NEXT:    .cfi_def_cfa_register %rbp
 ; AVX2-NEXT:    andq $-32, %rsp
 ; AVX2-NEXT:    subq $64, %rsp
 ; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
@@ -499,7 +487,6 @@ define <4 x i64> @test_compress_v4i64(<4 x i64> %vec, <4 x i1> %mask, <4 x i64>
 ; AVX2-NEXT:    vmovaps (%rsp), %ymm0
 ; AVX2-NEXT:    movq %rbp, %rsp
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa %rsp, 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test_compress_v4i64:
@@ -525,7 +512,58 @@ define <4 x i64> @test_compress_v4i64(<4 x i64> %vec, <4 x i1> %mask, <4 x i64>
     ret <4 x i64> %out
 }
 
-define <4 x double> @test_compress_v4f64(<4 x double> %vec, <4 x i1> %mask, <4 x double> %passthru) {
+define <4 x double> @test_compress_v4f64(<4 x double> %vec, <4 x i1> %mask, <4 x double> %passthru) nounwind {
+; AVX2-LABEL: test_compress_v4f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $64, %rsp
+; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm3
+; AVX2-NEXT:    vmovaps %ymm2, (%rsp)
+; AVX2-NEXT:    vpsrlq $63, %ymm3, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    vmovq %xmm1, %rcx
+; AVX2-NEXT:    addl %eax, %ecx
+; AVX2-NEXT:    andl $3, %ecx
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vmovlpd %xmm0, (%rsp)
+; AVX2-NEXT:    vmovq %xmm3, %rax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    vmovhpd %xmm0, (%rsp,%rcx,8)
+; AVX2-NEXT:    vpextrq $1, %xmm3, %rcx
+; AVX2-NEXT:    subq %rcx, %rax
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovlpd %xmm0, (%rsp,%rcx,8)
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm2
+; AVX2-NEXT:    vmovq %xmm2, %rcx
+; AVX2-NEXT:    subq %rcx, %rax
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    andl $3, %ecx
+; AVX2-NEXT:    vmovhpd %xmm0, (%rsp,%rcx,8)
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rcx
+; AVX2-NEXT:    subq %rcx, %rax
+; AVX2-NEXT:    cmpq $4, %rax
+; AVX2-NEXT:    jb .LBB7_2
+; AVX2-NEXT:  # %bb.1:
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2-NEXT:  .LBB7_2:
+; AVX2-NEXT:    cmpq $3, %rax
+; AVX2-NEXT:    movl $3, %ecx
+; AVX2-NEXT:    cmovbq %rax, %rcx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    vmovsd %xmm1, (%rsp,%rax,8)
+; AVX2-NEXT:    vmovaps (%rsp), %ymm0
+; AVX2-NEXT:    movq %rbp, %rsp
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test_compress_v4f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
@@ -549,7 +587,141 @@ define <4 x double> @test_compress_v4f64(<4 x double> %vec, <4 x i1> %mask, <4 x
     ret <4 x double> %out
 }
 
-define <16 x i32> @test_compress_v16i32(<16 x i32> %vec, <16 x i1> %mask, <16 x i32> %passthru) {
+define <16 x i32> @test_compress_v16i32(<16 x i32> %vec, <16 x i1> %mask, <16 x i32> %passthru) nounwind {
+; AVX2-LABEL: test_compress_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $128, %rsp
+; AVX2-NEXT:    vmovaps %ymm4, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm3, (%rsp)
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
+; AVX2-NEXT:    vpextrd $1, %xmm3, %eax
+; AVX2-NEXT:    vmovd %xmm3, %ecx
+; AVX2-NEXT:    addl %eax, %ecx
+; AVX2-NEXT:    vpextrd $2, %xmm3, %eax
+; AVX2-NEXT:    vpextrd $3, %xmm3, %edx
+; AVX2-NEXT:    addl %eax, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $1, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    vmovd %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $2, %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $3, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $4, %xmm2, %r8d
+; AVX2-NEXT:    andl $1, %r8d
+; AVX2-NEXT:    addq %rax, %r8
+; AVX2-NEXT:    vpextrb $5, %xmm2, %r9d
+; AVX2-NEXT:    andl $1, %r9d
+; AVX2-NEXT:    addq %r8, %r9
+; AVX2-NEXT:    vpextrb $6, %xmm2, %r10d
+; AVX2-NEXT:    andl $1, %r10d
+; AVX2-NEXT:    addq %r9, %r10
+; AVX2-NEXT:    vpextrb $7, %xmm2, %r11d
+; AVX2-NEXT:    andl $1, %r11d
+; AVX2-NEXT:    addq %r10, %r11
+; AVX2-NEXT:    vpextrb $8, %xmm2, %ebx
+; AVX2-NEXT:    andl $1, %ebx
+; AVX2-NEXT:    addq %r11, %rbx
+; AVX2-NEXT:    vpextrb $9, %xmm2, %r14d
+; AVX2-NEXT:    andl $1, %r14d
+; AVX2-NEXT:    addq %rbx, %r14
+; AVX2-NEXT:    vpextrb $10, %xmm2, %r15d
+; AVX2-NEXT:    andl $1, %r15d
+; AVX2-NEXT:    addq %r14, %r15
+; AVX2-NEXT:    vpextrb $11, %xmm2, %r12d
+; AVX2-NEXT:    andl $1, %r12d
+; AVX2-NEXT:    addq %r15, %r12
+; AVX2-NEXT:    vpextrb $12, %xmm2, %r13d
+; AVX2-NEXT:    andl $1, %r13d
+; AVX2-NEXT:    addq %r12, %r13
+; AVX2-NEXT:    vpextrb $13, %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %r13, %rcx
+; AVX2-NEXT:    vpextrb $14, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    vpextrb $15, %xmm2, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rax, %rdx
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    cmpq $16, %rdx
+; AVX2-NEXT:    vextractps $3, %xmm2, %esi
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX2-NEXT:    cmovbl (%rsp,%rdi,4), %esi
+; AVX2-NEXT:    movl %esi, %edi
+; AVX2-NEXT:    vmovss %xmm0, (%rsp)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rsi,4)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rsi,4)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rsi,4)
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rsi,4)
+; AVX2-NEXT:    andl $15, %r8d
+; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%r8,4)
+; AVX2-NEXT:    andl $15, %r9d
+; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%r9,4)
+; AVX2-NEXT:    andl $15, %r10d
+; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%r10,4)
+; AVX2-NEXT:    andl $15, %r11d
+; AVX2-NEXT:    vmovss %xmm1, (%rsp,%r11,4)
+; AVX2-NEXT:    andl $15, %ebx
+; AVX2-NEXT:    vextractps $1, %xmm1, (%rsp,%rbx,4)
+; AVX2-NEXT:    andl $15, %r14d
+; AVX2-NEXT:    vextractps $2, %xmm1, (%rsp,%r14,4)
+; AVX2-NEXT:    andl $15, %r15d
+; AVX2-NEXT:    vextractps $3, %xmm1, (%rsp,%r15,4)
+; AVX2-NEXT:    andl $15, %r12d
+; AVX2-NEXT:    vmovss %xmm2, (%rsp,%r12,4)
+; AVX2-NEXT:    andl $15, %r13d
+; AVX2-NEXT:    vextractps $1, %xmm2, (%rsp,%r13,4)
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vextractps $2, %xmm2, (%rsp,%rcx,4)
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vextractps $3, %xmm2, (%rsp,%rax,4)
+; AVX2-NEXT:    cmpq $15, %rdx
+; AVX2-NEXT:    movl $15, %eax
+; AVX2-NEXT:    cmovbq %rdx, %rax
+; AVX2-NEXT:    movl %eax, %eax
+; AVX2-NEXT:    movl %edi, (%rsp,%rax,4)
+; AVX2-NEXT:    vmovaps (%rsp), %ymm0
+; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT:    leaq -40(%rbp), %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test_compress_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -570,7 +742,135 @@ define <16 x i32> @test_compress_v16i32(<16 x i32> %vec, <16 x i1> %mask, <16 x
     ret <16 x i32> %out
 }
 
-define <16 x float> @test_compress_v16f32(<16 x float> %vec, <16 x i1> %mask, <16 x float> %passthru) {
+define <16 x float> @test_compress_v16f32(<16 x float> %vec, <16 x i1> %mask, <16 x float> %passthru) nounwind {
+; AVX2-LABEL: test_compress_v16f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $96, %rsp
+; AVX2-NEXT:    vmovaps %ymm4, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm3, (%rsp)
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
+; AVX2-NEXT:    vpextrd $1, %xmm3, %eax
+; AVX2-NEXT:    vmovd %xmm3, %ecx
+; AVX2-NEXT:    addl %eax, %ecx
+; AVX2-NEXT:    vpextrd $2, %xmm3, %eax
+; AVX2-NEXT:    vpextrd $3, %xmm3, %edx
+; AVX2-NEXT:    addl %eax, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovss %xmm0, (%rsp)
+; AVX2-NEXT:    vmovd %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rax,4)
+; AVX2-NEXT:    vpextrb $1, %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    vpextrb $2, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rax,4)
+; AVX2-NEXT:    vpextrb $3, %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    vpextrb $4, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    vpextrb $5, %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rax,4)
+; AVX2-NEXT:    vpextrb $6, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    vpextrb $7, %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rax,4)
+; AVX2-NEXT:    vpextrb $8, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vmovss %xmm1, (%rsp,%rcx,4)
+; AVX2-NEXT:    vpextrb $9, %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vextractps $1, %xmm1, (%rsp,%rax,4)
+; AVX2-NEXT:    vpextrb $10, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vextractps $2, %xmm1, (%rsp,%rcx,4)
+; AVX2-NEXT:    vpextrb $11, %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vextractps $3, %xmm1, (%rsp,%rax,4)
+; AVX2-NEXT:    vpextrb $12, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    vpextrb $13, %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rax,4)
+; AVX2-NEXT:    vpextrb $14, %xmm2, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    vpextrb $15, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rdx, %rax
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rdx,4)
+; AVX2-NEXT:    cmpq $16, %rax
+; AVX2-NEXT:    jae .LBB9_2
+; AVX2-NEXT:  # %bb.1:
+; AVX2-NEXT:    vmovaps %xmm3, %xmm0
+; AVX2-NEXT:  .LBB9_2:
+; AVX2-NEXT:    cmpq $15, %rax
+; AVX2-NEXT:    movl $15, %ecx
+; AVX2-NEXT:    cmovbq %rax, %rcx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rax,4)
+; AVX2-NEXT:    vmovaps (%rsp), %ymm0
+; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT:    movq %rbp, %rsp
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test_compress_v16f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -591,7 +891,79 @@ define <16 x float> @test_compress_v16f32(<16 x float> %vec, <16 x i1> %mask, <1
     ret <16 x float> %out
 }
 
-define <8 x i64> @test_compress_v8i64(<8 x i64> %vec, <8 x i1> %mask, <8 x i64> %passthru) {
+define <8 x i64> @test_compress_v8i64(<8 x i64> %vec, <8 x i1> %mask, <8 x i64> %passthru) nounwind {
+; AVX2-LABEL: test_compress_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $96, %rsp
+; AVX2-NEXT:    vmovaps %ymm4, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm3, (%rsp)
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX2-NEXT:    vpaddq %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
+; AVX2-NEXT:    vpextrq $1, %xmm3, %rcx
+; AVX2-NEXT:    vmovq %xmm3, %rax
+; AVX2-NEXT:    addl %ecx, %eax
+; AVX2-NEXT:    andl $7, %eax
+; AVX2-NEXT:    vpextrw $1, %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    vmovd %xmm2, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    vpextrw $2, %xmm2, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    addq %rcx, %rsi
+; AVX2-NEXT:    vpextrw $3, %xmm2, %edi
+; AVX2-NEXT:    andl $1, %edi
+; AVX2-NEXT:    addq %rsi, %rdi
+; AVX2-NEXT:    vpextrw $4, %xmm2, %r8d
+; AVX2-NEXT:    andl $1, %r8d
+; AVX2-NEXT:    addq %rdi, %r8
+; AVX2-NEXT:    vpextrw $5, %xmm2, %r9d
+; AVX2-NEXT:    andl $1, %r9d
+; AVX2-NEXT:    addq %r8, %r9
+; AVX2-NEXT:    vpextrw $6, %xmm2, %r10d
+; AVX2-NEXT:    andl $1, %r10d
+; AVX2-NEXT:    addq %r9, %r10
+; AVX2-NEXT:    vpextrw $7, %xmm2, %r11d
+; AVX2-NEXT:    andl $1, %r11d
+; AVX2-NEXT:    addq %r10, %r11
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rbx
+; AVX2-NEXT:    cmpq $8, %r11
+; AVX2-NEXT:    cmovbq (%rsp,%rax,8), %rbx
+; AVX2-NEXT:    vmovq %xmm0, (%rsp)
+; AVX2-NEXT:    vpextrq $1, %xmm0, (%rsp,%rdx,8)
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rsp,%rcx,8)
+; AVX2-NEXT:    vpextrq $1, %xmm0, (%rsp,%rsi,8)
+; AVX2-NEXT:    andl $7, %edi
+; AVX2-NEXT:    vmovq %xmm1, (%rsp,%rdi,8)
+; AVX2-NEXT:    andl $7, %r8d
+; AVX2-NEXT:    vpextrq $1, %xmm1, (%rsp,%r8,8)
+; AVX2-NEXT:    andl $7, %r9d
+; AVX2-NEXT:    vmovq %xmm2, (%rsp,%r9,8)
+; AVX2-NEXT:    andl $7, %r10d
+; AVX2-NEXT:    vpextrq $1, %xmm2, (%rsp,%r10,8)
+; AVX2-NEXT:    cmpq $7, %r11
+; AVX2-NEXT:    movl $7, %eax
+; AVX2-NEXT:    cmovbq %r11, %rax
+; AVX2-NEXT:    movl %eax, %eax
+; AVX2-NEXT:    movq %rbx, (%rsp,%rax,8)
+; AVX2-NEXT:    vmovaps (%rsp), %ymm0
+; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT:    leaq -8(%rbp), %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test_compress_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
@@ -612,7 +984,84 @@ define <8 x i64> @test_compress_v8i64(<8 x i64> %vec, <8 x i1> %mask, <8 x i64>
     ret <8 x i64> %out
 }
 
-define <8 x double> @test_compress_v8f64(<8 x double> %vec, <8 x i1> %mask, <8 x double> %passthru) {
+define <8 x double> @test_compress_v8f64(<8 x double> %vec, <8 x i1> %mask, <8 x double> %passthru) nounwind {
+; AVX2-LABEL: test_compress_v8f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $96, %rsp
+; AVX2-NEXT:    vmovaps %ymm4, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm3, (%rsp)
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX2-NEXT:    vpaddq %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
+; AVX2-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX2-NEXT:    vmovq %xmm3, %rcx
+; AVX2-NEXT:    addl %eax, %ecx
+; AVX2-NEXT:    andl $7, %ecx
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX2-NEXT:    vmovlps %xmm0, (%rsp)
+; AVX2-NEXT:    vmovd %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    vmovhps %xmm0, (%rsp,%rax,8)
+; AVX2-NEXT:    vpextrw $1, %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovlps %xmm0, (%rsp,%rcx,8)
+; AVX2-NEXT:    vpextrw $2, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    vmovhps %xmm0, (%rsp,%rax,8)
+; AVX2-NEXT:    vpextrw $3, %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    vpextrw $4, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $7, %ecx
+; AVX2-NEXT:    vmovlpd %xmm1, (%rsp,%rcx,8)
+; AVX2-NEXT:    vpextrw $5, %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $7, %eax
+; AVX2-NEXT:    vmovhpd %xmm1, (%rsp,%rax,8)
+; AVX2-NEXT:    vpextrw $6, %xmm2, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $7, %ecx
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX2-NEXT:    vmovlpd %xmm0, (%rsp,%rcx,8)
+; AVX2-NEXT:    vpextrw $7, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rdx, %rax
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
+; AVX2-NEXT:    andl $7, %edx
+; AVX2-NEXT:    vmovhpd %xmm0, (%rsp,%rdx,8)
+; AVX2-NEXT:    cmpq $8, %rax
+; AVX2-NEXT:    jb .LBB11_2
+; AVX2-NEXT:  # %bb.1:
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX2-NEXT:  .LBB11_2:
+; AVX2-NEXT:    cmpq $7, %rax
+; AVX2-NEXT:    movl $7, %ecx
+; AVX2-NEXT:    cmovbq %rax, %rcx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    vmovsd %xmm3, (%rsp,%rax,8)
+; AVX2-NEXT:    vmovaps (%rsp), %ymm0
+; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT:    movq %rbp, %rsp
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test_compress_v8f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
@@ -633,43 +1082,257 @@ define <8 x double> @test_compress_v8f64(<8 x double> %vec, <8 x i1> %mask, <8 x
     ret <8 x double> %out
 }
 
-define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> %passthru) {
-; AVX512F-LABEL: test_compress_v16i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
-; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
-; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
-; AVX512F-NEXT:    vpcompressd %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vpmovdb %zmm1, %xmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: test_compress_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpsllw $7, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpmovb2m %xmm1, %k1
-; AVX512VL-NEXT:    vpcompressb %xmm0, %xmm2 {%k1}
-; AVX512VL-NEXT:    vmovdqa %xmm2, %xmm0
-; AVX512VL-NEXT:    retq
-    %out = call <16 x i8> @llvm.experimental.vector.compress(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> %passthru)
-    ret <16 x i8> %out
-}
-
-define <8 x i16> @test_compress_v8i16(<8 x i16> %vec, <8 x i1> %mask, <8 x i16> %passthru) {
-; AVX512F-LABEL: test_compress_v8i16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
-; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
-; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
-; AVX512F-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vpmovqw %zmm1, %xmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
+define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> %passthru) nounwind {
+; AVX2-LABEL: test_compress_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    vpsllw $7, %xmm1, %xmm1
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpgtb %xmm1, %xmm3, %xmm1
+; AVX2-NEXT:    vmovaps %xmm2, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $1, %xmm1, %r11d
+; AVX2-NEXT:    vmovd %xmm1, %eax
+; AVX2-NEXT:    movzbl %al, %edx
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    subb %r11b, %al
+; AVX2-NEXT:    vpextrb $2, %xmm1, %esi
+; AVX2-NEXT:    subb %sil, %al
+; AVX2-NEXT:    vpextrb $3, %xmm1, %r13d
+; AVX2-NEXT:    subb %r13b, %al
+; AVX2-NEXT:    vpextrb $4, %xmm1, %r12d
+; AVX2-NEXT:    subb %r12b, %al
+; AVX2-NEXT:    vpextrb $5, %xmm1, %r15d
+; AVX2-NEXT:    subb %r15b, %al
+; AVX2-NEXT:    vpextrb $6, %xmm1, %r14d
+; AVX2-NEXT:    subb %r14b, %al
+; AVX2-NEXT:    vpextrb $7, %xmm1, %ebp
+; AVX2-NEXT:    subb %bpl, %al
+; AVX2-NEXT:    vpextrb $8, %xmm1, %ebx
+; AVX2-NEXT:    subb %bl, %al
+; AVX2-NEXT:    vpextrb $9, %xmm1, %r10d
+; AVX2-NEXT:    subb %r10b, %al
+; AVX2-NEXT:    vpextrb $10, %xmm1, %r9d
+; AVX2-NEXT:    subb %r9b, %al
+; AVX2-NEXT:    vpextrb $11, %xmm1, %r8d
+; AVX2-NEXT:    subb %r8b, %al
+; AVX2-NEXT:    vpextrb $12, %xmm1, %edi
+; AVX2-NEXT:    subb %dil, %al
+; AVX2-NEXT:    vpextrb $13, %xmm1, %ecx
+; AVX2-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT:    subb %cl, %al
+; AVX2-NEXT:    vpextrb $14, %xmm1, %ecx
+; AVX2-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT:    subb %cl, %al
+; AVX2-NEXT:    vpextrb $15, %xmm1, %ecx
+; AVX2-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT:    subb %cl, %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    movzbl -40(%rsp,%rax), %eax
+; AVX2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT:    vpextrb $0, %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    vpextrb $1, %xmm0, -40(%rsp,%rdx)
+; AVX2-NEXT:    movzbl %r11b, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rdx, %rax
+; AVX2-NEXT:    vpextrb $2, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT:    movzbl %sil, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    vpextrb $3, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT:    movzbl %r13b, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    vpextrb $4, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT:    movzbl %r12b, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    movzbl %r15b, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpextrb $5, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT:    movzbl %r14b, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vpextrb $6, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT:    movzbl %bpl, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpextrb $7, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT:    movzbl %bl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vpextrb $8, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT:    movzbl %r10b, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpextrb $9, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT:    movzbl %r9b, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vpextrb $10, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT:    movzbl %r8b, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpextrb $11, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT:    movzbl %dil, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vpextrb $12, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpextrb $13, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vpextrb $14, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpextrb $15, %xmm0, -40(%rsp,%rcx)
+; AVX2-NEXT:    cmpq $15, %rax
+; AVX2-NEXT:    movl $15, %ecx
+; AVX2-NEXT:    cmovbq %rax, %rcx
+; AVX2-NEXT:    vpextrb $15, %xmm0, %eax
+; AVX2-NEXT:    cmovbel {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; AVX2-NEXT:    movb %al, -40(%rsp,%rcx)
+; AVX2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test_compress_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512F-NEXT:    vpcompressd %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: test_compress_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllw $7, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmovb2m %xmm1, %k1
+; AVX512VL-NEXT:    vpcompressb %xmm0, %xmm2 {%k1}
+; AVX512VL-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX512VL-NEXT:    retq
+    %out = call <16 x i8> @llvm.experimental.vector.compress(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> %passthru)
+    ret <16 x i8> %out
+}
+
+define <8 x i16> @test_compress_v8i16(<8 x i16> %vec, <8 x i1> %mask, <8 x i16> %passthru) nounwind {
+; AVX2-LABEL: test_compress_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    vpsllw $15, %xmm1, %xmm1
+; AVX2-NEXT:    vpsraw $15, %xmm1, %xmm1
+; AVX2-NEXT:    vmovaps %xmm2, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    vmovd %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    leal (%rcx,%rax), %esi
+; AVX2-NEXT:    vpextrw $2, %xmm1, %edi
+; AVX2-NEXT:    andl $1, %edi
+; AVX2-NEXT:    vpextrw $3, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    leal (%rdi,%rdx), %r10d
+; AVX2-NEXT:    addl %esi, %r10d
+; AVX2-NEXT:    vpextrw $4, %xmm1, %r9d
+; AVX2-NEXT:    andl $1, %r9d
+; AVX2-NEXT:    vpextrw $5, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    leal (%r9,%rsi), %r11d
+; AVX2-NEXT:    vpextrw $6, %xmm1, %r8d
+; AVX2-NEXT:    andl $1, %r8d
+; AVX2-NEXT:    addl %r8d, %r11d
+; AVX2-NEXT:    addl %r10d, %r11d
+; AVX2-NEXT:    vpextrw $7, %xmm1, %r10d
+; AVX2-NEXT:    andl $1, %r10d
+; AVX2-NEXT:    addl %r10d, %r11d
+; AVX2-NEXT:    andl $7, %r11d
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    addq %rax, %rdi
+; AVX2-NEXT:    addq %rdi, %rdx
+; AVX2-NEXT:    addq %rdx, %r9
+; AVX2-NEXT:    addq %r9, %rsi
+; AVX2-NEXT:    addq %rsi, %r8
+; AVX2-NEXT:    addq %r8, %r10
+; AVX2-NEXT:    vpextrw $7, %xmm0, %ebx
+; AVX2-NEXT:    cmpq $8, %r10
+; AVX2-NEXT:    cmovbw -16(%rsp,%r11,2), %bx
+; AVX2-NEXT:    vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $1, %xmm0, -16(%rsp,%rcx,2)
+; AVX2-NEXT:    vpextrw $2, %xmm0, -16(%rsp,%rax,2)
+; AVX2-NEXT:    vpextrw $3, %xmm0, -16(%rsp,%rdi,2)
+; AVX2-NEXT:    andl $7, %edx
+; AVX2-NEXT:    vpextrw $4, %xmm0, -16(%rsp,%rdx,2)
+; AVX2-NEXT:    andl $7, %r9d
+; AVX2-NEXT:    vpextrw $5, %xmm0, -16(%rsp,%r9,2)
+; AVX2-NEXT:    andl $7, %esi
+; AVX2-NEXT:    vpextrw $6, %xmm0, -16(%rsp,%rsi,2)
+; AVX2-NEXT:    andl $7, %r8d
+; AVX2-NEXT:    vpextrw $7, %xmm0, -16(%rsp,%r8,2)
+; AVX2-NEXT:    cmpq $7, %r10
+; AVX2-NEXT:    movl $7, %eax
+; AVX2-NEXT:    cmovbq %r10, %rax
+; AVX2-NEXT:    movl %eax, %eax
+; AVX2-NEXT:    movw %bx, -16(%rsp,%rax,2)
+; AVX2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test_compress_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
+; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
+; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
+; AVX512F-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vpmovqw %zmm1, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
 ; AVX512VL-LABEL: test_compress_v8i16:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $15, %xmm1, %xmm1
@@ -681,14 +1344,252 @@ define <8 x i16> @test_compress_v8i16(<8 x i16> %vec, <8 x i1> %mask, <8 x i16>
     ret <8 x i16> %out
 }
 
-define <32 x i8> @test_compress_v32i8(<32 x i8> %vec, <32 x i1> %mask, <32 x i8> %passthru) {
+define <32 x i8> @test_compress_v32i8(<32 x i8> %vec, <32 x i1> %mask, <32 x i8> %passthru) nounwind {
+; AVX2-LABEL: test_compress_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $64, %rsp
+; AVX2-NEXT:    vpsllw $7, %ymm1, %ymm1
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpgtb %ymm1, %ymm3, %ymm3
+; AVX2-NEXT:    vmovaps %ymm2, (%rsp)
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm1
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm4
+; AVX2-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT:    vpextrb $1, %xmm2, %eax
+; AVX2-NEXT:    vmovd %xmm2, %ecx
+; AVX2-NEXT:    addb %al, %cl
+; AVX2-NEXT:    vpextrb $2, %xmm2, %eax
+; AVX2-NEXT:    vpextrb $3, %xmm2, %edx
+; AVX2-NEXT:    addb %al, %dl
+; AVX2-NEXT:    addb %cl, %dl
+; AVX2-NEXT:    vpextrb $4, %xmm2, %eax
+; AVX2-NEXT:    vpextrb $5, %xmm2, %ecx
+; AVX2-NEXT:    addb %al, %cl
+; AVX2-NEXT:    vpextrb $6, %xmm2, %eax
+; AVX2-NEXT:    addb %cl, %al
+; AVX2-NEXT:    addb %dl, %al
+; AVX2-NEXT:    vpextrb $7, %xmm2, %ecx
+; AVX2-NEXT:    vpextrb $8, %xmm2, %edx
+; AVX2-NEXT:    addb %cl, %dl
+; AVX2-NEXT:    vpextrb $9, %xmm2, %ecx
+; AVX2-NEXT:    addb %dl, %cl
+; AVX2-NEXT:    vpextrb $10, %xmm2, %edx
+; AVX2-NEXT:    addb %cl, %dl
+; AVX2-NEXT:    addb %al, %dl
+; AVX2-NEXT:    vpextrb $11, %xmm2, %eax
+; AVX2-NEXT:    vpextrb $12, %xmm2, %ecx
+; AVX2-NEXT:    addb %al, %cl
+; AVX2-NEXT:    vpextrb $13, %xmm2, %eax
+; AVX2-NEXT:    addb %cl, %al
+; AVX2-NEXT:    vpextrb $14, %xmm2, %ecx
+; AVX2-NEXT:    addb %al, %cl
+; AVX2-NEXT:    vpextrb $15, %xmm2, %eax
+; AVX2-NEXT:    addb %cl, %al
+; AVX2-NEXT:    addb %dl, %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $31, %eax
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
+; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp)
+; AVX2-NEXT:    vmovd %xmm3, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $1, %xmm3, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rdx)
+; AVX2-NEXT:    vpextrb $2, %xmm3, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $3, %xmm3, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rdx)
+; AVX2-NEXT:    vpextrb $4, %xmm3, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $5, %xmm3, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    vpextrb $6, %xmm3, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
+; AVX2-NEXT:    andl $31, %edx
+; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rdx)
+; AVX2-NEXT:    vpextrb $7, %xmm3, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $31, %ecx
+; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $8, %xmm3, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
+; AVX2-NEXT:    andl $31, %edx
+; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rdx)
+; AVX2-NEXT:    vpextrb $9, %xmm3, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $31, %ecx
+; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $10, %xmm3, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
+; AVX2-NEXT:    andl $31, %edx
+; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rdx)
+; AVX2-NEXT:    vpextrb $11, %xmm3, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $31, %ecx
+; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $12, %xmm3, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
+; AVX2-NEXT:    andl $31, %edx
+; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rdx)
+; AVX2-NEXT:    vpextrb $13, %xmm3, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $31, %ecx
+; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $14, %xmm3, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
+; AVX2-NEXT:    andl $31, %edx
+; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rdx)
+; AVX2-NEXT:    vpextrb $15, %xmm3, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $31, %ecx
+; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    vmovd %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
+; AVX2-NEXT:    andl $31, %edx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp,%rdx)
+; AVX2-NEXT:    vpextrb $1, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $31, %ecx
+; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $2, %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
+; AVX2-NEXT:    andl $31, %edx
+; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rdx)
+; AVX2-NEXT:    vpextrb $3, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $31, %ecx
+; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $4, %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
+; AVX2-NEXT:    andl $31, %edx
+; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rdx)
+; AVX2-NEXT:    vpextrb $5, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $31, %ecx
+; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $6, %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
+; AVX2-NEXT:    andl $31, %edx
+; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rdx)
+; AVX2-NEXT:    vpextrb $7, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $31, %ecx
+; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $8, %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
+; AVX2-NEXT:    andl $31, %edx
+; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rdx)
+; AVX2-NEXT:    vpextrb $9, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $31, %ecx
+; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $10, %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
+; AVX2-NEXT:    andl $31, %edx
+; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rdx)
+; AVX2-NEXT:    vpextrb $11, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $31, %ecx
+; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $12, %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
+; AVX2-NEXT:    andl $31, %edx
+; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rdx)
+; AVX2-NEXT:    vpextrb $13, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $31, %ecx
+; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $14, %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
+; AVX2-NEXT:    andl $31, %edx
+; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rdx)
+; AVX2-NEXT:    vpextrb $15, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $31, %ecx
+; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT:    cmpq $31, %rdx
+; AVX2-NEXT:    movl $31, %ecx
+; AVX2-NEXT:    cmovbq %rdx, %rcx
+; AVX2-NEXT:    vpextrb $15, %xmm0, %edx
+; AVX2-NEXT:    cmovbel %eax, %edx
+; AVX2-NEXT:    movb %dl, (%rsp,%rcx)
+; AVX2-NEXT:    vmovaps (%rsp), %ymm0
+; AVX2-NEXT:    movq %rbp, %rsp
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test_compress_v32i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_offset 16
-; AVX512F-NEXT:    .cfi_offset %rbp, -16
 ; AVX512F-NEXT:    movq %rsp, %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_register %rbp
 ; AVX512F-NEXT:    andq $-32, %rsp
 ; AVX512F-NEXT:    subq $64, %rsp
 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm3
@@ -719,7 +1620,6 @@ define <32 x i8> @test_compress_v32i8(<32 x i8> %vec, <32 x i1> %mask, <32 x i8>
 ; AVX512F-NEXT:    vpblendvb %ymm0, (%rsp), %ymm2, %ymm0
 ; AVX512F-NEXT:    movq %rbp, %rsp
 ; AVX512F-NEXT:    popq %rbp
-; AVX512F-NEXT:    .cfi_def_cfa %rsp, 8
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: test_compress_v32i8:
@@ -733,7 +1633,147 @@ define <32 x i8> @test_compress_v32i8(<32 x i8> %vec, <32 x i1> %mask, <32 x i8>
     ret <32 x i8> %out
 }
 
-define <16 x i16> @test_compress_v16i16(<16 x i16> %vec, <16 x i1> %mask, <16 x i16> %passthru) {
+define <16 x i16> @test_compress_v16i16(<16 x i16> %vec, <16 x i1> %mask, <16 x i16> %passthru) nounwind {
+; AVX2-LABEL: test_compress_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $96, %rsp
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT:    vpsllw $15, %ymm1, %ymm3
+; AVX2-NEXT:    vpsraw $15, %ymm3, %ymm1
+; AVX2-NEXT:    vmovaps %ymm2, (%rsp)
+; AVX2-NEXT:    vpsrlw $15, %ymm3, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpextrw $1, %xmm2, %eax
+; AVX2-NEXT:    vmovd %xmm2, %ecx
+; AVX2-NEXT:    addl %eax, %ecx
+; AVX2-NEXT:    vpextrw $2, %xmm2, %eax
+; AVX2-NEXT:    vpextrw $3, %xmm2, %edx
+; AVX2-NEXT:    addl %eax, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    vpextrw $4, %xmm2, %eax
+; AVX2-NEXT:    vpextrw $5, %xmm2, %ecx
+; AVX2-NEXT:    addl %eax, %ecx
+; AVX2-NEXT:    vpextrw $6, %xmm2, %eax
+; AVX2-NEXT:    addl %ecx, %eax
+; AVX2-NEXT:    addl %edx, %eax
+; AVX2-NEXT:    vpextrw $7, %xmm2, %ecx
+; AVX2-NEXT:    addl %eax, %ecx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    vmovd %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrw $2, %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrw $4, %xmm1, %r8d
+; AVX2-NEXT:    andl $1, %r8d
+; AVX2-NEXT:    addq %rax, %r8
+; AVX2-NEXT:    vpextrw $5, %xmm1, %r9d
+; AVX2-NEXT:    andl $1, %r9d
+; AVX2-NEXT:    addq %r8, %r9
+; AVX2-NEXT:    vpextrw $6, %xmm1, %r10d
+; AVX2-NEXT:    andl $1, %r10d
+; AVX2-NEXT:    addq %r9, %r10
+; AVX2-NEXT:    vpextrw $7, %xmm1, %r11d
+; AVX2-NEXT:    andl $1, %r11d
+; AVX2-NEXT:    addq %r10, %r11
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vmovd %xmm1, %ebx
+; AVX2-NEXT:    andl $1, %ebx
+; AVX2-NEXT:    addq %r11, %rbx
+; AVX2-NEXT:    vpextrw $1, %xmm1, %r14d
+; AVX2-NEXT:    andl $1, %r14d
+; AVX2-NEXT:    addq %rbx, %r14
+; AVX2-NEXT:    vpextrw $2, %xmm1, %r15d
+; AVX2-NEXT:    andl $1, %r15d
+; AVX2-NEXT:    addq %r14, %r15
+; AVX2-NEXT:    vpextrw $3, %xmm1, %r12d
+; AVX2-NEXT:    andl $1, %r12d
+; AVX2-NEXT:    addq %r15, %r12
+; AVX2-NEXT:    vpextrw $4, %xmm1, %r13d
+; AVX2-NEXT:    andl $1, %r13d
+; AVX2-NEXT:    addq %r12, %r13
+; AVX2-NEXT:    vpextrw $5, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %r13, %rdx
+; AVX2-NEXT:    vpextrw $6, %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    vpextrw $7, %xmm1, %edi
+; AVX2-NEXT:    andl $1, %edi
+; AVX2-NEXT:    addq %rcx, %rdi
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    cmpq $16, %rdi
+; AVX2-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT:    cmovbw (%rsp,%rsi,2), %ax
+; AVX2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT:    vpextrw $0, %xmm0, (%rsp)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT:    vpextrw $1, %xmm0, (%rsp,%rsi,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT:    vpextrw $2, %xmm0, (%rsp,%rsi,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT:    vpextrw $3, %xmm0, (%rsp,%rsi,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    vpextrw $4, %xmm0, (%rsp,%rax,2)
+; AVX2-NEXT:    andl $15, %r8d
+; AVX2-NEXT:    vpextrw $5, %xmm0, (%rsp,%r8,2)
+; AVX2-NEXT:    andl $15, %r9d
+; AVX2-NEXT:    vpextrw $6, %xmm0, (%rsp,%r9,2)
+; AVX2-NEXT:    andl $15, %r10d
+; AVX2-NEXT:    vpextrw $7, %xmm0, (%rsp,%r10,2)
+; AVX2-NEXT:    andl $15, %r11d
+; AVX2-NEXT:    vpextrw $0, %xmm1, (%rsp,%r11,2)
+; AVX2-NEXT:    andl $15, %ebx
+; AVX2-NEXT:    vpextrw $1, %xmm1, (%rsp,%rbx,2)
+; AVX2-NEXT:    andl $15, %r14d
+; AVX2-NEXT:    vpextrw $2, %xmm1, (%rsp,%r14,2)
+; AVX2-NEXT:    andl $15, %r15d
+; AVX2-NEXT:    vpextrw $3, %xmm1, (%rsp,%r15,2)
+; AVX2-NEXT:    andl $15, %r12d
+; AVX2-NEXT:    vpextrw $4, %xmm1, (%rsp,%r12,2)
+; AVX2-NEXT:    andl $15, %r13d
+; AVX2-NEXT:    vpextrw $5, %xmm1, (%rsp,%r13,2)
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpextrw $6, %xmm1, (%rsp,%rdx,2)
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpextrw $7, %xmm1, (%rsp,%rcx,2)
+; AVX2-NEXT:    cmpq $15, %rdi
+; AVX2-NEXT:    movl $15, %eax
+; AVX2-NEXT:    cmovbq %rdi, %rax
+; AVX2-NEXT:    movl %eax, %eax
+; AVX2-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; AVX2-NEXT:    movw %cx, (%rsp,%rax,2)
+; AVX2-NEXT:    vmovaps (%rsp), %ymm0
+; AVX2-NEXT:    leaq -40(%rbp), %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test_compress_v16i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -745,97 +1785,2541 @@ define <16 x i16> @test_compress_v16i16(<16 x i16> %vec, <16 x i1> %mask, <16 x
 ; AVX512F-NEXT:    vpmovdw %zmm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VL-LABEL: test_compress_v16i16:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpsllw $7, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpmovb2m %xmm1, %k1
-; AVX512VL-NEXT:    vpcompressw %ymm0, %ymm2 {%k1}
-; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
-; AVX512VL-NEXT:    retq
-    %out = call <16 x i16> @llvm.experimental.vector.compress(<16 x i16> %vec, <16 x i1> %mask, <16 x i16> %passthru)
-    ret <16 x i16> %out
-}
-
-define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> %passthru) {
-; AVX512VL-LABEL: test_compress_v64i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpsllw $7, %zmm1, %zmm1
-; AVX512VL-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512VL-NEXT:    vpcompressb %zmm0, %zmm2 {%k1}
-; AVX512VL-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512VL-NEXT:    retq
-    %out = call <64 x i8> @llvm.experimental.vector.compress(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> %passthru)
-    ret <64 x i8> %out
-}
-
-define <32 x i16> @test_compress_v32i16(<32 x i16> %vec, <32 x i1> %mask, <32 x i16> %passthru) {
-; AVX512F-LABEL: test_compress_v32i16:
+; AVX512VL-LABEL: test_compress_v16i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllw $7, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmovb2m %xmm1, %k1
+; AVX512VL-NEXT:    vpcompressw %ymm0, %ymm2 {%k1}
+; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX512VL-NEXT:    retq
+    %out = call <16 x i16> @llvm.experimental.vector.compress(<16 x i16> %vec, <16 x i1> %mask, <16 x i16> %passthru)
+    ret <16 x i16> %out
+}
+
+define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> %passthru) nounwind {
+; AVX2-LABEL: test_compress_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $128, %rsp
+; AVX2-NEXT:    # kill: def $r9d killed $r9d def $r9
+; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    # kill: def $r8d killed $r8d def $r8
+; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movl %ecx, %r13d
+; AVX2-NEXT:    movl %edx, %r15d
+; AVX2-NEXT:    movl %esi, %ebx
+; AVX2-NEXT:    # kill: def $edi killed $edi def $rdi
+; AVX2-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movl 360(%rbp), %eax
+; AVX2-NEXT:    movl 352(%rbp), %ecx
+; AVX2-NEXT:    vmovd %ecx, %xmm4
+; AVX2-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
+; AVX2-NEXT:    movl 368(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
+; AVX2-NEXT:    movl 376(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
+; AVX2-NEXT:    movl 384(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
+; AVX2-NEXT:    movl 392(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
+; AVX2-NEXT:    movl 400(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
+; AVX2-NEXT:    movl 408(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
+; AVX2-NEXT:    movl 416(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
+; AVX2-NEXT:    movl 424(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
+; AVX2-NEXT:    movl 432(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
+; AVX2-NEXT:    movl 440(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
+; AVX2-NEXT:    movl 448(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
+; AVX2-NEXT:    movl 456(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
+; AVX2-NEXT:    movl 464(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
+; AVX2-NEXT:    movl 472(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm4
+; AVX2-NEXT:    movl 224(%rbp), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm5
+; AVX2-NEXT:    movl 232(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 240(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 248(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 256(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 264(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 272(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 280(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 288(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 296(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 304(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 312(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 320(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 328(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 336(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 344(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX2-NEXT:    movl 96(%rbp), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm5
+; AVX2-NEXT:    movl 104(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 112(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 120(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 128(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 136(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 144(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 152(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 160(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 168(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 176(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 184(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 192(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 200(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 208(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl 216(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    vmovd %edi, %xmm6
+; AVX2-NEXT:    vpinsrb $1, %esi, %xmm6, %xmm6
+; AVX2-NEXT:    vpinsrb $2, %edx, %xmm6, %xmm6
+; AVX2-NEXT:    vpinsrb $3, %r13d, %xmm6, %xmm6
+; AVX2-NEXT:    vpinsrb $4, %r8d, %xmm6, %xmm6
+; AVX2-NEXT:    vpinsrb $5, %r9d, %xmm6, %xmm6
+; AVX2-NEXT:    movl 16(%rbp), %esi
+; AVX2-NEXT:    vpinsrb $6, %esi, %xmm6, %xmm6
+; AVX2-NEXT:    movl 24(%rbp), %edi
+; AVX2-NEXT:    vpinsrb $7, %edi, %xmm6, %xmm6
+; AVX2-NEXT:    movl 32(%rbp), %r8d
+; AVX2-NEXT:    vpinsrb $8, %r8d, %xmm6, %xmm6
+; AVX2-NEXT:    movl 40(%rbp), %r9d
+; AVX2-NEXT:    vpinsrb $9, %r9d, %xmm6, %xmm6
+; AVX2-NEXT:    movl 48(%rbp), %r10d
+; AVX2-NEXT:    vpinsrb $10, %r10d, %xmm6, %xmm6
+; AVX2-NEXT:    movl 56(%rbp), %r11d
+; AVX2-NEXT:    vpinsrb $11, %r11d, %xmm6, %xmm6
+; AVX2-NEXT:    movl 64(%rbp), %r14d
+; AVX2-NEXT:    vpinsrb $12, %r14d, %xmm6, %xmm6
+; AVX2-NEXT:    movl 72(%rbp), %r12d
+; AVX2-NEXT:    vpinsrb $13, %r12d, %xmm6, %xmm6
+; AVX2-NEXT:    movl 80(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $14, %eax, %xmm6, %xmm6
+; AVX2-NEXT:    movl 88(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $15, %eax, %xmm6, %xmm6
+; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm6, %ymm5
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-NEXT:    vpand %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vpaddb %ymm4, %ymm5, %ymm4
+; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX2-NEXT:    vpaddb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT:    vpextrb $1, %xmm4, %eax
+; AVX2-NEXT:    vmovd %xmm4, %ecx
+; AVX2-NEXT:    addb %al, %cl
+; AVX2-NEXT:    vpextrb $2, %xmm4, %edx
+; AVX2-NEXT:    vpextrb $3, %xmm4, %eax
+; AVX2-NEXT:    addb %dl, %al
+; AVX2-NEXT:    addb %cl, %al
+; AVX2-NEXT:    vpextrb $4, %xmm4, %ecx
+; AVX2-NEXT:    vpextrb $5, %xmm4, %edx
+; AVX2-NEXT:    addb %cl, %dl
+; AVX2-NEXT:    vpextrb $6, %xmm4, %ecx
+; AVX2-NEXT:    addb %dl, %cl
+; AVX2-NEXT:    addb %al, %cl
+; AVX2-NEXT:    vpextrb $7, %xmm4, %eax
+; AVX2-NEXT:    vpextrb $8, %xmm4, %edx
+; AVX2-NEXT:    addb %al, %dl
+; AVX2-NEXT:    vpextrb $9, %xmm4, %eax
+; AVX2-NEXT:    addb %dl, %al
+; AVX2-NEXT:    vpextrb $10, %xmm4, %edx
+; AVX2-NEXT:    addb %al, %dl
+; AVX2-NEXT:    addb %cl, %dl
+; AVX2-NEXT:    vpextrb $11, %xmm4, %eax
+; AVX2-NEXT:    vpextrb $12, %xmm4, %ecx
+; AVX2-NEXT:    addb %al, %cl
+; AVX2-NEXT:    vpextrb $13, %xmm4, %eax
+; AVX2-NEXT:    addb %cl, %al
+; AVX2-NEXT:    vpextrb $14, %xmm4, %ecx
+; AVX2-NEXT:    addb %al, %cl
+; AVX2-NEXT:    vpextrb $15, %xmm4, %eax
+; AVX2-NEXT:    addb %cl, %al
+; AVX2-NEXT:    addb %dl, %al
+; AVX2-NEXT:    vmovaps %ymm3, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm2, (%rsp)
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
+; AVX2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    andl $1, %ebx
+; AVX2-NEXT:    addq %rax, %rbx
+; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rbx)
+; AVX2-NEXT:    andl $1, %r15d
+; AVX2-NEXT:    addq %rbx, %r15
+; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%r15)
+; AVX2-NEXT:    andl $1, %r13d
+; AVX2-NEXT:    addq %r15, %r13
+; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%r13)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %r13, %rcx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    addq %rax, %rsi
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    andl $1, %edi
+; AVX2-NEXT:    addq %rsi, %rdi
+; AVX2-NEXT:    # kill: def $esi killed $esi killed $rsi def $rsi
+; AVX2-NEXT:    andl $63, %esi
+; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rsi)
+; AVX2-NEXT:    andl $1, %r8d
+; AVX2-NEXT:    addq %rdi, %r8
+; AVX2-NEXT:    # kill: def $edi killed $edi killed $rdi def $rdi
+; AVX2-NEXT:    andl $63, %edi
+; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rdi)
+; AVX2-NEXT:    andl $1, %r9d
+; AVX2-NEXT:    addq %r8, %r9
+; AVX2-NEXT:    # kill: def $r8d killed $r8d killed $r8 def $r8
+; AVX2-NEXT:    andl $63, %r8d
+; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%r8)
+; AVX2-NEXT:    andl $1, %r10d
+; AVX2-NEXT:    addq %r9, %r10
+; AVX2-NEXT:    # kill: def $r9d killed $r9d killed $r9 def $r9
+; AVX2-NEXT:    andl $63, %r9d
+; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%r9)
+; AVX2-NEXT:    andl $1, %r11d
+; AVX2-NEXT:    addq %r10, %r11
+; AVX2-NEXT:    # kill: def $r10d killed $r10d killed $r10 def $r10
+; AVX2-NEXT:    andl $63, %r10d
+; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%r10)
+; AVX2-NEXT:    andl $1, %r14d
+; AVX2-NEXT:    addq %r11, %r14
+; AVX2-NEXT:    # kill: def $r11d killed $r11d killed $r11 def $r11
+; AVX2-NEXT:    andl $63, %r11d
+; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%r11)
+; AVX2-NEXT:    andl $1, %r12d
+; AVX2-NEXT:    addq %r14, %r12
+; AVX2-NEXT:    # kill: def $r14d killed $r14d killed $r14 def $r14
+; AVX2-NEXT:    andl $63, %r14d
+; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%r14)
+; AVX2-NEXT:    movl 80(%rbp), %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %r12, %rax
+; AVX2-NEXT:    # kill: def $r12d killed $r12d killed $r12 def $r12
+; AVX2-NEXT:    andl $63, %r12d
+; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%r12)
+; AVX2-NEXT:    movl 88(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 96(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 104(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 112(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 120(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 128(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 136(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 144(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 152(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 160(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 168(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 176(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 184(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 192(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 200(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 208(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 216(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 224(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $0, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 232(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $1, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 240(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $2, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 248(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $3, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 256(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $4, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 264(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $5, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 272(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $6, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 280(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $7, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 288(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $8, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 296(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $9, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 304(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $10, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 312(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $11, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 320(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $12, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 328(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $13, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 336(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $14, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 344(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $15, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 352(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 360(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 368(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 376(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 384(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 392(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 400(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 408(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 416(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 424(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 432(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 440(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 448(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 456(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 464(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 472(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    vpextrb $15, %xmm0, %eax
+; AVX2-NEXT:    cmpq $64, %rcx
+; AVX2-NEXT:    cmovbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; AVX2-NEXT:    cmpq $63, %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    movl $63, %ecx
+; AVX2-NEXT:    cmovbq %rdx, %rcx
+; AVX2-NEXT:    movb %al, (%rsp,%rcx)
+; AVX2-NEXT:    vmovaps (%rsp), %ymm0
+; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT:    leaq -40(%rbp), %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test_compress_v64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
+; AVX512F-NEXT:    andq $-64, %rsp
+; AVX512F-NEXT:    subq $256, %rsp # imm = 0x100
+; AVX512F-NEXT:    movzbl 352(%rbp), %eax
+; AVX512F-NEXT:    andl $1, %eax
+; AVX512F-NEXT:    kmovw %eax, %k0
+; AVX512F-NEXT:    movzbl 360(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-5, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    kmovw %k1, %k3
+; AVX512F-NEXT:    movzbl 368(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $13, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-9, %ax
+; AVX512F-NEXT:    kmovw %eax, %k7
+; AVX512F-NEXT:    kandw %k7, %k0, %k0
+; AVX512F-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    movzbl 376(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $12, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-17, %ax
+; AVX512F-NEXT:    kmovw %eax, %k5
+; AVX512F-NEXT:    kandw %k5, %k0, %k0
+; AVX512F-NEXT:    movzbl 384(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $11, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-33, %ax
+; AVX512F-NEXT:    kmovw %eax, %k6
+; AVX512F-NEXT:    kandw %k6, %k0, %k0
+; AVX512F-NEXT:    movzbl 392(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $10, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-65, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    movzbl 400(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $9, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-129, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    movzbl 408(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-257, %ax # imm = 0xFEFF
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    movzbl 416(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $7, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-513, %ax # imm = 0xFDFF
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    movzbl 424(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $6, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-1025, %ax # imm = 0xFBFF
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    movzbl 432(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $5, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-2049, %ax # imm = 0xF7FF
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    movzbl 440(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $4, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-4097, %ax # imm = 0xEFFF
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    movzbl 448(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $3, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-8193, %ax # imm = 0xDFFF
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    movzbl 456(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $2, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k1
+; AVX512F-NEXT:    movw $-16385, %ax # imm = 0xBFFF
+; AVX512F-NEXT:    kmovw %eax, %k4
+; AVX512F-NEXT:    kandw %k4, %k1, %k1
+; AVX512F-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    movzbl 464(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512F-NEXT:    movzbl 472(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    movzbl 224(%rbp), %eax
+; AVX512F-NEXT:    andl $1, %eax
+; AVX512F-NEXT:    movzbl 232(%rbp), %r10d
+; AVX512F-NEXT:    kmovw %r10d, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    korw %k1, %k2, %k1
+; AVX512F-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k3, %k1, %k1
+; AVX512F-NEXT:    movzbl 240(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $13, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k7, %k1, %k1
+; AVX512F-NEXT:    movzbl 248(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $12, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k5, %k1, %k1
+; AVX512F-NEXT:    movzbl 256(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $11, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k6, %k1, %k1
+; AVX512F-NEXT:    movzbl 264(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $10, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k7, %k1, %k1
+; AVX512F-NEXT:    movzbl 272(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $9, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k0
+; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    movzbl 280(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k0, %k2
+; AVX512F-NEXT:    korw %k1, %k2, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k0, %k1, %k1
+; AVX512F-NEXT:    movzbl 288(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k0
+; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftlw $15, %k0, %k2
+; AVX512F-NEXT:    kshiftrw $7, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k0, %k1, %k1
+; AVX512F-NEXT:    movzbl 296(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k0
+; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftrw $6, %k0, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k0, %k1, %k1
+; AVX512F-NEXT:    movzbl 304(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k0
+; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftrw $5, %k0, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k0, %k1, %k1
+; AVX512F-NEXT:    movzbl 312(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k0
+; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftrw $4, %k0, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k0, %k1, %k1
+; AVX512F-NEXT:    movzbl 320(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k0
+; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftrw $3, %k0, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k0, %k1, %k1
+; AVX512F-NEXT:    movzbl 328(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftrw $2, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k4, %k1, %k1
+; AVX512F-NEXT:    movzbl 336(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512F-NEXT:    movzbl 344(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    movzbl 96(%rbp), %eax
+; AVX512F-NEXT:    andl $1, %eax
+; AVX512F-NEXT:    movzbl 104(%rbp), %r10d
+; AVX512F-NEXT:    kmovw %r10d, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    korw %k1, %k2, %k1
+; AVX512F-NEXT:    kandw %k3, %k1, %k1
+; AVX512F-NEXT:    movzbl 112(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $13, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k4, %k1, %k1
+; AVX512F-NEXT:    movzbl 120(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $12, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k5, %k1, %k1
+; AVX512F-NEXT:    movzbl 128(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $11, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k6, %k1, %k1
+; AVX512F-NEXT:    movzbl 136(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $10, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k7, %k1, %k1
+; AVX512F-NEXT:    movzbl 144(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $9, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k1, %k1
+; AVX512F-NEXT:    movzbl 152(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $8, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k3, %k1, %k1
+; AVX512F-NEXT:    movzbl 160(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $7, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k7, %k1, %k1
+; AVX512F-NEXT:    movzbl 168(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $6, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k1, %k1
+; AVX512F-NEXT:    movzbl 176(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $5, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k1, %k1
+; AVX512F-NEXT:    movzbl 184(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $4, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k1, %k1
+; AVX512F-NEXT:    movzbl 192(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $3, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k0, %k1, %k1
+; AVX512F-NEXT:    movzbl 200(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $2, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k1, %k1
+; AVX512F-NEXT:    movzbl 208(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512F-NEXT:    movzbl 216(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    andl $1, %edi
+; AVX512F-NEXT:    kmovw %esi, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512F-NEXT:    kmovw %edi, %k2
+; AVX512F-NEXT:    korw %k1, %k2, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw %edx, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $13, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k4, %k1, %k1
+; AVX512F-NEXT:    kmovw %ecx, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $12, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k5, %k1, %k1
+; AVX512F-NEXT:    kmovw %r8d, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $11, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k6, %k1, %k1
+; AVX512F-NEXT:    kmovw %r9d, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $10, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k1, %k1
+; AVX512F-NEXT:    movzbl 16(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $9, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k2
+; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k1, %k2, %k1
+; AVX512F-NEXT:    movzbl 24(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $8, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k3, %k1, %k1
+; AVX512F-NEXT:    movzbl 32(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $7, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k7, %k1, %k1
+; AVX512F-NEXT:    movzbl 40(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftrw $6, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k1, %k1
+; AVX512F-NEXT:    movzbl 48(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k5
+; AVX512F-NEXT:    kshiftrw $5, %k5, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k1, %k1
+; AVX512F-NEXT:    movzbl 56(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k4
+; AVX512F-NEXT:    kshiftrw $4, %k4, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k1, %k1
+; AVX512F-NEXT:    movzbl 64(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k3
+; AVX512F-NEXT:    kshiftrw $3, %k3, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k0, %k1, %k1
+; AVX512F-NEXT:    movzbl 72(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $2, %k2, %k0
+; AVX512F-NEXT:    korw %k0, %k1, %k0
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    movzbl 80(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $14, %k1, %k7
+; AVX512F-NEXT:    korw %k7, %k0, %k0
+; AVX512F-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $1, %k0, %k7
+; AVX512F-NEXT:    movzbl 88(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k0
+; AVX512F-NEXT:    kshiftlw $15, %k0, %k6
+; AVX512F-NEXT:    korw %k6, %k7, %k6
+; AVX512F-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    movw $-3, %ax
+; AVX512F-NEXT:    kmovw %eax, %k6
+; AVX512F-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k6, %k7, %k6
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512F-NEXT:    kshiftrw $14, %k7, %k7
+; AVX512F-NEXT:    korw %k7, %k6, %k6
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k7, %k6, %k6
+; AVX512F-NEXT:    kshiftrw $13, %k5, %k5
+; AVX512F-NEXT:    korw %k5, %k6, %k5
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k6, %k5, %k5
+; AVX512F-NEXT:    kshiftrw $12, %k4, %k4
+; AVX512F-NEXT:    korw %k4, %k5, %k4
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k5, %k4, %k4
+; AVX512F-NEXT:    kshiftrw $11, %k3, %k3
+; AVX512F-NEXT:    korw %k3, %k4, %k3
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k4, %k3, %k3
+; AVX512F-NEXT:    kshiftrw $10, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k3, %k2
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k3, %k2, %k2
+; AVX512F-NEXT:    kshiftlw $6, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k2, %k1
+; AVX512F-NEXT:    kshiftlw $9, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $9, %k1, %k1
+; AVX512F-NEXT:    kshiftlw $7, %k0, %k0
+; AVX512F-NEXT:    korw %k0, %k1, %k0
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kshiftlw $9, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $9, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kshiftlw $7, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kxorw %k0, %k1, %k0
+; AVX512F-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    kandw %k7, %k0, %k0
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kshiftrw $13, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    kandw %k6, %k0, %k0
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kshiftrw $12, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    kandw %k5, %k0, %k0
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kshiftrw $11, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    kandw %k4, %k0, %k0
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kshiftrw $10, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    kandw %k3, %k0, %k0
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kshiftlw $6, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    kshiftlw $9, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kshiftlw $7, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kshiftlw $9, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $9, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kshiftlw $7, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kxorw %k0, %k1, %k0
+; AVX512F-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512F-NEXT:    kxorw %k2, %k3, %k0
+; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %edx
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    vpcompressd %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 {%k1} {z} = -1
+; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    vpcompressd %zmm2, %zmm2 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm5 {%k1} {z} = -1
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT:    vpcompressd %zmm6, %zmm6 {%k3} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm7 {%k3} {z} = -1
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT:    vpcompressd %zmm0, %zmm0 {%k2} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm8 {%k2} {z} = -1
+; AVX512F-NEXT:    vpmovdb %zmm6, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    andl $31, %eax
+; AVX512F-NEXT:    vpmovdb %zmm0, 64(%rsp,%rax)
+; AVX512F-NEXT:    vpmovdb %zmm3, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    andl $31, %ecx
+; AVX512F-NEXT:    vpmovdb %zmm2, 96(%rsp,%rcx)
+; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX512F-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    andl $63, %edx
+; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX512F-NEXT:    vmovaps %ymm0, 128(%rsp,%rdx)
+; AVX512F-NEXT:    vpmovdb %zmm4, %xmm0
+; AVX512F-NEXT:    vpmovdb %zmm5, %xmm2
+; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm0, {{[0-9]+}}(%rsp), %ymm2, %ymm0
+; AVX512F-NEXT:    vpmovdb %zmm7, %xmm2
+; AVX512F-NEXT:    vpmovdb %zmm8, %xmm3
+; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    movq %rbp, %rsp
+; AVX512F-NEXT:    popq %rbp
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: test_compress_v64i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllw $7, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VL-NEXT:    vpcompressb %zmm0, %zmm2 {%k1}
+; AVX512VL-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512VL-NEXT:    retq
+    %out = call <64 x i8> @llvm.experimental.vector.compress(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> %passthru)
+    ret <64 x i8> %out
+}
+
+define <32 x i16> @test_compress_v32i16(<32 x i16> %vec, <32 x i1> %mask, <32 x i16> %passthru) nounwind {
+; AVX2-LABEL: test_compress_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $256, %rsp # imm = 0x100
+; AVX2-NEXT:    vmovaps %ymm4, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps %ymm3, (%rsp)
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX2-NEXT:    vpand %ymm5, %ymm6, %ymm5
+; AVX2-NEXT:    vpaddw %ymm4, %ymm5, %ymm4
+; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
+; AVX2-NEXT:    vpaddw %xmm5, %xmm4, %xmm4
+; AVX2-NEXT:    vpextrw $1, %xmm4, %eax
+; AVX2-NEXT:    vmovd %xmm4, %ecx
+; AVX2-NEXT:    addl %eax, %ecx
+; AVX2-NEXT:    vpextrw $2, %xmm4, %eax
+; AVX2-NEXT:    vpextrw $3, %xmm4, %edx
+; AVX2-NEXT:    addl %eax, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    vpextrw $4, %xmm4, %eax
+; AVX2-NEXT:    vpextrw $5, %xmm4, %ecx
+; AVX2-NEXT:    addl %eax, %ecx
+; AVX2-NEXT:    vpextrw $6, %xmm4, %eax
+; AVX2-NEXT:    addl %ecx, %eax
+; AVX2-NEXT:    addl %edx, %eax
+; AVX2-NEXT:    vpextrw $7, %xmm4, %ecx
+; AVX2-NEXT:    addl %eax, %ecx
+; AVX2-NEXT:    andl $31, %ecx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $1, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    vmovd %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $2, %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $3, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $4, %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $5, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $6, %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $7, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $8, %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $9, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $10, %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $11, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $12, %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $13, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $14, %xmm2, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $15, %xmm2, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vmovd %xmm3, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $1, %xmm3, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $2, %xmm3, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrb $3, %xmm3, %r12d
+; AVX2-NEXT:    andl $1, %r12d
+; AVX2-NEXT:    addq %rcx, %r12
+; AVX2-NEXT:    vpextrb $4, %xmm3, %r15d
+; AVX2-NEXT:    andl $1, %r15d
+; AVX2-NEXT:    addq %r12, %r15
+; AVX2-NEXT:    vpextrb $5, %xmm3, %r14d
+; AVX2-NEXT:    andl $1, %r14d
+; AVX2-NEXT:    addq %r15, %r14
+; AVX2-NEXT:    vpextrb $6, %xmm3, %ebx
+; AVX2-NEXT:    andl $1, %ebx
+; AVX2-NEXT:    addq %r14, %rbx
+; AVX2-NEXT:    vpextrb $7, %xmm3, %r11d
+; AVX2-NEXT:    andl $1, %r11d
+; AVX2-NEXT:    addq %rbx, %r11
+; AVX2-NEXT:    vpextrb $8, %xmm3, %r10d
+; AVX2-NEXT:    andl $1, %r10d
+; AVX2-NEXT:    addq %r11, %r10
+; AVX2-NEXT:    vpextrb $9, %xmm3, %r9d
+; AVX2-NEXT:    andl $1, %r9d
+; AVX2-NEXT:    addq %r10, %r9
+; AVX2-NEXT:    vpextrb $10, %xmm3, %r8d
+; AVX2-NEXT:    andl $1, %r8d
+; AVX2-NEXT:    addq %r9, %r8
+; AVX2-NEXT:    vpextrb $11, %xmm3, %edi
+; AVX2-NEXT:    andl $1, %edi
+; AVX2-NEXT:    addq %r8, %rdi
+; AVX2-NEXT:    vpextrb $12, %xmm3, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    addq %rdi, %rsi
+; AVX2-NEXT:    vpextrb $13, %xmm3, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rsi, %rdx
+; AVX2-NEXT:    vpextrb $14, %xmm3, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    vpextrb $15, %xmm3, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    cmpq $32, %rax
+; AVX2-NEXT:    vpextrw $7, %xmm2, %eax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    cmovbw (%rsp,%r13,2), %ax
+; AVX2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT:    vpextrw $0, %xmm0, (%rsp)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    vpextrw $1, %xmm0, (%rsp,%r13,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    vpextrw $2, %xmm0, (%rsp,%r13,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    vpextrw $3, %xmm0, (%rsp,%r13,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    vpextrw $4, %xmm0, (%rsp,%r13,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    vpextrw $5, %xmm0, (%rsp,%r13,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    andl $31, %r13d
+; AVX2-NEXT:    vpextrw $6, %xmm0, (%rsp,%r13,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    andl $31, %r13d
+; AVX2-NEXT:    vpextrw $7, %xmm0, (%rsp,%r13,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    andl $31, %r13d
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpextrw $0, %xmm0, (%rsp,%r13,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    andl $31, %r13d
+; AVX2-NEXT:    vpextrw $1, %xmm0, (%rsp,%r13,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    andl $31, %r13d
+; AVX2-NEXT:    vpextrw $2, %xmm0, (%rsp,%r13,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    andl $31, %r13d
+; AVX2-NEXT:    vpextrw $3, %xmm0, (%rsp,%r13,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    andl $31, %r13d
+; AVX2-NEXT:    vpextrw $4, %xmm0, (%rsp,%r13,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    andl $31, %r13d
+; AVX2-NEXT:    vpextrw $5, %xmm0, (%rsp,%r13,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    andl $31, %r13d
+; AVX2-NEXT:    vpextrw $6, %xmm0, (%rsp,%r13,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    andl $31, %r13d
+; AVX2-NEXT:    vpextrw $7, %xmm0, (%rsp,%r13,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    andl $31, %r13d
+; AVX2-NEXT:    vpextrw $0, %xmm1, (%rsp,%r13,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    andl $31, %r13d
+; AVX2-NEXT:    vpextrw $1, %xmm1, (%rsp,%r13,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    andl $31, %r13d
+; AVX2-NEXT:    vpextrw $2, %xmm1, (%rsp,%r13,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andl $31, %eax
+; AVX2-NEXT:    vpextrw $3, %xmm1, (%rsp,%rax,2)
+; AVX2-NEXT:    andl $31, %r12d
+; AVX2-NEXT:    vpextrw $4, %xmm1, (%rsp,%r12,2)
+; AVX2-NEXT:    andl $31, %r15d
+; AVX2-NEXT:    vpextrw $5, %xmm1, (%rsp,%r15,2)
+; AVX2-NEXT:    andl $31, %r14d
+; AVX2-NEXT:    vpextrw $6, %xmm1, (%rsp,%r14,2)
+; AVX2-NEXT:    andl $31, %ebx
+; AVX2-NEXT:    vpextrw $7, %xmm1, (%rsp,%rbx,2)
+; AVX2-NEXT:    andl $31, %r11d
+; AVX2-NEXT:    vpextrw $0, %xmm2, (%rsp,%r11,2)
+; AVX2-NEXT:    andl $31, %r10d
+; AVX2-NEXT:    vpextrw $1, %xmm2, (%rsp,%r10,2)
+; AVX2-NEXT:    andl $31, %r9d
+; AVX2-NEXT:    vpextrw $2, %xmm2, (%rsp,%r9,2)
+; AVX2-NEXT:    andl $31, %r8d
+; AVX2-NEXT:    vpextrw $3, %xmm2, (%rsp,%r8,2)
+; AVX2-NEXT:    andl $31, %edi
+; AVX2-NEXT:    vpextrw $4, %xmm2, (%rsp,%rdi,2)
+; AVX2-NEXT:    andl $31, %esi
+; AVX2-NEXT:    vpextrw $5, %xmm2, (%rsp,%rsi,2)
+; AVX2-NEXT:    andl $31, %edx
+; AVX2-NEXT:    vpextrw $6, %xmm2, (%rsp,%rdx,2)
+; AVX2-NEXT:    andl $31, %ecx
+; AVX2-NEXT:    vpextrw $7, %xmm2, (%rsp,%rcx,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT:    cmpq $31, %rcx
+; AVX2-NEXT:    movl $31, %eax
+; AVX2-NEXT:    cmovbq %rcx, %rax
+; AVX2-NEXT:    movl %eax, %eax
+; AVX2-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; AVX2-NEXT:    movw %cx, (%rsp,%rax,2)
+; AVX2-NEXT:    vmovaps (%rsp), %ymm0
+; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT:    leaq -40(%rbp), %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test_compress_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
+; AVX512F-NEXT:    andq $-64, %rsp
+; AVX512F-NEXT:    subq $128, %rsp
+; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm5
+; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
+; AVX512F-NEXT:    vpmovsxbd %xmm5, %zmm5
+; AVX512F-NEXT:    vpslld $31, %zmm5, %zmm5
+; AVX512F-NEXT:    vptestmd %zmm5, %zmm5, %k1
+; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpcompressd %zmm1, %zmm1 {%k2} {z}
+; AVX512F-NEXT:    vpmovdw %zmm1, (%rsp)
+; AVX512F-NEXT:    kshiftrw $8, %k2, %k0
+; AVX512F-NEXT:    kxorw %k0, %k2, %k0
+; AVX512F-NEXT:    kshiftrw $4, %k0, %k2
+; AVX512F-NEXT:    kxorw %k2, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $2, %k0, %k2
+; AVX512F-NEXT:    kxorw %k2, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $1, %k0, %k2
+; AVX512F-NEXT:    kxorw %k2, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    andl $31, %eax
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpcompressd %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpmovdw %zmm0, (%rsp,%rax,2)
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
+; AVX512F-NEXT:    vpsllw $15, %ymm4, %ymm1
+; AVX512F-NEXT:    vpsraw $15, %ymm1, %ymm1
+; AVX512F-NEXT:    vpblendvb %ymm1, {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX512F-NEXT:    vpsllw $15, %ymm3, %ymm1
+; AVX512F-NEXT:    vpsraw $15, %ymm1, %ymm1
+; AVX512F-NEXT:    vpblendvb %ymm1, (%rsp), %ymm2, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    movq %rbp, %rsp
+; AVX512F-NEXT:    popq %rbp
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: test_compress_v32i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpsllw $7, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpmovb2m %ymm1, %k1
+; AVX512VL-NEXT:    vpcompressw %zmm0, %zmm2 {%k1}
+; AVX512VL-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512VL-NEXT:    retq
+    %out = call <32 x i16> @llvm.experimental.vector.compress(<32 x i16> %vec, <32 x i1> %mask, <32 x i16> %passthru)
+    ret <32 x i16> %out
+}
+
+define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i32> %passthru) nounwind {
+; AVX2-LABEL: test_compress_large:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    movq %rsp, %rbp
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $288, %rsp # imm = 0x120
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $r9d killed $r9d def $r9
+; AVX2-NEXT:    # kill: def $r8d killed $r8d def $r8
+; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    vmovss %xmm0, (%rsp)
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rsi,4)
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %esi, %edx
+; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rdx,4)
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    andl $1, %r8d
+; AVX2-NEXT:    addl %ecx, %r8d
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovss %xmm0, (%rsp,%r8,4)
+; AVX2-NEXT:    andl $1, %r9d
+; AVX2-NEXT:    addl %r8d, %r9d
+; AVX2-NEXT:    movzbl 16(%rbp), %ecx
+; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%r9,4)
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %r9d, %ecx
+; AVX2-NEXT:    movzbl 24(%rbp), %edx
+; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    movzbl 32(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 40(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vmovss %xmm1, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 48(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $1, %xmm1, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 56(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractps $2, %xmm1, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 64(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $3, %xmm1, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 72(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 80(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 88(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 96(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 104(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vmovss %xmm2, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 112(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $1, %xmm2, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 120(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractps $2, %xmm2, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 128(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $3, %xmm2, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 136(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 144(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 152(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 160(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 168(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vmovss %xmm3, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 176(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $1, %xmm3, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 184(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractps $2, %xmm3, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 192(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $3, %xmm3, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 200(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm0
+; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 208(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 216(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 224(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 232(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vmovss %xmm4, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 240(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $1, %xmm4, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 248(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractps $2, %xmm4, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 256(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $3, %xmm4, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 264(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractf128 $1, %ymm4, %xmm0
+; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 272(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 280(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 288(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 296(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vmovss %xmm5, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 304(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $1, %xmm5, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 312(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractps $2, %xmm5, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 320(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $3, %xmm5, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 328(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm0
+; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 336(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 344(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 352(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 360(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vmovss %xmm6, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 368(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $1, %xmm6, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 376(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractps $2, %xmm6, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 384(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $3, %xmm6, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 392(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractf128 $1, %ymm6, %xmm0
+; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 400(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 408(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 416(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 424(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vmovss %xmm7, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 432(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $1, %xmm7, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 440(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractps $2, %xmm7, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 448(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $3, %xmm7, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 456(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractf128 $1, %ymm7, %xmm0
+; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    movzbl 464(%rbp), %ecx
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
+; AVX2-NEXT:    movzbl 472(%rbp), %edx
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
+; AVX2-NEXT:    andl $63, %edx
+; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
+; AVX2-NEXT:    vmovaps (%rsp), %ymm0
+; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm2
+; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm3
+; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm4
+; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm5
+; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm6
+; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm7
+; AVX2-NEXT:    vmovaps %ymm7, 224(%rdi)
+; AVX2-NEXT:    vmovaps %ymm6, 192(%rdi)
+; AVX2-NEXT:    vmovaps %ymm5, 160(%rdi)
+; AVX2-NEXT:    vmovaps %ymm4, 128(%rdi)
+; AVX2-NEXT:    vmovaps %ymm3, 96(%rdi)
+; AVX2-NEXT:    vmovaps %ymm2, 64(%rdi)
+; AVX2-NEXT:    vmovaps %ymm1, 32(%rdi)
+; AVX2-NEXT:    vmovaps %ymm0, (%rdi)
+; AVX2-NEXT:    movq %rbp, %rsp
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test_compress_large:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_offset 16
-; AVX512F-NEXT:    .cfi_offset %rbp, -16
 ; AVX512F-NEXT:    movq %rsp, %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_register %rbp
 ; AVX512F-NEXT:    andq $-64, %rsp
-; AVX512F-NEXT:    subq $128, %rsp
-; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512F-NEXT:    vpmovsxbd %xmm5, %zmm5
-; AVX512F-NEXT:    vpslld $31, %zmm5, %zmm5
-; AVX512F-NEXT:    vptestmd %zmm5, %zmm5, %k1
-; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
-; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
-; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k2
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT:    vpcompressd %zmm1, %zmm1 {%k2} {z}
-; AVX512F-NEXT:    vpmovdw %zmm1, (%rsp)
-; AVX512F-NEXT:    kshiftrw $8, %k2, %k0
-; AVX512F-NEXT:    kxorw %k0, %k2, %k0
+; AVX512F-NEXT:    subq $640, %rsp # imm = 0x280
+; AVX512F-NEXT:    movzbl 352(%rbp), %eax
+; AVX512F-NEXT:    andl $1, %eax
+; AVX512F-NEXT:    kmovw %eax, %k0
+; AVX512F-NEXT:    movzbl 360(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-5, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    kmovw %k1, %k3
+; AVX512F-NEXT:    movzbl 368(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $13, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-9, %ax
+; AVX512F-NEXT:    kmovw %eax, %k7
+; AVX512F-NEXT:    kandw %k7, %k0, %k0
+; AVX512F-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    movzbl 376(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $12, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-17, %ax
+; AVX512F-NEXT:    kmovw %eax, %k5
+; AVX512F-NEXT:    kandw %k5, %k0, %k0
+; AVX512F-NEXT:    movzbl 384(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $11, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-33, %ax
+; AVX512F-NEXT:    kmovw %eax, %k6
+; AVX512F-NEXT:    kandw %k6, %k0, %k0
+; AVX512F-NEXT:    movzbl 392(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $10, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-65, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    movzbl 400(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $9, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-129, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    movzbl 408(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-257, %ax # imm = 0xFEFF
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    movzbl 416(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $7, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-513, %ax # imm = 0xFDFF
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    movzbl 424(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $6, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-1025, %ax # imm = 0xFBFF
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    movzbl 432(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $5, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-2049, %ax # imm = 0xF7FF
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    movzbl 440(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $4, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-4097, %ax # imm = 0xEFFF
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    movzbl 448(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $3, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    movw $-8193, %ax # imm = 0xDFFF
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    movzbl 456(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $2, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k1
+; AVX512F-NEXT:    movw $-16385, %ax # imm = 0xBFFF
+; AVX512F-NEXT:    kmovw %eax, %k4
+; AVX512F-NEXT:    kandw %k4, %k1, %k1
+; AVX512F-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    movzbl 464(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512F-NEXT:    movzbl 472(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    movzbl 224(%rbp), %eax
+; AVX512F-NEXT:    andl $1, %eax
+; AVX512F-NEXT:    movzbl 232(%rbp), %r10d
+; AVX512F-NEXT:    kmovw %r10d, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    korw %k1, %k2, %k1
+; AVX512F-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k3, %k1, %k1
+; AVX512F-NEXT:    movzbl 240(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $13, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k7, %k1, %k1
+; AVX512F-NEXT:    movzbl 248(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $12, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k5, %k1, %k1
+; AVX512F-NEXT:    movzbl 256(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $11, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k6, %k1, %k1
+; AVX512F-NEXT:    movzbl 264(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $10, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k7, %k1, %k1
+; AVX512F-NEXT:    movzbl 272(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $9, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k0
+; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    movzbl 280(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k0, %k2
+; AVX512F-NEXT:    korw %k1, %k2, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k0, %k1, %k1
+; AVX512F-NEXT:    movzbl 288(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k0
+; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftlw $15, %k0, %k2
+; AVX512F-NEXT:    kshiftrw $7, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k0, %k1, %k1
+; AVX512F-NEXT:    movzbl 296(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k0
+; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftrw $6, %k0, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k0, %k1, %k1
+; AVX512F-NEXT:    movzbl 304(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k0
+; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftrw $5, %k0, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k0, %k1, %k1
+; AVX512F-NEXT:    movzbl 312(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k0
+; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; AVX512F-NEXT:    kshiftrw $4, %k0, %k2
-; AVX512F-NEXT:    kxorw %k2, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $2, %k0, %k2
-; AVX512F-NEXT:    kxorw %k2, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $1, %k0, %k2
-; AVX512F-NEXT:    kxorw %k2, %k0, %k0
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k0, %k1, %k1
+; AVX512F-NEXT:    movzbl 320(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k0
+; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftrw $3, %k0, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k0, %k1, %k1
+; AVX512F-NEXT:    movzbl 328(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftrw $2, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k4, %k1, %k1
+; AVX512F-NEXT:    movzbl 336(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512F-NEXT:    movzbl 344(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    movzbl 96(%rbp), %eax
+; AVX512F-NEXT:    andl $1, %eax
+; AVX512F-NEXT:    movzbl 104(%rbp), %r10d
+; AVX512F-NEXT:    kmovw %r10d, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    korw %k1, %k2, %k1
+; AVX512F-NEXT:    kandw %k3, %k1, %k1
+; AVX512F-NEXT:    movzbl 112(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $13, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k4, %k1, %k1
+; AVX512F-NEXT:    movzbl 120(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $12, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k5, %k1, %k1
+; AVX512F-NEXT:    movzbl 128(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $11, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kandw %k6, %k1, %k1
+; AVX512F-NEXT:    movzbl 136(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $10, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k7, %k1, %k1
+; AVX512F-NEXT:    movzbl 144(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $9, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k1, %k1
+; AVX512F-NEXT:    movzbl 152(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $8, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k3, %k1, %k1
+; AVX512F-NEXT:    movzbl 160(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $7, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k7, %k1, %k1
+; AVX512F-NEXT:    movzbl 168(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $6, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k1, %k1
+; AVX512F-NEXT:    movzbl 176(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $5, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k1, %k1
+; AVX512F-NEXT:    movzbl 184(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $4, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k1, %k1
+; AVX512F-NEXT:    movzbl 192(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $3, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k0, %k1, %k1
+; AVX512F-NEXT:    movzbl 200(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $2, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k1, %k1
+; AVX512F-NEXT:    movzbl 208(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512F-NEXT:    movzbl 216(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    andl $1, %edi
+; AVX512F-NEXT:    kmovw %esi, %k1
+; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512F-NEXT:    kmovw %edi, %k2
+; AVX512F-NEXT:    korw %k1, %k2, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw %edx, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $13, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k4, %k1, %k1
+; AVX512F-NEXT:    kmovw %ecx, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $12, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k5, %k1, %k1
+; AVX512F-NEXT:    kmovw %r8d, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $11, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k6, %k1, %k1
+; AVX512F-NEXT:    kmovw %r9d, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $10, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k1, %k1
+; AVX512F-NEXT:    movzbl 16(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $9, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k2
+; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k1, %k2, %k1
+; AVX512F-NEXT:    movzbl 24(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $8, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k3, %k1, %k1
+; AVX512F-NEXT:    movzbl 32(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $7, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k7, %k1, %k1
+; AVX512F-NEXT:    movzbl 40(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kshiftrw $6, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k1, %k1
+; AVX512F-NEXT:    movzbl 48(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k5
+; AVX512F-NEXT:    kshiftrw $5, %k5, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k1, %k1
+; AVX512F-NEXT:    movzbl 56(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k4
+; AVX512F-NEXT:    kshiftrw $4, %k4, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k2, %k1, %k1
+; AVX512F-NEXT:    movzbl 64(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k3
+; AVX512F-NEXT:    kshiftrw $3, %k3, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kandw %k0, %k1, %k1
+; AVX512F-NEXT:    movzbl 72(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512F-NEXT:    kshiftrw $2, %k2, %k0
+; AVX512F-NEXT:    korw %k0, %k1, %k0
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    movzbl 80(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kshiftlw $14, %k1, %k7
+; AVX512F-NEXT:    korw %k7, %k0, %k0
+; AVX512F-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $1, %k0, %k7
+; AVX512F-NEXT:    movzbl 88(%rbp), %eax
+; AVX512F-NEXT:    kmovw %eax, %k0
+; AVX512F-NEXT:    kshiftlw $15, %k0, %k6
+; AVX512F-NEXT:    korw %k6, %k7, %k6
+; AVX512F-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    movw $-3, %ax
+; AVX512F-NEXT:    kmovw %eax, %k6
+; AVX512F-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k6, %k7, %k6
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512F-NEXT:    kshiftrw $14, %k7, %k7
+; AVX512F-NEXT:    korw %k7, %k6, %k6
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k7, %k6, %k6
+; AVX512F-NEXT:    kshiftrw $13, %k5, %k5
+; AVX512F-NEXT:    korw %k5, %k6, %k5
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k6, %k5, %k5
+; AVX512F-NEXT:    kshiftrw $12, %k4, %k4
+; AVX512F-NEXT:    korw %k4, %k5, %k4
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k5, %k4, %k4
+; AVX512F-NEXT:    kshiftrw $11, %k3, %k3
+; AVX512F-NEXT:    korw %k3, %k4, %k3
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k4, %k3, %k3
+; AVX512F-NEXT:    kshiftrw $10, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k3, %k2
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k3, %k2, %k2
+; AVX512F-NEXT:    kshiftlw $6, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k2, %k1
+; AVX512F-NEXT:    kshiftlw $9, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $9, %k1, %k1
+; AVX512F-NEXT:    kshiftlw $7, %k0, %k0
+; AVX512F-NEXT:    korw %k0, %k1, %k0
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kshiftlw $9, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $9, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kshiftlw $7, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kxorw %k0, %k1, %k0
+; AVX512F-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512F-NEXT:    kxorw %k1, %k0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    vpcompressd %zmm2, %zmm2 {%k1} {z}
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kandw %k1, %k0, %k0
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    kandw %k7, %k0, %k0
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kshiftrw $13, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    kandw %k6, %k0, %k0
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kshiftrw $12, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    kandw %k5, %k0, %k0
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kshiftrw $11, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    kandw %k4, %k0, %k0
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kshiftrw $10, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    kandw %k3, %k0, %k0
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kshiftlw $6, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    kshiftlw $9, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kshiftlw $7, %k1, %k1
+; AVX512F-NEXT:    korw %k1, %k0, %k0
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    kshiftlw $9, %k1, %k1
+; AVX512F-NEXT:    kshiftrw $9, %k1, %k1
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    kshiftlw $7, %k2, %k2
+; AVX512F-NEXT:    korw %k2, %k1, %k1
+; AVX512F-NEXT:    kxorw %k0, %k1, %k0
+; AVX512F-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    vpcompressd %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT:    vpcompressd %zmm0, %zmm0 {%k2} {z}
+; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT:    vpcompressd %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    kxorw %k1, %k2, %k0
+; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %edx
+; AVX512F-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT:    vpcompressd %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT:    vpmovdw %zmm0, (%rsp,%rax,2)
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
-; AVX512F-NEXT:    vpsllw $15, %ymm4, %ymm1
-; AVX512F-NEXT:    vpsraw $15, %ymm1, %ymm1
-; AVX512F-NEXT:    vpblendvb %ymm1, {{[0-9]+}}(%rsp), %ymm0, %ymm0
-; AVX512F-NEXT:    vpsllw $15, %ymm3, %ymm1
-; AVX512F-NEXT:    vpsraw $15, %ymm1, %ymm1
-; AVX512F-NEXT:    vpblendvb %ymm1, (%rsp), %ymm2, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm1, 64(%rsp,%rax,4)
+; AVX512F-NEXT:    vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    andl $31, %ecx
+; AVX512F-NEXT:    vmovdqa64 %zmm3, 192(%rsp,%rcx,4)
+; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm0
+; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm1
+; AVX512F-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    andl $63, %edx
+; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm0
+; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm2
+; AVX512F-NEXT:    vmovaps %zmm0, 320(%rsp,%rdx,4)
+; AVX512F-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovaps %zmm2, 384(%rsp,%rdx,4)
+; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm0
+; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm1
+; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm2
+; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm3
 ; AVX512F-NEXT:    movq %rbp, %rsp
 ; AVX512F-NEXT:    popq %rbp
-; AVX512F-NEXT:    .cfi_def_cfa %rsp, 8
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VL-LABEL: test_compress_v32i16:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpsllw $7, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpmovb2m %ymm1, %k1
-; AVX512VL-NEXT:    vpcompressw %zmm0, %zmm2 {%k1}
-; AVX512VL-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512VL-NEXT:    retq
-    %out = call <32 x i16> @llvm.experimental.vector.compress(<32 x i16> %vec, <32 x i1> %mask, <32 x i16> %passthru)
-    ret <32 x i16> %out
-}
-
-define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i32> %passthru) {
 ; AVX512VL-LABEL: test_compress_large:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    pushq %rbp
-; AVX512VL-NEXT:    .cfi_def_cfa_offset 16
-; AVX512VL-NEXT:    .cfi_offset %rbp, -16
 ; AVX512VL-NEXT:    movq %rsp, %rbp
-; AVX512VL-NEXT:    .cfi_def_cfa_register %rbp
 ; AVX512VL-NEXT:    andq $-64, %rsp
 ; AVX512VL-NEXT:    subq $576, %rsp # imm = 0x240
 ; AVX512VL-NEXT:    vpsllw $7, %zmm0, %zmm0
@@ -896,13 +4380,12 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
 ; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm3
 ; AVX512VL-NEXT:    movq %rbp, %rsp
 ; AVX512VL-NEXT:    popq %rbp
-; AVX512VL-NEXT:    .cfi_def_cfa %rsp, 8
 ; AVX512VL-NEXT:    retq
     %out = call <64 x i32> @llvm.experimental.vector.compress(<64 x i32> %vec, <64 x i1> %mask, <64 x i32> undef)
     ret <64 x i32> %out
 }
 
-define <4 x i32> @test_compress_all_const() {
+define <4 x i32> @test_compress_all_const() nounwind {
 ; AVX2-LABEL: test_compress_all_const:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = [5,9,0,0]
@@ -923,7 +4406,7 @@ define <4 x i32> @test_compress_all_const() {
     ret <4 x i32> %out
 }
 
-define <4 x i32> @test_compress_const_mask(<4 x i32> %vec) {
+define <4 x i32> @test_compress_const_mask(<4 x i32> %vec) nounwind {
 ; CHECK-LABEL: test_compress_const_mask:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
@@ -932,7 +4415,7 @@ define <4 x i32> @test_compress_const_mask(<4 x i32> %vec) {
     ret <4 x i32> %out
 }
 
-define <4 x i32> @test_compress_const_mask_passthrough(<4 x i32> %vec, <4 x i32> %passthru) {
+define <4 x i32> @test_compress_const_mask_passthrough(<4 x i32> %vec, <4 x i32> %passthru) nounwind {
 ; CHECK-LABEL: test_compress_const_mask_passthrough:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,3]
@@ -941,7 +4424,7 @@ define <4 x i32> @test_compress_const_mask_passthrough(<4 x i32> %vec, <4 x i32>
     ret <4 x i32> %out
 }
 
-define <4 x i32> @test_compress_const_mask_const_passthrough(<4 x i32> %vec) {
+define <4 x i32> @test_compress_const_mask_const_passthrough(<4 x i32> %vec) nounwind {
 ; CHECK-LABEL: test_compress_const_mask_const_passthrough:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
@@ -956,7 +4439,7 @@ define <4 x i32> @test_compress_const_mask_const_passthrough(<4 x i32> %vec) {
 
 ; We pass a placeholder value for the const_mask* tests to check that they are converted to a no-op by simply copying
 ; the second vector input register to the return register or doing nothing.
-define <4 x i32> @test_compress_const_splat1_mask(<4 x i32> %ignore, <4 x i32> %vec) {
+define <4 x i32> @test_compress_const_splat1_mask(<4 x i32> %ignore, <4 x i32> %vec) nounwind {
 ; CHECK-LABEL: test_compress_const_splat1_mask:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
@@ -964,21 +4447,21 @@ define <4 x i32> @test_compress_const_splat1_mask(<4 x i32> %ignore, <4 x i32> %
     %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 -1), <4 x i32> undef)
     ret <4 x i32> %out
 }
-define <4 x i32> @test_compress_const_splat0_mask(<4 x i32> %ignore, <4 x i32> %vec) {
+define <4 x i32> @test_compress_const_splat0_mask(<4 x i32> %ignore, <4 x i32> %vec) nounwind {
 ; CHECK-LABEL: test_compress_const_splat0_mask:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    retq
     %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> undef)
     ret <4 x i32> %out
 }
-define <4 x i32> @test_compress_undef_mask(<4 x i32> %ignore, <4 x i32> %vec) {
+define <4 x i32> @test_compress_undef_mask(<4 x i32> %ignore, <4 x i32> %vec) nounwind {
 ; CHECK-LABEL: test_compress_undef_mask:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    retq
     %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> undef, <4 x i32> undef)
     ret <4 x i32> %out
 }
-define <4 x i32> @test_compress_const_splat0_mask_with_passthru(<4 x i32> %ignore, <4 x i32> %vec, <4 x i32> %passthru) {
+define <4 x i32> @test_compress_const_splat0_mask_with_passthru(<4 x i32> %ignore, <4 x i32> %vec, <4 x i32> %passthru) nounwind {
 ; CHECK-LABEL: test_compress_const_splat0_mask_with_passthru:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
@@ -986,7 +4469,7 @@ define <4 x i32> @test_compress_const_splat0_mask_with_passthru(<4 x i32> %ignor
     %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> %passthru)
     ret <4 x i32> %out
 }
-define <4 x i32> @test_compress_const_splat0_mask_without_passthru(<4 x i32> %ignore, <4 x i32> %vec) {
+define <4 x i32> @test_compress_const_splat0_mask_without_passthru(<4 x i32> %ignore, <4 x i32> %vec) nounwind {
 ; CHECK-LABEL: test_compress_const_splat0_mask_without_passthru:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    retq
@@ -994,7 +4477,98 @@ define <4 x i32> @test_compress_const_splat0_mask_without_passthru(<4 x i32> %ig
     ret <4 x i32> %out
 }
 
-define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) {
+define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) nounwind {
+; AVX2-LABEL: test_compress_small:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpsllw $7, %xmm1, %xmm1
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpextrb $0, %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovd %xmm1, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    vpextrb $1, %xmm0, -24(%rsp,%rax)
+; AVX2-NEXT:    vpextrb $1, %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    vpextrb $2, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $2, %xmm1, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    vpextrb $3, %xmm0, -24(%rsp,%rax)
+; AVX2-NEXT:    vpextrb $3, %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    vpextrb $4, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $4, %xmm1, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    vpextrb $5, %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vpextrb $5, %xmm0, -24(%rsp,%rax)
+; AVX2-NEXT:    vpextrb $6, %xmm1, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpextrb $6, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $7, %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vpextrb $7, %xmm0, -24(%rsp,%rax)
+; AVX2-NEXT:    vpextrb $8, %xmm1, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpextrb $8, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $9, %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vpextrb $9, %xmm0, -24(%rsp,%rax)
+; AVX2-NEXT:    vpextrb $10, %xmm1, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpextrb $10, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $11, %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vpextrb $11, %xmm0, -24(%rsp,%rax)
+; AVX2-NEXT:    vpextrb $12, %xmm1, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpextrb $12, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $13, %xmm1, %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vpextrb $13, %xmm0, -24(%rsp,%rax)
+; AVX2-NEXT:    vpextrb $14, %xmm1, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addl %ecx, %eax
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpextrb $14, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vpextrb $15, %xmm0, -24(%rsp,%rax)
+; AVX2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; AVX2-NEXT:    retq
+;
 ; AVX512F-LABEL: test_compress_small:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpslld $31, %xmm1, %xmm1
@@ -1017,7 +4591,7 @@ define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) {
     ret <4 x i8> %out
 }
 
-define <4 x i4> @test_compress_illegal_element_type(<4 x i4> %vec, <4 x i1> %mask) {
+define <4 x i4> @test_compress_illegal_element_type(<4 x i4> %vec, <4 x i1> %mask) nounwind {
 ; AVX2-LABEL: test_compress_illegal_element_type:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
@@ -1059,7 +4633,7 @@ define <4 x i4> @test_compress_illegal_element_type(<4 x i4> %vec, <4 x i1> %mas
     ret <4 x i4> %out
 }
 
-define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) {
+define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) nounwind {
 ; AVX2-LABEL: test_compress_narrow:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovd %edi, %xmm1
@@ -1132,7 +4706,7 @@ define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) {
     ret <3 x i32> %out
 }
 
-define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i1> %mask) {
+define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i1> %mask) nounwind {
 ; AVX2-LABEL: test_compress_narrow_illegal_element_type:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovd %ecx, %xmm0
@@ -1222,7 +4796,7 @@ define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i
     ret <3 x i3> %out
 }
 
-define <4 x i32> @test_compress_v4i32_zero_passthru(<4 x i32> %vec, <4 x i1> %mask) {
+define <4 x i32> @test_compress_v4i32_zero_passthru(<4 x i32> %vec, <4 x i1> %mask) nounwind {
 ; AVX2-LABEL: test_compress_v4i32_zero_passthru:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index 54acd012d1fe4..ec916060563a7 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -814,15 +814,13 @@ define float @load_cvt_i16_to_f32(ptr %a0) nounwind {
 ;
 ; F16C-LABEL: load_cvt_i16_to_f32:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    movzwl (%rdi), %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
+; F16C-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    retq
 ;
 ; AVX512-LABEL: load_cvt_i16_to_f32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    movzwl (%rdi), %eax
-; AVX512-NEXT:    vmovd %eax, %xmm0
+; AVX512-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = load i16, ptr %a0
@@ -1830,16 +1828,14 @@ define double @load_cvt_i16_to_f64(ptr %a0) nounwind {
 ;
 ; F16C-LABEL: load_cvt_i16_to_f64:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    movzwl (%rdi), %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
+; F16C-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
 ; F16C-NEXT:    retq
 ;
 ; AVX512-LABEL: load_cvt_i16_to_f64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    movzwl (%rdi), %eax
-; AVX512-NEXT:    vmovd %eax, %xmm0
+; AVX512-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/xor-lea.ll b/llvm/test/CodeGen/X86/xor-lea.ll
index 10e9525a2706a..d50752e48d293 100644
--- a/llvm/test/CodeGen/X86/xor-lea.ll
+++ b/llvm/test/CodeGen/X86/xor-lea.ll
@@ -327,7 +327,8 @@ define i32 @xor_shl_sminval_i32(i32 %x) {
 ; X86-LABEL: xor_shl_sminval_i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal -2147483648(,%eax,8), %eax
+; X86-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; X86-NEXT:    leal (%ecx,%eax,8), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: xor_shl_sminval_i32:
diff --git a/llvm/test/DebugInfo/NVPTX/debug-info.ll b/llvm/test/DebugInfo/NVPTX/debug-info.ll
index c926229f96e38..44884381e082e 100644
--- a/llvm/test/DebugInfo/NVPTX/debug-info.ll
+++ b/llvm/test/DebugInfo/NVPTX/debug-info.ll
@@ -199,6 +199,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 19                                  // DW_FORM_ref4
 ; CHECK-NEXT:.b8 60                                  // DW_AT_declaration
 ; CHECK-NEXT:.b8 12                                  // DW_FORM_flag
+; CHECK-NEXT:.b8 100                                 // DW_AT_object_pointer
+; CHECK-NEXT:.b8 19                                  // DW_FORM_ref4
 ; CHECK-NEXT:.b8 63                                  // DW_AT_external
 ; CHECK-NEXT:.b8 12                                  // DW_FORM_flag
 ; CHECK-NEXT:.b8 0                                   // EOM(1)
@@ -223,6 +225,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 11                                  // DW_FORM_data1
 ; CHECK-NEXT:.b8 60                                  // DW_AT_declaration
 ; CHECK-NEXT:.b8 12                                  // DW_FORM_flag
+; CHECK-NEXT:.b8 100                                 // DW_AT_object_pointer
+; CHECK-NEXT:.b8 19                                  // DW_FORM_ref4
 ; CHECK-NEXT:.b8 63                                  // DW_AT_external
 ; CHECK-NEXT:.b8 12                                  // DW_FORM_flag
 ; CHECK-NEXT:.b8 50                                  // DW_AT_accessibility
@@ -250,6 +254,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 11                                  // DW_FORM_data1
 ; CHECK-NEXT:.b8 60                                  // DW_AT_declaration
 ; CHECK-NEXT:.b8 12                                  // DW_FORM_flag
+; CHECK-NEXT:.b8 100                                 // DW_AT_object_pointer
+; CHECK-NEXT:.b8 19                                  // DW_FORM_ref4
 ; CHECK-NEXT:.b8 63                                  // DW_AT_external
 ; CHECK-NEXT:.b8 12                                  // DW_FORM_flag
 ; CHECK-NEXT:.b8 50                                  // DW_AT_accessibility
@@ -272,6 +278,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 19                                  // DW_FORM_ref4
 ; CHECK-NEXT:.b8 60                                  // DW_AT_declaration
 ; CHECK-NEXT:.b8 12                                  // DW_FORM_flag
+; CHECK-NEXT:.b8 100                                 // DW_AT_object_pointer
+; CHECK-NEXT:.b8 19                                  // DW_FORM_ref4
 ; CHECK-NEXT:.b8 63                                  // DW_AT_external
 ; CHECK-NEXT:.b8 12                                  // DW_FORM_flag
 ; CHECK-NEXT:.b8 50                                  // DW_AT_accessibility
@@ -373,6 +381,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 5                                   // DW_FORM_data2
 ; CHECK-NEXT:.b8 60                                  // DW_AT_declaration
 ; CHECK-NEXT:.b8 12                                  // DW_FORM_flag
+; CHECK-NEXT:.b8 100                                 // DW_AT_object_pointer
+; CHECK-NEXT:.b8 19                                  // DW_FORM_ref4
 ; CHECK-NEXT:.b8 63                                  // DW_AT_external
 ; CHECK-NEXT:.b8 12                                  // DW_FORM_flag
 ; CHECK-NEXT:.b8 0                                   // EOM(1)
@@ -393,6 +403,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 19                                  // DW_FORM_ref4
 ; CHECK-NEXT:.b8 60                                  // DW_AT_declaration
 ; CHECK-NEXT:.b8 12                                  // DW_FORM_flag
+; CHECK-NEXT:.b8 100                                 // DW_AT_object_pointer
+; CHECK-NEXT:.b8 19                                  // DW_FORM_ref4
 ; CHECK-NEXT:.b8 63                                  // DW_AT_external
 ; CHECK-NEXT:.b8 12                                  // DW_FORM_flag
 ; CHECK-NEXT:.b8 0                                   // EOM(1)
@@ -727,6 +739,21 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0                                   // EOM(2)
 ; CHECK-NEXT:.b8 45                                  // Abbreviation Code
 ; CHECK-NEXT:.b8 46                                  // DW_TAG_subprogram
+; CHECK-NEXT:.b8 1                                   // DW_CHILDREN_yes
+; CHECK-NEXT:.b8 3                                   // DW_AT_name
+; CHECK-NEXT:.b8 8                                   // DW_FORM_string
+; CHECK-NEXT:.b8 58                                  // DW_AT_decl_file
+; CHECK-NEXT:.b8 11                                  // DW_FORM_data1
+; CHECK-NEXT:.b8 59                                  // DW_AT_decl_line
+; CHECK-NEXT:.b8 5                                   // DW_FORM_data2
+; CHECK-NEXT:.b8 60                                  // DW_AT_declaration
+; CHECK-NEXT:.b8 12                                  // DW_FORM_flag
+; CHECK-NEXT:.b8 63                                  // DW_AT_external
+; CHECK-NEXT:.b8 12                                  // DW_FORM_flag
+; CHECK-NEXT:.b8 0                                   // EOM(1)
+; CHECK-NEXT:.b8 0                                   // EOM(2)
+; CHECK-NEXT:.b8 46                                  // Abbreviation Code
+; CHECK-NEXT:.b8 46                                  // DW_TAG_subprogram
 ; CHECK-NEXT:.b8 0                                   // DW_CHILDREN_no
 ; CHECK-NEXT:.b8 3                                   // DW_AT_name
 ; CHECK-NEXT:.b8 8                                   // DW_FORM_string
@@ -742,7 +769,27 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_FORM_flag
 ; CHECK-NEXT:.b8 0                                   // EOM(1)
 ; CHECK-NEXT:.b8 0                                   // EOM(2)
-; CHECK-NEXT:.b8 46                                  // Abbreviation Code
+; CHECK-NEXT:.b8 47                                  // Abbreviation Code
+; CHECK-NEXT:.b8 46                                  // DW_TAG_subprogram
+; CHECK-NEXT:.b8 1                                   // DW_CHILDREN_yes
+; CHECK-NEXT:.b8 135                                 // DW_AT_MIPS_linkage_name
+; CHECK-NEXT:.b8 64
+; CHECK-NEXT:.b8 8                                   // DW_FORM_string
+; CHECK-NEXT:.b8 3                                   // DW_AT_name
+; CHECK-NEXT:.b8 8                                   // DW_FORM_string
+; CHECK-NEXT:.b8 58                                  // DW_AT_decl_file
+; CHECK-NEXT:.b8 11                                  // DW_FORM_data1
+; CHECK-NEXT:.b8 59                                  // DW_AT_decl_line
+; CHECK-NEXT:.b8 11                                  // DW_FORM_data1
+; CHECK-NEXT:.b8 73                                  // DW_AT_type
+; CHECK-NEXT:.b8 19                                  // DW_FORM_ref4
+; CHECK-NEXT:.b8 60                                  // DW_AT_declaration
+; CHECK-NEXT:.b8 12                                  // DW_FORM_flag
+; CHECK-NEXT:.b8 63                                  // DW_AT_external
+; CHECK-NEXT:.b8 12                                  // DW_FORM_flag
+; CHECK-NEXT:.b8 0                                   // EOM(1)
+; CHECK-NEXT:.b8 0                                   // EOM(2)
+; CHECK-NEXT:.b8 48                                  // Abbreviation Code
 ; CHECK-NEXT:.b8 46                                  // DW_TAG_subprogram
 ; CHECK-NEXT:.b8 1                                   // DW_CHILDREN_yes
 ; CHECK-NEXT:.b8 135                                 // DW_AT_MIPS_linkage_name
@@ -764,12 +811,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:	}
 ; CHECK-NEXT:	.section	.debug_info
 ; CHECK-NEXT:	{
-; CHECK-NEXT:.b32 10035                              // Length of Unit
+; CHECK-NEXT:.b32 10107                              // Length of Unit
 ; CHECK-NEXT:.b8 2                                   // DWARF version number
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b32 .debug_abbrev                      // Offset Into Abbrev. Section
 ; CHECK-NEXT:.b8 8                                   // Address Size (in bytes)
-; CHECK-NEXT:.b8 1                                   // Abbrev [1] 0xb:0x272c DW_TAG_compile_unit
+; CHECK-NEXT:.b8 1                                   // Abbrev [1] 0xb:0x2774 DW_TAG_compile_unit
 ; CHECK-NEXT:.b8 0                                   // DW_AT_producer
 ; CHECK-NEXT:.b8 4                                   // DW_AT_language
 ; CHECK-NEXT:.b8 0
@@ -804,7 +851,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 114
 ; CHECK-NEXT:.b8 121
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b8 2                                   // Abbrev [2] 0x31:0x22a DW_TAG_structure_type
+; CHECK-NEXT:.b8 2                                   // Abbrev [2] 0x31:0x23e DW_TAG_structure_type
 ; CHECK-NEXT:.b8 95                                  // DW_AT_name
 ; CHECK-NEXT:.b8 95
 ; CHECK-NEXT:.b8 99
@@ -907,7 +954,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 78                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 603                                // DW_AT_type
+; CHECK-NEXT:.b32 623                                // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 3                                   // Abbrev [3] 0x9e:0x4f DW_TAG_subprogram
@@ -983,7 +1030,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 79                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 603                                // DW_AT_type
+; CHECK-NEXT:.b32 623                                // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 3                                   // Abbrev [3] 0xed:0x4f DW_TAG_subprogram
@@ -1059,10 +1106,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 80                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 603                                // DW_AT_type
+; CHECK-NEXT:.b32 623                                // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 4                                   // Abbrev [4] 0x13c:0x49 DW_TAG_subprogram
+; CHECK-NEXT:.b8 4                                   // Abbrev [4] 0x13c:0x4d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 78
@@ -1122,14 +1169,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 83                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 619                                // DW_AT_type
+; CHECK-NEXT:.b32 639                                // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
+; CHECK-NEXT:.b32 386                                // DW_AT_object_pointer
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x17e:0x6 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 666                                // DW_AT_type
+; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x182:0x6 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 686                                // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_artificial
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 6                                   // Abbrev [6] 0x185:0x27 DW_TAG_subprogram
+; CHECK-NEXT:.b8 6                                   // Abbrev [6] 0x189:0x2b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_name
 ; CHECK-NEXT:.b8 95
 ; CHECK-NEXT:.b8 99
@@ -1159,14 +1207,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 85                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
+; CHECK-NEXT:.b32 429                                // DW_AT_object_pointer
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 3                                   // DW_AT_accessibility
 ; CHECK-NEXT:                                        // DW_ACCESS_private
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x1a5:0x6 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 676                                // DW_AT_type
+; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x1ad:0x6 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 696                                // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_artificial
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 6                                   // Abbrev [6] 0x1ac:0x2c DW_TAG_subprogram
+; CHECK-NEXT:.b8 6                                   // Abbrev [6] 0x1b4:0x30 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_name
 ; CHECK-NEXT:.b8 95
 ; CHECK-NEXT:.b8 99
@@ -1196,16 +1245,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 85                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
+; CHECK-NEXT:.b32 472                                // DW_AT_object_pointer
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 3                                   // DW_AT_accessibility
 ; CHECK-NEXT:                                        // DW_ACCESS_private
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x1cc:0x6 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 676                                // DW_AT_type
+; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x1d8:0x6 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 696                                // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_artificial
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d2:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 681                                // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1de:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 701                                // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 8                                   // Abbrev [8] 0x1d8:0x43 DW_TAG_subprogram
+; CHECK-NEXT:.b8 8                                   // Abbrev [8] 0x1e4:0x47 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 78
@@ -1258,16 +1308,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 85                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
+; CHECK-NEXT:.b32 543                                // DW_AT_object_pointer
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 3                                   // DW_AT_accessibility
 ; CHECK-NEXT:                                        // DW_ACCESS_private
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x20f:0x6 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 666                                // DW_AT_type
+; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x21f:0x6 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 686                                // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_artificial
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x215:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 681                                // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x225:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 701                                // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 9                                   // Abbrev [9] 0x21b:0x3f DW_TAG_subprogram
+; CHECK-NEXT:.b8 9                                   // Abbrev [9] 0x22b:0x43 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 78
@@ -1316,17 +1367,18 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 85                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 686                                // DW_AT_type
+; CHECK-NEXT:.b32 706                                // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
+; CHECK-NEXT:.b32 615                                // DW_AT_object_pointer
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 3                                   // DW_AT_accessibility
 ; CHECK-NEXT:                                        // DW_ACCESS_private
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x253:0x6 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 666                                // DW_AT_type
+; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x267:0x6 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 686                                // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_artificial
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x25b:0x10 DW_TAG_base_type
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x26f:0x10 DW_TAG_base_type
 ; CHECK-NEXT:.b8 117                                 // DW_AT_name
 ; CHECK-NEXT:.b8 110
 ; CHECK-NEXT:.b8 115
@@ -1342,7 +1394,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 4                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 2                                   // Abbrev [2] 0x26b:0x2f DW_TAG_structure_type
+; CHECK-NEXT:.b8 2                                   // Abbrev [2] 0x27f:0x2f DW_TAG_structure_type
 ; CHECK-NEXT:.b8 117                                 // DW_AT_name
 ; CHECK-NEXT:.b8 105
 ; CHECK-NEXT:.b8 110
@@ -1352,48 +1404,48 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_byte_size
 ; CHECK-NEXT:.b8 3                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 190                                 // DW_AT_decl_line
-; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x275:0xc DW_TAG_member
+; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x289:0xc DW_TAG_member
 ; CHECK-NEXT:.b8 120                                 // DW_AT_name
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b32 603                                // DW_AT_type
+; CHECK-NEXT:.b32 623                                // DW_AT_type
 ; CHECK-NEXT:.b8 3                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 192                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2                                   // DW_AT_data_member_location
 ; CHECK-NEXT:.b8 35
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x281:0xc DW_TAG_member
+; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x295:0xc DW_TAG_member
 ; CHECK-NEXT:.b8 121                                 // DW_AT_name
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b32 603                                // DW_AT_type
+; CHECK-NEXT:.b32 623                                // DW_AT_type
 ; CHECK-NEXT:.b8 3                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 192                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2                                   // DW_AT_data_member_location
 ; CHECK-NEXT:.b8 35
 ; CHECK-NEXT:.b8 4
-; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x28d:0xc DW_TAG_member
+; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x2a1:0xc DW_TAG_member
 ; CHECK-NEXT:.b8 122                                 // DW_AT_name
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b32 603                                // DW_AT_type
+; CHECK-NEXT:.b32 623                                // DW_AT_type
 ; CHECK-NEXT:.b8 3                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 192                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2                                   // DW_AT_data_member_location
 ; CHECK-NEXT:.b8 35
 ; CHECK-NEXT:.b8 8
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x29a:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 671                                // DW_AT_type
-; CHECK-NEXT:.b8 13                                  // Abbrev [13] 0x29f:0x5 DW_TAG_const_type
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x2ae:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 691                                // DW_AT_type
+; CHECK-NEXT:.b8 13                                  // Abbrev [13] 0x2b3:0x5 DW_TAG_const_type
 ; CHECK-NEXT:.b32 49                                 // DW_AT_type
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x2a4:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x2b8:0x5 DW_TAG_pointer_type
 ; CHECK-NEXT:.b32 49                                 // DW_AT_type
-; CHECK-NEXT:.b8 14                                  // Abbrev [14] 0x2a9:0x5 DW_TAG_reference_type
-; CHECK-NEXT:.b32 671                                // DW_AT_type
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x2ae:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b8 14                                  // Abbrev [14] 0x2bd:0x5 DW_TAG_reference_type
+; CHECK-NEXT:.b32 691                                // DW_AT_type
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x2c2:0x5 DW_TAG_pointer_type
 ; CHECK-NEXT:.b32 49                                 // DW_AT_type
-; CHECK-NEXT:.b8 15                                  // Abbrev [15] 0x2b3:0x6 DW_TAG_subprogram
+; CHECK-NEXT:.b8 15                                  // Abbrev [15] 0x2c7:0x6 DW_TAG_subprogram
 ; CHECK-NEXT:.b32 79                                 // DW_AT_specification
 ; CHECK-NEXT:.b8 1                                   // DW_AT_inline
-; CHECK-NEXT:.b8 2                                   // Abbrev [2] 0x2b9:0x228 DW_TAG_structure_type
+; CHECK-NEXT:.b8 2                                   // Abbrev [2] 0x2cd:0x23c DW_TAG_structure_type
 ; CHECK-NEXT:.b8 95                                  // DW_AT_name
 ; CHECK-NEXT:.b8 95
 ; CHECK-NEXT:.b8 99
@@ -1423,7 +1475,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 1                                   // DW_AT_byte_size
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 88                                  // DW_AT_decl_line
-; CHECK-NEXT:.b8 3                                   // Abbrev [3] 0x2d7:0x4f DW_TAG_subprogram
+; CHECK-NEXT:.b8 3                                   // Abbrev [3] 0x2eb:0x4f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 78
@@ -1496,10 +1548,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 89                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 603                                // DW_AT_type
+; CHECK-NEXT:.b32 623                                // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 3                                   // Abbrev [3] 0x326:0x4f DW_TAG_subprogram
+; CHECK-NEXT:.b8 3                                   // Abbrev [3] 0x33a:0x4f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 78
@@ -1572,10 +1624,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 90                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 603                                // DW_AT_type
+; CHECK-NEXT:.b32 623                                // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 3                                   // Abbrev [3] 0x375:0x4f DW_TAG_subprogram
+; CHECK-NEXT:.b8 3                                   // Abbrev [3] 0x389:0x4f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 78
@@ -1648,10 +1700,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 91                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 603                                // DW_AT_type
+; CHECK-NEXT:.b32 623                                // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 4                                   // Abbrev [4] 0x3c4:0x47 DW_TAG_subprogram
+; CHECK-NEXT:.b8 4                                   // Abbrev [4] 0x3d8:0x4b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 78
@@ -1709,14 +1761,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 94                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 1249                               // DW_AT_type
+; CHECK-NEXT:.b32 1289                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
+; CHECK-NEXT:.b32 1052                               // DW_AT_object_pointer
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x404:0x6 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 1425                               // DW_AT_type
+; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x41c:0x6 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 1477                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_artificial
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 6                                   // Abbrev [6] 0x40b:0x27 DW_TAG_subprogram
+; CHECK-NEXT:.b8 6                                   // Abbrev [6] 0x423:0x2b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_name
 ; CHECK-NEXT:.b8 95
 ; CHECK-NEXT:.b8 99
@@ -1746,14 +1799,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 96                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
+; CHECK-NEXT:.b32 1095                               // DW_AT_object_pointer
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 3                                   // DW_AT_accessibility
 ; CHECK-NEXT:                                        // DW_ACCESS_private
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x42b:0x6 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 1435                               // DW_AT_type
+; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x447:0x6 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 1487                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_artificial
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 6                                   // Abbrev [6] 0x432:0x2c DW_TAG_subprogram
+; CHECK-NEXT:.b8 6                                   // Abbrev [6] 0x44e:0x30 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_name
 ; CHECK-NEXT:.b8 95
 ; CHECK-NEXT:.b8 99
@@ -1783,16 +1837,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 96                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
+; CHECK-NEXT:.b32 1138                               // DW_AT_object_pointer
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 3                                   // DW_AT_accessibility
 ; CHECK-NEXT:                                        // DW_ACCESS_private
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x452:0x6 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 1435                               // DW_AT_type
+; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x472:0x6 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 1487                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_artificial
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x458:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 1440                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x478:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 1492                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 8                                   // Abbrev [8] 0x45e:0x43 DW_TAG_subprogram
+; CHECK-NEXT:.b8 8                                   // Abbrev [8] 0x47e:0x47 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 78
@@ -1845,16 +1900,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 96                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
+; CHECK-NEXT:.b32 1209                               // DW_AT_object_pointer
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 3                                   // DW_AT_accessibility
 ; CHECK-NEXT:                                        // DW_ACCESS_private
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x495:0x6 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 1425                               // DW_AT_type
+; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x4b9:0x6 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 1477                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_artificial
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x49b:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 1440                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x4bf:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 1492                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 9                                   // Abbrev [9] 0x4a1:0x3f DW_TAG_subprogram
+; CHECK-NEXT:.b8 9                                   // Abbrev [9] 0x4c5:0x43 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 78
@@ -1903,17 +1959,18 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 96                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 1445                               // DW_AT_type
+; CHECK-NEXT:.b32 1497                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
+; CHECK-NEXT:.b32 1281                               // DW_AT_object_pointer
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 3                                   // DW_AT_accessibility
 ; CHECK-NEXT:                                        // DW_ACCESS_private
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x4d9:0x6 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 1425                               // DW_AT_type
+; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x501:0x6 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 1477                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_artificial
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 16                                  // Abbrev [16] 0x4e1:0x9d DW_TAG_structure_type
+; CHECK-NEXT:.b8 16                                  // Abbrev [16] 0x509:0xa9 DW_TAG_structure_type
 ; CHECK-NEXT:.b8 100                                 // DW_AT_name
 ; CHECK-NEXT:.b8 105
 ; CHECK-NEXT:.b8 109
@@ -1923,37 +1980,37 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 3                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 161                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b8 17                                  // Abbrev [17] 0x4eb:0xd DW_TAG_member
+; CHECK-NEXT:.b8 17                                  // Abbrev [17] 0x513:0xd DW_TAG_member
 ; CHECK-NEXT:.b8 120                                 // DW_AT_name
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b32 603                                // DW_AT_type
+; CHECK-NEXT:.b32 623                                // DW_AT_type
 ; CHECK-NEXT:.b8 3                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 163                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
 ; CHECK-NEXT:.b8 2                                   // DW_AT_data_member_location
 ; CHECK-NEXT:.b8 35
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b8 17                                  // Abbrev [17] 0x4f8:0xd DW_TAG_member
+; CHECK-NEXT:.b8 17                                  // Abbrev [17] 0x520:0xd DW_TAG_member
 ; CHECK-NEXT:.b8 121                                 // DW_AT_name
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b32 603                                // DW_AT_type
+; CHECK-NEXT:.b32 623                                // DW_AT_type
 ; CHECK-NEXT:.b8 3                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 163                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
 ; CHECK-NEXT:.b8 2                                   // DW_AT_data_member_location
 ; CHECK-NEXT:.b8 35
 ; CHECK-NEXT:.b8 4
-; CHECK-NEXT:.b8 17                                  // Abbrev [17] 0x505:0xd DW_TAG_member
+; CHECK-NEXT:.b8 17                                  // Abbrev [17] 0x52d:0xd DW_TAG_member
 ; CHECK-NEXT:.b8 122                                 // DW_AT_name
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b32 603                                // DW_AT_type
+; CHECK-NEXT:.b32 623                                // DW_AT_type
 ; CHECK-NEXT:.b8 3                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 163                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
 ; CHECK-NEXT:.b8 2                                   // DW_AT_data_member_location
 ; CHECK-NEXT:.b8 35
 ; CHECK-NEXT:.b8 8
-; CHECK-NEXT:.b8 18                                  // Abbrev [18] 0x512:0x21 DW_TAG_subprogram
+; CHECK-NEXT:.b8 18                                  // Abbrev [18] 0x53a:0x25 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 100                                 // DW_AT_name
 ; CHECK-NEXT:.b8 105
 ; CHECK-NEXT:.b8 109
@@ -1963,18 +2020,19 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 165                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
+; CHECK-NEXT:.b32 1353                               // DW_AT_object_pointer
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x51d:0x6 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 1406                               // DW_AT_type
+; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x549:0x6 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 1458                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_artificial
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x523:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 603                                // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x528:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 603                                // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x52d:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 603                                // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 18                                  // Abbrev [18] 0x533:0x17 DW_TAG_subprogram
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x54f:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 623                                // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x554:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 623                                // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x559:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 623                                // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 18                                  // Abbrev [18] 0x55f:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 100                                 // DW_AT_name
 ; CHECK-NEXT:.b8 105
 ; CHECK-NEXT:.b8 109
@@ -1984,14 +2042,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 166                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
+; CHECK-NEXT:.b32 1390                               // DW_AT_object_pointer
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x53e:0x6 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 1406                               // DW_AT_type
+; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x56e:0x6 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 1458                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_artificial
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x544:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 1411                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x574:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 1463                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 19                                  // Abbrev [19] 0x54a:0x33 DW_TAG_subprogram
+; CHECK-NEXT:.b8 19                                  // Abbrev [19] 0x57a:0x37 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 78
@@ -2029,18 +2088,19 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 3                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 167                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 1411                               // DW_AT_type
+; CHECK-NEXT:.b32 1463                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
+; CHECK-NEXT:.b32 1450                               // DW_AT_object_pointer
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x576:0x6 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 1406                               // DW_AT_type
+; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x5aa:0x6 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 1458                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_artificial
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x57e:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 1249                               // DW_AT_type
-; CHECK-NEXT:.b8 20                                  // Abbrev [20] 0x583:0xe DW_TAG_typedef
-; CHECK-NEXT:.b32 619                                // DW_AT_type
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x5b2:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 1289                               // DW_AT_type
+; CHECK-NEXT:.b8 20                                  // Abbrev [20] 0x5b7:0xe DW_TAG_typedef
+; CHECK-NEXT:.b32 639                                // DW_AT_type
 ; CHECK-NEXT:.b8 117                                 // DW_AT_name
 ; CHECK-NEXT:.b8 105
 ; CHECK-NEXT:.b8 110
@@ -2050,20 +2110,20 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 3                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 127                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x591:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 1430                               // DW_AT_type
-; CHECK-NEXT:.b8 13                                  // Abbrev [13] 0x596:0x5 DW_TAG_const_type
-; CHECK-NEXT:.b32 697                                // DW_AT_type
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x59b:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 697                                // DW_AT_type
-; CHECK-NEXT:.b8 14                                  // Abbrev [14] 0x5a0:0x5 DW_TAG_reference_type
-; CHECK-NEXT:.b32 1430                               // DW_AT_type
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x5a5:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 697                                // DW_AT_type
-; CHECK-NEXT:.b8 15                                  // Abbrev [15] 0x5aa:0x6 DW_TAG_subprogram
-; CHECK-NEXT:.b32 727                                // DW_AT_specification
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x5c5:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 1482                               // DW_AT_type
+; CHECK-NEXT:.b8 13                                  // Abbrev [13] 0x5ca:0x5 DW_TAG_const_type
+; CHECK-NEXT:.b32 717                                // DW_AT_type
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x5cf:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 717                                // DW_AT_type
+; CHECK-NEXT:.b8 14                                  // Abbrev [14] 0x5d4:0x5 DW_TAG_reference_type
+; CHECK-NEXT:.b32 1482                               // DW_AT_type
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x5d9:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 717                                // DW_AT_type
+; CHECK-NEXT:.b8 15                                  // Abbrev [15] 0x5de:0x6 DW_TAG_subprogram
+; CHECK-NEXT:.b32 747                                // DW_AT_specification
 ; CHECK-NEXT:.b8 1                                   // DW_AT_inline
-; CHECK-NEXT:.b8 2                                   // Abbrev [2] 0x5b0:0x233 DW_TAG_structure_type
+; CHECK-NEXT:.b8 2                                   // Abbrev [2] 0x5e4:0x247 DW_TAG_structure_type
 ; CHECK-NEXT:.b8 95                                  // DW_AT_name
 ; CHECK-NEXT:.b8 95
 ; CHECK-NEXT:.b8 99
@@ -2094,7 +2154,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 1                                   // DW_AT_byte_size
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 66                                  // DW_AT_decl_line
-; CHECK-NEXT:.b8 3                                   // Abbrev [3] 0x5cf:0x50 DW_TAG_subprogram
+; CHECK-NEXT:.b8 3                                   // Abbrev [3] 0x603:0x50 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 78
@@ -2168,10 +2228,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 67                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 603                                // DW_AT_type
+; CHECK-NEXT:.b32 623                                // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 3                                   // Abbrev [3] 0x61f:0x50 DW_TAG_subprogram
+; CHECK-NEXT:.b8 3                                   // Abbrev [3] 0x653:0x50 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 78
@@ -2245,10 +2305,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 68                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 603                                // DW_AT_type
+; CHECK-NEXT:.b32 623                                // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 3                                   // Abbrev [3] 0x66f:0x50 DW_TAG_subprogram
+; CHECK-NEXT:.b8 3                                   // Abbrev [3] 0x6a3:0x50 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 78
@@ -2322,10 +2382,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 69                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 603                                // DW_AT_type
+; CHECK-NEXT:.b32 623                                // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 4                                   // Abbrev [4] 0x6bf:0x4a DW_TAG_subprogram
+; CHECK-NEXT:.b8 4                                   // Abbrev [4] 0x6f3:0x4e DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 78
@@ -2386,14 +2446,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 72                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 619                                // DW_AT_type
+; CHECK-NEXT:.b32 639                                // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
+; CHECK-NEXT:.b32 1850                               // DW_AT_object_pointer
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x702:0x6 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2019                               // DW_AT_type
+; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x73a:0x6 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2091                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_artificial
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 6                                   // Abbrev [6] 0x709:0x28 DW_TAG_subprogram
+; CHECK-NEXT:.b8 6                                   // Abbrev [6] 0x741:0x2c DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_name
 ; CHECK-NEXT:.b8 95
 ; CHECK-NEXT:.b8 99
@@ -2424,14 +2485,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 74                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
+; CHECK-NEXT:.b32 1894                               // DW_AT_object_pointer
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 3                                   // DW_AT_accessibility
 ; CHECK-NEXT:                                        // DW_ACCESS_private
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x72a:0x6 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2029                               // DW_AT_type
+; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x766:0x6 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2101                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_artificial
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 6                                   // Abbrev [6] 0x731:0x2d DW_TAG_subprogram
+; CHECK-NEXT:.b8 6                                   // Abbrev [6] 0x76d:0x31 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_name
 ; CHECK-NEXT:.b8 95
 ; CHECK-NEXT:.b8 99
@@ -2462,16 +2524,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 74                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
+; CHECK-NEXT:.b32 1938                               // DW_AT_object_pointer
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 3                                   // DW_AT_accessibility
 ; CHECK-NEXT:                                        // DW_ACCESS_private
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x752:0x6 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2029                               // DW_AT_type
+; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x792:0x6 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2101                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_artificial
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x758:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2034                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x798:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2106                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 8                                   // Abbrev [8] 0x75e:0x44 DW_TAG_subprogram
+; CHECK-NEXT:.b8 8                                   // Abbrev [8] 0x79e:0x48 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 78
@@ -2525,16 +2588,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 74                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
+; CHECK-NEXT:.b32 2010                               // DW_AT_object_pointer
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 3                                   // DW_AT_accessibility
 ; CHECK-NEXT:                                        // DW_ACCESS_private
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x796:0x6 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2019                               // DW_AT_type
+; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x7da:0x6 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2091                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_artificial
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x79c:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2034                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x7e0:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2106                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 9                                   // Abbrev [9] 0x7a2:0x40 DW_TAG_subprogram
+; CHECK-NEXT:.b8 9                                   // Abbrev [9] 0x7e6:0x44 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 78
@@ -2584,30 +2648,31 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 74                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2039                               // DW_AT_type
+; CHECK-NEXT:.b32 2111                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
+; CHECK-NEXT:.b32 2083                               // DW_AT_object_pointer
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 3                                   // DW_AT_accessibility
 ; CHECK-NEXT:                                        // DW_ACCESS_private
-; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x7db:0x6 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2019                               // DW_AT_type
+; CHECK-NEXT:.b8 5                                   // Abbrev [5] 0x823:0x6 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2091                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_artificial
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x7e3:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 2024                               // DW_AT_type
-; CHECK-NEXT:.b8 13                                  // Abbrev [13] 0x7e8:0x5 DW_TAG_const_type
-; CHECK-NEXT:.b32 1456                               // DW_AT_type
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x7ed:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 1456                               // DW_AT_type
-; CHECK-NEXT:.b8 14                                  // Abbrev [14] 0x7f2:0x5 DW_TAG_reference_type
-; CHECK-NEXT:.b32 2024                               // DW_AT_type
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x7f7:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 1456                               // DW_AT_type
-; CHECK-NEXT:.b8 15                                  // Abbrev [15] 0x7fc:0x6 DW_TAG_subprogram
-; CHECK-NEXT:.b32 1487                               // DW_AT_specification
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x82b:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 2096                               // DW_AT_type
+; CHECK-NEXT:.b8 13                                  // Abbrev [13] 0x830:0x5 DW_TAG_const_type
+; CHECK-NEXT:.b32 1508                               // DW_AT_type
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x835:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 1508                               // DW_AT_type
+; CHECK-NEXT:.b8 14                                  // Abbrev [14] 0x83a:0x5 DW_TAG_reference_type
+; CHECK-NEXT:.b32 2096                               // DW_AT_type
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x83f:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 1508                               // DW_AT_type
+; CHECK-NEXT:.b8 15                                  // Abbrev [15] 0x844:0x6 DW_TAG_subprogram
+; CHECK-NEXT:.b32 1539                               // DW_AT_specification
 ; CHECK-NEXT:.b8 1                                   // DW_AT_inline
-; CHECK-NEXT:.b8 21                                  // Abbrev [21] 0x802:0x32 DW_TAG_subprogram
+; CHECK-NEXT:.b8 21                                  // Abbrev [21] 0x84a:0x32 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 51
@@ -2627,28 +2692,28 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 3                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 1                                   // DW_AT_inline
-; CHECK-NEXT:.b8 22                                  // Abbrev [22] 0x816:0x9 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 22                                  // Abbrev [22] 0x85e:0x9 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b8 120                                 // DW_AT_name
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 1                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 3                                   // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 22                                  // Abbrev [22] 0x81f:0x9 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 22                                  // Abbrev [22] 0x867:0x9 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b8 121                                 // DW_AT_name
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 1                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 3                                   // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 22                                  // Abbrev [22] 0x828:0xb DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 22                                  // Abbrev [22] 0x870:0xb DW_TAG_formal_parameter
 ; CHECK-NEXT:.b8 114                                 // DW_AT_name
 ; CHECK-NEXT:.b8 101
 ; CHECK-NEXT:.b8 115
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 1                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 3                                   // DW_AT_decl_line
-; CHECK-NEXT:.b32 2109                               // DW_AT_type
+; CHECK-NEXT:.b32 2181                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x834:0x9 DW_TAG_base_type
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x87c:0x9 DW_TAG_base_type
 ; CHECK-NEXT:.b8 102                                 // DW_AT_name
 ; CHECK-NEXT:.b8 108
 ; CHECK-NEXT:.b8 111
@@ -2657,9 +2722,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 4                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x83d:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 23                                  // Abbrev [23] 0x842:0xd5 DW_TAG_subprogram
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x885:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 23                                  // Abbrev [23] 0x88a:0xd5 DW_TAG_subprogram
 ; CHECK-NEXT:.b64 $L__func_begin0                    // DW_AT_low_pc
 ; CHECK-NEXT:.b64 $L__func_end0                      // DW_AT_high_pc
 ; CHECK-NEXT:.b8 1                                   // DW_AT_frame_base
@@ -2688,7 +2753,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 1                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 24                                  // Abbrev [24] 0x86d:0x10 DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 24                                  // Abbrev [24] 0x8b5:0x10 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b8 2                                   // DW_AT_address_class
 ; CHECK-NEXT:.b8 5                                   // DW_AT_location
 ; CHECK-NEXT:.b8 144
@@ -2700,62 +2765,62 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 1                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_line
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
-; CHECK-NEXT:.b8 25                                  // Abbrev [25] 0x87d:0xd DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
+; CHECK-NEXT:.b8 25                                  // Abbrev [25] 0x8c5:0xd DW_TAG_formal_parameter
 ; CHECK-NEXT:.b32 $L__debug_loc0                     // DW_AT_location
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 1                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 22                                  // Abbrev [22] 0x88a:0x9 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 22                                  // Abbrev [22] 0x8d2:0x9 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b8 120                                 // DW_AT_name
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 1                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_line
-; CHECK-NEXT:.b32 2109                               // DW_AT_type
-; CHECK-NEXT:.b8 22                                  // Abbrev [22] 0x893:0x9 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2181                               // DW_AT_type
+; CHECK-NEXT:.b8 22                                  // Abbrev [22] 0x8db:0x9 DW_TAG_formal_parameter
 ; CHECK-NEXT:.b8 121                                 // DW_AT_name
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 1                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_line
-; CHECK-NEXT:.b32 2109                               // DW_AT_type
-; CHECK-NEXT:.b8 26                                  // Abbrev [26] 0x89c:0xd DW_TAG_variable
+; CHECK-NEXT:.b32 2181                               // DW_AT_type
+; CHECK-NEXT:.b8 26                                  // Abbrev [26] 0x8e4:0xd DW_TAG_variable
 ; CHECK-NEXT:.b32 $L__debug_loc1                     // DW_AT_location
 ; CHECK-NEXT:.b8 105                                 // DW_AT_name
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 1                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_line
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
-; CHECK-NEXT:.b8 27                                  // Abbrev [27] 0x8a9:0x18 DW_TAG_inlined_subroutine
-; CHECK-NEXT:.b32 691                                // DW_AT_abstract_origin
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
+; CHECK-NEXT:.b8 27                                  // Abbrev [27] 0x8f1:0x18 DW_TAG_inlined_subroutine
+; CHECK-NEXT:.b32 711                                // DW_AT_abstract_origin
 ; CHECK-NEXT:.b64 $L__tmp1                           // DW_AT_low_pc
 ; CHECK-NEXT:.b64 $L__tmp2                           // DW_AT_high_pc
 ; CHECK-NEXT:.b8 1                                   // DW_AT_call_file
 ; CHECK-NEXT:.b8 6                                   // DW_AT_call_line
 ; CHECK-NEXT:.b8 11                                  // DW_AT_call_column
-; CHECK-NEXT:.b8 27                                  // Abbrev [27] 0x8c1:0x18 DW_TAG_inlined_subroutine
-; CHECK-NEXT:.b32 1450                               // DW_AT_abstract_origin
+; CHECK-NEXT:.b8 27                                  // Abbrev [27] 0x909:0x18 DW_TAG_inlined_subroutine
+; CHECK-NEXT:.b32 1502                               // DW_AT_abstract_origin
 ; CHECK-NEXT:.b64 $L__tmp2                           // DW_AT_low_pc
 ; CHECK-NEXT:.b64 $L__tmp3                           // DW_AT_high_pc
 ; CHECK-NEXT:.b8 1                                   // DW_AT_call_file
 ; CHECK-NEXT:.b8 6                                   // DW_AT_call_line
 ; CHECK-NEXT:.b8 24                                  // DW_AT_call_column
-; CHECK-NEXT:.b8 27                                  // Abbrev [27] 0x8d9:0x18 DW_TAG_inlined_subroutine
-; CHECK-NEXT:.b32 2044                               // DW_AT_abstract_origin
+; CHECK-NEXT:.b8 27                                  // Abbrev [27] 0x921:0x18 DW_TAG_inlined_subroutine
+; CHECK-NEXT:.b32 2116                               // DW_AT_abstract_origin
 ; CHECK-NEXT:.b64 $L__tmp3                           // DW_AT_low_pc
 ; CHECK-NEXT:.b64 $L__tmp4                           // DW_AT_high_pc
 ; CHECK-NEXT:.b8 1                                   // DW_AT_call_file
 ; CHECK-NEXT:.b8 6                                   // DW_AT_call_line
 ; CHECK-NEXT:.b8 37                                  // DW_AT_call_column
-; CHECK-NEXT:.b8 28                                  // Abbrev [28] 0x8f1:0x25 DW_TAG_inlined_subroutine
-; CHECK-NEXT:.b32 2050                               // DW_AT_abstract_origin
+; CHECK-NEXT:.b8 28                                  // Abbrev [28] 0x939:0x25 DW_TAG_inlined_subroutine
+; CHECK-NEXT:.b32 2122                               // DW_AT_abstract_origin
 ; CHECK-NEXT:.b64 $L__tmp9                           // DW_AT_low_pc
 ; CHECK-NEXT:.b64 $L__tmp10                          // DW_AT_high_pc
 ; CHECK-NEXT:.b8 1                                   // DW_AT_call_file
 ; CHECK-NEXT:.b8 8                                   // DW_AT_call_line
 ; CHECK-NEXT:.b8 5                                   // DW_AT_call_column
-; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x909:0xc DW_TAG_formal_parameter
+; CHECK-NEXT:.b8 29                                  // Abbrev [29] 0x951:0xc DW_TAG_formal_parameter
 ; CHECK-NEXT:.b8 2                                   // DW_AT_address_class
 ; CHECK-NEXT:.b8 5                                   // DW_AT_location
 ; CHECK-NEXT:.b8 144
@@ -2763,859 +2828,859 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 204
 ; CHECK-NEXT:.b8 149
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 2079                               // DW_AT_abstract_origin
+; CHECK-NEXT:.b32 2151                               // DW_AT_abstract_origin
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0x917:0x588 DW_TAG_namespace
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0x95f:0x588 DW_TAG_namespace
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 100
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x91c:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x964:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 202                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3743                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x923:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 3815                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x96b:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 203                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3787                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x92a:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 3859                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x972:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 204                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3816                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x931:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 3888                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x979:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 205                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3847                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x938:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 3919                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x980:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 206                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3876                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x93f:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 3948                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x987:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 207                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3907                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x946:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 3979                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x98e:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 208                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3936                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x94d:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4008                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x995:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 209                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3973                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x954:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4045                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x99c:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 210                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4004                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x95b:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4076                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9a3:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 211                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4033                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x962:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4105                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9aa:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 212                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4062                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x969:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4134                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9b1:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 213                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4105                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x970:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4177                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9b8:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 214                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4132                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x977:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4204                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9bf:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 215                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4161                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x97e:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4233                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9c6:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 216                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4188                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x985:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4260                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9cd:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 217                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4217                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x98c:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4289                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9d4:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 218                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4244                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x993:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4316                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9db:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 219                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4273                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x99a:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4345                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9e2:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 220                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4304                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9a1:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4376                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9e9:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 221                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4333                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9a8:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4405                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9f0:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 222                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4368                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9af:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4440                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9f7:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 223                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4399                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9b6:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4471                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9fe:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 224                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4438                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9bd:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4510                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa05:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 225                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4473                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9c4:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4545                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa0c:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 226                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4508                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9cb:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4580                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa13:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 227                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4543                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9d2:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4615                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa1a:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 228                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4592                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9d9:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4664                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa21:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 229                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4635                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9e0:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4707                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa28:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 230                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4672                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9e7:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4744                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa2f:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 231                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4703                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9ee:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4775                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa36:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 232                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4748                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9f5:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4820                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa3d:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 233                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4793                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x9fc:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4865                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa44:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 234                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4849                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa03:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4921                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa4b:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 235                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4880                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa0a:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4952                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa52:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 236                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4919                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa11:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 4991                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa59:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 237                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4969                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa18:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5041                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa60:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 238                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5023                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa1f:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5095                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa67:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 239                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5054                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa26:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5126                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa6e:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 240                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5091                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa2d:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5163                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa75:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 241                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5141                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa34:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5213                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa7c:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 242                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5182                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa3b:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5254                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa83:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 243                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5219                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa42:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5291                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa8a:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 244                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5252                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa49:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5324                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa91:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 245                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5283                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa50:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5355                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa98:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 246                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5316                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa57:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5388                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa9f:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 247                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5343                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa5e:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5415                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xaa6:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 248                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5374                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa65:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5446                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xaad:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 249                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5405                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa6c:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5477                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xab4:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 250                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5434                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa73:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5506                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xabb:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 251                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5463                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa7a:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5535                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xac2:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 252                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5494                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa81:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5566                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xac9:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 253                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5527                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa88:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5599                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xad0:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 254                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5562                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xa8f:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5634                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xad7:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 255                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5598                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xa96:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5670                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xade:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 0                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5655                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xa9e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5727                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xae6:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 1                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5686                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xaa6:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5758                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xaee:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 2                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5725                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xaae:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5797                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xaf6:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 3                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5770                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xab6:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5842                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xafe:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5803                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xabe:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5875                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb06:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5848                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xac6:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5920                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb0e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5894                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xace:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5966                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb16:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5923                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xad6:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 5995                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb1e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5954                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xade:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6026                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb26:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 9                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5995                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xae6:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6067                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb2e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 10                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6034                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xaee:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6106                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb36:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 11                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6069                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xaf6:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6141                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb3e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6096                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xafe:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6168                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb46:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6125                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb06:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6197                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb4e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 14                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6154                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb0e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6226                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb56:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 15                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6181                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb16:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6253                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb5e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 16                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6210                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb1e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6282                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb66:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 17                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6243                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xb26:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6315                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xb6e:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 102                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6274                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xb2d:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6346                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xb75:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 121                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6294                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xb34:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6366                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xb7c:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 140                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6314                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xb3b:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6386                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xb83:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 159                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6334                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xb42:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6406                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xb8a:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 180                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6360                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xb49:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6432                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xb91:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 199                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6380                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xb50:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6452                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xb98:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 218                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6399                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xb57:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6471                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xb9f:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 237                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6419                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb5e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6491                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xba6:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 0                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6438                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb66:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6510                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xbae:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 19                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6458                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb6e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6530                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xbb6:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 38                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6479                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb76:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6551                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xbbe:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 59                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6504                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb7e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6576                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xbc6:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 78                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6530                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb86:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6602                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xbce:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 97                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6556                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb8e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6628                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xbd6:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 116                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6575                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb96:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6647                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xbde:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 135                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6596                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xb9e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6668                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xbe6:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 147                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6626                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xba6:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6698                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xbee:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 184                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6650                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xbae:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6722                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xbf6:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 203                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6669                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xbb6:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6741                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xbfe:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 222                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6689                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xbbe:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6761                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xc06:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 241                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6709                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xbc6:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6781                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xc0e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 6                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2
-; CHECK-NEXT:.b32 6728                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xbce:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6800                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc16:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 118                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6748                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xbd5:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6820                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc1d:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 119                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6763                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xbdc:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6835                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc24:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 121                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6811                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xbe3:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6883                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc2b:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 122                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6824                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xbea:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6896                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc32:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 123                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6844                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xbf1:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6916                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc39:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 129                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6873                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xbf8:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6945                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc40:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 130                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6893                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xbff:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6965                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc47:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 131                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6914                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc06:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 6986                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc4e:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 132                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6935                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc0d:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7007                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc55:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 133                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7063                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc14:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7135                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc5c:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 134                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7091                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc1b:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7163                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc63:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 135                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7116                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc22:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7188                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc6a:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 136                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7134                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc29:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7206                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc71:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 137                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7151                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc30:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7223                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc78:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 138                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7179                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc37:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7251                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc7f:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 139                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7200                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc3e:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7272                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc86:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 140                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7226                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc45:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7298                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc8d:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 142                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7249                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc4c:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7321                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc94:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 143                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7276                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc53:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7348                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc9b:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 144                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7327                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc5a:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7399                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xca2:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 146                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7360                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc61:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7432                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xca9:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 152                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7393                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc68:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7465                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xcb0:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 153                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7408                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc6f:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7480                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xcb7:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 154                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7437                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc76:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7509                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xcbe:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 155                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7455                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc7d:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7527                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xcc5:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 156                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7487                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc84:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7559                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xccc:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 157                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7519                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc8b:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7591                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xcd3:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 158                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7552                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc92:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7624                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xcda:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 160                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7575                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xc99:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7647                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xce1:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 161                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7620                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xca0:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7692                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xce8:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 241                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7768                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xca7:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7840                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xcef:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 243                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7817                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xcae:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7889                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xcf6:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 245                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7836                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xcb5:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7908                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xcfd:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 246                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7722                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xcbc:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7794                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xd04:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 247                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7858                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xcc3:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7930                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xd0b:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 249                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7885                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xcca:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7957                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xd12:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 250                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 8000                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xcd1:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8072                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xd19:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 251                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7907                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xcd8:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7979                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xd20:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 252                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7940                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xcdf:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8012                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0xd27:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 253                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 8027                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xce6:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8099                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd2e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 149                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8070                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xcee:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8142                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd36:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 150                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8102                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xcf6:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8174                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd3e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 151                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8136                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xcfe:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8208                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd46:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 152                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8168                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd06:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8240                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd4e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 153                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8202                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd0e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8274                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd56:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 154                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8242                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd16:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8314                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd5e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 155                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8274                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd1e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8346                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd66:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 156                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8308                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd26:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8380                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd6e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 157                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8340                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd2e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8412                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd76:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 158                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8372                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd36:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8444                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd7e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 159                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8418                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd3e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8490                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd86:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 160                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8448                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd46:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8520                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd8e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 161                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8480                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd4e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8552                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd96:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 162                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8512                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd56:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8584                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd9e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 163                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8542                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd5e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8614                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xda6:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 164                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8574                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd66:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8646                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xdae:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 165                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8604                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd6e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8676                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xdb6:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 166                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8638                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd76:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8710                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xdbe:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 167                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8670                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd7e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8742                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xdc6:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 168                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8708                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd86:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8780                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xdce:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 169                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8742                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd8e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8814                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xdd6:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 170                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8784                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd96:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8856                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xdde:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 171                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8822                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xd9e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8894                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xde6:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 172                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8860                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xda6:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8932                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xdee:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 173                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8898                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xdae:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8970                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xdf6:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 174                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8939                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xdb6:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9011                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xdfe:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 175                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 8979                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xdbe:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9051                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe06:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 176                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9013                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xdc6:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9085                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe0e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 177                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9053                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xdce:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9125                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe16:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 178                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9089                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xdd6:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9161                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe1e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 179                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9125                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xdde:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9197                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe26:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 180                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9163                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xde6:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9235                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe2e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 181                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9197                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xdee:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9269                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe36:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 182                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9231                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xdf6:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9303                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe3e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 183                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9263                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xdfe:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9335                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe46:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 184                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9295                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe06:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9367                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe4e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 185                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9325                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe0e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9397                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe56:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 186                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9359                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe16:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9431                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe5e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 187                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9395                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe1e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9467                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe66:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 188                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9434                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe26:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9506                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe6e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 189                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9477                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe2e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9549                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe76:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 190                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9526                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe36:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9598                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe7e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 191                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9562                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe3e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9634                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe86:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 192                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9611                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe46:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9683                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe8e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 193                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9660                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe4e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9732                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe96:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 194                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9692                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe56:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9764                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe9e:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 195                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9726                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe5e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9798                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xea6:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 196                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9770                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe66:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9842                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xeae:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 197                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9812                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe6e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9884                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xeb6:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 198                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9842                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe76:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9914                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xebe:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 199                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9874                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe7e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9946                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xec6:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 200                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9906                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe86:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 9978                               // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xece:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 201                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9936                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe8e:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 10008                              // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xed6:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 202                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 9968                               // DW_AT_import
-; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xe96:0x8 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 10040                              // DW_AT_import
+; CHECK-NEXT:.b8 32                                  // Abbrev [32] 0xede:0x8 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 13                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 203                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 10004                              // DW_AT_import
+; CHECK-NEXT:.b32 10076                              // DW_AT_import
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0xe9f:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0xee7:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3631,12 +3696,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 44                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 3770                               // DW_AT_type
+; CHECK-NEXT:.b32 3842                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xeb4:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 3770                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xefc:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 3842                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0xeba:0x11 DW_TAG_base_type
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0xf02:0x11 DW_TAG_base_type
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 110
@@ -3653,7 +3718,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 8                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0xecb:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0xf13:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3671,12 +3736,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 46                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xee2:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf2a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0xee8:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0xf30:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3696,12 +3761,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 48                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf01:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf49:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0xf07:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0xf4f:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3719,12 +3784,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 50                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf1e:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf66:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0xf24:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0xf6c:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3744,12 +3809,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 52                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf3d:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf85:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0xf43:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0xf8b:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3767,12 +3832,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 56                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf5a:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xfa2:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0xf60:0x25 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0xfa8:0x25 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3793,14 +3858,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 54                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf7a:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf7f:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xfc2:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xfc7:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0xf85:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0xfcd:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3820,12 +3885,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 58                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xf9e:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xfe6:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0xfa4:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0xfec:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3843,12 +3908,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 60                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xfbb:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1003:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0xfc1:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1009:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3866,12 +3931,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 62                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xfd8:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1020:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0xfde:0x2b DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1026:0x2b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3898,14 +3963,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 64                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0xffe:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1003:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1046:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x104b:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1009:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1051:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3921,12 +3986,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 66                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x101e:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1066:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1024:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x106c:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3944,12 +4009,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 68                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x103b:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1083:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1041:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1089:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3965,12 +4030,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 72                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1056:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x109e:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x105c:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x10a4:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -3988,12 +4053,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 70                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1073:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x10bb:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1079:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x10c1:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4009,12 +4074,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 76                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x108e:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x10d6:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1094:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x10dc:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4032,12 +4097,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 74                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x10ab:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x10f3:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x10b1:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x10f9:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4057,12 +4122,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 78                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x10ca:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1112:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x10d0:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1118:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4080,12 +4145,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 80                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x10e7:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x112f:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x10ed:0x23 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1135:0x23 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4104,14 +4169,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 82                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1105:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x110a:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x114d:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1152:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1110:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1158:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4131,12 +4196,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 84                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1129:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1171:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x112f:0x27 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1177:0x27 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4154,16 +4219,16 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 86                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1146:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x114b:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1150:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x118e:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1193:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1198:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1156:0x23 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x119e:0x23 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4182,14 +4247,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 88                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x116e:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1173:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x11b6:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x11bb:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1179:0x23 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x11c1:0x23 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4208,14 +4273,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 90                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1191:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1196:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x11d9:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x11de:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x119c:0x23 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x11e4:0x23 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4234,14 +4299,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 92                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x11b4:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x11b9:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x11fc:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1201:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x11bf:0x2a DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1207:0x2a DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4272,19 +4337,19 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 94                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x11e3:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x122b:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x11e9:0x7 DW_TAG_base_type
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1231:0x7 DW_TAG_base_type
 ; CHECK-NEXT:.b8 105                                 // DW_AT_name
 ; CHECK-NEXT:.b8 110
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 4                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x11f0:0x26 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1238:0x26 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4306,16 +4371,16 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 96                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x120b:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1210:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4630                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1253:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1258:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4702                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1216:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x121b:0x25 DW_TAG_subprogram
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x125e:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1263:0x25 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4336,14 +4401,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 98                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1235:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x123a:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x127d:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1282:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1240:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1288:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4363,12 +4428,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 100                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1259:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x12a1:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x125f:0x25 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x12a7:0x25 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4394,12 +4459,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 102                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4740                               // DW_AT_type
+; CHECK-NEXT:.b32 4812                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x127e:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x12c6:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1284:0x8 DW_TAG_base_type
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x12cc:0x8 DW_TAG_base_type
 ; CHECK-NEXT:.b8 98                                  // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 111
@@ -4407,7 +4472,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 2                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 1                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x128c:0x2d DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x12d4:0x2d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4436,14 +4501,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 106                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4740                               // DW_AT_type
+; CHECK-NEXT:.b32 4812                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x12ae:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x12b3:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x12f6:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x12fb:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x12b9:0x38 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1301:0x38 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4483,14 +4548,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 105                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4740                               // DW_AT_type
+; CHECK-NEXT:.b32 4812                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x12e6:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x12eb:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x132e:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1333:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x12f1:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1339:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4510,12 +4575,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 108                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4740                               // DW_AT_type
+; CHECK-NEXT:.b32 4812                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x130a:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1352:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1310:0x27 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1358:0x27 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4538,14 +4603,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 112                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4740                               // DW_AT_type
+; CHECK-NEXT:.b32 4812                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x132c:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1331:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1374:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1379:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1337:0x32 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x137f:0x32 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4579,14 +4644,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 111                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4740                               // DW_AT_type
+; CHECK-NEXT:.b32 4812                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x135e:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1363:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x13a6:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x13ab:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1369:0x36 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x13b1:0x36 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4624,14 +4689,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 114                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4740                               // DW_AT_type
+; CHECK-NEXT:.b32 4812                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1394:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1399:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x13dc:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x13e1:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x139f:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x13e7:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4651,12 +4716,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 116                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4740                               // DW_AT_type
+; CHECK-NEXT:.b32 4812                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x13b8:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1400:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x13be:0x25 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1406:0x25 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4682,12 +4747,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 118                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4740                               // DW_AT_type
+; CHECK-NEXT:.b32 4812                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x13dd:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1425:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x13e3:0x32 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x142b:0x32 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4721,14 +4786,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 120                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4740                               // DW_AT_type
+; CHECK-NEXT:.b32 4812                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x140a:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x140f:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1452:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1457:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1415:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x145d:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4746,12 +4811,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 121                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5170                               // DW_AT_type
+; CHECK-NEXT:.b32 5242                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x142c:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5170                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1474:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5242                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1432:0xc DW_TAG_base_type
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x147a:0xc DW_TAG_base_type
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 110
@@ -4763,7 +4828,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 8                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x143e:0x25 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1486:0x25 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4784,14 +4849,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 123                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1458:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x145d:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x14a0:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x14a5:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1463:0x21 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x14ab:0x21 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4813,12 +4878,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 125                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x147e:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x14c6:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1484:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x14cc:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4838,12 +4903,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 126                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3770                               // DW_AT_type
+; CHECK-NEXT:.b32 3842                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x149d:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 3770                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x14e5:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 3842                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x14a3:0x21 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x14eb:0x21 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4865,12 +4930,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 128                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3770                               // DW_AT_type
+; CHECK-NEXT:.b32 3842                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x14be:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1506:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x14c4:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x150c:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4886,12 +4951,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 138                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x14d9:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1521:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x14df:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1527:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4911,12 +4976,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 130                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x14f8:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1540:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x14fe:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1546:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4936,12 +5001,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 132                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1517:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x155f:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x151d:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1565:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4959,12 +5024,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 134                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1534:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x157c:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x153a:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1582:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -4982,12 +5047,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 136                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1551:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1599:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1557:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x159f:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5007,12 +5072,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 140                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5170                               // DW_AT_type
+; CHECK-NEXT:.b32 5242                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1570:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x15b8:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1576:0x21 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x15be:0x21 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5034,12 +5099,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 142                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5170                               // DW_AT_type
+; CHECK-NEXT:.b32 5242                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1591:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x15d9:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1597:0x23 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x15df:0x23 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5063,12 +5128,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 143                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3770                               // DW_AT_type
+; CHECK-NEXT:.b32 3842                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x15b4:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x15fc:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x15ba:0x24 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1602:0x24 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5088,14 +5153,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 145                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x15d3:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x15d8:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2109                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x161b:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1620:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2181                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x15de:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1626:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5113,12 +5178,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 146                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x15f5:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5637                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x163d:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5709                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x15fb:0xa DW_TAG_base_type
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1643:0xa DW_TAG_base_type
 ; CHECK-NEXT:.b8 100                                 // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 117
@@ -5128,11 +5193,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 8                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1605:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 5642                               // DW_AT_type
-; CHECK-NEXT:.b8 13                                  // Abbrev [13] 0x160a:0x5 DW_TAG_const_type
-; CHECK-NEXT:.b32 5647                               // DW_AT_type
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x160f:0x8 DW_TAG_base_type
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x164d:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 5714                               // DW_AT_type
+; CHECK-NEXT:.b8 13                                  // Abbrev [13] 0x1652:0x5 DW_TAG_const_type
+; CHECK-NEXT:.b32 5719                               // DW_AT_type
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1657:0x8 DW_TAG_base_type
 ; CHECK-NEXT:.b8 99                                  // DW_AT_name
 ; CHECK-NEXT:.b8 104
 ; CHECK-NEXT:.b8 97
@@ -5140,7 +5205,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 8                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 1                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1617:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x165f:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5160,12 +5225,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 147                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1630:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5637                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1678:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5709                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1636:0x27 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x167e:0x27 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5193,12 +5258,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 149                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1657:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x169f:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x165d:0x2d DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x16a5:0x2d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5227,14 +5292,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 151                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x167f:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1684:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16c7:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16cc:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x168a:0x21 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x16d2:0x21 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5251,14 +5316,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 155                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16a0:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16a5:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16e8:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16ed:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x16ab:0x2d DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x16f3:0x2d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5287,14 +5352,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 157                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16cd:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16d2:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1715:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x171a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x16d8:0x2e DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1720:0x2e DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5319,16 +5384,16 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 159                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16f6:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x16fb:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1700:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4630                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x173e:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1743:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1748:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4702                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1706:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x174e:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5346,12 +5411,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 161                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x171d:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1765:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1723:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x176b:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5371,12 +5436,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 163                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x173c:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1784:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1742:0x29 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x178a:0x29 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5401,14 +5466,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 165                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1760:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1765:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5170                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x17a8:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x17ad:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5242                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x176b:0x27 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x17b3:0x27 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5431,14 +5496,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 167                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1787:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x178c:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x17cf:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x17d4:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1792:0x23 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x17da:0x23 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5462,12 +5527,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 169                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 4740                               // DW_AT_type
+; CHECK-NEXT:.b32 4812                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x17af:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x17f7:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x17b5:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x17fd:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5483,12 +5548,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 171                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x17ca:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1812:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x17d0:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1818:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5506,12 +5571,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 173                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x17e7:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x182f:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x17ed:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1835:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5529,12 +5594,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 175                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1804:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x184c:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x180a:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1852:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5550,12 +5615,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 177                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x181f:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1867:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1825:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x186d:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5573,12 +5638,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 179                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x183c:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1884:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1842:0x21 DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x188a:0x21 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5600,12 +5665,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 181                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x185d:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18a5:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x1863:0x1f DW_TAG_subprogram
+; CHECK-NEXT:.b8 33                                  // Abbrev [33] 0x18ab:0x1f DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -5625,12 +5690,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 183                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x187c:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18c4:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1882:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x18ca:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 99
 ; CHECK-NEXT:.b8 111
@@ -5638,13 +5703,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 54                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1890:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18d8:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1896:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x18de:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 115
 ; CHECK-NEXT:.b8 105
@@ -5652,13 +5717,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 56                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18a4:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18ec:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x18aa:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x18f2:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 97
@@ -5666,13 +5731,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 58                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18b8:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1900:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x18be:0x1a DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1906:0x1a DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 97
@@ -5681,15 +5746,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 60                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18cd:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18d2:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1915:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x191a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x18d8:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1920:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 99                                  // DW_AT_name
 ; CHECK-NEXT:.b8 101
 ; CHECK-NEXT:.b8 105
@@ -5697,26 +5762,26 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 178                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18e6:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x192e:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x18ec:0x13 DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1934:0x13 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 99                                  // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 115
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 63                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x18f9:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1941:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x18ff:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1947:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 99                                  // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 115
@@ -5724,26 +5789,26 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 72                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x190d:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1955:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1913:0x13 DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x195b:0x13 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 101                                 // DW_AT_name
 ; CHECK-NEXT:.b8 120
 ; CHECK-NEXT:.b8 112
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 100                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1920:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1968:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1926:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x196e:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 102                                 // DW_AT_name
 ; CHECK-NEXT:.b8 97
 ; CHECK-NEXT:.b8 98
@@ -5751,13 +5816,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 181                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1934:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x197c:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x193a:0x15 DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1982:0x15 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 102                                 // DW_AT_name
 ; CHECK-NEXT:.b8 108
 ; CHECK-NEXT:.b8 111
@@ -5766,13 +5831,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 184                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1949:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1991:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x194f:0x19 DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1997:0x19 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 102                                 // DW_AT_name
 ; CHECK-NEXT:.b8 109
 ; CHECK-NEXT:.b8 111
@@ -5780,15 +5845,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 187                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x195d:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1962:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19a5:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19aa:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1968:0x1a DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x19b0:0x1a DW_TAG_subprogram
 ; CHECK-NEXT:.b8 102                                 // DW_AT_name
 ; CHECK-NEXT:.b8 114
 ; CHECK-NEXT:.b8 101
@@ -5797,15 +5862,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 103                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1977:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x197c:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4630                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19bf:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19c4:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4702                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1982:0x1a DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x19ca:0x1a DW_TAG_subprogram
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 100
 ; CHECK-NEXT:.b8 101
@@ -5814,28 +5879,28 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 106                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1991:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1996:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19d9:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19de:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x199c:0x13 DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x19e4:0x13 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 103
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 109                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19a9:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19f1:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x19af:0x15 DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x19f7:0x15 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 103
@@ -5844,13 +5909,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 112                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19be:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a06:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x19c4:0x19 DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1a0c:0x19 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 109                                 // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 100
@@ -5858,45 +5923,45 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 115                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19d2:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19d7:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6621                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x19dd:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x19e2:0x18 DW_TAG_subprogram
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a1a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a1f:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6693                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1a25:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1a2a:0x18 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 112                                 // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 119
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 153                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19ef:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x19f4:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a37:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a3c:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x19fa:0x13 DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1a42:0x13 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 105
 ; CHECK-NEXT:.b8 110
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 65                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a07:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a4f:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1a0d:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1a55:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 105
 ; CHECK-NEXT:.b8 110
@@ -5904,13 +5969,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 74                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a1b:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a63:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1a21:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1a69:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 113
 ; CHECK-NEXT:.b8 114
@@ -5918,26 +5983,26 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 156                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a2f:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a77:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1a35:0x13 DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1a7d:0x13 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 116                                 // DW_AT_name
 ; CHECK-NEXT:.b8 97
 ; CHECK-NEXT:.b8 110
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 67                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a42:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a8a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1a48:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1a90:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 116                                 // DW_AT_name
 ; CHECK-NEXT:.b8 97
 ; CHECK-NEXT:.b8 110
@@ -5945,14 +6010,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 76                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a56:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1a9e:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 35                                  // Abbrev [35] 0x1a5c:0xd DW_TAG_typedef
-; CHECK-NEXT:.b32 6761                               // DW_AT_type
+; CHECK-NEXT:.b8 35                                  // Abbrev [35] 0x1aa4:0xd DW_TAG_typedef
+; CHECK-NEXT:.b32 6833                               // DW_AT_type
 ; CHECK-NEXT:.b8 100                                 // DW_AT_name
 ; CHECK-NEXT:.b8 105
 ; CHECK-NEXT:.b8 118
@@ -5961,10 +6026,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 101                                 // DW_AT_decl_line
-; CHECK-NEXT:.b8 36                                  // Abbrev [36] 0x1a69:0x2 DW_TAG_structure_type
+; CHECK-NEXT:.b8 36                                  // Abbrev [36] 0x1ab1:0x2 DW_TAG_structure_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 35                                  // Abbrev [35] 0x1a6b:0xe DW_TAG_typedef
-; CHECK-NEXT:.b32 6777                               // DW_AT_type
+; CHECK-NEXT:.b8 35                                  // Abbrev [35] 0x1ab3:0xe DW_TAG_typedef
+; CHECK-NEXT:.b32 6849                               // DW_AT_type
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 100
 ; CHECK-NEXT:.b8 105
@@ -5974,35 +6039,35 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 109                                 // DW_AT_decl_line
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1a79:0x22 DW_TAG_structure_type
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1ac1:0x22 DW_TAG_structure_type
 ; CHECK-NEXT:.b8 16                                  // DW_AT_byte_size
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 105                                 // DW_AT_decl_line
-; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x1a7d:0xf DW_TAG_member
+; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x1ac5:0xf DW_TAG_member
 ; CHECK-NEXT:.b8 113                                 // DW_AT_name
 ; CHECK-NEXT:.b8 117
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b32 5170                               // DW_AT_type
+; CHECK-NEXT:.b32 5242                               // DW_AT_type
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 107                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2                                   // DW_AT_data_member_location
 ; CHECK-NEXT:.b8 35
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x1a8c:0xe DW_TAG_member
+; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x1ad4:0xe DW_TAG_member
 ; CHECK-NEXT:.b8 114                                 // DW_AT_name
 ; CHECK-NEXT:.b8 101
 ; CHECK-NEXT:.b8 109
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b32 5170                               // DW_AT_type
+; CHECK-NEXT:.b32 5242                               // DW_AT_type
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 108                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2                                   // DW_AT_data_member_location
 ; CHECK-NEXT:.b8 35
 ; CHECK-NEXT:.b8 8
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 38                                  // Abbrev [38] 0x1a9b:0xd DW_TAG_subprogram
+; CHECK-NEXT:.b8 38                                  // Abbrev [38] 0x1ae3:0xd DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 98
 ; CHECK-NEXT:.b8 111
@@ -6015,7 +6080,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 1                                   // DW_AT_noreturn
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1aa8:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1af0:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 98
 ; CHECK-NEXT:.b8 115
@@ -6023,13 +6088,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ab6:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1afe:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1abc:0x17 DW_TAG_subprogram
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1b04:0x17 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 101
@@ -6040,16 +6105,16 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1acd:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6867                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b15:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 6939                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1ad3:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 6872                               // DW_AT_type
-; CHECK-NEXT:.b8 40                                  // Abbrev [40] 0x1ad8:0x1 DW_TAG_subroutine_type
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1ad9:0x14 DW_TAG_subprogram
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1b1b:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 6944                               // DW_AT_type
+; CHECK-NEXT:.b8 40                                  // Abbrev [40] 0x1b20:0x1 DW_TAG_subroutine_type
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1b21:0x14 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 111
@@ -6057,13 +6122,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 9                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 26                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ae7:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5637                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b2f:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5709                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1aed:0x15 DW_TAG_subprogram
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1b35:0x15 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 111
@@ -6072,13 +6137,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 22                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1afc:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5637                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b44:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5709                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1b02:0x15 DW_TAG_subprogram
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1b4a:0x15 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 111
@@ -6087,13 +6152,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 27                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 5170                               // DW_AT_type
+; CHECK-NEXT:.b32 5242                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b11:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5637                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b59:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5709                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1b17:0x2b DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1b5f:0x2b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 98                                  // DW_AT_name
 ; CHECK-NEXT:.b8 115
 ; CHECK-NEXT:.b8 101
@@ -6104,26 +6169,26 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 10                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 20                                  // DW_AT_decl_line
-; CHECK-NEXT:.b32 6978                               // DW_AT_type
+; CHECK-NEXT:.b32 7050                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b28:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6979                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b2d:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6979                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b32:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6985                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b37:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6985                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b3c:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7020                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 41                                  // Abbrev [41] 0x1b42:0x1 DW_TAG_pointer_type
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1b43:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 6984                               // DW_AT_type
-; CHECK-NEXT:.b8 42                                  // Abbrev [42] 0x1b48:0x1 DW_TAG_const_type
-; CHECK-NEXT:.b8 35                                  // Abbrev [35] 0x1b49:0xe DW_TAG_typedef
-; CHECK-NEXT:.b32 6999                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b70:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7051                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b75:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7051                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b7a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7057                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b7f:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7057                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b84:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7092                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 41                                  // Abbrev [41] 0x1b8a:0x1 DW_TAG_pointer_type
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1b8b:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 7056                               // DW_AT_type
+; CHECK-NEXT:.b8 42                                  // Abbrev [42] 0x1b90:0x1 DW_TAG_const_type
+; CHECK-NEXT:.b8 35                                  // Abbrev [35] 0x1b91:0xe DW_TAG_typedef
+; CHECK-NEXT:.b32 7071                               // DW_AT_type
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 105
 ; CHECK-NEXT:.b8 122
@@ -6133,7 +6198,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 11                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 62                                  // DW_AT_decl_line
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1b57:0x15 DW_TAG_base_type
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1b9f:0x15 DW_TAG_base_type
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 110
@@ -6154,8 +6219,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 8                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 20                                  // Abbrev [20] 0x1b6c:0x16 DW_TAG_typedef
-; CHECK-NEXT:.b32 7042                               // DW_AT_type
+; CHECK-NEXT:.b8 20                                  // Abbrev [20] 0x1bb4:0x16 DW_TAG_typedef
+; CHECK-NEXT:.b32 7114                               // DW_AT_type
 ; CHECK-NEXT:.b8 95                                  // DW_AT_name
 ; CHECK-NEXT:.b8 95
 ; CHECK-NEXT:.b8 99
@@ -6173,16 +6238,16 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 230                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1b82:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 7047                               // DW_AT_type
-; CHECK-NEXT:.b8 43                                  // Abbrev [43] 0x1b87:0x10 DW_TAG_subroutine_type
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b8c:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6979                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1b91:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6979                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1b97:0x1c DW_TAG_subprogram
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1bca:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 7119                               // DW_AT_type
+; CHECK-NEXT:.b8 43                                  // Abbrev [43] 0x1bcf:0x10 DW_TAG_subroutine_type
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1bd4:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7051                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1bd9:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7051                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1bdf:0x1c DW_TAG_subprogram
 ; CHECK-NEXT:.b8 99                                  // DW_AT_name
 ; CHECK-NEXT:.b8 97
 ; CHECK-NEXT:.b8 108
@@ -6193,15 +6258,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 212                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6978                               // DW_AT_type
+; CHECK-NEXT:.b32 7050                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ba8:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6985                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1bad:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6985                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1bf0:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7057                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1bf5:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7057                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1bb3:0x19 DW_TAG_subprogram
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1bfb:0x19 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 100                                 // DW_AT_name
 ; CHECK-NEXT:.b8 105
 ; CHECK-NEXT:.b8 118
@@ -6209,15 +6274,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 21                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 6748                               // DW_AT_type
+; CHECK-NEXT:.b32 6820                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1bc1:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1bc6:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c09:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c0e:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x1bcc:0x12 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x1c14:0x12 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 101                                 // DW_AT_name
 ; CHECK-NEXT:.b8 120
 ; CHECK-NEXT:.b8 105
@@ -6229,10 +6294,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 1                                   // DW_AT_noreturn
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1bd8:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c20:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 18                                  // Abbrev [18] 0x1bde:0x11 DW_TAG_subprogram
+; CHECK-NEXT:.b8 45                                  // Abbrev [45] 0x1c26:0x11 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 102                                 // DW_AT_name
 ; CHECK-NEXT:.b8 114
 ; CHECK-NEXT:.b8 101
@@ -6243,10 +6308,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 1
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1be9:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6978                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c31:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7050                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1bef:0x17 DW_TAG_subprogram
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1c37:0x17 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 103                                 // DW_AT_name
 ; CHECK-NEXT:.b8 101
 ; CHECK-NEXT:.b8 116
@@ -6257,15 +6322,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 52                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2
-; CHECK-NEXT:.b32 7174                               // DW_AT_type
+; CHECK-NEXT:.b32 7246                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c00:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5637                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c48:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5709                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1c06:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 5647                               // DW_AT_type
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1c0b:0x15 DW_TAG_subprogram
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1c4e:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 5719                               // DW_AT_type
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1c53:0x15 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 97
 ; CHECK-NEXT:.b8 98
@@ -6274,13 +6339,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 5170                               // DW_AT_type
+; CHECK-NEXT:.b32 5242                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c1a:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5170                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c62:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5242                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1c20:0x1a DW_TAG_subprogram
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1c68:0x1a DW_TAG_subprogram
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 100
 ; CHECK-NEXT:.b8 105
@@ -6289,15 +6354,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 23                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 6763                               // DW_AT_type
+; CHECK-NEXT:.b32 6835                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c2f:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5170                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c34:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5170                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c77:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5242                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c7c:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5242                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1c3a:0x17 DW_TAG_subprogram
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1c82:0x17 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 109                                 // DW_AT_name
 ; CHECK-NEXT:.b8 97
 ; CHECK-NEXT:.b8 108
@@ -6308,13 +6373,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 210                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6978                               // DW_AT_type
+; CHECK-NEXT:.b32 7050                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c4b:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6985                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c93:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7057                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1c51:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1c99:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 109                                 // DW_AT_name
 ; CHECK-NEXT:.b8 98
 ; CHECK-NEXT:.b8 108
@@ -6324,15 +6389,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 95                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c61:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5637                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c66:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6985                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ca9:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5709                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cae:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7057                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1c6c:0x23 DW_TAG_subprogram
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1cb4:0x23 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 109                                 // DW_AT_name
 ; CHECK-NEXT:.b8 98
 ; CHECK-NEXT:.b8 115
@@ -6345,19 +6410,19 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 106                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 6985                               // DW_AT_type
+; CHECK-NEXT:.b32 7057                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c7f:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7311                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c84:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5637                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1c89:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6985                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1c8f:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 7316                               // DW_AT_type
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1c94:0xb DW_TAG_base_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cc7:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7383                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ccc:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5709                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cd1:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7057                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1cd7:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 7388                               // DW_AT_type
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1cdc:0xb DW_TAG_base_type
 ; CHECK-NEXT:.b8 119                                 // DW_AT_name
 ; CHECK-NEXT:.b8 99
 ; CHECK-NEXT:.b8 104
@@ -6368,7 +6433,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 5                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 4                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1c9f:0x21 DW_TAG_subprogram
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1ce7:0x21 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 109                                 // DW_AT_name
 ; CHECK-NEXT:.b8 98
 ; CHECK-NEXT:.b8 116
@@ -6379,17 +6444,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 98                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cb0:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7311                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cb5:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5637                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cba:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6985                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 18                                  // Abbrev [18] 0x1cc0:0x21 DW_TAG_subprogram
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cf8:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7383                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cfd:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5709                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d02:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7057                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 45                                  // Abbrev [45] 0x1d08:0x21 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 113                                 // DW_AT_name
 ; CHECK-NEXT:.b8 115
 ; CHECK-NEXT:.b8 111
@@ -6401,16 +6466,16 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 2
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ccc:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6978                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cd1:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6985                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cd6:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6985                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1cdb:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7020                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d14:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7050                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d19:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7057                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d1e:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7057                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d23:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7092                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 45                                  // Abbrev [45] 0x1ce1:0xf DW_TAG_subprogram
+; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x1d29:0xf DW_TAG_subprogram
 ; CHECK-NEXT:.b8 114                                 // DW_AT_name
 ; CHECK-NEXT:.b8 97
 ; CHECK-NEXT:.b8 110
@@ -6419,10 +6484,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 118                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1cf0:0x1d DW_TAG_subprogram
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1d38:0x1d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 114                                 // DW_AT_name
 ; CHECK-NEXT:.b8 101
 ; CHECK-NEXT:.b8 97
@@ -6434,15 +6499,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 224                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 6978                               // DW_AT_type
+; CHECK-NEXT:.b32 7050                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d02:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6978                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d07:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6985                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d4a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7050                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d4f:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7057                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 18                                  // Abbrev [18] 0x1d0d:0x12 DW_TAG_subprogram
+; CHECK-NEXT:.b8 45                                  // Abbrev [45] 0x1d55:0x12 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 114
 ; CHECK-NEXT:.b8 97
@@ -6454,10 +6519,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 1
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d19:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 603                                // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d61:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 623                                // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1d1f:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1d67:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 114
@@ -6467,17 +6532,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 164                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5627                               // DW_AT_type
+; CHECK-NEXT:.b32 5699                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d2f:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5637                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d34:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7482                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1d3a:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 7174                               // DW_AT_type
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1d3f:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d77:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5709                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d7c:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7554                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1d82:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 7246                               // DW_AT_type
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1d87:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 114
@@ -6487,17 +6552,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 183                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 5170                               // DW_AT_type
+; CHECK-NEXT:.b32 5242                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d4f:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5637                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d54:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7482                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d59:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1d5f:0x21 DW_TAG_subprogram
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d97:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5709                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d9c:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7554                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1da1:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1da7:0x21 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 114
@@ -6508,17 +6573,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 187                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 6999                               // DW_AT_type
+; CHECK-NEXT:.b32 7071                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d70:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5637                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d75:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7482                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d7a:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1d80:0x17 DW_TAG_subprogram
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1db8:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5709                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1dbd:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7554                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1dc2:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1dc8:0x17 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 121
 ; CHECK-NEXT:.b8 115
@@ -6529,13 +6594,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 205                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1d91:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5637                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1dd9:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5709                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1d97:0x23 DW_TAG_subprogram
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1ddf:0x23 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 119                                 // DW_AT_name
 ; CHECK-NEXT:.b8 99
 ; CHECK-NEXT:.b8 115
@@ -6548,21 +6613,21 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 109                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 6985                               // DW_AT_type
+; CHECK-NEXT:.b32 7057                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1daa:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7174                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1daf:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7610                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1db4:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 6985                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1dba:0x5 DW_TAG_pointer_type
-; CHECK-NEXT:.b32 7615                               // DW_AT_type
-; CHECK-NEXT:.b8 13                                  // Abbrev [13] 0x1dbf:0x5 DW_TAG_const_type
-; CHECK-NEXT:.b32 7316                               // DW_AT_type
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1dc4:0x1c DW_TAG_subprogram
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1df2:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7246                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1df7:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7682                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1dfc:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7057                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 12                                  // Abbrev [12] 0x1e02:0x5 DW_TAG_pointer_type
+; CHECK-NEXT:.b32 7687                               // DW_AT_type
+; CHECK-NEXT:.b8 13                                  // Abbrev [13] 0x1e07:0x5 DW_TAG_const_type
+; CHECK-NEXT:.b32 7388                               // DW_AT_type
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1e0c:0x1c DW_TAG_subprogram
 ; CHECK-NEXT:.b8 119                                 // DW_AT_name
 ; CHECK-NEXT:.b8 99
 ; CHECK-NEXT:.b8 116
@@ -6573,15 +6638,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 102                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1dd5:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7174                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1dda:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7316                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1e1d:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7246                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1e22:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7388                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0x1de0:0x78 DW_TAG_namespace
+; CHECK-NEXT:.b8 30                                  // Abbrev [30] 0x1e28:0x78 DW_TAG_namespace
 ; CHECK-NEXT:.b8 95                                  // DW_AT_name
 ; CHECK-NEXT:.b8 95
 ; CHECK-NEXT:.b8 103
@@ -6592,43 +6657,43 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 120
 ; CHECK-NEXT:.b8 120
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1deb:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1e33:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 201                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7768                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1df2:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7840                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1e3a:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 207                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7817                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1df9:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7889                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1e41:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 211                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7836                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1e00:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7908                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1e48:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 217                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7858                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1e07:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7930                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1e4f:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 228                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7885                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1e0e:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7957                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1e56:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 229                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7907                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1e15:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 7979                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1e5d:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 230                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7940                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1e1c:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8012                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1e64:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 232                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 8000                               // DW_AT_import
-; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1e23:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT:.b32 8072                               // DW_AT_import
+; CHECK-NEXT:.b8 31                                  // Abbrev [31] 0x1e6b:0x7 DW_TAG_imported_declaration
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 233                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 8027                               // DW_AT_import
-; CHECK-NEXT:.b8 4                                   // Abbrev [4] 0x1e2a:0x2d DW_TAG_subprogram
+; CHECK-NEXT:.b32 8099                               // DW_AT_import
+; CHECK-NEXT:.b8 47                                  // Abbrev [47] 0x1e72:0x2d DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 78
@@ -6656,17 +6721,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 8                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 214                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7768                               // DW_AT_type
+; CHECK-NEXT:.b32 7840                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1e4c:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 3770                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1e51:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 3770                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1e94:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 3842                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1e99:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 3842                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 35                                  // Abbrev [35] 0x1e58:0xf DW_TAG_typedef
-; CHECK-NEXT:.b32 7783                               // DW_AT_type
+; CHECK-NEXT:.b8 35                                  // Abbrev [35] 0x1ea0:0xf DW_TAG_typedef
+; CHECK-NEXT:.b32 7855                               // DW_AT_type
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 108
 ; CHECK-NEXT:.b8 100
@@ -6677,35 +6742,35 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 121                                 // DW_AT_decl_line
-; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1e67:0x22 DW_TAG_structure_type
+; CHECK-NEXT:.b8 37                                  // Abbrev [37] 0x1eaf:0x22 DW_TAG_structure_type
 ; CHECK-NEXT:.b8 16                                  // DW_AT_byte_size
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 117                                 // DW_AT_decl_line
-; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x1e6b:0xf DW_TAG_member
+; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x1eb3:0xf DW_TAG_member
 ; CHECK-NEXT:.b8 113                                 // DW_AT_name
 ; CHECK-NEXT:.b8 117
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b32 3770                               // DW_AT_type
+; CHECK-NEXT:.b32 3842                               // DW_AT_type
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 119                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2                                   // DW_AT_data_member_location
 ; CHECK-NEXT:.b8 35
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x1e7a:0xe DW_TAG_member
+; CHECK-NEXT:.b8 11                                  // Abbrev [11] 0x1ec2:0xe DW_TAG_member
 ; CHECK-NEXT:.b8 114                                 // DW_AT_name
 ; CHECK-NEXT:.b8 101
 ; CHECK-NEXT:.b8 109
 ; CHECK-NEXT:.b8 0
-; CHECK-NEXT:.b32 3770                               // DW_AT_type
+; CHECK-NEXT:.b32 3842                               // DW_AT_type
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 120                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2                                   // DW_AT_data_member_location
 ; CHECK-NEXT:.b8 35
 ; CHECK-NEXT:.b8 8
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x1e89:0x13 DW_TAG_subprogram
+; CHECK-NEXT:.b8 44                                  // Abbrev [44] 0x1ed1:0x13 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_name
 ; CHECK-NEXT:.b8 69
 ; CHECK-NEXT:.b8 120
@@ -6718,10 +6783,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
 ; CHECK-NEXT:.b8 1                                   // DW_AT_noreturn
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1e96:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ede:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1e9c:0x16 DW_TAG_subprogram
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1ee4:0x16 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 108
 ; CHECK-NEXT:.b8 97
@@ -6731,13 +6796,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 3770                               // DW_AT_type
+; CHECK-NEXT:.b32 3842                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1eac:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 3770                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ef4:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 3842                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1eb2:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1efa:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 108
 ; CHECK-NEXT:.b8 100
@@ -6747,15 +6812,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 29                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 7768                               // DW_AT_type
+; CHECK-NEXT:.b32 7840                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ec2:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 3770                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ec7:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 3770                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f0a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 3842                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f0f:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 3842                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1ecd:0x16 DW_TAG_subprogram
+; CHECK-NEXT:.b8 39                                  // Abbrev [39] 0x1f15:0x16 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 97                                  // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 111
@@ -6765,13 +6830,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 36                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 1
-; CHECK-NEXT:.b32 3770                               // DW_AT_type
+; CHECK-NEXT:.b32 3842                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1edd:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5637                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f25:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5709                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1ee3:0x21 DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1f2b:0x21 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 114
@@ -6782,17 +6847,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 209                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 3770                               // DW_AT_type
+; CHECK-NEXT:.b32 3842                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ef4:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5637                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1ef9:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7482                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1efe:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1f04:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f3c:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5709                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f41:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7554                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f46:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1f4c:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 114
@@ -6804,17 +6869,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 214                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 7974                               // DW_AT_type
+; CHECK-NEXT:.b32 8046                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f16:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5637                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f1b:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7482                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f20:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
-; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1f26:0x1a DW_TAG_base_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f5e:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5709                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f63:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7554                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f68:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
+; CHECK-NEXT:.b8 0                                   // End Of Children Mark
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1f6e:0x1a DW_TAG_base_type
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 110
@@ -6840,7 +6905,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 8                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1f40:0x1b DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1f88:0x1b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 114
@@ -6850,15 +6915,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 172                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f50:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5637                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f55:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7482                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f98:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5709                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f9d:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7554                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1f5b:0x1c DW_TAG_subprogram
+; CHECK-NEXT:.b8 34                                  // Abbrev [34] 0x1fa3:0x1c DW_TAG_subprogram
 ; CHECK-NEXT:.b8 115                                 // DW_AT_name
 ; CHECK-NEXT:.b8 116
 ; CHECK-NEXT:.b8 114
@@ -6869,15 +6934,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_file
 ; CHECK-NEXT:.b8 175                                 // DW_AT_decl_line
-; CHECK-NEXT:.b32 8055                               // DW_AT_type
+; CHECK-NEXT:.b32 8127                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
 ; CHECK-NEXT:.b8 1                                   // DW_AT_external
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f6c:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5637                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1f71:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 7482                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1fb4:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5709                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1fb9:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 7554                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1f77:0xf DW_TAG_base_type
+; CHECK-NEXT:.b8 10                                  // Abbrev [10] 0x1fbf:0xf DW_TAG_base_type
 ; CHECK-NEXT:.b8 108                                 // DW_AT_name
 ; CHECK-NEXT:.b8 111
 ; CHECK-NEXT:.b8 110
@@ -6892,7 +6957,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 0
 ; CHECK-NEXT:.b8 4                                   // DW_AT_encoding
 ; CHECK-NEXT:.b8 8                                   // DW_AT_byte_size
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x1f86:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x1fce:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -6913,12 +6978,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 62                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1fa0:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1fe8:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x1fa6:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x1fee:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -6941,12 +7006,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 90                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1fc2:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x200a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x1fc8:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2010:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -6967,12 +7032,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 57                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x1fe2:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x202a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x1fe8:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2030:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -6995,12 +7060,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 95                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2004:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x204c:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x200a:0x28 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2052:0x28 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7024,14 +7089,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 47                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2027:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x202c:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x206f:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2074:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x2032:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x207a:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7052,12 +7117,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 52                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x204c:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2094:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x2052:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x209a:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7080,12 +7145,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 100                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x206e:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x20b6:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x2074:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x20bc:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7106,12 +7171,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 150                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x208e:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x20d6:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x2094:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x20dc:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7132,12 +7197,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 14                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 155                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x20ae:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x20f6:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x20b4:0x2e DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x20fc:0x2e DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7167,14 +7232,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 165                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 4
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x20d7:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x20dc:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x211f:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2124:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x20e2:0x1e DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x212a:0x1e DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7193,12 +7258,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 219                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 4
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x20fa:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2142:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x2100:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2148:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7219,12 +7284,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 32                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x211a:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2162:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x2120:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2168:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7245,12 +7310,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 210                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x213a:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2182:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x2140:0x1e DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2188:0x1e DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7269,12 +7334,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 200                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2158:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x21a0:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x215e:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x21a6:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7295,12 +7360,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 14                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 145                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2178:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x21c0:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x217e:0x1e DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x21c6:0x1e DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7319,12 +7384,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 14                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2196:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x21de:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x219c:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x21e4:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7347,12 +7412,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 105                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x21b8:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2200:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x21be:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2206:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7373,12 +7438,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 14                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 95                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x21d8:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2220:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x21de:0x26 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2226:0x26 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7400,14 +7465,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 80                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 6
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x21f9:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x21fe:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2241:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2246:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x2204:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x224c:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7430,12 +7495,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 14                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 85                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2220:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2268:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x2226:0x2a DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x226e:0x2a DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7456,16 +7521,16 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 32                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 6
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2240:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2245:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x224a:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2288:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x228d:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2292:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x2250:0x26 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2298:0x26 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7487,14 +7552,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 14                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 110                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x226b:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2270:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x22b3:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x22b8:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x2276:0x26 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x22be:0x26 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7516,14 +7581,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 14                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 105                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2291:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2296:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x22d9:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x22de:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x229c:0x26 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x22e4:0x26 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7545,14 +7610,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 17                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 6
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x22b7:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x22bc:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x22ff:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2304:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x22c2:0x29 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x230a:0x29 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7577,14 +7642,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 7                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 6
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x22e0:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x22e5:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4630                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2328:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x232d:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4702                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x22eb:0x28 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2333:0x28 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7608,14 +7673,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 110                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2308:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x230d:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2350:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2355:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x2313:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x235b:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7638,12 +7703,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 85                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 6
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x232f:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2377:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x2335:0x28 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x237d:0x28 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7667,14 +7732,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 240                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2352:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2357:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x239a:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x239f:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x235d:0x24 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x23a5:0x24 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7699,12 +7764,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 235                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x237b:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x23c3:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x2381:0x24 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x23c9:0x24 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7729,12 +7794,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 125                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 4
-; CHECK-NEXT:.b32 3770                               // DW_AT_type
+; CHECK-NEXT:.b32 3842                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x239f:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x23e7:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x23a5:0x26 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x23ed:0x26 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7761,12 +7826,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 66                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 6
-; CHECK-NEXT:.b32 3770                               // DW_AT_type
+; CHECK-NEXT:.b32 3842                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x23c5:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x240d:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x23cb:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2413:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7789,12 +7854,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 76                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x23e7:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x242f:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x23ed:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2435:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7817,12 +7882,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 85                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2409:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2451:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x240f:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2457:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7843,12 +7908,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 5                                   // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2429:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2471:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x242f:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2477:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7869,12 +7934,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 90                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 6
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2449:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2491:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x244f:0x1e DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2497:0x1e DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7893,12 +7958,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 67                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2467:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x24af:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x246d:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x24b5:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7921,12 +7986,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 116                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 4
-; CHECK-NEXT:.b32 5170                               // DW_AT_type
+; CHECK-NEXT:.b32 5242                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2489:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x24d1:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x248f:0x24 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x24d7:0x24 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7951,12 +8016,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 71                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 6
-; CHECK-NEXT:.b32 5170                               // DW_AT_type
+; CHECK-NEXT:.b32 5242                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x24ad:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x24f5:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x24b3:0x27 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x24fb:0x27 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -7979,14 +8044,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 6
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x24cf:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x24d4:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2109                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2517:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x251c:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2181                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x24da:0x2b DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2522:0x2b DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8018,12 +8083,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 130                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 4
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x24ff:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2547:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x2505:0x31 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x254d:0x31 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8056,14 +8121,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 194                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 4
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x252b:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2530:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2573:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2578:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x2536:0x24 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x257e:0x24 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8083,14 +8148,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 47                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 6
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x254f:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2554:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2597:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x259c:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x255a:0x31 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x25a2:0x31 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8123,14 +8188,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 22                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 6
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2580:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2585:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x25c8:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x25cd:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x258b:0x31 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x25d3:0x31 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8158,16 +8223,16 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 27                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 6
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x25ac:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x25b1:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x25b6:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4630                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x25f4:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x25f9:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x25fe:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4702                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x25bc:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2604:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8188,12 +8253,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 111                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 4
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x25d6:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x261e:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x25dc:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2624:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8216,12 +8281,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 61                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 6
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x25f8:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2640:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x25fe:0x2c DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2646:0x2c DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8249,14 +8314,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 250                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x261f:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2624:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 5170                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2667:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x266c:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 5242                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x262a:0x2a DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2672:0x2a DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8282,14 +8347,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 245                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2649:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x264e:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 4585                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2691:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2696:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 4657                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x2654:0x1e DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x269c:0x1e DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8308,12 +8373,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 210                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 4
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x266c:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x26b4:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x2672:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x26ba:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8334,12 +8399,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 37                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x268c:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x26d4:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x2692:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x26da:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8360,12 +8425,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 14                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 139                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 3
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x26ac:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x26f4:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x26b2:0x1e DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x26fa:0x1e DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8384,12 +8449,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 252                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 4
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x26ca:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2712:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x26d0:0x20 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2718:0x20 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8410,12 +8475,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 42                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 5
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x26ea:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2732:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x26f0:0x24 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x2738:0x24 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8440,12 +8505,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 12                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 56                                  // DW_AT_decl_line
 ; CHECK-NEXT:.b8 6
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x270e:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2756:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
-; CHECK-NEXT:.b8 46                                  // Abbrev [46] 0x2714:0x22 DW_TAG_subprogram
+; CHECK-NEXT:.b8 48                                  // Abbrev [48] 0x275c:0x22 DW_TAG_subprogram
 ; CHECK-NEXT:.b8 95                                  // DW_AT_MIPS_linkage_name
 ; CHECK-NEXT:.b8 90
 ; CHECK-NEXT:.b8 76
@@ -8468,10 +8533,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b8 14                                  // DW_AT_decl_file
 ; CHECK-NEXT:.b8 150                                 // DW_AT_decl_line
 ; CHECK-NEXT:.b8 2
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 1                                   // DW_AT_declaration
-; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2730:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT:.b32 2100                               // DW_AT_type
+; CHECK-NEXT:.b8 7                                   // Abbrev [7] 0x2778:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT:.b32 2172                               // DW_AT_type
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
 ; CHECK-NEXT:.b8 0                                   // End Of Children Mark
 ; CHECK-NEXT:	}
diff --git a/llvm/test/DebugInfo/X86/DW_AT_object_pointer.ll b/llvm/test/DebugInfo/X86/DW_AT_object_pointer.ll
index d9988ac31451e..30d4203466766 100644
--- a/llvm/test/DebugInfo/X86/DW_AT_object_pointer.ll
+++ b/llvm/test/DebugInfo/X86/DW_AT_object_pointer.ll
@@ -5,7 +5,15 @@
 ; CHECK-NOT: ""
 ; CHECK: DW_TAG
 ; CHECK: DW_TAG_class_type
-; CHECK: DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x{{[0-9a-f]*}} => {[[PARAM:0x[0-9a-f]*]]})
+; CHECK: [[DECL:0x[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK:                         DW_AT_name {{.*}} "A"
+; CHECK:                         DW_AT_object_pointer [DW_FORM_ref4] 
+; CHECK-SAME:                    (cu + 0x{{[0-9a-f]*}} => {[[DECL_PARAM:0x[0-9a-f]*]]})
+; CHECK: [[DECL_PARAM]]:         DW_TAG_formal_parameter
+;
+; CHECK: DW_TAG_subprogram
+; CHECK:   DW_AT_specification [DW_FORM_ref4] (cu + {{.*}} => {[[DECL]]}
+; CHECK:   DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x{{[0-9a-f]*}} => {[[PARAM:0x[0-9a-f]*]]})
 ; CHECK: [[PARAM]]:     DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_name [DW_FORM_strp]     ( .debug_str[0x{{[0-9a-f]*}}] = "this")
diff --git a/llvm/test/DebugInfo/X86/dwarf-public-names.ll b/llvm/test/DebugInfo/X86/dwarf-public-names.ll
index c2274511d4191..a484c094892d0 100644
--- a/llvm/test/DebugInfo/X86/dwarf-public-names.ll
+++ b/llvm/test/DebugInfo/X86/dwarf-public-names.ll
@@ -61,7 +61,7 @@
 
 ; Skip the output to the header of the pubnames section.
 ; LINUX: debug_pubnames
-; LINUX-NEXT: unit_size = 0x00000128
+; LINUX-NEXT: unit_size =
 
 ; Check for each name in the output.
 ; LINUX-DAG: "ns"
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
index 6bc92bc29ea8a..40e3fbda47787 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
@@ -899,104 +899,131 @@ v_bfm_b32 v5, src_scc, vcc_lo
 v_bfm_b32 v255, 0xaf123456, vcc_hi
 // GFX11: v_bfm_b32 v255, 0xaf123456, vcc_hi      ; encoding: [0xff,0x00,0x1d,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cndmask_b16 v5, v1, src_scc, s3
-// W32: v_cndmask_b16 v5, v1, src_scc, s3       ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x0d,0x00]
-// W64-ERR: :[[@LINE-2]]:32: error: invalid operand for instruction
+v_cndmask_b16 v5.l, v1.l, src_scc, s3
+// W32: v_cndmask_b16 v5.l, v1.l, src_scc, s3   ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x0d,0x00]
+// W64-ERR: :[[@LINE-2]]:36: error: invalid operand for instruction
 
-v_cndmask_b16 v5, v255, 0.5, s3
-// W32: v_cndmask_b16 v5, v255, 0.5, s3         ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x0d,0x00]
-// W64-ERR: :[[@LINE-2]]:30: error: invalid operand for instruction
+v_cndmask_b16 v5.l, v255.l, 0.5, s3
+// W32: v_cndmask_b16 v5.l, v255.l, 0.5, s3     ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x0d,0x00]
+// W64-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction
 
-v_cndmask_b16 v5, s105, s105, s3
-// W32: v_cndmask_b16 v5, s105, s105, s3        ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x0c,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
-
-v_cndmask_b16 v5, vcc_hi, v2, s3
-// W32: v_cndmask_b16 v5, vcc_hi, v2, s3        ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x0e,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, s105, s105, s3
+// W32: v_cndmask_b16 v5.l, s105, s105, s3      ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x0c,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, ttmp15, ttmp15, s3
-// W32: v_cndmask_b16 v5, ttmp15, ttmp15, s3    ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x0c,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, vcc_hi, v2.l, s3
+// W32: v_cndmask_b16 v5.l, vcc_hi, v2.l, s3    ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x0e,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, m0, v255, s3
-// W32: v_cndmask_b16 v5, m0, v255, s3          ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x0f,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, ttmp15, ttmp15, s3
+// W32: v_cndmask_b16 v5.l, ttmp15, ttmp15, s3  ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x0c,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, exec_lo, exec_lo, s3
-// W32: v_cndmask_b16 v5, exec_lo, exec_lo, s3  ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x0c,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, m0, v255.l, s3
+// W32: v_cndmask_b16 v5.l, m0, v255.l, s3      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x0f,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, exec_hi, exec_hi, s3
-// W32: v_cndmask_b16 v5, exec_hi, exec_hi, s3  ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x0c,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, exec_lo, exec_lo, s3
+// W32: v_cndmask_b16 v5.l, exec_lo, exec_lo, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x0c,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, null, m0, s105
-// W32: v_cndmask_b16 v5, null, m0, s105        ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0xa4,0x01]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, exec_hi, exec_hi, s3
+// W32: v_cndmask_b16 v5.l, exec_hi, exec_hi, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x0c,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, -1, -|vcc_lo|, vcc_lo
-// W32: v_cndmask_b16 v5, -1, -|vcc_lo|, vcc_lo ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa8,0x41]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, null, m0, s105
+// W32: v_cndmask_b16 v5.l, null, m0, s105      ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0xa4,0x01]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, 0.5, -1, vcc_hi
-// W32: v_cndmask_b16 v5, 0.5, -1, vcc_hi       ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xad,0x01]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, -1, -|vcc_lo|, vcc_lo
+// W32: v_cndmask_b16 v5.l, -1, -|vcc_lo|, vcc_lo ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa8,0x41]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, -|src_scc|, null, ttmp15
-// W32: v_cndmask_b16 v5, -|src_scc|, null, ttmp15 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xec,0x21]
+v_cndmask_b16 v5.l, 0.5, -1, vcc_hi
+// W32: v_cndmask_b16 v5.l, 0.5, -1, vcc_hi     ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xad,0x01]
 // W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, v1, src_scc, s[6:7]
-// W64: v_cndmask_b16 v5, v1, src_scc, s[6:7]   ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
-// W32-ERR: :[[@LINE-2]]:32: error: invalid operand for instruction
+v_cndmask_b16 v5.l, -|src_scc|, null, ttmp15
+// W32: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp15 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xec,0x21]
+// W64-ERR: :[[@LINE-2]]:23: error: invalid operand for instruction
 
-v_cndmask_b16 v5, v255, 0.5, s[6:7]
-// W64: v_cndmask_b16 v5, v255, 0.5, s[6:7]     ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x19,0x00]
-// W32-ERR: :[[@LINE-2]]:30: error: invalid operand for instruction
+v_cndmask_b16 v5.l, v1.l, src_scc, s[6:7]
+// W64: v_cndmask_b16 v5.l, v1.l, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
+// W32-ERR: :[[@LINE-2]]:36: error: invalid operand for instruction
 
-v_cndmask_b16 v5, s105, s105, s[6:7]
-// W64: v_cndmask_b16 v5, s105, s105, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, v255.l, 0.5, s[6:7]
+// W64: v_cndmask_b16 v5.l, v255.l, 0.5, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x19,0x00]
+// W32-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction
 
-v_cndmask_b16 v5, vcc_hi, v2, s[6:7]
-// W64: v_cndmask_b16 v5, vcc_hi, v2, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, s105, s105, s[6:7]
+// W64: v_cndmask_b16 v5.l, s105, s105, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7]
-// W64: v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, vcc_hi, v2.l, s[6:7]
+// W64: v_cndmask_b16 v5.l, vcc_hi, v2.l, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, m0, v255, s[6:7]
-// W64: v_cndmask_b16 v5, m0, v255, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, ttmp15, ttmp15, s[6:7]
+// W64: v_cndmask_b16 v5.l, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7]
-// W64: v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, m0, v255.l, s[6:7]
+// W64: v_cndmask_b16 v5.l, m0, v255.l, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7]
-// W64: v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, exec_lo, exec_lo, s[6:7]
+// W64: v_cndmask_b16 v5.l, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, null, m0, s[6:7]
-// W64: v_cndmask_b16 v5, null, m0, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, exec_hi, exec_hi, s[6:7]
+// W64: v_cndmask_b16 v5.l, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105]
-// W64: v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, null, m0, s[6:7]
+// W64: v_cndmask_b16 v5.l, null, m0, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, -1, -|vcc_lo|, s[104:105]
+// W64: v_cndmask_b16 v5.l, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, 0.5, -1, vcc
+// W64: v_cndmask_b16 v5.l, 0.5, -1, vcc        ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xa9,0x01]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, -|src_scc|, null, ttmp[14:15]
+// W64: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+// W32-ERR: :[[@LINE-2]]:23: error: invalid operand for instruction
+
+v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null
+// GFX11: v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+
+v_cndmask_b16 v5.l, 0x3800, -1, vcc_lo
+// W32: v_cndmask_b16 v5.l, 0x3800, -1, vcc_lo  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, 0x3800, -1, vcc
+// W64: v_cndmask_b16 v5.l, 0x3800, -1, vcc     ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, v255.h, 0.5, s3
+// W32: v_cndmask_b16 v5.l, v255.h, 0.5, s3     ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xe1,0x0d,0x00]
+// W64-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, m0, v255.h, s3
+// W32: v_cndmask_b16 v5.l, m0, v255.h, s3      ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x0f,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, 0.5, -1, vcc
-// W64: v_cndmask_b16 v5, 0.5, -1, vcc          ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xa9,0x01]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, v255.h, 0.5, s[6:7]
+// W64: v_cndmask_b16 v5.l, v255.h, 0.5, s[6:7] ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xe1,0x19,0x00]
+// W32-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction
 
-v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15]
-// W64: v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+v_cndmask_b16 v5.l, m0, v255.h, s[6:7]
+// W64: v_cndmask_b16 v5.l, m0, v255.h, s[6:7]  ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
 // W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null
-// GFX11: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null
+// GFX11: v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 v_cubeid_f32 v5, v1, v2, s3
 // GFX11: v_cubeid_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x0c,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
index 5fa1334aa6e95..2bff644605ff6 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
@@ -765,112 +765,139 @@ v_bfm_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x1d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1b,0x00,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[0,1,2,3]
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0xe4,0x00,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[0,1,2,3]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0xe4,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_mirror
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_half_mirror
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x41,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_half_mirror
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x41,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:1
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x01,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:1
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x01,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:15
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x0f,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:15
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x0f,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:1
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x11,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:1
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x11,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:15
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1f,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:15
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1f,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_ror:1
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x21,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_ror:1
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x21,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s105 row_ror:15
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s105 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x2f,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 row_ror:15
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x2f,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x50,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x50,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1
-// W32: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
-// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1
+// W32: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// W32: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xee,0x21,0x01,0x60,0x09,0x13]
-// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xee,0x21,0x01,0x60,0x09,0x13]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0]
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[3,2,1,0]
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3]
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[0,1,2,3]
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_mirror
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_half_mirror
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:1
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:15
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:1
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:15
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:1
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:15
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1
-// W64: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
-// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1
+// W64: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// W64: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13]
-// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x05,0x30]
+v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x05,0x30]
+
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x50,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_hi row_share:15 row_mask:0x0 bank_mask:0x1
+// W32: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_hi row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xae,0x41,0x01,0x5f,0x01,0x01]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi row_share:15 row_mask:0x0 bank_mask:0x1
+// W32: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xae,0x41,0x01,0x5f,0x01,0x01]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xee,0x21,0x01,0x60,0x09,0x13]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1
+// W64: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x05,0x30]
 
 v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX11: v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s
index 462ad7ba6516d..60ec94446235e 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s
@@ -1347,47 +1347,56 @@ v_cmpx_lg_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr
 v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x95,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_lt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_lt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 row_mirror
-// GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX11: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 row_half_mirror
-// GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:1
-// GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:15
-// GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:1
-// GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:15
-// GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:1
-// GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:15
-// GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_lt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_lt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_lt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_lt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_lt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_lt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_lt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_lt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+
+v_cmpx_lt_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_lt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+v_cmpx_lt_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_lt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_lt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
index 2fc02061c59de..2f9b5efca9e17 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
@@ -424,44 +424,71 @@ v_bfm_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x1d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// W32: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xee,0x21,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xee,0x21,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0]
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] dpp8:[7,6,5,4,3,2,1,0]
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0]
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] dpp8:[7,6,5,4,3,2,1,0]
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0]
-// W64: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
-// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0]
+// W64: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1
-// W64: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
-// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1
+// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
+v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
+
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xae,0x41,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xae,0x41,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xea,0x04,0xee,0x21,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0]
+// W64: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1
+// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x43,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
 
 v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s
index 46f1db837b0dd..fb2b28874bd04 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s
@@ -326,17 +326,26 @@ v_cmpx_lg_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x95,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_lt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_lt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_lt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_lt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_lt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_lt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_lt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x81,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_lt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_lt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x81,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x81,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x81,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+v_cmpx_lt_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_lt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_lt_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_lt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x81,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x81,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_lt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x91,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s
index 371d29f2a2cb6..7a95d8cd53cde 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s
@@ -2066,11 +2066,11 @@ v_cmpx_lg_f64_e64 -|src_scc|, -|exec|
 v_cmpx_lg_f64_e64 0xaf123456, -|vcc| clamp
 // GFX11: v_cmpx_lg_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa5,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lt_f16_e64 v1, v2
-// GFX11: v_cmpx_lt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_lt_f16_e64 v1.l, v2.l
+// GFX11: v_cmpx_lt_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_lt_f16_e64 v255, v255
-// GFX11: v_cmpx_lt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_lt_f16_e64 v255.l, v255.l
+// GFX11: v_cmpx_lt_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_lt_f16_e64 s1, s2
 // GFX11: v_cmpx_lt_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x04,0x00,0x00]
@@ -2111,6 +2111,12 @@ v_cmpx_lt_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_lt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX11: v_cmpx_lt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x81,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_lt_f16_e64 v1.h, v2.l
+// GFX11: v_cmpx_lt_f16_e64 v1.h, v2.l            ; encoding: [0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_lt_f16_e64 v255.l, v255.h
+// GFX11: v_cmpx_lt_f16_e64 v255.l, v255.h        ; encoding: [0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_lt_f32_e64 v1, v2
 // GFX11: v_cmpx_lt_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x91,0xd4,0x01,0x05,0x02,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s
index 82c43e1a91b6a..42d7c5ea600b4 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s
@@ -2054,50 +2054,65 @@ v_cmpx_lg_f64 src_scc, v[2:3]
 v_cmpx_lg_f64 0xaf123456, v[254:255]
 // GFX11: v_cmpx_lg_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4b,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lt_f16 v1, v2
-// GFX11: v_cmpx_lt_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x02,0x7d]
+v_cmpx_lt_f16 v1.l, v2.l
+// GFX11: v_cmpx_lt_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x02,0x7d]
 
-v_cmpx_lt_f16 v127, v2
-// GFX11: v_cmpx_lt_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x02,0x7d]
+v_cmpx_lt_f16 v127.l, v2.l
+// GFX11: v_cmpx_lt_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x02,0x7d]
 
-v_cmpx_lt_f16 s1, v2
-// GFX11: v_cmpx_lt_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x02,0x7d]
+v_cmpx_lt_f16 s1, v2.l
+// GFX11: v_cmpx_lt_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 s105, v2
-// GFX11: v_cmpx_lt_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x02,0x7d]
+v_cmpx_lt_f16 s105, v2.l
+// GFX11: v_cmpx_lt_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 vcc_lo, v2
-// GFX11: v_cmpx_lt_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x02,0x7d]
+v_cmpx_lt_f16 vcc_lo, v2.l
+// GFX11: v_cmpx_lt_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 vcc_hi, v2
-// GFX11: v_cmpx_lt_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x02,0x7d]
+v_cmpx_lt_f16 vcc_hi, v2.l
+// GFX11: v_cmpx_lt_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 ttmp15, v2
-// GFX11: v_cmpx_lt_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x02,0x7d]
+v_cmpx_lt_f16 ttmp15, v2.l
+// GFX11: v_cmpx_lt_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 m0, v2
-// GFX11: v_cmpx_lt_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x02,0x7d]
+v_cmpx_lt_f16 m0, v2.l
+// GFX11: v_cmpx_lt_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 exec_lo, v2
-// GFX11: v_cmpx_lt_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x02,0x7d]
+v_cmpx_lt_f16 exec_lo, v2.l
+// GFX11: v_cmpx_lt_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 exec_hi, v2
-// GFX11: v_cmpx_lt_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x02,0x7d]
+v_cmpx_lt_f16 exec_hi, v2.l
+// GFX11: v_cmpx_lt_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 null, v2
-// GFX11: v_cmpx_lt_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x02,0x7d]
+v_cmpx_lt_f16 null, v2.l
+// GFX11: v_cmpx_lt_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 -1, v2
-// GFX11: v_cmpx_lt_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x02,0x7d]
+v_cmpx_lt_f16 -1, v2.l
+// GFX11: v_cmpx_lt_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 0.5, v2
-// GFX11: v_cmpx_lt_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x02,0x7d]
+v_cmpx_lt_f16 0.5, v2.l
+// GFX11: v_cmpx_lt_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 src_scc, v2
-// GFX11: v_cmpx_lt_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x02,0x7d]
+v_cmpx_lt_f16 src_scc, v2.l
+// GFX11: v_cmpx_lt_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 0xfe0b, v127
-// GFX11: v_cmpx_lt_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x02,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_lt_f16 0xfe0b, v127.l
+// GFX11: v_cmpx_lt_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x02,0x7d,0x0b,0xfe,0x00,0x00]
+
+v_cmpx_lt_f16 v1.h, v2.l
+// GFX11: v_cmpx_lt_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x02,0x7d]
+
+v_cmpx_lt_f16 v127.h, v2.l
+// GFX11: v_cmpx_lt_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x02,0x7d]
+
+v_cmpx_lt_f16 0.5, v127.l
+// GFX11: v_cmpx_lt_f16_e32 0.5, v127.l           ; encoding: [0xf0,0xfe,0x02,0x7d]
+
+v_cmpx_lt_f16 src_scc, v2.h
+// GFX11: v_cmpx_lt_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x03,0x7d]
+
+v_cmpx_lt_f16 0xfe0b, v127.h
+// GFX11: v_cmpx_lt_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x03,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_lt_f32 v1, v2
 // GFX11: v_cmpx_lt_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x22,0x7d]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s
index b2ea4348f33b8..57185330971e1 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s
@@ -1346,47 +1346,56 @@ v_cmpx_lg_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_lg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_lg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x2b,0x7d,0xff,0x6f,0xf5,0x30]
 
-v_cmpx_lt_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_lt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_lt_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_lt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1b,0x00,0xff]
 
-v_cmpx_lt_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_lt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_lt_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_lt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0xe4,0x00,0xff]
 
-v_cmpx_lt_f16 v1, v2 row_mirror
-// GFX11: v_cmpx_lt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_lt_f16 v1.l, v2.l row_mirror
+// GFX11: v_cmpx_lt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_lt_f16 v1, v2 row_half_mirror
-// GFX11: v_cmpx_lt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_lt_f16 v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_lt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_lt_f16 v1, v2 row_shl:1
-// GFX11: v_cmpx_lt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_lt_f16 v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_lt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_lt_f16 v1, v2 row_shl:15
-// GFX11: v_cmpx_lt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_lt_f16 v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_lt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_lt_f16 v1, v2 row_shr:1
-// GFX11: v_cmpx_lt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_lt_f16 v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_lt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_lt_f16 v1, v2 row_shr:15
-// GFX11: v_cmpx_lt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_lt_f16 v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_lt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_lt_f16 v1, v2 row_ror:1
-// GFX11: v_cmpx_lt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_lt_f16 v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_lt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_lt_f16 v1, v2 row_ror:15
-// GFX11: v_cmpx_lt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_lt_f16 v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_lt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_lt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_lt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_lt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_lt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_lt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_lt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_lt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_lt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_lt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_lt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_lt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cmpx_lt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_lt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_lt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x02,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_lt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cmpx_lt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x02,0x7d,0x7f,0x6f,0xf5,0x30]
+
+v_cmpx_lt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_lt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x02,0x7d,0x7f,0x5f,0x01,0x01]
+
+v_cmpx_lt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_lt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x03,0x7d,0x81,0x60,0x09,0x13]
+
+v_cmpx_lt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_lt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x03,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_lt_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_lt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s
index b4c556cf0328a..e78840e08c497 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s
@@ -290,14 +290,23 @@ v_cmpx_lg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_lg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_lg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x2b,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_lt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_lt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x02,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_lt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_lt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x02,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_lt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_lt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x02,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_lt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_lt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x02,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_lt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_lt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x02,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_lt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cmpx_lt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x02,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_lt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_lt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x02,0x7d,0x7f,0x77,0x39,0x05]
+
+v_cmpx_lt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_lt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x03,0x7d,0x81,0x77,0x39,0x05]
+
+v_cmpx_lt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_lt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x03,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_lt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_lt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x22,0x7d,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s
index ec628dd94f366..7c9fa7f846d47 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --sort --version 5
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --sort --version 5
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s
 
 v_cmpx_class_f16_e32 v1, v255
@@ -271,23 +271,41 @@ v_cmpx_lg_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_lg_f16_e32 v255, v2 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
 
-v_cmpx_lt_f16_e32 v1, v255
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_lt_f16_e32 v1.h, v255.h
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_lt_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_lt_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_lt_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_lt_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_lt_f16_e32 v255, v2
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_lt_f16_e32 v1.l, v255.l
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_lt_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_lt_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_lt_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_lt_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
+
+v_cmpx_lt_f16_e32 v255.h, v2.h
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_lt_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_lt_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_lt_f16_e32 v255.l, v2.l
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_lt_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_lt_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 v_cmpx_lt_i16_e32 v1, v255
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s
index 3bbdf3d3a903f..bffe5c7251ddf 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --sort --version 5
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --sort --version 5
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 %s
 
 v_cmpx_class_f16 v1, v255
@@ -271,23 +271,41 @@ v_cmpx_lg_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_lg_f16 v255, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_lg_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_lt_f16 v1, v255
-// GFX11: v_cmpx_lt_f16_e64 v1, v255              ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_lt_f16 v1.h, v255.h
+// GFX11: v_cmpx_lt_f16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0x81,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_lt_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_lt_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_lt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_lt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x81,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_lt_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_lt_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_lt_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_lt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x81,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_lt_f16 v255, v2
-// GFX11: v_cmpx_lt_f16_e64 v255, v2              ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_lt_f16 v1.l, v255.l
+// GFX11: v_cmpx_lt_f16_e64 v1.l, v255.l          ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_lt_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_lt_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_lt_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_lt_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_lt_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_lt_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_lt_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_lt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lt_f16 v255.h, v2.h
+// GFX11: v_cmpx_lt_f16_e64 v255.h, v2.h          ; encoding: [0x7e,0x18,0x81,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_lt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_lt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x81,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_lt_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_lt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x81,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_lt_f16 v255.l, v2.l
+// GFX11: v_cmpx_lt_f16_e64 v255.l, v2.l          ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_lt_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_lt_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_lt_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_lt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
 v_cmpx_lt_i16 v1, v255
 // GFX11: v_cmpx_lt_i16_e64 v1, v255              ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0xff,0x03,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
index 3e7b7d28c2e97..cd4ed2b9458e6 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
@@ -857,104 +857,131 @@ v_bfm_b32 v5, src_scc, vcc_lo
 v_bfm_b32 v255, 0xaf123456, vcc_hi
 // GFX12: v_bfm_b32 v255, 0xaf123456, vcc_hi      ; encoding: [0xff,0x00,0x1d,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cndmask_b16 v5, v1, src_scc, s3
-// W32: v_cndmask_b16 v5, v1, src_scc, s3       ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x0d,0x00]
-// W64-ERR: :[[@LINE-2]]:32: error: invalid operand for instruction
+v_cndmask_b16 v5.l, v1.l, src_scc, s3
+// W32: v_cndmask_b16 v5.l, v1.l, src_scc, s3   ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x0d,0x00]
+// W64-ERR: :[[@LINE-2]]:36: error: invalid operand for instruction
 
-v_cndmask_b16 v5, v255, 0.5, s3
-// W32: v_cndmask_b16 v5, v255, 0.5, s3         ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x0d,0x00]
-// W64-ERR: :[[@LINE-2]]:30: error: invalid operand for instruction
+v_cndmask_b16 v5.l, v255.l, 0.5, s3
+// W32: v_cndmask_b16 v5.l, v255.l, 0.5, s3     ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x0d,0x00]
+// W64-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction
 
-v_cndmask_b16 v5, s105, s105, s3
-// W32: v_cndmask_b16 v5, s105, s105, s3        ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x0c,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
-
-v_cndmask_b16 v5, vcc_hi, v2, s3
-// W32: v_cndmask_b16 v5, vcc_hi, v2, s3        ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x0e,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, s105, s105, s3
+// W32: v_cndmask_b16 v5.l, s105, s105, s3      ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x0c,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, ttmp15, ttmp15, s3
-// W32: v_cndmask_b16 v5, ttmp15, ttmp15, s3    ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x0c,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, vcc_hi, v2.l, s3
+// W32: v_cndmask_b16 v5.l, vcc_hi, v2.l, s3    ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x0e,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, m0, v255, s3
-// W32: v_cndmask_b16 v5, m0, v255, s3          ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x0f,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, ttmp15, ttmp15, s3
+// W32: v_cndmask_b16 v5.l, ttmp15, ttmp15, s3  ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x0c,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, exec_lo, exec_lo, s3
-// W32: v_cndmask_b16 v5, exec_lo, exec_lo, s3  ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x0c,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, m0, v255.l, s3
+// W32: v_cndmask_b16 v5.l, m0, v255.l, s3      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x0f,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, exec_hi, exec_hi, s3
-// W32: v_cndmask_b16 v5, exec_hi, exec_hi, s3  ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x0c,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, exec_lo, exec_lo, s3
+// W32: v_cndmask_b16 v5.l, exec_lo, exec_lo, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x0c,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, null, m0, s105
-// W32: v_cndmask_b16 v5, null, m0, s105        ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0xa4,0x01]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, exec_hi, exec_hi, s3
+// W32: v_cndmask_b16 v5.l, exec_hi, exec_hi, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x0c,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, -1, -|vcc_lo|, vcc_lo
-// W32: v_cndmask_b16 v5, -1, -|vcc_lo|, vcc_lo ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa8,0x41]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, null, m0, s105
+// W32: v_cndmask_b16 v5.l, null, m0, s105      ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0xa4,0x01]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, 0.5, -1, vcc_hi
-// W32: v_cndmask_b16 v5, 0.5, -1, vcc_hi       ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xad,0x01]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, -1, -|vcc_lo|, vcc_lo
+// W32: v_cndmask_b16 v5.l, -1, -|vcc_lo|, vcc_lo ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa8,0x41]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, -|src_scc|, null, ttmp15
-// W32: v_cndmask_b16 v5, -|src_scc|, null, ttmp15 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xec,0x21]
+v_cndmask_b16 v5.l, 0.5, -1, vcc_hi
+// W32: v_cndmask_b16 v5.l, 0.5, -1, vcc_hi     ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xad,0x01]
 // W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, v1, src_scc, s[6:7]
-// W64: v_cndmask_b16 v5, v1, src_scc, s[6:7]   ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
-// W32-ERR: :[[@LINE-2]]:32: error: invalid operand for instruction
+v_cndmask_b16 v5.l, -|src_scc|, null, ttmp15
+// W32: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp15 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xec,0x21]
+// W64-ERR: :[[@LINE-2]]:23: error: invalid operand for instruction
 
-v_cndmask_b16 v5, v255, 0.5, s[6:7]
-// W64: v_cndmask_b16 v5, v255, 0.5, s[6:7]     ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x19,0x00]
-// W32-ERR: :[[@LINE-2]]:30: error: invalid operand for instruction
+v_cndmask_b16 v5.l, v1.l, src_scc, s[6:7]
+// W64: v_cndmask_b16 v5.l, v1.l, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
+// W32-ERR: :[[@LINE-2]]:36: error: invalid operand for instruction
 
-v_cndmask_b16 v5, s105, s105, s[6:7]
-// W64: v_cndmask_b16 v5, s105, s105, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, v255.l, 0.5, s[6:7]
+// W64: v_cndmask_b16 v5.l, v255.l, 0.5, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x19,0x00]
+// W32-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction
 
-v_cndmask_b16 v5, vcc_hi, v2, s[6:7]
-// W64: v_cndmask_b16 v5, vcc_hi, v2, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, s105, s105, s[6:7]
+// W64: v_cndmask_b16 v5.l, s105, s105, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7]
-// W64: v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, vcc_hi, v2.l, s[6:7]
+// W64: v_cndmask_b16 v5.l, vcc_hi, v2.l, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, m0, v255, s[6:7]
-// W64: v_cndmask_b16 v5, m0, v255, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, ttmp15, ttmp15, s[6:7]
+// W64: v_cndmask_b16 v5.l, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7]
-// W64: v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, m0, v255.l, s[6:7]
+// W64: v_cndmask_b16 v5.l, m0, v255.l, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7]
-// W64: v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, exec_lo, exec_lo, s[6:7]
+// W64: v_cndmask_b16 v5.l, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, null, m0, s[6:7]
-// W64: v_cndmask_b16 v5, null, m0, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, exec_hi, exec_hi, s[6:7]
+// W64: v_cndmask_b16 v5.l, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105]
-// W64: v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, null, m0, s[6:7]
+// W64: v_cndmask_b16 v5.l, null, m0, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, -1, -|vcc_lo|, s[104:105]
+// W64: v_cndmask_b16 v5.l, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, 0.5, -1, vcc
+// W64: v_cndmask_b16 v5.l, 0.5, -1, vcc        ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xa9,0x01]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, -|src_scc|, null, ttmp[14:15]
+// W64: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+// W32-ERR: :[[@LINE-2]]:23: error: invalid operand for instruction
+
+v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null
+// GFX12: v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
-v_cndmask_b16 v5, 0.5, -1, vcc
-// W64: v_cndmask_b16 v5, 0.5, -1, vcc          ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xa9,0x01]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, v255.h, 0.5, s3
+// W32: v_cndmask_b16 v5.l, v255.h, 0.5, s3     ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xe1,0x0d,0x00]
+// W64-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, m0, v255.h, s3
+// W32: v_cndmask_b16 v5.l, m0, v255.h, s3      ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x0f,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, v255.h, 0.5, s[6:7]
+// W64: v_cndmask_b16 v5.l, v255.h, 0.5, s[6:7] ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xe1,0x19,0x00]
+// W32-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, m0, v255.h, s[6:7]
+// W64: v_cndmask_b16 v5.l, m0, v255.h, s[6:7]  ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, 0x3800, -1, vcc_lo
+// W32: v_cndmask_b16 v5.l, 0x3800, -1, vcc_lo  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15]
-// W64: v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+v_cndmask_b16 v5.l, 0x3800, -1, vcc
+// W64: v_cndmask_b16 v5.l, 0x3800, -1, vcc     ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
 // W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null
-// GFX12: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null
+// GFX12: v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 v_cubeid_f32 v5, v1, v2, s3
 // GFX12: v_cubeid_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x0c,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
index aa804cc302bf0..78ce7451c1ba7 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
@@ -869,128 +869,147 @@ v_bfm_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x1d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1b,0x00,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[0,1,2,3]
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0xe4,0x00,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[0,1,2,3]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0xe4,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_mirror
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, s2, s3 row_mirror
-// W32: v_cndmask_b16_e64_dpp v5, v1, s2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0c,0x00,0x01,0x40,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, s2, s3 row_mirror
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, s2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0c,0x00,0x01,0x40,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, 10, s3 row_mirror
-// W32: v_cndmask_b16_e64_dpp v5, v1, 10, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x14,0x0d,0x00,0x01,0x40,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, 10, s3 row_mirror
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, 10, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x14,0x0d,0x00,0x01,0x40,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_half_mirror
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x41,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_half_mirror
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x41,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:1
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x01,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:1
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x01,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:15
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x0f,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:15
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x0f,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:1
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x11,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:1
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x11,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:15
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1f,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:15
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1f,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_ror:1
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x21,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_ror:1
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x21,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s105 row_ror:15
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s105 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x2f,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 row_ror:15
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x2f,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x50,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x50,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1
-// W32: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
-// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1
+// W32: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// W32: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xee,0x21,0x01,0x60,0x09,0x13]
-// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xee,0x21,0x01,0x60,0x09,0x13]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0]
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[3,2,1,0]
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3]
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[0,1,2,3]
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_mirror
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_half_mirror
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, s2, s[6:7] row_half_mirror
-// W64: v_cndmask_b16_e64_dpp v5, v1, s2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x18,0x00,0x01,0x41,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, s2, s[6:7] row_half_mirror
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, s2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x18,0x00,0x01,0x41,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, 10, s[6:7] row_half_mirror
-// W64: v_cndmask_b16_e64_dpp v5, v1, 10, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x14,0x19,0x00,0x01,0x41,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, 10, s[6:7] row_half_mirror
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, 10, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x14,0x19,0x00,0x01,0x41,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:1
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:15
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:1
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:15
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:1
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:15
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1
-// W64: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
-// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1
+// W64: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// W64: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13]
-// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x05,0x30]
+v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x05,0x30]
+
+v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi row_share:15 row_mask:0x0 bank_mask:0x1
+// W32: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xae,0x41,0x01,0x5f,0x01,0x01]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xee,0x21,0x01,0x60,0x09,0x13]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1
+// W64: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x05,0x30]
 
 v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
index e93a65ec92e73..b41f92b889368 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
@@ -516,56 +516,75 @@ v_bfm_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x1d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, s2, s3 dpp8:[7,6,5,4,3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, v1, s2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0c,0x00,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, s2, s3 dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, s2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0c,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, 10, s3 dpp8:[7,6,5,4,3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, v1, 10, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x14,0x0d,0x00,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, 10, s3 dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, 10, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x14,0x0d,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// W32: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xee,0x21,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xee,0x21,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0]
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] dpp8:[7,6,5,4,3,2,1,0]
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0]
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] dpp8:[7,6,5,4,3,2,1,0]
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0]
-// W64: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
-// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0]
+// W64: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1
-// W64: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
-// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1
+// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, -v1, |s2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1
-// W64: v_cndmask_b16_e64_dpp v5, -v1, |s2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xe8,0x21,0x01,0x77,0x39,0x05]
-// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |s2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1
+// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |s2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xe8,0x21,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:42: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
+v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
+
+v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xae,0x41,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xea,0x04,0xee,0x21,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0]
+// W64: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1
+// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x43,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
 
 v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s
index 476ea846f603a..6730482540060 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s
@@ -1775,11 +1775,11 @@ v_cmpx_lg_f64_e64 -|src_scc|, -|exec|
 v_cmpx_lg_f64_e64 0xaf123456, -|vcc| clamp
 // GFX12: v_cmpx_lg_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa5,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lt_f16_e64 v1, v2
-// GFX12: v_cmpx_lt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_lt_f16_e64 v1.l, v2.l
+// GFX12: v_cmpx_lt_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_lt_f16_e64 v255, v255
-// GFX12: v_cmpx_lt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_lt_f16_e64 v255.l, v255.l
+// GFX12: v_cmpx_lt_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_lt_f16_e64 s1, s2
 // GFX12: v_cmpx_lt_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x04,0x00,0x00]
@@ -1820,6 +1820,12 @@ v_cmpx_lt_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_lt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX12: v_cmpx_lt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x81,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_lt_f16_e64 v1.h, v2.l
+// GFX12: v_cmpx_lt_f16_e64 v1.h, v2.l            ; encoding: [0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_lt_f16_e64 v255.l, v255.h
+// GFX12: v_cmpx_lt_f16_e64 v255.l, v255.h        ; encoding: [0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_lt_f32_e64 v1, v2
 // GFX12: v_cmpx_lt_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x91,0xd4,0x01,0x05,0x02,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s
index a6953ecc1d78a..2ffdf04ff886a 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s
@@ -1346,53 +1346,62 @@ v_cmpx_lg_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr
 v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x95,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_lt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_lt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_lt_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_lt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_lt_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_lt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 row_mirror
-// GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX12: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 row_half_mirror
-// GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:1
-// GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:15
-// GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:1
-// GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:15
-// GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:1
-// GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:15
-// GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_lt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_lt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_lt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_lt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_lt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_lt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_lt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_lt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+
+v_cmpx_lt_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_lt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+v_cmpx_lt_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_lt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_lt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s
index 8e2899086f2bd..05bce2e0e61f2 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s
@@ -452,23 +452,32 @@ v_cmpx_lg_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x95,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_lt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_lt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_lt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_lt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_lt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_lt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_lt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_lt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_lt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x81,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_lt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_lt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x81,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_lt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_lt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x81,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_lt_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_lt_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x81,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_lt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_lt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x81,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_lt_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_lt_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x81,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x81,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x81,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+v_cmpx_lt_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_lt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_lt_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_lt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x81,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x81,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_lt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x91,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s
index 300c748145141..1392b9b8112f9 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s
@@ -1766,50 +1766,62 @@ v_cmpx_lg_f64 src_scc, v[2:3]
 v_cmpx_lg_f64 0xaf123456, v[254:255]
 // GFX12: v_cmpx_lg_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4b,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lt_f16 v1, v2
-// GFX12: v_cmpx_lt_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x02,0x7d]
+v_cmpx_lt_f16 v1.l, v2.l
+// GFX12: v_cmpx_lt_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x02,0x7d]
 
-v_cmpx_lt_f16 v127, v2
-// GFX12: v_cmpx_lt_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x02,0x7d]
+v_cmpx_lt_f16 v127.l, v2.l
+// GFX12: v_cmpx_lt_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x02,0x7d]
 
-v_cmpx_lt_f16 s1, v2
-// GFX12: v_cmpx_lt_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x02,0x7d]
+v_cmpx_lt_f16 s1, v2.l
+// GFX12: v_cmpx_lt_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 s105, v2
-// GFX12: v_cmpx_lt_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x02,0x7d]
+v_cmpx_lt_f16 s105, v2.l
+// GFX12: v_cmpx_lt_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 vcc_lo, v2
-// GFX12: v_cmpx_lt_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x02,0x7d]
+v_cmpx_lt_f16 vcc_lo, v2.l
+// GFX12: v_cmpx_lt_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 vcc_hi, v2
-// GFX12: v_cmpx_lt_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x02,0x7d]
+v_cmpx_lt_f16 vcc_hi, v2.l
+// GFX12: v_cmpx_lt_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 ttmp15, v2
-// GFX12: v_cmpx_lt_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x02,0x7d]
+v_cmpx_lt_f16 ttmp15, v2.l
+// GFX12: v_cmpx_lt_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 m0, v2
-// GFX12: v_cmpx_lt_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x02,0x7d]
+v_cmpx_lt_f16 m0, v2.l
+// GFX12: v_cmpx_lt_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 exec_lo, v2
-// GFX12: v_cmpx_lt_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x02,0x7d]
+v_cmpx_lt_f16 exec_lo, v2.l
+// GFX12: v_cmpx_lt_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 exec_hi, v2
-// GFX12: v_cmpx_lt_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x02,0x7d]
+v_cmpx_lt_f16 exec_hi, v2.l
+// GFX12: v_cmpx_lt_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 null, v2
-// GFX12: v_cmpx_lt_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x02,0x7d]
+v_cmpx_lt_f16 null, v2.l
+// GFX12: v_cmpx_lt_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 -1, v2
-// GFX12: v_cmpx_lt_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x02,0x7d]
+v_cmpx_lt_f16 -1, v2.l
+// GFX12: v_cmpx_lt_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 0.5, v2
-// GFX12: v_cmpx_lt_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x02,0x7d]
+v_cmpx_lt_f16 0.5, v2.l
+// GFX12: v_cmpx_lt_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 src_scc, v2
-// GFX12: v_cmpx_lt_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x02,0x7d]
+v_cmpx_lt_f16 src_scc, v2.l
+// GFX12: v_cmpx_lt_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x02,0x7d]
 
-v_cmpx_lt_f16 0xfe0b, v127
-// GFX12: v_cmpx_lt_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x02,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_lt_f16 0xfe0b, v127.l
+// GFX12: v_cmpx_lt_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x02,0x7d,0x0b,0xfe,0x00,0x00]
+
+v_cmpx_lt_f16 v1.h, v2.l
+// GFX12: v_cmpx_lt_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x02,0x7d]
+
+v_cmpx_lt_f16 v127.h, v2.l
+// GFX12: v_cmpx_lt_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x02,0x7d]
+
+v_cmpx_lt_f16 src_scc, v2.h
+// GFX12: v_cmpx_lt_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x03,0x7d]
+
+v_cmpx_lt_f16 0xfe0b, v127.h
+// GFX12: v_cmpx_lt_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x03,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_lt_f32 v1, v2
 // GFX12: v_cmpx_lt_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x22,0x7d]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s
index 857d6267a215f..c8f9835181837 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s
@@ -1178,47 +1178,53 @@ v_cmpx_lg_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_lg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_lg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x2b,0x7d,0xff,0x6f,0xf5,0x30]
 
-v_cmpx_lt_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_lt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_lt_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_lt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1b,0x00,0xff]
 
-v_cmpx_lt_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_lt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_lt_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_lt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0xe4,0x00,0xff]
 
-v_cmpx_lt_f16 v1, v2 row_mirror
-// GFX12: v_cmpx_lt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_lt_f16 v1.l, v2.l row_mirror
+// GFX12: v_cmpx_lt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_lt_f16 v1, v2 row_half_mirror
-// GFX12: v_cmpx_lt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_lt_f16 v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_lt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_lt_f16 v1, v2 row_shl:1
-// GFX12: v_cmpx_lt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_lt_f16 v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_lt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_lt_f16 v1, v2 row_shl:15
-// GFX12: v_cmpx_lt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_lt_f16 v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_lt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_lt_f16 v1, v2 row_shr:1
-// GFX12: v_cmpx_lt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_lt_f16 v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_lt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_lt_f16 v1, v2 row_shr:15
-// GFX12: v_cmpx_lt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_lt_f16 v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_lt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_lt_f16 v1, v2 row_ror:1
-// GFX12: v_cmpx_lt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_lt_f16 v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_lt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_lt_f16 v1, v2 row_ror:15
-// GFX12: v_cmpx_lt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_lt_f16 v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_lt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_lt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_lt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_lt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_lt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_lt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_lt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_lt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_lt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_lt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_lt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_lt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX12: v_cmpx_lt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_lt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_lt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x02,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_lt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX12: v_cmpx_lt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x02,0x7d,0x7f,0x6f,0xf5,0x30]
+
+v_cmpx_lt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_lt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x03,0x7d,0x81,0x60,0x09,0x13]
+
+v_cmpx_lt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_lt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x03,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_lt_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_lt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s
index 8ee6b7d488fdf..3e7922d2acbda 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s
@@ -254,14 +254,20 @@ v_cmpx_lg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_lg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_lg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x2b,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_lt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_lt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x02,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_lt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_lt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x02,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_lt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_lt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x02,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_lt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_lt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x02,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_lt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_lt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x02,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_lt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_cmpx_lt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x02,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_lt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_lt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x03,0x7d,0x81,0x77,0x39,0x05]
+
+v_cmpx_lt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_lt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x03,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_lt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_lt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x22,0x7d,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s
index 5019324d174b8..cb317443d2828 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --sort --version 5
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --sort --version 5
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 --implicit-check-not=error %s
 
 v_cmpx_class_f16_e32 v1, v255
@@ -253,23 +253,41 @@ v_cmpx_lg_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_lg_f16_e32 v255, v2 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
 
-v_cmpx_lt_f16_e32 v1, v255
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_lt_f16_e32 v1.h, v255.h
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_lt_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_lt_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_lt_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_lt_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_lt_f16_e32 v255, v2
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_lt_f16_e32 v1.l, v255.l
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_lt_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_lt_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_lt_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_lt_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
+
+v_cmpx_lt_f16_e32 v255.h, v2.h
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_lt_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_lt_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_lt_f16_e32 v255.l, v2.l
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_lt_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_lt_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 v_cmpx_lt_i16_e32 v1, v255
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s
index 4f462861e3a0b..f3278c826475a 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --sort --version 5
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --sort --version 5
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 %s
 
 v_cmpx_class_f16 v1, v255
@@ -253,23 +253,41 @@ v_cmpx_lg_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_lg_f16 v255, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_lg_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_lt_f16 v1, v255
-// GFX12: v_cmpx_lt_f16_e64 v1, v255              ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_lt_f16 v1.h, v255.h
+// GFX12: v_cmpx_lt_f16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0x81,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_lt_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_lt_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_lt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_lt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x81,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_lt_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_lt_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_lt_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_lt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x81,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_lt_f16 v255, v2
-// GFX12: v_cmpx_lt_f16_e64 v255, v2              ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_lt_f16 v1.l, v255.l
+// GFX12: v_cmpx_lt_f16_e64 v1.l, v255.l          ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_lt_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_lt_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_lt_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_lt_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_lt_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_lt_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_lt_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_lt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lt_f16 v255.h, v2.h
+// GFX12: v_cmpx_lt_f16_e64 v255.h, v2.h          ; encoding: [0x7e,0x18,0x81,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_lt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_lt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x81,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_lt_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_lt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x81,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_lt_f16 v255.l, v2.l
+// GFX12: v_cmpx_lt_f16_e64 v255.l, v2.l          ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_lt_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_lt_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_lt_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_lt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
 v_cmpx_lt_i16 v1, v255
 // GFX12: v_cmpx_lt_i16_e64 v1, v255              ; encoding: [0x7e,0x00,0xb1,0xd4,0x01,0xff,0x03,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
index adcca58776100..05174e3128919 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
@@ -1054,55 +1054,100 @@
 # GFX11: v_bfm_b32 v255, 0xaf123456, vcc_hi      ; encoding: [0xff,0x00,0x1d,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00
-# W32: v_cndmask_b16 v5, v1, src_scc, s6       ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
-# W64: v_cndmask_b16 v5, v1, src_scc, s[6:7]   ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, v1.l, src_scc, s6   ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, v1, src_scc, s6       ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, v1.l, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, v1, src_scc, s[6:7]   ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
 
 0x05,0x00,0x5d,0xd6,0xff,0xe1,0x19,0x00
-# W32: v_cndmask_b16 v5, v255, 0x3800, s6      ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cndmask_b16 v5, v255, 0x3800, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, v255.l, 0x3800, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s6      ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, v255.l, 0x3800, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
 
 0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00
-# W32: v_cndmask_b16 v5, s105, s105, s6        ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
-# W64: v_cndmask_b16 v5, s105, s105, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, s105, s105, s6      ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, s105, s105, s6        ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, s105, s105, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, s105, s105, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
 
 0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00
-# W32: v_cndmask_b16 v5, vcc_hi, v2, s6        ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
-# W64: v_cndmask_b16 v5, vcc_hi, v2, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, vcc_hi, v2.l, s6    ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, vcc_hi, v2, s6        ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, vcc_hi, v2.l, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, vcc_hi, v2, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
 
 0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00
-# W32: v_cndmask_b16 v5, ttmp15, ttmp15, s6    ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
-# W64: v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, ttmp15, ttmp15, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, ttmp15, ttmp15, s6    ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
 
 0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00
-# W32: v_cndmask_b16 v5, m0, v255, s6          ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
-# W64: v_cndmask_b16 v5, m0, v255, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, m0, v255.l, s6      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, m0, v255, s6          ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, m0, v255.l, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, m0, v255, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
 
 0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00
-# W32: v_cndmask_b16 v5, exec_lo, exec_lo, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
-# W64: v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, exec_lo, exec_lo, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, exec_lo, exec_lo, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
 
 0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00
-# W32: v_cndmask_b16 v5, exec_hi, exec_hi, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
-# W64: v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, exec_hi, exec_hi, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, exec_hi, exec_hi, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
 
 0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00
-# W32: v_cndmask_b16 v5, null, m0, s6          ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
-# W64: v_cndmask_b16 v5, null, m0, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, null, m0, s6        ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, null, m0, s6          ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, null, m0, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, null, m0, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
 
 0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41
-# W32: v_cndmask_b16 v5, -1, -|vcc_lo|, s104   ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
-# W64: v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
+# W32-REAL16: v_cndmask_b16 v5.l, -1, -|vcc_lo|, s104 ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
+# W32-FAKE16: v_cndmask_b16 v5, -1, -|vcc_lo|, s104   ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
+# W64-REAL16: v_cndmask_b16 v5.l, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
+# W64-FAKE16: v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
 
 0x05,0x00,0x5d,0xd6,0xf0,0x82,0xa9,0x01
-# W32: v_cndmask_b16 v5, 0x3800, -1, vcc_lo    ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
-# W64: v_cndmask_b16 v5, 0x3800, -1, vcc       ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, 0x3800, -1, vcc_lo  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, 0x3800, -1, vcc_lo    ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, 0x3800, -1, vcc     ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, 0x3800, -1, vcc       ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
 
 0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21
-# W32: v_cndmask_b16 v5, -|src_scc|, null, ttmp14 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
-# W64: v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+# W32-REAL16: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp14 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+# W32-FAKE16: v_cndmask_b16 v5, -|src_scc|, null, ttmp14 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+# W64-REAL16: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+# W64-FAKE16: v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
 
 0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
-# GFX11: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+
+0x05,0x08,0x5d,0xd6,0xff,0xe1,0x19,0x00
+# W32-REAL16: v_cndmask_b16 v5.l, v255.h, 0x3800, s6  ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s6      ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, v255.h, 0x3800, s[6:7] ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+
+0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00
+# W32-REAL16: v_cndmask_b16 v5.l, m0, v255.h, s6      ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, m0, v255, s6          ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, m0, v255.h, s[6:7]  ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, m0, v255, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+
+0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x0c,0xd6,0x01,0x05,0x0e,0x00
 # GFX11: v_cubeid_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x0c,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
index 2964360a77fd2..c9ef3c714213d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
@@ -738,65 +738,118 @@
 # GFX11: v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x1d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s104 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s104 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s104 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
 
 0x05,0x02,0x5d,0xd6,0xfa,0x04,0xf2,0x21,0x01,0x6f,0x09,0x30
-# GFX11: v_cndmask_b16_e64_dpp v5, -v1, |v2|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xf2,0x21,0x01,0x6f,0x09,0x30]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xf2,0x21,0x01,0x6f,0x09,0x30]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xf2,0x21,0x01,0x6f,0x09,0x30]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xf2,0x21,0x01,0x6f,0x09,0x30]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xf2,0x21,0x01,0x6f,0x09,0x30]
 
 0x05,0x01,0x5d,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x6f,0x09,0x30
-# GFX11: v_cndmask_b16_e64_dpp v5, |v1|, -v2, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x6f,0x09,0x30]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x6f,0x09,0x30]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x6f,0x09,0x30]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x6f,0x09,0x30]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x6f,0x09,0x30]
 
 0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30
-# GFX11: v_cndmask_b16_e64_dpp v255, v255, v255, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30]
+
+0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+
+0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13]
+
+0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30
+# W32-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # GFX11: v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt
index 4ce26199bcc08..ab5f0af5f6629 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -1349,46 +1349,72 @@
 # GFX11: v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x95,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_lt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_lt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX11: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x19,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
index 7a81ba23afa35..1e74b5aec0cf3 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
@@ -396,29 +396,64 @@
 # GFX11: v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x1d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s104 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s104 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s104 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x5d,0xd6,0xe9,0x04,0xf2,0x21,0x01,0x00,0x00,0x00
-# GFX11: v_cndmask_b16_e64_dpp v5, -v1, |v2|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xf2,0x21,0x01,0x00,0x00,0x00]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xf2,0x21,0x01,0x00,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xf2,0x21,0x01,0x00,0x00,0x00]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xf2,0x21,0x01,0x00,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xf2,0x21,0x01,0x00,0x00,0x00]
 
 0x05,0x01,0x5d,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x00,0x00,0x00
-# GFX11: v_cndmask_b16_e64_dpp v5, |v1|, -v2, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x00,0x00,0x00]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x00,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x00,0x00,0x00]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x00,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x00,0x00,0x00]
 
 0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00
-# GFX11: v_cndmask_b16_e64_dpp v255, v255, v255, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00]
+
+0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+
+0x05,0x12,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x12,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x12,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+
+0xff,0x43,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00
+# W32-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # GFX11: v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt
index a703568f5c6f2..6867126e9c70e 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -269,16 +269,32 @@
 # GFX11: v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x95,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_lt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_lt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x81,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_lt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x81,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x81,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x81,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x81,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX11: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x81,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x81,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x81,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x81,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x81,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x91,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_lt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x91,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt
index d519c0ffa66c6..b9d7a5296cc5e 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00
 # GFX11: v_cmpx_class_f16_e64 v1, v2             ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00]
@@ -2069,10 +2069,12 @@
 # GFX11: v_cmpx_lg_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa5,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00
-# GFX11: v_cmpx_lt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-REAL16: v_cmpx_lt_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00
-# GFX11: v_cmpx_lt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-REAL16: v_cmpx_lt_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x81,0xd4,0x01,0x04,0x00,0x00
 # GFX11: v_cmpx_lt_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x04,0x00,0x00]
@@ -2113,6 +2115,14 @@
 0x7e,0x83,0x81,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX11: v_cmpx_lt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x81,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00
+# GFX11-REAL16: v_cmpx_lt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00
+# GFX11-REAL16: v_cmpx_lt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_lt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00]
+
 0x7e,0x00,0x91,0xd4,0x01,0x05,0x02,0x00
 # GFX11: v_cmpx_lt_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x91,0xd4,0x01,0x05,0x02,0x00]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt
index e0b5c16c27d2f..913e753627581 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX11,GFX11-REAL16
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX11,GFX11-REAL16
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16
 
 0x01,0x05,0xfa,0x7d
 # GFX11: v_cmpx_class_f16_e32 v1, v2             ; encoding: [0x01,0x05,0xfa,0x7d]
@@ -2057,49 +2057,84 @@
 # GFX11: v_cmpx_lg_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4b,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x02,0x7d
-# GFX11: v_cmpx_lt_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x02,0x7d]
+# GFX11-REAL16: v_cmpx_lt_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x02,0x7d]
+# GFX11-FAKE16: v_cmpx_lt_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x02,0x7d]
 
 0x7f,0x05,0x02,0x7d
-# GFX11: v_cmpx_lt_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x02,0x7d]
+# GFX11-REAL16: v_cmpx_lt_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x02,0x7d]
+# GFX11-FAKE16: v_cmpx_lt_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x02,0x7d]
 
 0x01,0x04,0x02,0x7d
-# GFX11: v_cmpx_lt_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x02,0x7d]
+# GFX11-REAL16: v_cmpx_lt_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x02,0x7d]
+# GFX11-FAKE16: v_cmpx_lt_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x02,0x7d]
 
 0x69,0x04,0x02,0x7d
-# GFX11: v_cmpx_lt_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x02,0x7d]
+# GFX11-REAL16: v_cmpx_lt_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x02,0x7d]
+# GFX11-FAKE16: v_cmpx_lt_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x02,0x7d]
 
 0x6a,0x04,0x02,0x7d
-# GFX11: v_cmpx_lt_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x02,0x7d]
+# GFX11-REAL16: v_cmpx_lt_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x02,0x7d]
+# GFX11-FAKE16: v_cmpx_lt_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x02,0x7d]
 
 0x6b,0x04,0x02,0x7d
-# GFX11: v_cmpx_lt_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x02,0x7d]
+# GFX11-REAL16: v_cmpx_lt_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x02,0x7d]
+# GFX11-FAKE16: v_cmpx_lt_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x02,0x7d]
 
 0x7b,0x04,0x02,0x7d
-# GFX11: v_cmpx_lt_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x02,0x7d]
+# GFX11-REAL16: v_cmpx_lt_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x02,0x7d]
+# GFX11-FAKE16: v_cmpx_lt_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x02,0x7d]
 
 0x7d,0x04,0x02,0x7d
-# GFX11: v_cmpx_lt_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x02,0x7d]
+# GFX11-REAL16: v_cmpx_lt_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x02,0x7d]
+# GFX11-FAKE16: v_cmpx_lt_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x02,0x7d]
 
 0x7e,0x04,0x02,0x7d
-# GFX11: v_cmpx_lt_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x02,0x7d]
+# GFX11-REAL16: v_cmpx_lt_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x02,0x7d]
+# GFX11-FAKE16: v_cmpx_lt_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x02,0x7d]
 
 0x7f,0x04,0x02,0x7d
-# GFX11: v_cmpx_lt_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x02,0x7d]
+# GFX11-REAL16: v_cmpx_lt_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x02,0x7d]
+# GFX11-FAKE16: v_cmpx_lt_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x02,0x7d]
 
 0x7c,0x04,0x02,0x7d
-# GFX11: v_cmpx_lt_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x02,0x7d]
+# GFX11-REAL16: v_cmpx_lt_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x02,0x7d]
+# GFX11-FAKE16: v_cmpx_lt_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x02,0x7d]
 
 0xc1,0x04,0x02,0x7d
-# GFX11: v_cmpx_lt_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x02,0x7d]
+# GFX11-REAL16: v_cmpx_lt_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x02,0x7d]
+# GFX11-FAKE16: v_cmpx_lt_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x02,0x7d]
 
 0xf0,0x04,0x02,0x7d
-# GFX11: v_cmpx_lt_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x02,0x7d]
+# GFX11-REAL16: v_cmpx_lt_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x02,0x7d]
+# GFX11-FAKE16: v_cmpx_lt_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x02,0x7d]
 
 0xfd,0x04,0x02,0x7d
-# GFX11: v_cmpx_lt_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x02,0x7d]
+# GFX11-REAL16: v_cmpx_lt_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x02,0x7d]
+# GFX11-FAKE16: v_cmpx_lt_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x02,0x7d]
 
 0xff,0xfe,0x02,0x7d,0x0b,0xfe,0x00,0x00
-# GFX11: v_cmpx_lt_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x02,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_cmpx_lt_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x02,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_lt_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x02,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x02,0x7d
+# GFX11-REAL16: v_cmpx_lt_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x02,0x7d]
+# GFX11-FAKE16: v_cmpx_lt_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x02,0x7d]
+
+0xff,0x05,0x02,0x7d
+# GFX11-REAL16: v_cmpx_lt_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x02,0x7d]
+# GFX11-FAKE16: v_cmpx_lt_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x02,0x7d]
+
+0xf0,0xfe,0x02,0x7d
+# GFX11-REAL16: v_cmpx_lt_f16_e32 0.5, v127.l           ; encoding: [0xf0,0xfe,0x02,0x7d]
+# GFX11-FAKE16: v_cmpx_lt_f16_e32 0.5, v127             ; encoding: [0xf0,0xfe,0x02,0x7d]
+
+0xfd,0x04,0x03,0x7d
+# GFX11-REAL16: v_cmpx_lt_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x03,0x7d]
+# GFX11-FAKE16: v_cmpx_lt_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x03,0x7d]
+
+0xff,0xfe,0x03,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_cmpx_lt_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x03,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_lt_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x03,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x22,0x7d
 # GFX11: v_cmpx_lt_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x22,0x7d]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt
index 1d7e82c8bf96f..8919d86071f4d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX11,GFX11-REAL16
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX11,GFX11-REAL16
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16
 
 0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff]
@@ -1349,46 +1349,72 @@
 # GFX11: v_cmpx_lg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2b,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x02,0x7d,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_lt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_lt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_lt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_lt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_lt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_lt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_lt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_lt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_lt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_lt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_lt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_lt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_lt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x02,0x7d,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_lt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_lt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x02,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX11: v_cmpx_lt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x02,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-REAL16: v_cmpx_lt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x02,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_lt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x02,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0xfe,0x02,0x7d,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_lt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x02,0x7d,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_lt_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x02,0x7d,0x7f,0x5f,0x01,0x01]
+
+0xfa,0x04,0x03,0x7d,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_lt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x03,0x7d,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_lt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x03,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x03,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11-REAL16: v_cmpx_lt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x03,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_lt_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x03,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x22,0x7d,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_lt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt
index a6d8ec95d6d63..867fd7374b788 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX11,GFX11-REAL16
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX11,GFX11-REAL16
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16
 
 0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05]
@@ -197,10 +197,24 @@
 # GFX11: v_cmpx_lg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2b,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x02,0x7d,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_lt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x02,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_lt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x02,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_lt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x02,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x02,0x7d,0x7f,0x00,0x00,0x00
-# GFX11: v_cmpx_lt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x02,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_lt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x02,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_lt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x02,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfe,0x02,0x7d,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_lt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x02,0x7d,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_lt_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x02,0x7d,0x7f,0x77,0x39,0x05]
+
+0xe9,0x04,0x03,0x7d,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_lt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x03,0x7d,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_lt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x03,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x03,0x7d,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_lt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x03,0x7d,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_lt_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x03,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x22,0x7d,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_lt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x22,0x7d,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
index 633d3a48634fa..4108fd9c8be62 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
@@ -1018,55 +1018,100 @@
 # GFX12: v_bfm_b32 v255, 0xaf123456, vcc_hi      ; encoding: [0xff,0x00,0x1d,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00
-# W32: v_cndmask_b16 v5, v1, src_scc, s6       ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
-# W64: v_cndmask_b16 v5, v1, src_scc, s[6:7]   ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, v1.l, src_scc, s6   ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, v1, src_scc, s6       ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, v1.l, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, v1, src_scc, s[6:7]   ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
 
 0x05,0x00,0x5d,0xd6,0xff,0xe1,0x19,0x00
-# W32: v_cndmask_b16 v5, v255, 0x3800, s6      ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cndmask_b16 v5, v255, 0x3800, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, v255.l, 0x3800, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s6      ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, v255.l, 0x3800, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
 
 0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00
-# W32: v_cndmask_b16 v5, s105, s105, s6        ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
-# W64: v_cndmask_b16 v5, s105, s105, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, s105, s105, s6      ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, s105, s105, s6        ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, s105, s105, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, s105, s105, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
 
 0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00
-# W32: v_cndmask_b16 v5, vcc_hi, v2, s6        ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
-# W64: v_cndmask_b16 v5, vcc_hi, v2, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, vcc_hi, v2.l, s6    ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, vcc_hi, v2, s6        ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, vcc_hi, v2.l, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, vcc_hi, v2, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
 
 0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00
-# W32: v_cndmask_b16 v5, ttmp15, ttmp15, s6    ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
-# W64: v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, ttmp15, ttmp15, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, ttmp15, ttmp15, s6    ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
 
 0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00
-# W32: v_cndmask_b16 v5, m0, v255, s6          ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
-# W64: v_cndmask_b16 v5, m0, v255, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, m0, v255.l, s6      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, m0, v255, s6          ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, m0, v255.l, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, m0, v255, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
 
 0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00
-# W32: v_cndmask_b16 v5, exec_lo, exec_lo, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
-# W64: v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, exec_lo, exec_lo, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, exec_lo, exec_lo, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
 
 0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00
-# W32: v_cndmask_b16 v5, exec_hi, exec_hi, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
-# W64: v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, exec_hi, exec_hi, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, exec_hi, exec_hi, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
 
 0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00
-# W32: v_cndmask_b16 v5, null, m0, s6          ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
-# W64: v_cndmask_b16 v5, null, m0, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, null, m0, s6        ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, null, m0, s6          ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, null, m0, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, null, m0, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
 
 0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41
-# W32: v_cndmask_b16 v5, -1, -|vcc_lo|, s104   ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
-# W64: v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
+# W32-REAL16: v_cndmask_b16 v5.l, -1, -|vcc_lo|, s104 ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
+# W32-FAKE16: v_cndmask_b16 v5, -1, -|vcc_lo|, s104   ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
+# W64-REAL16: v_cndmask_b16 v5.l, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
+# W64-FAKE16: v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
 
 0x05,0x00,0x5d,0xd6,0xf0,0x82,0xa9,0x01
-# W32: v_cndmask_b16 v5, 0x3800, -1, vcc_lo    ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
-# W64: v_cndmask_b16 v5, 0x3800, -1, vcc       ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, 0x3800, -1, vcc_lo  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, 0x3800, -1, vcc_lo    ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, 0x3800, -1, vcc     ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, 0x3800, -1, vcc       ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
 
 0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21
-# W32: v_cndmask_b16 v5, -|src_scc|, null, ttmp14 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
-# W64: v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+# W32-REAL16: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp14 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+# W32-FAKE16: v_cndmask_b16 v5, -|src_scc|, null, ttmp14 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+# W64-REAL16: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+# W64-FAKE16: v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
 
 0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
-# GFX12: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+
+0x05,0x08,0x5d,0xd6,0xff,0xe1,0x19,0x00
+# W32-REAL16: v_cndmask_b16 v5.l, v255.h, 0x3800, s6  ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s6      ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, v255.h, 0x3800, s[6:7] ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+
+0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00
+# W32-REAL16: v_cndmask_b16 v5.l, m0, v255.h, s6      ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, m0, v255, s6          ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, m0, v255.h, s[6:7]  ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, m0, v255, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+
+0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x0c,0xd6,0x01,0x05,0x0e,0x00
 # GFX12: v_cubeid_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x0c,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
index 7e30a4a2096b1..0be540da8287b 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
@@ -789,59 +789,106 @@
 # GFX12: v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x1d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s104 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s104 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s104 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
 
 0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30
-# GFX12: v_cndmask_b16_e64_dpp v255, v255, v255, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30]
+
+0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+
+0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13]
+
+0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30
+# W32-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # GFX12: v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
index 2aaba2a17fae6..343a71abb27d0 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
@@ -447,23 +447,52 @@
 # GFX12: v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x1d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s104 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s104 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s104 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
 
 0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00
-# GFX12: v_cndmask_b16_e64_dpp v255, v255, v255, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00]
+
+0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+
+0x05,0x12,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x12,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x12,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+
+0xff,0x43,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00
+# W32-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # GFX12: v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
index 25c4e4ad43b1b..3ccf6feac4cca 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
@@ -2199,7 +2199,6 @@
 # GFX12-REAL16: v_fract_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 # GFX12-FAKE16: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_fract_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00]
 0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00
 # GFX12: v_fract_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00]
 
@@ -2504,7 +2503,6 @@
 # GFX12-REAL16: v_frexp_mant_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 # GFX12-FAKE16: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_frexp_mant_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00]
 0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00
 # GFX12: v_frexp_mant_f32_e64 v5, v1             ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
index f447fb42afc7b..a020b0ae46a37 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
@@ -1794,7 +1794,6 @@
 # GFX12-REAL16: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 # GFX12-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
@@ -2013,7 +2012,6 @@
 # GFX12-REAL16: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 # GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
-# GFX11: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
index 7cf415aad5a19..ad3c673b4e390 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
@@ -528,7 +528,6 @@
 # GFX12-REAL16: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-# GFX11: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
@@ -593,7 +592,6 @@
 # GFX12-REAL16: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-# GFX11: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt
index 46f255c2f484f..55e0bf6c525ec 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_class_f16_e64 v1, v2             ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00]
@@ -1778,10 +1778,12 @@
 # GFX12: v_cmpx_lg_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa5,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00
-# GFX12: v_cmpx_lt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-REAL16: v_cmpx_lt_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00
-# GFX12: v_cmpx_lt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-REAL16: v_cmpx_lt_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x81,0xd4,0x01,0x04,0x00,0x00
 # GFX12: v_cmpx_lt_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x04,0x00,0x00]
@@ -1822,6 +1824,14 @@
 0x7e,0x83,0x81,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_lt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x81,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_lt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_lt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00]
+
 0x7e,0x00,0x91,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_lt_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x91,0xd4,0x01,0x05,0x02,0x00]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt
index 3550b6fc5e95d..041e43f4d05e5 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -1268,49 +1268,123 @@
 # GFX12: v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x95,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_lt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_lt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX12: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_lt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x81,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_lt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x81,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x81,0xd4,0xff,0xff,0x03,0x00]
+
+0x7e,0x00,0x81,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_lt_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x81,0xd4,0x01,0x04,0x00,0x00]
+
+0x7e,0x00,0x81,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_lt_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x81,0xd4,0x69,0xd2,0x00,0x00]
+
+0x7e,0x00,0x81,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_lt_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x81,0xd4,0x6a,0xf6,0x00,0x00]
+
+0x7e,0x00,0x81,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_lt_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x81,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x00,0x81,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_lt_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x81,0xd4,0x7b,0xfa,0x01,0x00]
+
+0x7e,0x00,0x81,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_lt_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x81,0xd4,0x7d,0xe0,0x01,0x00]
+
+0x7e,0x00,0x81,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_lt_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x81,0xd4,0x7e,0x82,0x01,0x00]
+
+0x7e,0x01,0x81,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_lt_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x81,0xd4,0x7f,0xf8,0x00,0x00]
+
+0x7e,0x00,0x81,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_lt_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x81,0xd4,0x7c,0xfc,0x00,0x00]
+
+0x7e,0x00,0x81,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_lt_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x81,0xd4,0xc1,0xfe,0x00,0x00]
+
+0x7e,0x00,0x81,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_lt_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x81,0xd4,0xf0,0xfa,0x00,0x40]
+
+0x7e,0x02,0x81,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_lt_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x81,0xd4,0xfd,0xd4,0x00,0x20]
+
+0x7e,0x83,0x81,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_lt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x81,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x19,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x81,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x81,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt
index 9442dcc4fb1d5..35e7a45a7b162 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -323,19 +323,36 @@
 # GFX12: v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x95,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x00,0x81,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_lt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_lt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x81,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_lt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x81,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x81,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x81,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x81,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX12: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x81,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x81,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x81,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x81,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_lt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x81,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x81,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x91,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x91,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt
index 180ec987280d1..9e5959ca4a77e 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX12,GFX12-REAL16
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX12,GFX12-REAL16
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX12,GFX12-FAKE16
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX12,GFX12-FAKE16
 
 0x01,0x05,0xfa,0x7d
 # GFX12: v_cmpx_class_f16_e32 v1, v2             ; encoding: [0x01,0x05,0xfa,0x7d]
@@ -1769,49 +1769,80 @@
 # GFX12: v_cmpx_lg_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4b,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x02,0x7d
-# GFX12: v_cmpx_lt_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x02,0x7d]
+# GFX12-REAL16: v_cmpx_lt_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x02,0x7d]
+# GFX12-FAKE16: v_cmpx_lt_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x02,0x7d]
 
 0x7f,0x05,0x02,0x7d
-# GFX12: v_cmpx_lt_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x02,0x7d]
+# GFX12-REAL16: v_cmpx_lt_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x02,0x7d]
+# GFX12-FAKE16: v_cmpx_lt_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x02,0x7d]
 
 0x01,0x04,0x02,0x7d
-# GFX12: v_cmpx_lt_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x02,0x7d]
+# GFX12-REAL16: v_cmpx_lt_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x02,0x7d]
+# GFX12-FAKE16: v_cmpx_lt_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x02,0x7d]
 
 0x69,0x04,0x02,0x7d
-# GFX12: v_cmpx_lt_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x02,0x7d]
+# GFX12-REAL16: v_cmpx_lt_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x02,0x7d]
+# GFX12-FAKE16: v_cmpx_lt_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x02,0x7d]
 
 0x6a,0x04,0x02,0x7d
-# GFX12: v_cmpx_lt_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x02,0x7d]
+# GFX12-REAL16: v_cmpx_lt_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x02,0x7d]
+# GFX12-FAKE16: v_cmpx_lt_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x02,0x7d]
 
 0x6b,0x04,0x02,0x7d
-# GFX12: v_cmpx_lt_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x02,0x7d]
+# GFX12-REAL16: v_cmpx_lt_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x02,0x7d]
+# GFX12-FAKE16: v_cmpx_lt_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x02,0x7d]
 
 0x7b,0x04,0x02,0x7d
-# GFX12: v_cmpx_lt_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x02,0x7d]
+# GFX12-REAL16: v_cmpx_lt_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x02,0x7d]
+# GFX12-FAKE16: v_cmpx_lt_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x02,0x7d]
 
 0x7d,0x04,0x02,0x7d
-# GFX12: v_cmpx_lt_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x02,0x7d]
+# GFX12-REAL16: v_cmpx_lt_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x02,0x7d]
+# GFX12-FAKE16: v_cmpx_lt_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x02,0x7d]
 
 0x7e,0x04,0x02,0x7d
-# GFX12: v_cmpx_lt_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x02,0x7d]
+# GFX12-REAL16: v_cmpx_lt_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x02,0x7d]
+# GFX12-FAKE16: v_cmpx_lt_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x02,0x7d]
 
 0x7f,0x04,0x02,0x7d
-# GFX12: v_cmpx_lt_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x02,0x7d]
+# GFX12-REAL16: v_cmpx_lt_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x02,0x7d]
+# GFX12-FAKE16: v_cmpx_lt_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x02,0x7d]
 
 0x7c,0x04,0x02,0x7d
-# GFX12: v_cmpx_lt_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x02,0x7d]
+# GFX12-REAL16: v_cmpx_lt_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x02,0x7d]
+# GFX12-FAKE16: v_cmpx_lt_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x02,0x7d]
 
 0xc1,0x04,0x02,0x7d
-# GFX12: v_cmpx_lt_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x02,0x7d]
+# GFX12-REAL16: v_cmpx_lt_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x02,0x7d]
+# GFX12-FAKE16: v_cmpx_lt_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x02,0x7d]
 
 0xf0,0x04,0x02,0x7d
-# GFX12: v_cmpx_lt_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x02,0x7d]
+# GFX12-REAL16: v_cmpx_lt_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x02,0x7d]
+# GFX12-FAKE16: v_cmpx_lt_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x02,0x7d]
 
 0xfd,0x04,0x02,0x7d
-# GFX12: v_cmpx_lt_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x02,0x7d]
+# GFX12-REAL16: v_cmpx_lt_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x02,0x7d]
+# GFX12-FAKE16: v_cmpx_lt_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x02,0x7d]
 
 0xff,0xfe,0x02,0x7d,0x0b,0xfe,0x00,0x00
-# GFX12: v_cmpx_lt_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x02,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_cmpx_lt_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x02,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_lt_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x02,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x02,0x7d
+# GFX12-REAL16: v_cmpx_lt_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x02,0x7d]
+# GFX12-FAKE16: v_cmpx_lt_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x02,0x7d]
+
+0xff,0x05,0x02,0x7d
+# GFX12-REAL16: v_cmpx_lt_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x02,0x7d]
+# GFX12-FAKE16: v_cmpx_lt_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x02,0x7d]
+
+0xfd,0x04,0x03,0x7d
+# GFX12-REAL16: v_cmpx_lt_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x03,0x7d]
+# GFX12-FAKE16: v_cmpx_lt_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x03,0x7d]
+
+0xff,0xfe,0x03,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_cmpx_lt_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x03,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_lt_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x03,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x22,0x7d
 # GFX12: v_cmpx_lt_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x22,0x7d]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt
index e65d451116d29..8ecef5536ad79 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX12,GFX12-REAL16
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX12,GFX12-REAL16
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX12,GFX12-FAKE16
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX12,GFX12-FAKE16
 
 0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff]
@@ -1181,46 +1181,68 @@
 # GFX12: v_cmpx_lg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2b,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x02,0x7d,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_lt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_lt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_lt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_lt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_lt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_lt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_lt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_lt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_lt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_lt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_lt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x02,0x7d,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_lt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_lt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x02,0x7d,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_lt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_lt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_lt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x02,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x02,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX12: v_cmpx_lt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x02,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-REAL16: v_cmpx_lt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x02,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_lt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x02,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0x04,0x03,0x7d,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_lt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x03,0x7d,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_lt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x03,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x03,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12-REAL16: v_cmpx_lt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x03,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_lt_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x03,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x22,0x7d,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_lt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x22,0x7d,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt
index 4449cbcfb3608..147084df5384f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX12,GFX12-REAL16
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX12,GFX12-REAL16
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX12,GFX12-FAKE16
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck %s --check-prefixes=GFX12,GFX12-FAKE16
 
 0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05]
@@ -173,10 +173,20 @@
 # GFX12: v_cmpx_lg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2b,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x02,0x7d,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_lt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x02,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_lt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x02,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_lt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x02,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x02,0x7d,0x7f,0x00,0x00,0x00
-# GFX12: v_cmpx_lt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x02,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_lt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x02,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_lt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x02,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0x04,0x03,0x7d,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_lt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x03,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_lt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x03,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x03,0x7d,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_lt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x03,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_lt_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x03,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x22,0x7d,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_lt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x22,0x7d,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
index 6df44c87d2332..57e3153da401b 100755
--- a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
+++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
@@ -96,3 +96,99 @@
 # ATT:   tileloaddrst1 -32(,%rbp,2), %tmm3
 # INTEL: tileloaddrst1 tmm3, [2*rbp - 32]
 0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6
+# INTEL: t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456]
+0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291]
+0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz0rs 64(%r18), %tmm6
+# INTEL: t2rpntlvwz0rs tmm6, [r18 + 64]
+0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40
+
+# ATT:   t2rpntlvwz0rs -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz0rs tmm2, [2*rbp - 32]
+0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6
+# INTEL: t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456]
+0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291]
+0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz0rst1 64(%r18), %tmm6
+# INTEL: t2rpntlvwz0rst1 tmm6, [r18 + 64]
+0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40
+
+# ATT:   t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
+0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6
+# INTEL: t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456]
+0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291]
+0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz1rs 64(%r18), %tmm6
+# INTEL: t2rpntlvwz1rs tmm6, [r18 + 64]
+0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40
+
+# ATT:   t2rpntlvwz1rs -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz1rs tmm2, [2*rbp - 32]
+0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6
+# INTEL: t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456]
+0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291]
+0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz1rst1 64(%r18), %tmm6
+# INTEL: t2rpntlvwz1rst1 tmm6, [r18 + 64]
+0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40
+
+# ATT:   t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
+0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   tileloaddrs 268435456(%r16,%r14,8), %tmm6
+# INTEL: tileloaddrs tmm6, [r16 + 8*r14 + 268435456]
+0x62,0xba,0x7f,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   tileloaddrs 291(%r8,%r17,4), %tmm3
+# INTEL: tileloaddrs tmm3, [r8 + 4*r17 + 291]
+0x62,0xd2,0x7b,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   tileloaddrs 64(%r18), %tmm6
+# INTEL: tileloaddrs tmm6, [r18 + 64]
+0x62,0xfa,0x7f,0x08,0x4a,0x74,0x22,0x40
+
+# ATT:   tileloaddrs -32(,%rbp,2), %tmm3
+# INTEL: tileloaddrs tmm3, [2*rbp - 32]
+0x62,0xf2,0x7f,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   tileloaddrst1 268435456(%r16,%r14,8), %tmm6
+# INTEL: tileloaddrst1 tmm6, [r16 + 8*r14 + 268435456]
+0x62,0xba,0x7d,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   tileloaddrst1 291(%r8,%r17,4), %tmm3
+# INTEL: tileloaddrst1 tmm3, [r8 + 4*r17 + 291]
+0x62,0xd2,0x79,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   tileloaddrst1 64(%r18), %tmm6
+# INTEL: tileloaddrst1 tmm6, [r18 + 64]
+0x62,0xfa,0x7d,0x08,0x4a,0x74,0x22,0x40
+
+# ATT:   tileloaddrst1 -32(,%rbp,2), %tmm3
+# INTEL: tileloaddrst1 tmm3, [2*rbp - 32]
+0x62,0xf2,0x7d,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff
diff --git a/llvm/test/MC/Disassembler/X86/amx-avx512.txt b/llvm/test/MC/Disassembler/X86/amx-avx512.txt
index 0a162af1b4bc0..17858f333e632 100644
--- a/llvm/test/MC/Disassembler/X86/amx-avx512.txt
+++ b/llvm/test/MC/Disassembler/X86/amx-avx512.txt
@@ -17,36 +17,36 @@
 # INTEL:      tcvtrowd2ps zmm22, tmm2, 123
 0x62,0xe3,0x7e,0x48,0x07,0xf2,0x7b
 
-# ATT:        tcvtrowps2pbf16h %ecx, %tmm5, %zmm22
-# INTEL:      tcvtrowps2pbf16h zmm22, tmm5, ecx
+# ATT:        tcvtrowps2bf16h %ecx, %tmm5, %zmm22
+# INTEL:      tcvtrowps2bf16h zmm22, tmm5, ecx
 0x62,0xe2,0x77,0x48,0x6d,0xf5
 
-# ATT:        tcvtrowps2pbf16h %ecx, %tmm2, %zmm22
-# INTEL:      tcvtrowps2pbf16h zmm22, tmm2, ecx
+# ATT:        tcvtrowps2bf16h %ecx, %tmm2, %zmm22
+# INTEL:      tcvtrowps2bf16h zmm22, tmm2, ecx
 0x62,0xe2,0x77,0x48,0x6d,0xf2
 
-# ATT:        tcvtrowps2pbf16h $123, %tmm5, %zmm22
-# INTEL:      tcvtrowps2pbf16h zmm22, tmm5, 123
+# ATT:        tcvtrowps2bf16h $123, %tmm5, %zmm22
+# INTEL:      tcvtrowps2bf16h zmm22, tmm5, 123
 0x62,0xe3,0x7f,0x48,0x07,0xf5,0x7b
 
-# ATT:        tcvtrowps2pbf16h $123, %tmm2, %zmm22
-# INTEL:      tcvtrowps2pbf16h zmm22, tmm2, 123
+# ATT:        tcvtrowps2bf16h $123, %tmm2, %zmm22
+# INTEL:      tcvtrowps2bf16h zmm22, tmm2, 123
 0x62,0xe3,0x7f,0x48,0x07,0xf2,0x7b
 
-# ATT:        tcvtrowps2pbf16l %ecx, %tmm5, %zmm22
-# INTEL:      tcvtrowps2pbf16l zmm22, tmm5, ecx
+# ATT:        tcvtrowps2bf16l %ecx, %tmm5, %zmm22
+# INTEL:      tcvtrowps2bf16l zmm22, tmm5, ecx
 0x62,0xe2,0x76,0x48,0x6d,0xf5
 
-# ATT:        tcvtrowps2pbf16l %ecx, %tmm2, %zmm22
-# INTEL:      tcvtrowps2pbf16l zmm22, tmm2, ecx
+# ATT:        tcvtrowps2bf16l %ecx, %tmm2, %zmm22
+# INTEL:      tcvtrowps2bf16l zmm22, tmm2, ecx
 0x62,0xe2,0x76,0x48,0x6d,0xf2
 
-# ATT:        tcvtrowps2pbf16l $123, %tmm5, %zmm22
-# INTEL:      tcvtrowps2pbf16l zmm22, tmm5, 123
+# ATT:        tcvtrowps2bf16l $123, %tmm5, %zmm22
+# INTEL:      tcvtrowps2bf16l zmm22, tmm5, 123
 0x62,0xe3,0x7e,0x48,0x77,0xf5,0x7b
 
-# ATT:        tcvtrowps2pbf16l $123, %tmm2, %zmm22
-# INTEL:      tcvtrowps2pbf16l zmm22, tmm2, 123
+# ATT:        tcvtrowps2bf16l $123, %tmm2, %zmm22
+# INTEL:      tcvtrowps2bf16l zmm22, tmm2, 123
 0x62,0xe3,0x7e,0x48,0x77,0xf2,0x7b
 
 # ATT:        tcvtrowps2phh %ecx, %tmm5, %zmm22
diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
index 8c6f1be80ba2d..d768630ac1475 100644
--- a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
+++ b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
@@ -49,6 +49,54 @@
 # INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32]
 0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
 
+# ATT:   t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4
+# INTEL: t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456]
+0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz0 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291]
+0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz0 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz0 tmm2, [2*rbp - 32]
+0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4
+# INTEL: t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456]
+0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291]
+0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz0t1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz0t1 tmm2, [2*rbp - 32]
+0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4
+# INTEL: t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456]
+0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz1 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291]
+0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz1 tmm2, [2*rbp - 32]
+0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4
+# INTEL: t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456]
+0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291]
+0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32]
+0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
+
 # ATT:   ttransposed %tmm1, %tmm2
 # INTEL: ttransposed tmm2, tmm1
 0xc4,0xe2,0x7a,0x5f,0xd1
diff --git a/llvm/test/MC/Disassembler/X86/movrs.txt b/llvm/test/MC/Disassembler/X86/movrs.txt
index fa91b542d3f73..caac8bc8b7b30 100644
--- a/llvm/test/MC/Disassembler/X86/movrs.txt
+++ b/llvm/test/MC/Disassembler/X86/movrs.txt
@@ -95,4 +95,100 @@
 
 # ATT:   movrsq  -128(%rdx), %rbx
 # INTEL: movrs rbx, qword ptr [rdx - 128]
-0x48,0x0f,0x38,0x8b,0x5a,0x80
\ No newline at end of file
+0x48,0x0f,0x38,0x8b,0x5a,0x80
+
+# ATT:   movrsb  268435456(%rbp,%r14,8), %r16b
+# INTEL: movrs r16b, byte ptr [rbp + 8*r14 + 268435456]
+0x62,0xa4,0x7c,0x08,0x8a,0x84,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   movrsb  291(%r17,%rax,4), %bl
+# INTEL: movrs bl, byte ptr [r17 + 4*rax + 291]
+0x62,0xfc,0x7c,0x08,0x8a,0x9c,0x81,0x23,0x01,0x00,0x00
+
+# ATT:   movrsb  (%rip), %bl
+# INTEL: movrs bl, byte ptr [rip]
+0x62,0xf4,0x7c,0x08,0x8a,0x1d,0x00,0x00,0x00,0x00
+
+# ATT:   movrsb  -32(,%rbp,2), %r18b
+# INTEL: movrs r18b, byte ptr [2*rbp - 32]
+0x62,0xe4,0x7c,0x08,0x8a,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   movrsb  127(%r19), %bl
+# INTEL: movrs bl, byte ptr [r19 + 127]
+0x62,0xfc,0x7c,0x08,0x8a,0x5b,0x7f
+
+# ATT:   movrsb  -128(%r20,%riz), %bl
+# INTEL: movrs bl, byte ptr [r20 + riz - 128]
+0x62,0xfc,0x7c,0x08,0x8a,0x5c,0x24,0x80
+
+# ATT:   movrsw  268435456(%rbp,%r14,8), %r16w
+# INTEL: movrs r16w, word ptr [rbp + 8*r14 + 268435456]
+0x62,0xa4,0x7d,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   movrsw  291(%r17,%rax,4), %bx
+# INTEL: movrs bx, word ptr [r17 + 4*rax + 291]
+0x62,0xfc,0x7d,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00
+
+# ATT:   movrsw  (%rip), %bx
+# INTEL: movrs bx, word ptr [rip]
+0x62,0xf4,0x7d,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00
+
+# ATT:   movrsw  -32(,%rbp,2), %r18w
+# INTEL: movrs r18w, word ptr [2*rbp - 32]
+0x62,0xe4,0x7d,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   movrsw  127(%r19), %bx
+# INTEL: movrs bx, word ptr [r19 + 127]
+0x62,0xfc,0x7d,0x08,0x8b,0x5b,0x7f
+
+# ATT:   movrsw  -128(%r20,%riz), %bx
+# INTEL: movrs bx, word ptr [r20 + riz - 128]
+0x62,0xfc,0x7d,0x08,0x8b,0x5c,0x24,0x80
+
+# ATT:   movrsl  268435456(%rbp,%r14,8), %r16d
+# INTEL: movrs r16d, dword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa4,0x7c,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   movrsl  291(%r17,%rax,4), %ebx
+# INTEL: movrs ebx, dword ptr [r17 + 4*rax + 291]
+0x62,0xfc,0x7c,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00
+
+# ATT:   movrsl  (%rip), %ebx
+# INTEL: movrs ebx, dword ptr [rip]
+0x62,0xf4,0x7c,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00
+
+# ATT:   movrsl  -32(,%rbp,2), %r18d
+# INTEL: movrs r18d, dword ptr [2*rbp - 32]
+0x62,0xe4,0x7c,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   movrsl  127(%r19), %ebx
+# INTEL: movrs ebx, dword ptr [r19 + 127]
+0x62,0xfc,0x7c,0x08,0x8b,0x5b,0x7f
+
+# ATT:   movrsl  -128(%r20,%riz), %ebx
+# INTEL: movrs ebx, dword ptr [r20 + riz - 128]
+0x62,0xfc,0x7c,0x08,0x8b,0x5c,0x24,0x80
+
+# ATT:   movrsq  268435456(%rbp,%r14,8), %r16
+# INTEL: movrs r16, qword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa4,0xfc,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   movrsq  291(%r17,%rax,4), %rbx
+# INTEL: movrs rbx, qword ptr [r17 + 4*rax + 291]
+0x62,0xfc,0xfc,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00
+
+# ATT:   movrsq  (%rip), %rbx
+# INTEL: movrs rbx, qword ptr [rip]
+0x62,0xf4,0xfc,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00
+
+# ATT:   movrsq  -32(,%rbp,2), %r18
+# INTEL: movrs r18, qword ptr [2*rbp - 32]
+0x62,0xe4,0xfc,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   movrsq  127(%r19), %rbx
+# INTEL: movrs rbx, qword ptr [r19 + 127]
+0x62,0xfc,0xfc,0x08,0x8b,0x5b,0x7f
+
+# ATT:   movrsq  -128(%r20,%riz), %rbx
+# INTEL: movrs rbx, qword ptr [r20 + riz - 128]
+0x62,0xfc,0xfc,0x08,0x8b,0x5c,0x24,0x80
diff --git a/llvm/test/MC/LoongArch/Macros/aliases-la.s b/llvm/test/MC/LoongArch/Macros/aliases-la.s
index dd5a4d474e001..1b5b818f4348f 100644
--- a/llvm/test/MC/LoongArch/Macros/aliases-la.s
+++ b/llvm/test/MC/LoongArch/Macros/aliases-la.s
@@ -3,13 +3,26 @@
 
 # RUN: llvm-mc --triple=loongarch64 %s \
 # RUN:     | FileCheck %s --check-prefix=NORMAL
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o %t
+# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=RELOC
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.relax
+# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=RELOC,RELAX
 # RUN: llvm-mc --triple=loongarch64 --mattr=+la-global-with-pcrel < %s \
 # RUN:     | FileCheck %s --check-prefix=GTOPCR
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+la-global-with-pcrel \
+# RUN:     --mattr=-relax %s -o %t
+# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=GTOPCR-RELOC
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+la-global-with-pcrel \
+# RUN:     --mattr=+relax %s -o %t.relax
+# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=GTOPCR-RELOC,GTOPCR-RELAX
 # RUN: llvm-mc --triple=loongarch64 --mattr=+la-global-with-abs < %s \
 # RUN:     | FileCheck %s --check-prefix=GTOABS
 # RUN: llvm-mc --triple=loongarch64 --mattr=+la-local-with-abs < %s \
 # RUN:     | FileCheck %s --check-prefix=LTOABS
 
+# RELOC:      Relocations [
+# RELOC-NEXT:   Section ({{.*}}) .rela.text {
+
 la $a0, sym
 # NORMAL:      pcalau12i $a0, %got_pc_hi20(sym)
 # NORMAL-NEXT: ld.d $a0, $a0, %got_pc_lo12(sym)
@@ -22,6 +35,16 @@ la $a0, sym
 # GTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym)
 # GTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym)
 
+# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+
+# GTOPCR-RELOC: R_LARCH_PCALA_HI20 sym 0x0
+# GTOPCR-RELAX: R_LARCH_RELAX - 0x0
+# GTOPCR-RELOC-NEXT: R_LARCH_PCALA_LO12 sym 0x0
+# GTOPCR-RELAX-NEXT: R_LARCH_RELAX - 0x0
+
 la.global $a0, sym_global
 # NORMAL:      pcalau12i $a0, %got_pc_hi20(sym_global)
 # NORMAL-NEXT: ld.d $a0, $a0, %got_pc_lo12(sym_global)
@@ -34,6 +57,16 @@ la.global $a0, sym_global
 # GTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym_global)
 # GTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_global)
 
+# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym_global 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_global 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+
+# GTOPCR-RELOC-NEXT: R_LARCH_PCALA_HI20 sym_global 0x0
+# GTOPCR-RELAX-NEXT: R_LARCH_RELAX - 0x0
+# GTOPCR-RELOC-NEXT: R_LARCH_PCALA_LO12 sym_global 0x0
+# GTOPCR-RELAX-NEXT: R_LARCH_RELAX - 0x0
+
 la.global $a0, $a1, sym_global_large
 # NORMAL:      pcalau12i $a0, %got_pc_hi20(sym_global_large)
 # NORMAL-NEXT: addi.d $a1, $zero, %got_pc_lo12(sym_global_large)
@@ -52,6 +85,11 @@ la.global $a0, $a1, sym_global_large
 # GTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym_global_large)
 # GTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_global_large)
 
+# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym_global_large 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_global_large 0x0
+# RELOC-NEXT: R_LARCH_GOT64_PC_LO20 sym_global_large 0x0
+# RELOC-NEXT: R_LARCH_GOT64_PC_HI12 sym_global_large 0x0
+
 la.local $a0, sym_local
 # NORMAL:      pcalau12i $a0, %pc_hi20(sym_local)
 # NORMAL-NEXT: addi.d $a0, $a0, %pc_lo12(sym_local)
@@ -61,6 +99,11 @@ la.local $a0, sym_local
 # LTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym_local)
 # LTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_local)
 
+# RELOC-NEXT: R_LARCH_PCALA_HI20 sym_local 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+# RELOC-NEXT: R_LARCH_PCALA_LO12 sym_local 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+
 la.local $a0, $a1, sym_local_large
 # NORMAL:      pcalau12i $a0, %pc_hi20(sym_local_large)
 # NORMAL-NEXT: addi.d $a1, $zero, %pc_lo12(sym_local_large)
@@ -72,3 +115,12 @@ la.local $a0, $a1, sym_local_large
 # LTOABS-NEXT: ori $a0, $a0, %abs_lo12(sym_local_large)
 # LTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym_local_large)
 # LTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_local_large)
+
+# RELOC-NEXT: R_LARCH_PCALA_HI20 sym_local_large 0x0
+# RELOC-NEXT: R_LARCH_PCALA_LO12 sym_local_large 0x0
+# RELOC-NEXT: R_LARCH_PCALA64_LO20 sym_local_large 0x0
+# RELOC-NEXT: R_LARCH_PCALA64_HI12 sym_local_large 0x0
+
+
+# RELOC-NEXT:   }
+# RELOC-NEXT: ]
diff --git a/llvm/test/MC/LoongArch/Macros/macros-call.s b/llvm/test/MC/LoongArch/Macros/macros-call.s
index a648a39780381..df7715050a0f9 100644
--- a/llvm/test/MC/LoongArch/Macros/macros-call.s
+++ b/llvm/test/MC/LoongArch/Macros/macros-call.s
@@ -1,9 +1,26 @@
 # RUN: llvm-mc --triple=loongarch64 %s | FileCheck %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o %t
+# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=RELOC
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.relax
+# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=RELOC,RELAX
+
+# RELOC:      Relocations [
+# RELOC-NEXT:   Section ({{.*}}) .rela.text {
 
 call36 sym_call
 # CHECK:      pcaddu18i $ra, %call36(sym_call)
 # CHECK-NEXT: jirl $ra, $ra, 0
 
+# RELOC-NEXT: R_LARCH_CALL36 sym_call 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+
 tail36 $t0, sym_tail
 # CHECK:      pcaddu18i $t0, %call36(sym_tail)
 # CHECK-NEXT: jr $t0
+
+# RELOC-NEXT: R_LARCH_CALL36 sym_tail 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+
+
+# RELOC-NEXT:   }
+# RELOC-NEXT: ]
diff --git a/llvm/test/MC/LoongArch/Macros/macros-la.s b/llvm/test/MC/LoongArch/Macros/macros-la.s
index d4272b93ba54d..a732988ef1f1a 100644
--- a/llvm/test/MC/LoongArch/Macros/macros-la.s
+++ b/llvm/test/MC/LoongArch/Macros/macros-la.s
@@ -5,6 +5,12 @@
 # RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=RELOC,RELAX
 # RUN: llvm-mc --triple=loongarch64 --mattr=+la-global-with-abs \
 # RUN:     %s | FileCheck %s --check-prefix=ABS
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+la-global-with-abs \
+# RUN:     --mattr=-relax %s -o %t
+# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=GTOABS-RELOC
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+la-global-with-abs \
+# RUN:     --mattr=+relax %s -o %t.relax
+# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=GTOABS-RELOC,GTOABS-RELAX
 
 # RELOC:      Relocations [
 # RELOC-NEXT:   Section ({{.*}}) .rela.text {
@@ -36,6 +42,10 @@ la.pcrel $a0, sym_pcrel
 # RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_PCALA_LO12 sym_pcrel 0x0
 # RELAX-NEXT: R_LARCH_RELAX - 0x0
+# GTOABS-RELOC: R_LARCH_PCALA_HI20 sym_pcrel 0x0
+# GTOABS-RELAX-NEXT: R_LARCH_RELAX - 0x0
+# GTOABS-RELOC-NEXT: R_LARCH_PCALA_LO12 sym_pcrel 0x0
+# GTOABS-RELAX-NEXT: R_LARCH_RELAX - 0x0
 
 la.got $a0, sym_got
 # CHECK-NEXT: pcalau12i $a0, %got_pc_hi20(sym_got)
@@ -73,7 +83,9 @@ la.tls.ie $a0, sym_ie
 # ABS-NEXT:   ld.d $a0, $a0, 0
 # ABS-EMPTY:
 # RELOC-NEXT: R_LARCH_TLS_IE_PC_HI20 sym_ie 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_TLS_IE_PC_LO12 sym_ie 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 
 la.tls.ld $a0, sym_ld
 # CHECK-NEXT: pcalau12i $a0, %ld_pc_hi20(sym_ld)
@@ -85,7 +97,9 @@ la.tls.ld $a0, sym_ld
 # ABS-NEXT:   lu52i.d $a0, $a0, %got64_hi12(sym_ld)
 # ABS-EMPTY:
 # RELOC-NEXT: R_LARCH_TLS_LD_PC_HI20 sym_ld 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_ld 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 
 la.tls.gd $a0, sym_gd
 # CHECK-NEXT: pcalau12i $a0, %gd_pc_hi20(sym_gd)
@@ -97,7 +111,9 @@ la.tls.gd $a0, sym_gd
 # ABS-NEXT:   lu52i.d $a0, $a0, %got64_hi12(sym_gd)
 # ABS-EMPTY:
 # RELOC-NEXT: R_LARCH_TLS_GD_PC_HI20 sym_gd 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_gd 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 
 la.tls.desc $a0, sym_desc
 # CHECK-NEXT: pcalau12i $a0, %desc_pc_hi20(sym_desc)
@@ -113,9 +129,13 @@ la.tls.desc $a0, sym_desc
 # ABS-NEXT:   jirl $ra, $ra, %desc_call(sym_desc)
 # ABS-EMPTY:
 # RELOC-NEXT: R_LARCH_TLS_DESC_PC_HI20 sym_desc 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_TLS_DESC_PC_LO12 sym_desc 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_TLS_DESC_LD sym_desc 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_TLS_DESC_CALL sym_desc 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 
 #############################################################
 ## with a temporary register.
diff --git a/llvm/test/MC/LoongArch/Relocations/relax-tls-le.s b/llvm/test/MC/LoongArch/Relocations/relax-tls-le.s
new file mode 100644
index 0000000000000..899f12f85654d
--- /dev/null
+++ b/llvm/test/MC/LoongArch/Relocations/relax-tls-le.s
@@ -0,0 +1,70 @@
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=+relax < %s \
+# RUN:     | llvm-readobj -r - | FileCheck --check-prefix=LA32-RELAX-RELOC %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=-relax < %s \
+# RUN:     | llvm-readobj -r - | FileCheck --check-prefix=LA32-NORELAX-RELOC %s
+# RUN: llvm-mc --triple=loongarch32 --mattr=+relax < %s --show-encoding \
+# RUN:     | FileCheck --check-prefix=LA32-RELAX-FIXUP %s
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax --defsym=LA64=1 < %s \
+# RUN:     | llvm-readobj -r - | FileCheck --check-prefix=LA64-RELAX-RELOC %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax --defsym=LA64=1 < %s \
+# RUN:     | llvm-readobj -r - | FileCheck --check-prefix=LA64-NORELAX-RELOC %s
+# RUN: llvm-mc --triple=loongarch64 --mattr=+relax --defsym=LA64=1 < %s --show-encoding \
+# RUN:     | FileCheck --check-prefix=LA64-RELAX-FIXUP %s
+
+.long foo
+
+.ifndef LA64
+
+lu12i.w $a0, %le_hi20_r(foo)
+# LA32-NORELAX-RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0
+# LA32-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0
+# LA32-RELAX-RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0
+# LA32-RELAX-RELOC: R_LARCH_RELAX - 0x0
+# LA32-RELAX-FIXUP: fixup A - offset: 0, value: %le_hi20_r(foo), kind: FK_NONE
+# LA32-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE
+
+add.w $a0, $a0, $tp, %le_add_r(foo)
+# LA32-NORELAX-RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0
+# LA32-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0
+# LA32-RELAX-RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0
+# LA32-RELAX-RELOC: R_LARCH_RELAX - 0x0
+# LA32-RELAX-FIXUP: fixup A - offset: 0, value: %le_add_r(foo), kind: FK_NONE
+# LA32-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE
+
+addi.w $a0, $a0, %le_lo12_r(foo)
+# LA32-NORELAX-RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0
+# LA32-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0
+# LA32-RELAX-RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0
+# LA32-RELAX-RELOC: R_LARCH_RELAX - 0x0
+# LA32-RELAX-FIXUP: fixup A - offset: 0, value: %le_lo12_r(foo), kind: FK_NONE
+# LA32-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE
+
+.else
+
+lu12i.w $a0, %le_hi20_r(foo)
+# LA64-NORELAX-RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0
+# LA64-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0
+# LA64-RELAX-RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0
+# LA64-RELAX-RELOC: R_LARCH_RELAX - 0x0
+# LA64-RELAX-FIXUP: fixup A - offset: 0, value: %le_hi20_r(foo), kind: FK_NONE
+# LA64-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE
+
+add.d $a0, $a0, $tp, %le_add_r(foo)
+# LA64-NORELAX-RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0
+# LA64-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0
+# LA64-RELAX-RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0
+# LA64-RELAX-RELOC: R_LARCH_RELAX - 0x0
+# LA64-RELAX-FIXUP: fixup A - offset: 0, value: %le_add_r(foo), kind: FK_NONE
+# LA64-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE
+
+addi.d $a0, $a0, %le_lo12_r(foo)
+# LA64-NORELAX-RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0
+# LA64-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0
+# LA64-RELAX-RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0
+# LA64-RELAX-RELOC: R_LARCH_RELAX - 0x0
+# LA64-RELAX-FIXUP: fixup A - offset: 0, value: %le_lo12_r(foo), kind: FK_NONE
+# LA64-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE
+
+.endif
+
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
index d780ad4f0e369..92db672e1c82d 100755
--- a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
@@ -86,4 +86,92 @@
 
 // CHECK: tileloaddrst1 -32(,%rbp,2), %tmm3
 // CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
-          tileloaddrst1 -32(,%rbp,2), %tmm3
\ No newline at end of file
+          tileloaddrst1 -32(,%rbp,2), %tmm3
+
+// CHECK: t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6
+// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz0rs   291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2
+
+// CHECK: t2rpntlvwz0rs   64(%r18), %tmm6
+// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40]
+          t2rpntlvwz0rs 64(%r18), %tmm6
+
+// CHECK: {evex}  t2rpntlvwz0rs   -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0rs -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6
+// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz0rst1   291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2
+
+// CHECK: t2rpntlvwz0rst1   64(%r18), %tmm6
+// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40]
+          t2rpntlvwz0rst1 64(%r18), %tmm6
+
+// CHECK: {evex}  t2rpntlvwz0rst1   -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6
+// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz1rs   291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2
+
+// CHECK: t2rpntlvwz1rs   64(%r18), %tmm6
+// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40]
+          t2rpntlvwz1rs 64(%r18), %tmm6
+
+// CHECK: {evex}  t2rpntlvwz1rs   -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1rs -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6
+// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz1rst1   291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2
+
+// CHECK: t2rpntlvwz1rst1   64(%r18), %tmm6
+// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40]
+          t2rpntlvwz1rst1 64(%r18), %tmm6
+
+// CHECK: {evex}  t2rpntlvwz1rst1   -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
+
+// CHECK: tileloaddrs     291(%r16,%rax,4), %tmm3
+// CHECK: encoding: [0x62,0xfa,0x7f,0x08,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00]
+          tileloaddrs 291(%r16,%rax,4), %tmm3
+
+// CHECK: tileloaddrs     291(%r8,%r17,4), %tmm3
+// CHECK: encoding: [0x62,0xd2,0x7b,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00]
+          tileloaddrs 291(%r8,%r17,4), %tmm3
+
+// CHECK: {evex}  tileloaddrs     -32(,%rbp,2), %tmm3
+// CHECK: encoding: [0x62,0xf2,0x7f,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} tileloaddrs -32(,%rbp,2), %tmm3
+
+// CHECK: tileloaddrst1     291(%r16,%rax,4), %tmm3
+// CHECK: encoding: [0x62,0xfa,0x7d,0x08,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00]
+          tileloaddrst1 291(%r16,%rax,4), %tmm3
+
+// CHECK: tileloaddrst1     291(%r8,%r17,4), %tmm3
+// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00]
+          tileloaddrst1 291(%r8,%r17,4), %tmm3
+
+// CHECK: {evex}  tileloaddrst1     -32(,%rbp,2), %tmm3
+// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} tileloaddrst1 -32(,%rbp,2), %tmm3
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
index ccc7ac51a98a4..140d1aa6b198e 100755
--- a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
@@ -95,3 +95,99 @@
 // CHECK: tileloaddrst1 tmm3, [2*rbp - 32]
 // CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
           tileloaddrst1 tmm3, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: t2rpntlvwz0rs tmm6, [r18 + 64]
+// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40]
+          t2rpntlvwz0rs tmm6, [r18 + 64]
+
+// CHECK: {evex} t2rpntlvwz0rs tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0rs tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: t2rpntlvwz0rst1 tmm6, [r18 + 64]
+// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40]
+          t2rpntlvwz0rst1 tmm6, [r18 + 64]
+
+// CHECK: {evex} t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: t2rpntlvwz1rs tmm6, [r18 + 64]
+// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40]
+          t2rpntlvwz1rs tmm6, [r18 + 64]
+
+// CHECK: {evex} t2rpntlvwz1rs tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1rs tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: t2rpntlvwz1rst1 tmm6, [r18 + 64]
+// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40]
+          t2rpntlvwz1rst1 tmm6, [r18 + 64]
+
+// CHECK: {evex} t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
+
+// CHECK: tileloaddrs     tmm6, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xba,0x7f,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          tileloaddrs tmm6, [r16 + 8*r14 + 268435456]
+
+// CHECK: tileloaddrs     tmm3, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd2,0x7b,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00]
+          tileloaddrs tmm3, [r8 + 4*r17 + 291]
+
+// CHECK: tileloaddrs     tmm6, [r18 + 64]
+// CHECK: encoding: [0x62,0xfa,0x7f,0x08,0x4a,0x74,0x22,0x40]
+          tileloaddrs tmm6, [r18 + 64]
+
+// CHECK: {evex}  tileloaddrs     tmm3, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf2,0x7f,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} tileloaddrs tmm3, [2*rbp - 32]
+
+// CHECK: tileloaddrst1     tmm6, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          tileloaddrst1 tmm6, [r16 + 8*r14 + 268435456]
+
+// CHECK: tileloaddrst1     tmm3, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00]
+          tileloaddrst1 tmm3, [r8 + 4*r17 + 291]
+
+// CHECK: tileloaddrst1     tmm6, [r18 + 64]
+// CHECK: encoding: [0x62,0xfa,0x7d,0x08,0x4a,0x74,0x22,0x40]
+          tileloaddrst1 tmm6, [r18 + 64]
+
+// CHECK: {evex}  tileloaddrst1     tmm3, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} tileloaddrst1 tmm3, [2*rbp - 32]
diff --git a/llvm/test/MC/X86/amx-avx512-att.s b/llvm/test/MC/X86/amx-avx512-att.s
index 6da4ede82c621..ddab9225199a9 100644
--- a/llvm/test/MC/X86/amx-avx512-att.s
+++ b/llvm/test/MC/X86/amx-avx512-att.s
@@ -16,37 +16,37 @@
 // CHECK: encoding: [0x62,0xe3,0x7e,0x48,0x07,0xf2,0x7b]
           tcvtrowd2ps $123, %tmm2, %zmm22
 
-// CHECK: tcvtrowps2pbf16h %ecx, %tmm5, %zmm22
+// CHECK: tcvtrowps2bf16h %ecx, %tmm5, %zmm22
 // CHECK: encoding: [0x62,0xe2,0x77,0x48,0x6d,0xf5]
-          tcvtrowps2pbf16h %ecx, %tmm5, %zmm22
+          tcvtrowps2bf16h %ecx, %tmm5, %zmm22
 
-// CHECK: tcvtrowps2pbf16h %ecx, %tmm2, %zmm22
+// CHECK: tcvtrowps2bf16h %ecx, %tmm2, %zmm22
 // CHECK: encoding: [0x62,0xe2,0x77,0x48,0x6d,0xf2]
-          tcvtrowps2pbf16h %ecx, %tmm2, %zmm22
+          tcvtrowps2bf16h %ecx, %tmm2, %zmm22
 
-// CHECK: tcvtrowps2pbf16h $123, %tmm5, %zmm22
+// CHECK: tcvtrowps2bf16h $123, %tmm5, %zmm22
 // CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x07,0xf5,0x7b]
-          tcvtrowps2pbf16h $123, %tmm5, %zmm22
+          tcvtrowps2bf16h $123, %tmm5, %zmm22
 
-// CHECK: tcvtrowps2pbf16h $123, %tmm2, %zmm22
+// CHECK: tcvtrowps2bf16h $123, %tmm2, %zmm22
 // CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x07,0xf2,0x7b]
-          tcvtrowps2pbf16h $123, %tmm2, %zmm22
+          tcvtrowps2bf16h $123, %tmm2, %zmm22
 
-// CHECK: tcvtrowps2pbf16l %ecx, %tmm5, %zmm22
+// CHECK: tcvtrowps2bf16l %ecx, %tmm5, %zmm22
 // CHECK: encoding: [0x62,0xe2,0x76,0x48,0x6d,0xf5]
-          tcvtrowps2pbf16l %ecx, %tmm5, %zmm22
+          tcvtrowps2bf16l %ecx, %tmm5, %zmm22
 
-// CHECK: tcvtrowps2pbf16l %ecx, %tmm2, %zmm22
+// CHECK: tcvtrowps2bf16l %ecx, %tmm2, %zmm22
 // CHECK: encoding: [0x62,0xe2,0x76,0x48,0x6d,0xf2]
-          tcvtrowps2pbf16l %ecx, %tmm2, %zmm22
+          tcvtrowps2bf16l %ecx, %tmm2, %zmm22
 
-// CHECK: tcvtrowps2pbf16l $123, %tmm5, %zmm22
+// CHECK: tcvtrowps2bf16l $123, %tmm5, %zmm22
 // CHECK: encoding: [0x62,0xe3,0x7e,0x48,0x77,0xf5,0x7b]
-          tcvtrowps2pbf16l $123, %tmm5, %zmm22
+          tcvtrowps2bf16l $123, %tmm5, %zmm22
 
-// CHECK: tcvtrowps2pbf16l $123, %tmm2, %zmm22
+// CHECK: tcvtrowps2bf16l $123, %tmm2, %zmm22
 // CHECK: encoding: [0x62,0xe3,0x7e,0x48,0x77,0xf2,0x7b]
-          tcvtrowps2pbf16l $123, %tmm2, %zmm22
+          tcvtrowps2bf16l $123, %tmm2, %zmm22
 
 // CHECK: tcvtrowps2phh %ecx, %tmm5, %zmm22
 // CHECK: encoding: [0x62,0xe2,0x74,0x48,0x6d,0xf5]
diff --git a/llvm/test/MC/X86/amx-avx512-intel.s b/llvm/test/MC/X86/amx-avx512-intel.s
index 3a517a6cd1aab..918e9126d35f9 100644
--- a/llvm/test/MC/X86/amx-avx512-intel.s
+++ b/llvm/test/MC/X86/amx-avx512-intel.s
@@ -16,37 +16,37 @@
 // CHECK: encoding: [0x62,0xe3,0x7e,0x48,0x07,0xf2,0x7b]
           tcvtrowd2ps zmm22, tmm2, 123
 
-// CHECK: tcvtrowps2pbf16h zmm22, tmm5, ecx
+// CHECK: tcvtrowps2bf16h zmm22, tmm5, ecx
 // CHECK: encoding: [0x62,0xe2,0x77,0x48,0x6d,0xf5]
-          tcvtrowps2pbf16h zmm22, tmm5, ecx
+          tcvtrowps2bf16h zmm22, tmm5, ecx
 
-// CHECK: tcvtrowps2pbf16h zmm22, tmm2, ecx
+// CHECK: tcvtrowps2bf16h zmm22, tmm2, ecx
 // CHECK: encoding: [0x62,0xe2,0x77,0x48,0x6d,0xf2]
-          tcvtrowps2pbf16h zmm22, tmm2, ecx
+          tcvtrowps2bf16h zmm22, tmm2, ecx
 
-// CHECK: tcvtrowps2pbf16h zmm22, tmm5, 123
+// CHECK: tcvtrowps2bf16h zmm22, tmm5, 123
 // CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x07,0xf5,0x7b]
-          tcvtrowps2pbf16h zmm22, tmm5, 123
+          tcvtrowps2bf16h zmm22, tmm5, 123
 
-// CHECK: tcvtrowps2pbf16h zmm22, tmm2, 123
+// CHECK: tcvtrowps2bf16h zmm22, tmm2, 123
 // CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x07,0xf2,0x7b]
-          tcvtrowps2pbf16h zmm22, tmm2, 123
+          tcvtrowps2bf16h zmm22, tmm2, 123
 
-// CHECK: tcvtrowps2pbf16l zmm22, tmm5, ecx
+// CHECK: tcvtrowps2bf16l zmm22, tmm5, ecx
 // CHECK: encoding: [0x62,0xe2,0x76,0x48,0x6d,0xf5]
-          tcvtrowps2pbf16l zmm22, tmm5, ecx
+          tcvtrowps2bf16l zmm22, tmm5, ecx
 
-// CHECK: tcvtrowps2pbf16l zmm22, tmm2, ecx
+// CHECK: tcvtrowps2bf16l zmm22, tmm2, ecx
 // CHECK: encoding: [0x62,0xe2,0x76,0x48,0x6d,0xf2]
-          tcvtrowps2pbf16l zmm22, tmm2, ecx
+          tcvtrowps2bf16l zmm22, tmm2, ecx
 
-// CHECK: tcvtrowps2pbf16l zmm22, tmm5, 123
+// CHECK: tcvtrowps2bf16l zmm22, tmm5, 123
 // CHECK: encoding: [0x62,0xe3,0x7e,0x48,0x77,0xf5,0x7b]
-          tcvtrowps2pbf16l zmm22, tmm5, 123
+          tcvtrowps2bf16l zmm22, tmm5, 123
 
-// CHECK: tcvtrowps2pbf16l zmm22, tmm2, 123
+// CHECK: tcvtrowps2bf16l zmm22, tmm2, 123
 // CHECK: encoding: [0x62,0xe3,0x7e,0x48,0x77,0xf2,0x7b]
-          tcvtrowps2pbf16l zmm22, tmm2, 123
+          tcvtrowps2bf16l zmm22, tmm2, 123
 
 // CHECK: tcvtrowps2phh zmm22, tmm5, ecx
 // CHECK: encoding: [0x62,0xe2,0x74,0x48,0x6d,0xf5]
diff --git a/llvm/test/MC/X86/amx-transpose-att.s b/llvm/test/MC/X86/amx-transpose-att.s
index 21bbf258ac6ef..5158470f8c905 100644
--- a/llvm/test/MC/X86/amx-transpose-att.s
+++ b/llvm/test/MC/X86/amx-transpose-att.s
@@ -48,6 +48,54 @@
 // CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
           t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
 
+// CHECK: t2rpntlvwz0     268435456(%r16,%r14,8), %tmm4
+// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4
+
+// CHECK: t2rpntlvwz0     291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0 291(%r8,%r17,4), %tmm2
+
+// CHECK: {evex}  t2rpntlvwz0     -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0 -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz0t1     268435456(%r16,%r14,8), %tmm4
+// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4
+
+// CHECK: t2rpntlvwz0t1     291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2
+
+// CHECK: {evex}  t2rpntlvwz0t1     -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0t1 -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz1     268435456(%r16,%r14,8), %tmm4
+// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4
+
+// CHECK: t2rpntlvwz1     291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1 291(%r8,%r17,4), %tmm2
+
+// CHECK: {evex}  t2rpntlvwz1     -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1 -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz1t1     268435456(%r16,%r14,8), %tmm4
+// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4
+
+// CHECK: t2rpntlvwz1t1     291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2
+
+// CHECK: {evex}  t2rpntlvwz1t1     -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
+
 // CHECK: ttransposed     %tmm1, %tmm5
 // CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9]
           ttransposed %tmm1, %tmm5
diff --git a/llvm/test/MC/X86/amx-transpose-intel.s b/llvm/test/MC/X86/amx-transpose-intel.s
index a772232ddbbf2..0d2c22f67a173 100644
--- a/llvm/test/MC/X86/amx-transpose-intel.s
+++ b/llvm/test/MC/X86/amx-transpose-intel.s
@@ -48,6 +48,54 @@
 // CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
           t2rpntlvwz1t1 tmm2, [2*rbp - 32]
 
+// CHECK: t2rpntlvwz0     tmm4, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz0     tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: {evex} t2rpntlvwz0     tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0 tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz0t1     tmm4, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz0t1     tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: {evex} t2rpntlvwz0t1     tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0t1 tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz1     tmm4, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz1     tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: {evex} t2rpntlvwz1     tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1 tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz1t1     tmm4, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz1t1     tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: {evex} t2rpntlvwz1t1     tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1t1 tmm2, [2*rbp - 32]
+
 // CHECK: ttransposed     tmm5, tmm1
 // CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9]
           ttransposed tmm5, tmm1
diff --git a/llvm/test/MC/X86/movrs-att-64.s b/llvm/test/MC/X86/movrs-att-64.s
index 59a2fdb6d10b2..e951b30369d46 100644
--- a/llvm/test/MC/X86/movrs-att-64.s
+++ b/llvm/test/MC/X86/movrs-att-64.s
@@ -94,4 +94,100 @@
 
 // CHECK: movrsq  -128(%rdx), %rbx
 // CHECK: encoding: [0x48,0x0f,0x38,0x8b,0x5a,0x80]
-          movrs  -128(%rdx), %rbx
\ No newline at end of file
+          movrs  -128(%rdx), %rbx
+
+// CHECK: movrsb  268435456(%rbp,%r14,8), %r16b
+// CHECK: encoding: [0x62,0xa4,0x7c,0x08,0x8a,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs  268435456(%rbp,%r14,8), %r16b
+
+// CHECK: movrsb  291(%r17,%rax,4), %bl
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs  291(%r17,%rax,4), %bl
+
+// CHECK: {evex}  movrsb  (%rip), %bl
+// CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x8a,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs  (%rip), %bl
+
+// CHECK: movrsb  -32(,%rbp,2), %r18b
+// CHECK: encoding: [0x62,0xe4,0x7c,0x08,0x8a,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs  -32(,%rbp,2), %r18b
+
+// CHECK: movrsb  127(%r19), %bl
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x5b,0x7f]
+          movrs 127(%r19), %bl
+
+// CHECK: movrsb  -128(%r20), %bl
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x5c,0x24,0x80]
+          movrs  -128(%r20), %bl
+
+// CHECK: movrsw  268435456(%rbp,%r14,8), %r16w
+// CHECK: encoding: [0x62,0xa4,0x7d,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs  268435456(%rbp,%r14,8), %r16w
+
+// CHECK: movrsw  291(%r17,%rax,4), %bx
+// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs  291(%r17,%rax,4), %bx
+
+// CHECK: {evex}  movrsw  (%rip), %bx
+// CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs  (%rip), %bx
+
+// CHECK: movrsw  -32(,%rbp,2), %r18w
+// CHECK: encoding: [0x62,0xe4,0x7d,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs  -32(,%rbp,2), %r18w
+
+// CHECK: movrsw  127(%r19), %bx
+// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x5b,0x7f]
+          movrs 127(%r19), %bx
+
+// CHECK: movrsw  -128(%r20), %bx
+// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x5c,0x24,0x80]
+          movrs  -128(%r20), %bx
+
+// CHECK: movrsl  268435456(%rbp,%r14,8), %r16d
+// CHECK: encoding: [0x62,0xa4,0x7c,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs  268435456(%rbp,%r14,8), %r16d
+
+// CHECK: movrsl  291(%r17,%rax,4), %ebx
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs  291(%r17,%rax,4), %ebx
+
+// CHECK: {evex}  movrsl  (%rip), %ebx
+// CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs  (%rip), %ebx
+
+// CHECK: movrsl  -32(,%rbp,2), %r18d
+// CHECK: encoding: [0x62,0xe4,0x7c,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs  -32(,%rbp,2), %r18d
+
+// CHECK: movrsl  127(%r19), %ebx
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x5b,0x7f]
+          movrs 127(%r19), %ebx
+
+// CHECK: movrsl  -128(%r20), %ebx
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x5c,0x24,0x80]
+          movrs  -128(%r20), %ebx
+
+// CHECK: movrsq  268435456(%rbp,%r14,8), %r16
+// CHECK: encoding: [0x62,0xa4,0xfc,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs  268435456(%rbp,%r14,8), %r16
+
+// CHECK: movrsq  291(%r17,%rax,4), %rbx
+// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs  291(%r17,%rax,4), %rbx
+
+// CHECK: {evex}  movrsq  (%rip), %rbx
+// CHECK: encoding: [0x62,0xf4,0xfc,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs  (%rip), %rbx
+
+// CHECK: movrsq  -32(,%rbp,2), %r18
+// CHECK: encoding: [0x62,0xe4,0xfc,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs  -32(,%rbp,2), %r18
+
+// CHECK: movrsq  127(%r19), %rbx
+// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x5b,0x7f]
+          movrs 127(%r19), %rbx
+
+// CHECK: movrsq  -128(%r20), %rbx
+// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x5c,0x24,0x80]
+          movrs  -128(%r20), %rbx
diff --git a/llvm/test/MC/X86/movrs-intel-64.s b/llvm/test/MC/X86/movrs-intel-64.s
index f41075a21b3e8..f698f1c440442 100644
--- a/llvm/test/MC/X86/movrs-intel-64.s
+++ b/llvm/test/MC/X86/movrs-intel-64.s
@@ -94,4 +94,100 @@
 
 // CHECK: movrs rbx, qword ptr [rdx - 128]
 // CHECK: encoding: [0x48,0x0f,0x38,0x8b,0x5a,0x80]
-          movrs rbx, qword ptr [rdx - 128]
\ No newline at end of file
+          movrs rbx, qword ptr [rdx - 128]
+
+// CHECK: movrs r16b, byte ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa4,0x7c,0x08,0x8a,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs r16b, byte ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: movrs bl, byte ptr [r17 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs bl, byte ptr [r17 + 4*rax + 291]
+
+// CHECK: {evex} movrs bl, byte ptr [rip]
+// CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x8a,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs bl, byte ptr [rip]
+
+// CHECK: movrs r18b, byte ptr [2*rbp - 32]
+// CHECK: encoding: [0x62,0xe4,0x7c,0x08,0x8a,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs r18b, byte ptr [2*rbp - 32]
+
+// CHECK: movrs bl, byte ptr [r19 + 127]
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x5b,0x7f]
+          movrs bl, byte ptr [r19 + 127]
+
+// CHECK: movrs bl, byte ptr [r20 - 128]
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x5c,0x24,0x80]
+          movrs bl, byte ptr [r20 - 128]
+
+// CHECK: movrs r16w, word ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa4,0x7d,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs r16w, word ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: movrs bx, word ptr [r17 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs bx, word ptr [r17 + 4*rax + 291]
+
+// CHECK: {evex} movrs bx, word ptr [rip]
+// CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs bx, word ptr [rip]
+
+// CHECK: movrs r18w, word ptr [2*rbp - 32]
+// CHECK: encoding: [0x62,0xe4,0x7d,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs r18w, word ptr [2*rbp - 32]
+
+// CHECK: movrs bx, word ptr [r19 + 127]
+// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x5b,0x7f]
+          movrs bx, word ptr [r19 + 127]
+
+// CHECK: movrs bx, word ptr [r20 - 128]
+// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x5c,0x24,0x80]
+          movrs bx, word ptr [r20 - 128]
+
+// CHECK: movrs r16d, dword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa4,0x7c,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs r16d, dword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: movrs ebx, dword ptr [r17 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs ebx, dword ptr [r17 + 4*rax + 291]
+
+// CHECK: {evex} movrs ebx, dword ptr [rip]
+// CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs ebx, dword ptr [rip]
+
+// CHECK: movrs r18d, dword ptr [2*rbp - 32]
+// CHECK: encoding: [0x62,0xe4,0x7c,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs r18d, dword ptr [2*rbp - 32]
+
+// CHECK: movrs ebx, dword ptr [r19 + 127]
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x5b,0x7f]
+          movrs ebx, dword ptr [r19 + 127]
+
+// CHECK: movrs ebx, dword ptr [r20 - 128]
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x5c,0x24,0x80]
+          movrs ebx, dword ptr [r20 - 128]
+
+// CHECK: movrs r16, qword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa4,0xfc,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs r16, qword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: movrs rbx, qword ptr [r17 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs rbx, qword ptr [r17 + 4*rax + 291]
+
+// CHECK: {evex} movrs rbx, qword ptr [rip]
+// CHECK: encoding: [0x62,0xf4,0xfc,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs rbx, qword ptr [rip]
+
+// CHECK: movrs r18, qword ptr [2*rbp - 32]
+// CHECK: encoding: [0x62,0xe4,0xfc,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs r18, qword ptr [2*rbp - 32]
+
+// CHECK: movrs rbx, qword ptr [r19 + 127]
+// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x5b,0x7f]
+          movrs rbx, qword ptr [r19 + 127]
+
+// CHECK: movrs rbx, qword ptr [r20 - 128]
+// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x5c,0x24,0x80]
+          movrs rbx, qword ptr [r20 - 128]
diff --git a/llvm/test/Object/Inputs/WASM/multi-table.wasm b/llvm/test/Object/Inputs/WASM/multi-table.wasm
index 47f5d8311cb74..81e52a2d3e286 100644
Binary files a/llvm/test/Object/Inputs/WASM/multi-table.wasm and b/llvm/test/Object/Inputs/WASM/multi-table.wasm differ
diff --git a/llvm/test/Other/print-loop-func-scope.ll b/llvm/test/Other/print-loop-func-scope.ll
new file mode 100644
index 0000000000000..507ff70a5fd96
--- /dev/null
+++ b/llvm/test/Other/print-loop-func-scope.ll
@@ -0,0 +1,75 @@
+; This test documents how the IR dumped for loop passes differs with -print-loop-func-scope
+; and -print-module-scope
+;   - Without -print-loop-func-scope, dumps only the loop, with 3 sections- preheader,
+;     loop, and exit blocks
+;   - With -print-loop-func-scope, dumps only the function which contains the loop
+;   - With -print-module-scope, dumps the entire module containing the loop, and disregards
+;     the -print-loop-func-scope flag.
+
+; RUN: opt < %s 2>&1 -disable-output \
+; RUN: 	   -passes=licm -print-after=licm \
+; RUN:	   | FileCheck %s -check-prefix=VANILLA
+; RUN: opt < %s 2>&1 -disable-output \
+; RUN: 	   -passes=licm -print-after=licm -print-loop-func-scope \
+; RUN:	   | FileCheck %s -check-prefix=LOOPFUNC
+; RUN: opt < %s 2>&1 -disable-output \
+; RUN: 	   -passes=licm -print-after=licm -print-module-scope \
+; RUN:	   | FileCheck %s -check-prefix=MODULE
+; RUN: opt < %s 2>&1 -disable-output \
+; RUN: 	   -passes=licm -print-after=licm -print-module-scope -print-loop-func-scope\
+; RUN:	   | FileCheck %s -check-prefix=MODULEWITHLOOP
+
+; VANILLA: IR Dump After LICMPass
+; VANILLA-NOT: define void @foo
+; VANILLA: Preheader:
+; VANILLA: Loop:
+; VANILLA: Exit blocks
+
+; LOOPFUNC: IR Dump After LICMPass
+; LOOPFUNC: (loop:
+; LOOPFUNC: define void @foo
+; LOOPFUNC-NOT: Preheader:
+; LOOPFUNC-NOT: Loop:
+; LOOPFUNC-NOT: Exit blocks
+
+; MODULE: IR Dump After LICMPass
+; MODULE: ModuleID =
+; MODULE: define void @foo
+; MODULE-NOT: Preheader:
+; MODULE-NOT: Loop:
+; MODULE-NOT: Exit blocks
+; MODULE: define void @bar
+; MODULE: declare void @baz(i32)
+
+; MODULEWITHLOOP: IR Dump After LICMPass
+; MODULEWITHLOOP: ModuleID =
+; MODULEWITHLOOP: define void @foo
+; MODULEWITHLOOP-NOT: Preheader:
+; MODULEWITHLOOP-NOT: Loop:
+; MODULEWITHLOOP-NOT: Exit blocks
+; MODULEWITHLOOP: define void @bar
+; MODULEWITHLOOP: declare void @baz(i32)
+
+define void @foo(i32 %n) {
+entry:
+  br label %loop_cond
+
+loop_cond:
+  %i = phi i32 [ 0, %entry ], [ %i_next, %loop_body ]
+  %cmp = icmp slt i32 %i, %n
+  br i1 %cmp, label %loop_body, label %loop_end
+
+loop_body:
+  call void @baz(i32 %i)
+  %i_next = add i32 %i, 1
+  br label %loop_cond
+
+loop_end:
+  ret void
+}
+
+define void @bar() {
+  ret void
+}
+
+declare void @baz(i32)
diff --git a/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
index 7c8181410d400..b7132bf2bcd8c 100644
--- a/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
@@ -950,8 +950,8 @@ def MOVcimm8 : I<(outs GPR32:$dst), (ins i32imm:$imm), [(set GPR32:$dst, cimm8:$
 // NOOPT-NEXT:  GIM_Try, /*On fail goto*//*Label [[LABEL_NUM:[0-9]+]]*/ GIMT_Encode4([[LABEL:[0-9]+]]),
 // NOOPT-NEXT:    GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
 // NOOPT-NEXT:    GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_LOAD),
-// NOOPT-NEXT:    GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // NOOPT-NEXT:    GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
+// NOOPT-NEXT:    GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // NOOPT-NEXT:    // MIs[0] DstI[dst]
 // NOOPT-NEXT:    GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
 // NOOPT-NEXT:    GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
@@ -973,8 +973,8 @@ def LOAD : I<(outs GPR32:$dst), (ins GPR32:$src1),
 // NOOPT-NEXT:  GIM_Try, /*On fail goto*//*Label [[LABEL_NUM:[0-9]+]]*/ GIMT_Encode4([[LABEL:[0-9]+]]),
 // NOOPT-NEXT:    GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
 // NOOPT-NEXT:    GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_LOAD),
-// NOOPT-NEXT:    GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // NOOPT-NEXT:    GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
+// NOOPT-NEXT:    GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // NOOPT-NEXT:    // MIs[0] DstI[dst]
 // NOOPT-NEXT:    GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_p0s32,
 // NOOPT-NEXT:    GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
@@ -996,8 +996,8 @@ def : Pat<(load GPR32:$src),
 // NOOPT-NEXT:  GIM_Try, /*On fail goto*//*Label [[LABEL_NUM:[0-9]+]]*/ GIMT_Encode4([[LABEL:[0-9]+]]),
 // NOOPT-NEXT:    GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
 // NOOPT-NEXT:    GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_SEXTLOAD),
-// NOOPT-NEXT:    GIM_CheckMemorySizeEqualTo, /*MI*/0, /*MMO*/0, /*Size*/GIMT_Encode4(2),
 // NOOPT-NEXT:    GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
+// NOOPT-NEXT:    GIM_CheckMemorySizeEqualTo, /*MI*/0, /*MMO*/0, /*Size*/GIMT_Encode4(2),
 // NOOPT-NEXT:    // MIs[0] DstI[dst]
 // NOOPT-NEXT:    GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
 // NOOPT-NEXT:    GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
diff --git a/llvm/test/TableGen/GlobalISelEmitter/HwModes.td b/llvm/test/TableGen/GlobalISelEmitter/HwModes.td
index 3588ba3979411..510368516739d 100644
--- a/llvm/test/TableGen/GlobalISelEmitter/HwModes.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/HwModes.td
@@ -131,8 +131,8 @@ class I<dag OOps, dag IOps, list<dag> Pat>
 // CHECK-NEXT:    GIM_CheckFeatures, GIMT_Encode2(GIFBS_HwMode0),
 // CHECK-NEXT:    GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
 // CHECK-NEXT:    GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_LOAD),
-// CHECK-NEXT:    GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // CHECK-NEXT:    GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
+// CHECK-NEXT:    GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // CHECK-NEXT:    // MIs[0] DstI[dst]
 // CHECK-NEXT:    GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s64,
 // CHECK-NEXT:    GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPRRegClassID),
@@ -149,8 +149,8 @@ class I<dag OOps, dag IOps, list<dag> Pat>
 // CHECK-NEXT:    GIM_CheckFeatures, GIMT_Encode2(GIFBS_HwMode1),
 // CHECK-NEXT:    GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
 // CHECK-NEXT:    GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_LOAD),
-// CHECK-NEXT:    GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // CHECK-NEXT:    GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
+// CHECK-NEXT:    GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // CHECK-NEXT:    // MIs[0] DstI[dst]
 // CHECK-NEXT:    GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
 // CHECK-NEXT:    GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPRRegClassID),
@@ -173,8 +173,8 @@ def LOAD : I<(outs GPR:$dst), (ins GPR:$src1),
 // CHECK-NEXT:    GIM_CheckFeatures, GIMT_Encode2(GIFBS_HwMode0),
 // CHECK-NEXT:    GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
 // CHECK-NEXT:    GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_LOAD),
-// CHECK-NEXT:    GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // CHECK-NEXT:    GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
+// CHECK-NEXT:    GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // CHECK-NEXT:    // MIs[0] DstI[dst]
 // CHECK-NEXT:    GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_p0s64,
 // CHECK-NEXT:    GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPRRegClassID),
@@ -191,8 +191,8 @@ def LOAD : I<(outs GPR:$dst), (ins GPR:$src1),
 // CHECK-NEXT:    GIM_CheckFeatures, GIMT_Encode2(GIFBS_HwMode1),
 // CHECK-NEXT:    GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
 // CHECK-NEXT:    GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_LOAD),
-// CHECK-NEXT:    GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // CHECK-NEXT:    GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
+// CHECK-NEXT:    GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // CHECK-NEXT:    // MIs[0] DstI[dst]
 // CHECK-NEXT:    GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_p0s32,
 // CHECK-NEXT:    GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPRRegClassID),
diff --git a/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizer.td b/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizer.td
index c4307258aae9a..6ac6703991c2d 100644
--- a/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizer.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizer.td
@@ -9,8 +9,8 @@ def LOAD8 : I<(outs GPR8:$dst), (ins GPR8:$src), []>;
 def LOAD32 : I<(outs GPR8:$dst), (ins GPR32:$src), []>;
 // CHECK: Label 1: @{{[0-9]+}}
 // CHECK-NEXT: GIM_Try, /*On fail goto*//*Label [[L1_ID:[0-9]+]]*/ GIMT_Encode4([[L1_AT:[0-9]+]]),
+// CHECK-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
 // CHECK-NEXT: GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
-// CHECK-NEXT:   GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
 // CHECK-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR8RegClassID),
 // CHECK-NEXT: GIM_Try, /*On fail goto*//*Label [[L2_ID:[0-9]+]]*/ GIMT_Encode4([[L2_AT:[0-9]+]]),
 // CHECK-NEXT: // MIs[0] src
@@ -47,8 +47,8 @@ def LOAD16 : I<(outs GPR16:$dst), (ins GPR16:$src), []>;
 def LOAD16Imm : I<(outs GPR16:$dst), (ins GPR16:$src), []>;
 // CHECK: // Label 2: @{{[0-9]+}}
 // CHECK-NEXT: GIM_Try, /*On fail goto*//*Label [[L1_ID:[0-9]+]]*/ GIMT_Encode4([[L1_AT:[0-9]+]]),
-// CHECK-NEXT: GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // CHECK-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
+// CHECK-NEXT: GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // CHECK-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR16RegClassID),
 // CHECK-NEXT: GIM_CheckPointerToAny, /*MI*/0, /*Op*/1, /*SizeInBits*/16,
 // CHECK-NEXT: GIM_Try, /*On fail goto*//*Label [[L2_ID:[0-9]+]]*/ GIMT_Encode4([[L2_AT:[0-9]+]]),
diff --git a/llvm/test/TableGen/GlobalISelEmitter/OverloadedPtr.td b/llvm/test/TableGen/GlobalISelEmitter/OverloadedPtr.td
index 31accba8b1847..43a121f94bd6c 100644
--- a/llvm/test/TableGen/GlobalISelEmitter/OverloadedPtr.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/OverloadedPtr.td
@@ -13,8 +13,8 @@ let TargetPrefix = "mytarget" in {
 // Check that iPTR in the destination DAG doesn't prevent the pattern from being imported.
 
 // CHECK: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
-// CHECK-NEXT: GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // CHECK-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
+// CHECK-NEXT: GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // CHECK-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
 // CHECK-NEXT: // MIs[0] src1
 // CHECK-NEXT: GIM_CheckPointerToAny, /*MI*/0, /*Op*/1, /*SizeInBits*/0,
diff --git a/llvm/test/TableGen/GlobalISelEmitter/atomic-store.td b/llvm/test/TableGen/GlobalISelEmitter/atomic-store.td
index 53b8670f47e63..99869cc4e8ef0 100644
--- a/llvm/test/TableGen/GlobalISelEmitter/atomic-store.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/atomic-store.td
@@ -6,8 +6,8 @@ include "GlobalISelEmitterCommon.td"
 def ST_ATOM_B32 : I<(outs), (ins GPR32Op:$val, GPR32Op:$ptr), []>;
 
 // GISEL: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_STORE),
-// GISEL-NEXT: GIM_CheckMemorySizeEqualTo, /*MI*/0, /*MMO*/0, /*Size*/GIMT_Encode4(1),
 // GISEL-NEXT: GIM_CheckAtomicOrderingOrStrongerThan, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::Unordered,
+// GISEL-NEXT: GIM_CheckMemorySizeEqualTo, /*MI*/0, /*MMO*/0, /*Size*/GIMT_Encode4(1),
 // GISEL-NEXT: // MIs[0] val
 // GISEL-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
 // GISEL-NEXT: // MIs[0] ptr
diff --git a/llvm/test/TableGen/GlobalISelEmitter/predicated-pattern-order.td b/llvm/test/TableGen/GlobalISelEmitter/predicated-pattern-order.td
new file mode 100644
index 0000000000000..ce420dbe01a27
--- /dev/null
+++ b/llvm/test/TableGen/GlobalISelEmitter/predicated-pattern-order.td
@@ -0,0 +1,82 @@
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common %s | FileCheck %s
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=true -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefix=OPT %s
+
+include "llvm/Target/Target.td"
+include "GlobalISelEmitterCommon.td"
+
+// Check that IPM_GenericPredicate doesn't influence the final order of patterns.
+// https://github.com/llvm/llvm-project/issues/121446
+
+def aligned_store: PatFrag<(ops node:$v, node:$a), (store $v, $a), [{
+  return true;
+}]>{
+  let GISelPredicateCode = [{ return true; }];
+}
+
+// CHECK: GIM_Try
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_STORE),
+// CHECK-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
+// CHECK-NEXT: GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
+// CHECK-NEXT: // MIs[0] src0
+// CHECK-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// CHECK-NEXT: // MIs[0] src1
+// CHECK-NEXT: GIM_CheckPointerToAny, /*MI*/0, /*Op*/1, /*SizeInBits*/32,
+// CHECK-NEXT: GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// CHECK-NEXT: GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GIMT_Encode2(GICXXPred_MI_Predicate_aligned_store),
+// CHECK-NEXT: // (st GPR32:{ *:[i32] }:$src0, GPR32:{ *:[i32] }:$src1)<<P:Predicate_unindexedstore>><<P:Predicate_store>><<P:Predicate_aligned_store>>  =>  (MOVALIGNED GPR32:{ *:[i32] }:$src0, GPR32:{ *:[i32] }:$src1)
+// CHECK-NEXT: GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/0, /*Opcode*/GIMT_Encode2(MyTarget::MOVALIGNED),
+// CHECK-NEXT: GIR_RootConstrainSelectedInstOperands,
+// CHECK-NEXT: // GIR_Coverage
+
+// CHECK: GIM_Try
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_STORE),
+// CHECK-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
+// CHECK-NEXT: GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
+// CHECK-NEXT: // MIs[0] src0
+// CHECK-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// CHECK-NEXT: // MIs[0] src1
+// CHECK-NEXT: GIM_CheckPointerToAny, /*MI*/0, /*Op*/1, /*SizeInBits*/32,
+// CHECK-NEXT: GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// CHECK-NEXT: // (st GPR32:{ *:[i32] }:$src0, GPR32:{ *:[i32] }:$src1)<<P:Predicate_unindexedstore>><<P:Predicate_store>>  =>  (MOVUNALIGNED GPR32:{ *:[i32] }:$src0, GPR32:{ *:[i32] }:$src1)
+// CHECK-NEXT: GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/0, /*Opcode*/GIMT_Encode2(MyTarget::MOVUNALIGNED),
+// CHECK-NEXT: GIR_RootConstrainSelectedInstOperands,
+// CHECK-NEXT: // GIR_Coverage
+
+// OPT: GIM_Try
+// OPT-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_STORE),
+// OPT-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
+// OPT-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
+// OPT-NEXT: GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
+
+// OPT-NEXT: GIM_Try
+// OPT-NEXT:   GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// OPT-NEXT:   // MIs[0] src1
+// OPT-NEXT:   GIM_CheckPointerToAny, /*MI*/0, /*Op*/1, /*SizeInBits*/32,
+// OPT-NEXT:   GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// OPT-NEXT:   GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GIMT_Encode2(GICXXPred_MI_Predicate_aligned_store),
+// OPT-NEXT:   // (st GPR32:{ *:[i32] }:$src0, GPR32:{ *:[i32] }:$src1)<<P:Predicate_unindexedstore>><<P:Predicate_store>><<P:Predicate_aligned_store>>  =>  (MOVALIGNED GPR32:{ *:[i32] }:$src0, GPR32:{ *:[i32] }:$src1)
+// OPT-NEXT:   GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/0, /*Opcode*/GIMT_Encode2(MyTarget::MOVALIGNED),
+// OPT-NEXT:   GIR_RootConstrainSelectedInstOperands,
+// OPT-NEXT:   // GIR_Coverage
+
+// OPT:      GIM_Try
+// OPT-NEXT:   GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// OPT-NEXT:   // MIs[0] src1
+// OPT-NEXT:   GIM_CheckPointerToAny, /*MI*/0, /*Op*/1, /*SizeInBits*/32,
+// OPT-NEXT:   GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// OPT-NEXT:   // (st GPR32:{ *:[i32] }:$src0, GPR32:{ *:[i32] }:$src1)<<P:Predicate_unindexedstore>><<P:Predicate_store>>  =>  (MOVUNALIGNED GPR32:{ *:[i32] }:$src0, GPR32:{ *:[i32] }:$src1)
+// OPT-NEXT:   GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/0, /*Opcode*/GIMT_Encode2(MyTarget::MOVUNALIGNED),
+// OPT-NEXT:   GIR_RootConstrainSelectedInstOperands,
+// OPT-NEXT:   // GIR_Coverage
+
+def MOVALIGNED : I<(outs), (ins GPR32:$src0, GPR32:$src1),
+    [(aligned_store GPR32:$src0, GPR32:$src1)]>;
+
+
+def MOVUNALIGNED : I<(outs), (ins GPR32:$src0, GPR32:$src1),
+    [(store GPR32:$src0, GPR32:$src1)]>;
+
diff --git a/llvm/test/TableGen/GlobalISelEmitter/zero-reg.td b/llvm/test/TableGen/GlobalISelEmitter/zero-reg.td
index 87e5432093377..dfbe7f902c011 100644
--- a/llvm/test/TableGen/GlobalISelEmitter/zero-reg.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/zero-reg.td
@@ -22,8 +22,8 @@ def INST : PredI<(outs GPR32:$dst), (ins GPR32:$src), []>;
 
 // CHECK: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
 // CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_LOAD),
-// CHECK-NEXT: GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // CHECK-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
+// CHECK-NEXT: GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // CHECK-NEXT: // MIs[0] DstI[dst]
 // CHECK-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
 // CHECK-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
diff --git a/llvm/test/TableGen/address-space-patfrags.td b/llvm/test/TableGen/address-space-patfrags.td
index 582b97d55a518..a2611df048b06 100644
--- a/llvm/test/TableGen/address-space-patfrags.td
+++ b/llvm/test/TableGen/address-space-patfrags.td
@@ -60,9 +60,9 @@ def inst_d : Instruction {
 // GISEL: GIM_Try, /*On fail goto*//*Label 0*/ GIMT_Encode4({{[0-9]+}}), // Rule ID 0 //
 // GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
 // GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_LOAD),
+// GISEL-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
 // GISEL-NEXT: GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // GISEL-NEXT: GIM_CheckMemoryAddressSpace, /*MI*/0, /*MMO*/0, /*NumAddrSpace*/2, /*AddrSpace*/123, /*AddrSpace*//* 455(*/0xC7, 0x03/*)*/,
-// GISEL-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
 def : Pat <
   (pat_frag_b GPR32:$src),
   (inst_b GPR32:$src)
@@ -80,9 +80,9 @@ def : Pat <
 // GISEL: GIM_Try, /*On fail goto*//*Label 1*/ GIMT_Encode4({{[0-9]+}}), // Rule ID 1 //
 // GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
 // GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_LOAD),
+// GISEL-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
 // GISEL-NEXT: GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // GISEL-NEXT: GIM_CheckMemoryAlignment, /*MI*/0, /*MMO*/0, /*MinAlign*/2,
-// GISEL-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
 def : Pat <
   (pat_frag_a GPR32:$src),
   (inst_a GPR32:$src)
@@ -99,8 +99,8 @@ def truncstorei16_addrspace : PatFrag<(ops node:$val, node:$ptr),
 // GISEL: GIM_Try, /*On fail goto*//*Label 2*/ GIMT_Encode4({{[0-9]+}}), // Rule ID 2 //
 // GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
 // GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_STORE),
-// GISEL-NEXT: GIM_CheckMemorySizeLessThanLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // GISEL-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
+// GISEL-NEXT: GIM_CheckMemorySizeLessThanLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // GISEL-NEXT: // MIs[0] src0
 // GISEL-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
 def : Pat <
@@ -112,6 +112,7 @@ def : Pat <
 // GISEL: GIM_Try, /*On fail goto*//*Label 3*/ GIMT_Encode4({{[0-9]+}}), // Rule ID 3 //
 // GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
 // GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_STORE),
+// GISEL-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
 // GISEL-NEXT: GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 def : Pat <
   (store GPR32:$src0, GPR32:$src1),
@@ -122,6 +123,7 @@ def : Pat <
 // GISEL: GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4({{[0-9]+}}), // Rule ID 4 //
 // GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
 // GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_STORE),
+// GISEL-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
 // GISEL-NEXT: GIM_CheckMemorySizeLessThanLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
 // GISEL-NEXT: GIM_CheckMemorySizeEqualTo, /*MI*/0, /*MMO*/0, /*Size*/GIMT_Encode4(2),
 // GISEL-NEXT: GIM_CheckMemoryAddressSpace, /*MI*/0, /*MMO*/0, /*NumAddrSpace*/2, /*AddrSpace*/123, /*AddrSpace*//* 455(*/0xC7, 0x03/*)*/,
diff --git a/llvm/test/TableGen/x86-instr-mapping.inc b/llvm/test/TableGen/x86-instr-mapping.inc
index ed43684db2dfc..4f64d4b8d93d0 100644
--- a/llvm/test/TableGen/x86-instr-mapping.inc
+++ b/llvm/test/TableGen/x86-instr-mapping.inc
@@ -133,6 +133,10 @@ static const X86TableEntry X86CompressEVEXTable[] = {
   { X86::MOVDIR64B64_EVEX, X86::MOVDIR64B64 },
   { X86::MOVDIRI32_EVEX, X86::MOVDIRI32 },
   { X86::MOVDIRI64_EVEX, X86::MOVDIRI64 },
+  { X86::MOVRS16rm_EVEX, X86::MOVRS16rm },
+  { X86::MOVRS32rm_EVEX, X86::MOVRS32rm },
+  { X86::MOVRS64rm_EVEX, X86::MOVRS64rm },
+  { X86::MOVRS8rm_EVEX, X86::MOVRS8rm },
   { X86::MULX32rm_EVEX, X86::MULX32rm },
   { X86::MULX32rr_EVEX, X86::MULX32rr },
   { X86::MULX64rm_EVEX, X86::MULX64rm },
@@ -163,6 +167,16 @@ static const X86TableEntry X86CompressEVEXTable[] = {
   { X86::SHRX64rm_EVEX, X86::SHRX64rm },
   { X86::SHRX64rr_EVEX, X86::SHRX64rr },
   { X86::STTILECFG_EVEX, X86::STTILECFG },
+  { X86::T2RPNTLVWZ0RST1_EVEX, X86::T2RPNTLVWZ0RST1 },
+  { X86::T2RPNTLVWZ0RS_EVEX, X86::T2RPNTLVWZ0RS },
+  { X86::T2RPNTLVWZ0T1_EVEX, X86::T2RPNTLVWZ0T1 },
+  { X86::T2RPNTLVWZ0_EVEX, X86::T2RPNTLVWZ0 },
+  { X86::T2RPNTLVWZ1RST1_EVEX, X86::T2RPNTLVWZ1RST1 },
+  { X86::T2RPNTLVWZ1RS_EVEX, X86::T2RPNTLVWZ1RS },
+  { X86::T2RPNTLVWZ1T1_EVEX, X86::T2RPNTLVWZ1T1 },
+  { X86::T2RPNTLVWZ1_EVEX, X86::T2RPNTLVWZ1 },
+  { X86::TILELOADDRST1_EVEX, X86::TILELOADDRST1 },
+  { X86::TILELOADDRS_EVEX, X86::TILELOADDRS },
   { X86::TILELOADDT1_EVEX, X86::TILELOADDT1 },
   { X86::TILELOADD_EVEX, X86::TILELOADD },
   { X86::TILESTORED_EVEX, X86::TILESTORED },
diff --git a/llvm/test/Transforms/FunctionAttrs/phi_cycle.ll b/llvm/test/Transforms/FunctionAttrs/phi_cycle.ll
new file mode 100644
index 0000000000000..137becd76588e
--- /dev/null
+++ b/llvm/test/Transforms/FunctionAttrs/phi_cycle.ll
@@ -0,0 +1,52 @@
+; RUN: opt -passes=function-attrs -S < %s
+
+; Regression test for a null-returning bug of getUnderlyingObjectAggressive().
+; This should not crash.
+define void @phi_cycle() {
+bb:
+  unreachable
+
+bb1:                                              ; preds = %bb17
+  br label %bb2
+
+bb2:                                              ; preds = %bb5, %bb1
+  %phi = phi ptr [ %phi6, %bb1 ], [ %phi6, %bb5 ]
+  br i1 poison, label %bb4, label %bb3
+
+bb3:                                              ; preds = %bb2
+  %getelementptr = getelementptr inbounds i8, ptr %phi, i32 poison
+  br label %bb5
+
+bb4:                                              ; preds = %bb2
+  br label %bb7
+
+bb5:                                              ; preds = %bb15, %bb3
+  %phi6 = phi ptr [ %getelementptr, %bb3 ], [ %phi16, %bb15 ]
+  br i1 poison, label %bb17, label %bb2
+
+bb7:                                              ; preds = %bb15, %bb4
+  %phi8 = phi ptr [ %phi, %bb4 ], [ %phi16, %bb15 ]
+  br i1 poison, label %bb11, label %bb9
+
+bb9:                                              ; preds = %bb7
+  %getelementptr10 = getelementptr inbounds i8, ptr %phi8, i32 1
+  store i8 poison, ptr %phi8, align 1
+  br label %bb15
+
+bb11:                                             ; preds = %bb7
+  br i1 poison, label %bb13, label %bb12
+
+bb12:                                             ; preds = %bb11
+  br label %bb13
+
+bb13:                                             ; preds = %bb12, %bb11
+  %getelementptr14 = getelementptr inbounds i8, ptr %phi8, i32 poison
+  br label %bb15
+
+bb15:                                             ; preds = %bb13, %bb9
+  %phi16 = phi ptr [ %getelementptr14, %bb13 ], [ %getelementptr10, %bb9 ]
+  br i1 poison, label %bb5, label %bb7
+
+bb17:                                             ; preds = %bb5
+  br label %bb1
+}
diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
new file mode 100644
index 0000000000000..fa817a8cbf417
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
@@ -0,0 +1,368 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names)" --version 4
+
+; REQUIRES: aarch64-registered-target
+
+; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+$test_single_bb_resolver.resolver = comdat any
+$test_multi_bb_resolver.resolver = comdat any
+$test_caller_feats_not_implied.resolver = comdat any
+$test_non_fmv_caller.resolver = comdat any
+$test_priority.resolver = comdat any
+$test_alternative_names.resolver = comdat any
+
+@__aarch64_cpu_features = external local_unnamed_addr global { i64 }
+
+@test_single_bb_resolver = weak_odr ifunc i32 (), ptr @test_single_bb_resolver.resolver
+@test_multi_bb_resolver = weak_odr ifunc i32 (), ptr @test_multi_bb_resolver.resolver
+@test_caller_feats_not_implied = weak_odr ifunc i32 (), ptr @test_caller_feats_not_implied.resolver
+@test_non_fmv_caller = weak_odr ifunc i32 (), ptr @test_non_fmv_caller.resolver
+@test_priority = weak_odr ifunc i32 (), ptr @test_priority.resolver
+@test_alternative_names = weak_odr ifunc i32 (), ptr @test_alternative_names.resolver
+
+declare void @__init_cpu_features_resolver() local_unnamed_addr
+
+declare i32 @test_single_bb_resolver.default() #0
+declare i32 @test_single_bb_resolver._Msve() #1
+declare i32 @test_single_bb_resolver._Msve2() #2
+
+define weak_odr ptr @test_single_bb_resolver.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_single_bb_resolver.resolver() comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 68719476736
+  %.not = icmp eq i64 %1, 0
+  %2 = and i64 %0, 1073741824
+  %.not3 = icmp eq i64 %2, 0
+  %test_single_bb_resolver._Msve.test_single_bb_resolver.default = select i1 %.not3, ptr @test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve
+  %common.ret.op = select i1 %.not, ptr %test_single_bb_resolver._Msve.test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve2
+  ret ptr %common.ret.op
+}
+
+define i32 @caller1._Msve() #1 {
+; CHECK-LABEL: define i32 @caller1._Msve(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve()
+;
+entry:
+  %call = tail call i32 @test_single_bb_resolver()
+  ret i32 %call
+}
+
+define i32 @caller1._Msve2() #2 {
+; CHECK-LABEL: define i32 @caller1._Msve2(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve2()
+;
+entry:
+  %call = tail call i32 @test_single_bb_resolver()
+  ret i32 %call
+}
+
+define i32 @caller1.default() #0 {
+; CHECK-LABEL: define i32 @caller1.default(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_single_bb_resolver.default()
+;
+entry:
+  %call = tail call i32 @test_single_bb_resolver()
+  ret i32 %call
+}
+
+declare i32 @test_multi_bb_resolver._Mmops() #3
+declare i32 @test_multi_bb_resolver._Msve2() #2
+declare i32 @test_multi_bb_resolver._Msve() #1
+declare i32 @test_multi_bb_resolver.default() #0
+
+define weak_odr ptr @test_multi_bb_resolver.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_multi_bb_resolver.resolver() comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 576460752303423488
+  %.not = icmp eq i64 %1, 0
+  br i1 %.not, label %resolver_else, label %common.ret
+
+common.ret:                                       ; preds = %resolver_else2, %resolver_else, %resolver_entry
+  %common.ret.op = phi ptr [ @test_multi_bb_resolver._Mmops, %resolver_entry ], [ @test_multi_bb_resolver._Msve2, %resolver_else ], [ %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default, %resolver_else2 ]
+  ret ptr %common.ret.op
+
+resolver_else:                                    ; preds = %resolver_entry
+  %2 = and i64 %0, 68719476736
+  %.not5 = icmp eq i64 %2, 0
+  br i1 %.not5, label %resolver_else2, label %common.ret
+
+resolver_else2:                                   ; preds = %resolver_else
+  %3 = and i64 %0, 1073741824
+  %.not6 = icmp eq i64 %3, 0
+  %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default = select i1 %.not6, ptr @test_multi_bb_resolver.default, ptr @test_multi_bb_resolver._Msve
+  br label %common.ret
+}
+
+define i32 @caller2._MmopsMsve2() #4 {
+; CHECK-LABEL: define i32 @caller2._MmopsMsve2(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops()
+;
+entry:
+  %call = tail call i32 @test_multi_bb_resolver()
+  ret i32 %call
+}
+
+define i32 @caller2._Mmops() #3 {
+; CHECK-LABEL: define i32 @caller2._Mmops(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops()
+;
+entry:
+  %call = tail call i32 @test_multi_bb_resolver()
+  ret i32 %call
+}
+
+define i32 @caller2._Msve() #1 {
+; CHECK-LABEL: define i32 @caller2._Msve(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver()
+;
+entry:
+  %call = tail call i32 @test_multi_bb_resolver()
+  ret i32 %call
+}
+
+define i32 @caller2.default() #0 {
+; CHECK-LABEL: define i32 @caller2.default(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver.default()
+;
+entry:
+  %call = tail call i32 @test_multi_bb_resolver()
+  ret i32 %call
+}
+
+declare i32 @test_caller_feats_not_implied._Mmops() #3
+declare i32 @test_caller_feats_not_implied._Msme() #5
+declare i32 @test_caller_feats_not_implied._Msve() #1
+declare i32 @test_caller_feats_not_implied.default() #0
+
+define weak_odr ptr @test_caller_feats_not_implied.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_caller_feats_not_implied.resolver() comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 576460752303423488
+  %.not = icmp eq i64 %1, 0
+  br i1 %.not, label %resolver_else, label %common.ret
+
+common.ret:                                       ; preds = %resolver_else2, %resolver_else, %resolver_entry
+  %common.ret.op = phi ptr [ @test_caller_feats_not_implied._Mmops, %resolver_entry ], [ @test_caller_feats_not_implied._Msme, %resolver_else ], [ %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default, %resolver_else2 ]
+  ret ptr %common.ret.op
+
+resolver_else:                                    ; preds = %resolver_entry
+  %2 = and i64 %0, 4398046511104
+  %.not5 = icmp eq i64 %2, 0
+  br i1 %.not5, label %resolver_else2, label %common.ret
+
+resolver_else2:                                   ; preds = %resolver_else
+  %3 = and i64 %0, 1073741824
+  %.not6 = icmp eq i64 %3, 0
+  %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default = select i1 %.not6, ptr @test_caller_feats_not_implied.default, ptr @test_caller_feats_not_implied._Msve
+  br label %common.ret
+}
+
+define i32 @caller3._Mmops() #3 {
+; CHECK-LABEL: define i32 @caller3._Mmops(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR3]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied._Mmops()
+;
+entry:
+  %call = tail call i32 @test_caller_feats_not_implied()
+  ret i32 %call
+}
+
+define i32 @caller3._Msve() #1 {
+; CHECK-LABEL: define i32 @caller3._Msve(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied()
+;
+entry:
+  %call = tail call i32 @test_caller_feats_not_implied()
+  ret i32 %call
+}
+
+define i32 @caller3.default() #0 {
+; CHECK-LABEL: define i32 @caller3.default(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied()
+;
+entry:
+  %call = tail call i32 @test_caller_feats_not_implied()
+  ret i32 %call
+}
+
+declare i32 @test_non_fmv_caller._Maes() #6
+declare i32 @test_non_fmv_caller._Msm4() #7
+declare i32 @test_non_fmv_caller.default() #0
+
+define weak_odr ptr @test_non_fmv_caller.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_non_fmv_caller.resolver() comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 32768
+  %.not = icmp eq i64 %1, 0
+  %test_non_fmv_caller._Maes.test_non_fmv_caller.default = select i1 %.not, ptr @test_non_fmv_caller.default, ptr @test_non_fmv_caller._Maes
+  ret ptr %test_non_fmv_caller._Maes.test_non_fmv_caller.default
+}
+
+define i32 @caller4() #8 {
+; CHECK-LABEL: define i32 @caller4(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_non_fmv_caller()
+;
+entry:
+  %call = tail call i32 @test_non_fmv_caller()
+  ret i32 %call
+}
+
+define i32 @caller5() #9 {
+; CHECK-LABEL: define i32 @caller5(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR8:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_non_fmv_caller()
+;
+entry:
+  %call = tail call i32 @test_non_fmv_caller()
+  ret i32 %call
+}
+
+declare i32 @test_priority._Msve2-sha3() #10
+declare i32 @test_priority._Mls64Mssbs() #11
+declare i32 @test_priority._MflagmMlseMrng() #12
+declare i32 @test_priority.default() #0
+
+define weak_odr ptr @test_priority.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_priority.resolver() comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 131
+  %2 = icmp eq i64 %1, 131
+  br i1 %2, label %common.ret, label %resolver_else
+
+common.ret:                                       ; preds = %resolver_else2, %resolver_else, %resolver_entry
+  %common.ret.op = phi ptr [ @test_priority._MflagmMlseMrng, %resolver_entry ], [ @test_priority._Mls64Mssbs, %resolver_else ], [ %test_priority._Msve2-sha3.test_priority.default, %resolver_else2 ]
+  ret ptr %common.ret.op
+
+resolver_else:                                    ; preds = %resolver_entry
+  %3 = and i64 %0, 9570149208162304
+  %4 = icmp eq i64 %3, 9570149208162304
+  br i1 %4, label %common.ret, label %resolver_else2
+
+resolver_else2:                                   ; preds = %resolver_else
+  %5 = and i64 %0, 1099511627776
+  %.not = icmp eq i64 %5, 0
+  %test_priority._Msve2-sha3.test_priority.default = select i1 %.not, ptr @test_priority.default, ptr @test_priority._Msve2-sha3
+  br label %common.ret
+}
+
+define i32 @caller6._MflagmMls64MlseMrngMssbsMsve2-sha3() #13 {
+; CHECK-LABEL: define i32 @caller6._MflagmMls64MlseMrngMssbsMsve2-sha3(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR12:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_priority._Mls64Mssbs()
+;
+entry:
+  %call = tail call i32 @test_priority()
+  ret i32 %call
+}
+
+declare i32 @test_alternative_names._Mdpb2Mfrintts() #14
+declare i32 @test_alternative_names._Mflagm2Mfrintts() #15
+declare i32 @test_alternative_names._Mrcpc2() #16
+declare i32 @test_alternative_names.default() #0
+
+define weak_odr ptr @test_alternative_names.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_alternative_names.resolver() comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 17563904
+  %2 = icmp eq i64 %1, 17563904
+  br i1 %2, label %common.ret, label %resolver_else
+
+common.ret:                                       ; preds = %resolver_else2, %resolver_else, %resolver_entry
+  %common.ret.op = phi ptr [ @test_alternative_names._Mdpb2Mfrintts, %resolver_entry ], [ @test_alternative_names._Mflagm2Mfrintts, %resolver_else ], [ %test_alternative_names._Mrcpc2.test_alternative_names.default, %resolver_else2 ]
+  ret ptr %common.ret.op
+
+resolver_else:                                    ; preds = %resolver_entry
+  %3 = and i64 %0, 16777478
+  %4 = icmp eq i64 %3, 16777478
+  br i1 %4, label %common.ret, label %resolver_else2
+
+resolver_else2:                                   ; preds = %resolver_else
+  %5 = and i64 %0, 12582912
+  %6 = icmp eq i64 %5, 12582912
+  %test_alternative_names._Mrcpc2.test_alternative_names.default = select i1 %6, ptr @test_alternative_names._Mrcpc2, ptr @test_alternative_names.default
+  br label %common.ret
+}
+
+define i32 @caller7._Mdpb2Mfrintts() #14 {
+; CHECK-LABEL: define i32 @caller7._Mdpb2Mfrintts(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR13:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_alternative_names._Mdpb2Mfrintts()
+;
+entry:
+  %call = tail call i32 @test_alternative_names()
+  ret i32 %call
+}
+
+define i32 @caller7._Mfrintts() #17 {
+; CHECK-LABEL: define i32 @caller7._Mfrintts(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR16:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_alternative_names()
+;
+entry:
+  %call = tail call i32 @test_alternative_names()
+  ret i32 %call
+}
+
+define i32 @caller7._Mrcpc2() #16 {
+; CHECK-LABEL: define i32 @caller7._Mrcpc2(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR15:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_alternative_names._Mrcpc2()
+;
+entry:
+  %call = tail call i32 @test_alternative_names()
+  ret i32 %call
+}
+
+define i32 @caller7.default() #0 {
+; CHECK-LABEL: define i32 @caller7.default(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_alternative_names.default()
+;
+entry:
+  %call = tail call i32 @test_alternative_names()
+  ret i32 %call
+}
+
+attributes #0 = { "fmv-features" }
+attributes #1 = { "fmv-features"="sve" }
+attributes #2 = { "fmv-features"="sve2" }
+attributes #3 = { "fmv-features"="mops" }
+attributes #4 = { "fmv-features"="mops,sve2" }
+attributes #5 = { "fmv-features"="sme" }
+attributes #6 = { "fmv-features"="aes" }
+attributes #7 = { "fmv-features"="sm4" }
+attributes #8 = { "target-features"="+aes,+fp-armv8,+neon,+outline-atomics,+v8a" }
+attributes #9 = { "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,+sm4" }
+attributes #10 = { "fmv-features"="sve2-sha3" }
+attributes #11 = { "fmv-features"="ls64,ssbs" }
+attributes #12 = { "fmv-features"="flagm,lse,rng" }
+attributes #13 = { "fmv-features"="flagm,ls64,lse,rng,ssbs,sve2-sha3" }
+attributes #14 = { "fmv-features"="dpb2,frintts" }
+attributes #15 = { "fmv-features"="flagm2,frintts" }
+attributes #16 = { "fmv-features"="rcpc2" }
+attributes #17 = { "fmv-features"="frintts" }
diff --git a/llvm/test/Transforms/InstCombine/assume-align.ll b/llvm/test/Transforms/InstCombine/assume-align.ll
index 47659ff8c8490..f0e0257433086 100644
--- a/llvm/test/Transforms/InstCombine/assume-align.ll
+++ b/llvm/test/Transforms/InstCombine/assume-align.ll
@@ -171,3 +171,85 @@ define ptr @dont_fold_assume_align_zero_of_loaded_pointer_into_align_metadata(pt
   call void @llvm.assume(i1 true) [ "align"(ptr %p2, i64 0) ]
   ret ptr %p2
 }
+
+define ptr @redundant_assume_align_1(ptr %p) {
+; CHECK-LABEL: @redundant_assume_align_1(
+; CHECK-NEXT:    [[P2:%.*]] = load ptr, ptr [[P:%.*]], align 8
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P2]], i32 1) ]
+; CHECK-NEXT:    call void @foo(ptr [[P2]])
+; CHECK-NEXT:    ret ptr [[P2]]
+;
+  %p2 = load ptr, ptr %p
+  call void @llvm.assume(i1 true) [ "align"(ptr %p2, i32 1) ]
+  call void @foo(ptr %p2)
+  ret ptr %p2
+}
+
+
+define ptr @redundant_assume_align_8_via_align_metadata(ptr %p) {
+; CHECK-LABEL: @redundant_assume_align_8_via_align_metadata(
+; CHECK-NEXT:    [[P2:%.*]] = load ptr, ptr [[P:%.*]], align 8, !align [[META0:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P2]], i32 8) ]
+; CHECK-NEXT:    call void @foo(ptr [[P2]])
+; CHECK-NEXT:    ret ptr [[P2]]
+;
+  %p2 = load ptr, ptr %p, !align !{i64 8}
+  call void @llvm.assume(i1 true) [ "align"(ptr %p2, i32 8) ]
+  call void @foo(ptr %p2)
+  ret ptr %p2
+}
+
+define ptr @assume_align_16_via_align_metadata(ptr %p) {
+; CHECK-LABEL: @assume_align_16_via_align_metadata(
+; CHECK-NEXT:    [[P2:%.*]] = load ptr, ptr [[P:%.*]], align 8, !align [[META0]]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P2]], i32 16) ]
+; CHECK-NEXT:    call void @foo(ptr [[P2]])
+; CHECK-NEXT:    ret ptr [[P2]]
+;
+  %p2 = load ptr, ptr %p, !align !{i64 8}
+  call void @llvm.assume(i1 true) [ "align"(ptr %p2, i32 16) ]
+  call void @foo(ptr %p2)
+  ret ptr %p2
+}
+
+define ptr @redundant_assume_align_8_via_align_attribute(ptr align 8 %p) {
+; CHECK-LABEL: @redundant_assume_align_8_via_align_attribute(
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P:%.*]], i32 8) ]
+; CHECK-NEXT:    call void @foo(ptr [[P]])
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  call void @llvm.assume(i1 true) [ "align"(ptr %p, i32 8) ]
+  call void @foo(ptr %p)
+  ret ptr %p
+}
+
+define ptr @assume_align_16_via_align_attribute(ptr align 8 %p) {
+; CHECK-LABEL: @assume_align_16_via_align_attribute(
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P:%.*]], i32 16) ]
+; CHECK-NEXT:    call void @foo(ptr [[P]])
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  call void @llvm.assume(i1 true) [ "align"(ptr %p, i32 16) ]
+  call void @foo(ptr %p)
+  ret ptr %p
+}
+
+define ptr @redundant_assume_align_8_via_asume(ptr %p) {
+; CHECK-LABEL: @redundant_assume_align_8_via_asume(
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P:%.*]], i32 16) ]
+; CHECK-NEXT:    call void @foo(ptr [[P]])
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P]], i32 8) ]
+; CHECK-NEXT:    call void @foo(ptr [[P]])
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  call void @llvm.assume(i1 true) [ "align"(ptr %p, i32 16) ]
+  call void @foo(ptr %p)
+  call void @llvm.assume(i1 true) [ "align"(ptr %p, i32 8) ]
+  call void @foo(ptr %p)
+  ret ptr %p
+}
+
+declare void @foo(ptr)
+;.
+; CHECK: [[META0]] = !{i64 8}
+;.
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index 2d7bc49b6dcae..c21f8457e82d1 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -35,13 +35,18 @@ define i32 @foo1(ptr %a) #0 {
 }
 
 define i32 @align_assume_trunc_cond(ptr %a) #0 {
-; CHECK-LABEL: @align_assume_trunc_cond(
-; CHECK-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[PTRINT]] to i1
-; CHECK-NEXT:    [[MASKCOND:%.*]] = xor i1 [[TRUNC]], true
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT:    ret i32 [[T0]]
+; DEFAULT-LABEL: @align_assume_trunc_cond(
+; DEFAULT-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
+; DEFAULT-NEXT:    [[PTRINT:%.*]] = ptrtoint ptr [[A]] to i64
+; DEFAULT-NEXT:    [[TRUNC:%.*]] = trunc i64 [[PTRINT]] to i1
+; DEFAULT-NEXT:    [[MASKCOND:%.*]] = xor i1 [[TRUNC]], true
+; DEFAULT-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; DEFAULT-NEXT:    ret i32 [[T0]]
+;
+; BUNDLES-LABEL: @align_assume_trunc_cond(
+; BUNDLES-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
+; BUNDLES-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 2) ]
+; BUNDLES-NEXT:    ret i32 [[T0]]
 ;
   %t0 = load i32, ptr %a, align 4
   %ptrint = ptrtoint ptr %a to i64
diff --git a/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll b/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll
new file mode 100644
index 0000000000000..6296954333e8a
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll
@@ -0,0 +1,631 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes='instcombine<no-verify-fixpoint>' < %s | FileCheck %s
+
+@x = global double 0.000000e+00
+@r1 = global double 0.000000e+00
+@r2 = global double 0.000000e+00
+@r3 = global double 0.000000e+00
+@v = global [2 x double] zeroinitializer
+@v1 = global [2 x double] zeroinitializer
+@v2 = global [2 x double] zeroinitializer
+
+; div/mul/div1 in the same block.
+define void @bb_constraint_case1(double %a) {
+; CHECK-LABEL: define void @bb_constraint_case1(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; div/mul in one block and div1 in other block with conditional guard.
+define void @bb_constraint_case2(double %a, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case2(
+; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    [[D_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[D_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %d.not = icmp eq i32 %d, 0
+  br i1 %d.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; div in one block. mul/div1 in other block and conditionally guarded. Don't optimize.
+define void @bb_constraint_case3(double %a, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case3(
+; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[D_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[D_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %d.not = icmp eq i32 %d, 0
+  br i1 %d.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; div in one block. mul/div1 each in different block and conditionally guarded. Don't optimize.
+define void @bb_constraint_case4(double %a, i32 %c, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case4(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    br i1 [[C_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[D_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[D_NOT]], label [[IF_END1:%.*]], label [[IF_THEN1:%.*]]
+; CHECK:       if.then1:
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    br label [[IF_END1]]
+; CHECK:       if.end1:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %c.not = icmp eq i32 %c, 0
+  br i1 %c.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %d.not = icmp eq i32 %d, 0
+  br i1 %d.not, label %if.end1, label %if.then1
+
+if.then1:                                         ; preds = %if.end
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  br label %if.end1
+
+if.end1:                                          ; preds = %if.then1, %if.end
+  ret void
+}
+
+; sqrt value comes from different blocks. Don't optimize.
+define void @bb_constraint_case5(double %a, i32 %c) {
+; CHECK-LABEL: define void @bb_constraint_case5(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    br i1 [[C_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[A]], 1.000000e+01
+; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[ADD]])
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[SQRT:%.*]] = phi double [ [[TMP0]], [[IF_THEN]] ], [ [[TMP1]], [[IF_ELSE]] ]
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c.not = icmp eq i32 %c, 0
+  br i1 %c.not, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %0 = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %add = fadd double %a, 1.000000e+01
+  %1 = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %add)
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %sqrt = phi double[ %0, %if.then], [ %1, %if.else]
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; div in one block and conditionally guarded. mul/div1 in other block. Don't optimize.
+define void @bb_constraint_case6(double %a, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case6(
+; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[D_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[D_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr @x, align 8
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[TMP1]], ptr @x, align 8
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[DIV:%.*]] = phi double [ [[TMP0]], [[IF_ELSE]] ], [ [[TMP1]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %d.not = icmp eq i32 %d, 0
+  br i1 %d.not, label %if.else, label %if.then
+
+if.else:                                          ; preds = %entry
+  %1 = load double, ptr @x
+  br label %if.end
+
+if.then:                                          ; preds = %entry
+  %2 = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %2, ptr @x
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %div = phi double [ %1, %if.else ], [ %2, %if.then ]
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; value for mul comes from different blocks. Don't optimize.
+define void @bb_constraint_case7(double %a, i32 %c, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case7(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    br i1 [[C_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv double 3.000000e+00, [[A]]
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[D_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[D_NOT]], label [[IF_ELSE1:%.*]], label [[IF_THEN1:%.*]]
+; CHECK:       if.then1:
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv double 2.000000e+00, [[A]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.else1:
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[MUL:%.*]] = phi double [ [[TMP1]], [[IF_THEN1]] ], [ [[TMP2]], [[IF_ELSE1]] ], [ [[TMP0]], [[IF_THEN]] ]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %c.not = icmp eq i32 %c, 0
+  br i1 %c.not, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = fdiv double 3.000000e+00, %a
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %d.not = icmp eq i32 %d, 0
+  br i1 %d.not, label %if.else1, label %if.then1
+
+if.then1:                                         ; preds = %if.else
+  %2 = fdiv double 2.000000e+00, %a
+  br label %if.end
+
+if.else1:                                         ; preds = %if.else
+  %3 = fmul reassoc double %div, %div
+  br label %if.end
+
+if.end:                                           ; preds = %if.then1, %if.else1, %if.then
+  %mul = phi double [ %2, %if.then1 ], [ %3, %if.else1 ], [ %1, %if.then ]
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; value of mul  comes from two different blocks(as shown by select ins).
+define void @bb_constraint_case8(double %a, i32 %c) {
+; CHECK-LABEL: define void @bb_constraint_case8(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul double [[A]], [[A]]
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[C_NOT]], double [[TMP1]], double [[TMP0]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %c.not = icmp eq i32 %c, 0
+  %1 = fmul double %a, %a
+  %2 = fmul reassoc double %div, %div
+  %mul = select i1 %c.not, double %1, double %2
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; multiple instances of multiply ops to optimize. Optimize all.
+define void @mutiple_multiply_instances(double %a, i32 %c) {
+; CHECK-LABEL: define void @mutiple_multiply_instances(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP1]], [[SQRT1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul double [[A]], [[A]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul double [[A]], [[A]]
+; CHECK-NEXT:    [[MUL1:%.*]] = select i1 [[C_NOT]], double [[TMP2]], double [[TMP1]]
+; CHECK-NEXT:    [[MUL2:%.*]] = select i1 [[C_NOT]], double [[TMP1]], double [[TMP3]]
+; CHECK-NEXT:    store double [[MUL1]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[MUL2]], ptr @r3, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %c.not = icmp eq i32 %c, 0
+  %1 = fmul double %a, %a
+  %2 = fmul double %a, %a
+  %3 = fmul reassoc double %div, %div
+  %4 = fmul reassoc double %div, %div
+  %mul1 = select i1 %c.not, double %1, double %3
+  %mul2 = select i1 %c.not, double %4, double %2
+  store double %mul1, ptr @r1
+  store double %mul2, ptr @r3
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; missing flags for optimization.
+define void @missing_arcp_flag_on_div(double %a) {
+; CHECK-LABEL: define void @missing_arcp_flag_on_div(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; missing flags for optimization.
+define void @missing_reassoc_flag_on_mul(double %a) {
+; CHECK-LABEL: define void @missing_reassoc_flag_on_mul(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; missing flags for optimization.
+define void @missing_reassoc_flag_on_div1(double %a) {
+; CHECK-LABEL: define void @missing_reassoc_flag_on_div1(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; div = -1/sqrt(a)
+define void @negative_fdiv_val(double %a) {
+; CHECK-LABEL: define void @negative_fdiv_val(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg reassoc double [[SQRT1]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double -1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+define void @fpmath_metadata_on_div1(double %a) {
+; CHECK-LABEL: define void @fpmath_metadata_on_div1(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]), !fpmath [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt, !fpmath !3
+  store double %div1, ptr @r2
+  ret void
+}
+
+define void @fpmath_metadata_on_mul(double %a) {
+; CHECK-LABEL: define void @fpmath_metadata_on_mul(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]], !fpmath [[META1:![0-9]+]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div, !fpmath !2
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; FIXME: DIV in the result should get the fpmath metadata from %div.
+define void @fpmath_metadata_on_div(double %a) {
+; CHECK-LABEL: define void @fpmath_metadata_on_div(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]], !fpmath [[META2:![0-9]+]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt, !fpmath !1
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+define void @fpmath_metadata_on_all(double %a) {
+; CHECK-LABEL: define void @fpmath_metadata_on_all(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]), !fpmath [[META0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]], !fpmath [[META1]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]], !fpmath [[META2]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a), !fpmath !0
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt, !fpmath !1
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div, !fpmath !2
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt, !fpmath !3
+  store double %div1, ptr @r2
+  ret void
+}
+
+define void @vector_input(<2 x double> %a) {
+; CHECK-LABEL: define void @vector_input(
+; CHECK-SAME: <2 x double> [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc <2 x double> @llvm.sqrt.v2f64(<2 x double> [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc <2 x double> splat (double 1.000000e+00), [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc <2 x double> [[TMP0]], [[SQRT1]]
+; CHECK-NEXT:    store <2 x double> [[DIV]], ptr @v, align 16
+; CHECK-NEXT:    store <2 x double> [[TMP0]], ptr @v1, align 16
+; CHECK-NEXT:    store <2 x double> [[SQRT1]], ptr @v2, align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %a)
+  %div = fdiv reassoc arcp ninf <2 x double><double 1.000000e+00, double 1.000000e+00>, %sqrt
+  store <2 x double> %div, ptr @v
+  %mul = fmul reassoc <2 x double> %div, %div
+  store <2 x double> %mul, ptr @v1
+  %div1 = fdiv reassoc <2 x double> %a, %sqrt
+  store <2 x double> %div1, ptr @v2
+  ret void
+}
+
+define void @strict_fp_metadata(double %a) {
+; CHECK-LABEL: define void @strict_fp_metadata(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 1, metadata !"round.dynamic", metadata !"fpexcept.strict")
+; CHECK-NEXT:    [[CALL:%.*]] = call double @llvm.sqrt.f64(double noundef [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[CONV]], double [[CALL]], metadata !"round.dynamic", metadata !"fpexcept.strict")
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[MUL:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[DIV]], double [[DIV]], metadata !"round.dynamic", metadata !"fpexcept.strict")
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV2:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[A]], double [[CALL]], metadata !"round.dynamic", metadata !"fpexcept.strict")
+; CHECK-NEXT:    store double [[DIV2]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 1, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  %call = call double @llvm.sqrt.f64(double noundef %a)
+  %div = call double @llvm.experimental.constrained.fdiv.f64(double %conv, double %call, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  store double %div, ptr @x
+  %mul = call double @llvm.experimental.constrained.fmul.f64(double %div, double %div, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  store double %mul, ptr @r1
+  %div2 = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %call, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  store double %div2, ptr @r2
+  ret void
+}
+
+declare double @llvm.experimental.constrained.sitofp.f64.i32(i32, metadata, metadata)
+declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata)
+declare double @llvm.sqrt.f64(double)
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
+
+!0 = !{float 2.5}
+!1 = !{float 3.5}
+!2 = !{float 4.5}
+!3 = !{float 5.5}
+; CHECK: [[META0]] = !{float 5.500000e+00}
+; CHECK: [[META1]] = !{float 4.500000e+00}
+; CHECK: [[META2]] = !{float 3.500000e+00}
diff --git a/llvm/test/Transforms/InstCombine/loadstore-metadata.ll b/llvm/test/Transforms/InstCombine/loadstore-metadata.ll
index 54649251e4cb1..5a977882504ce 100644
--- a/llvm/test/Transforms/InstCombine/loadstore-metadata.ll
+++ b/llvm/test/Transforms/InstCombine/loadstore-metadata.ll
@@ -186,21 +186,36 @@ entry:
   ret i32 %c
 }
 
-; FIXME: Should preserve metadata on loads, except !noundef and !invariant.load.
+; Preserve none-UB metadata on loads.
 define ptr @preserve_load_metadata_after_select_transform1(i1 %c, ptr dereferenceable(8) %a, ptr dereferenceable(8) %b) {
 ; CHECK-LABEL: @preserve_load_metadata_after_select_transform1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B_VAL:%.*]] = load ptr, ptr [[B:%.*]], align 1
-; CHECK-NEXT:    [[A_VAL:%.*]] = load ptr, ptr [[A:%.*]], align 1
+; CHECK-NEXT:    [[B_VAL:%.*]] = load ptr, ptr [[B:%.*]], align 1, !nonnull [[META6]], !align [[META8]]
+; CHECK-NEXT:    [[A_VAL:%.*]] = load ptr, ptr [[A:%.*]], align 1, !nonnull [[META6]], !align [[META8]]
 ; CHECK-NEXT:    [[L_SEL:%.*]] = select i1 [[C:%.*]], ptr [[B_VAL]], ptr [[A_VAL]]
 ; CHECK-NEXT:    ret ptr [[L_SEL]]
 ;
 entry:
   %ptr.sel = select i1 %c, ptr %b, ptr %a
-  %l.sel = load ptr, ptr %ptr.sel, align 1, !tbaa !0, !llvm.access.group !7, !dereferenceable !9, !noundef !{}, !invariant.load !7
+  %l.sel = load ptr, ptr %ptr.sel, align 1, !tbaa !0, !llvm.access.group !7, !dereferenceable !9, !noundef !{}, !invariant.load !7, !align !9, !nonnull !{}
   ret ptr %l.sel
 }
 
+; Preserve none-UB metadata on loads.
+define i32 @preserve_load_metadata_after_select_transform_range(i1 %c, ptr dereferenceable(8) %a, ptr dereferenceable(8) %b) {
+; CHECK-LABEL: @preserve_load_metadata_after_select_transform_range(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B_VAL:%.*]] = load i32, ptr [[B:%.*]], align 1, !range [[RNG10:![0-9]+]]
+; CHECK-NEXT:    [[A_VAL:%.*]] = load i32, ptr [[A:%.*]], align 1, !range [[RNG10]]
+; CHECK-NEXT:    [[L_SEL:%.*]] = select i1 [[C:%.*]], i32 [[B_VAL]], i32 [[A_VAL]]
+; CHECK-NEXT:    ret i32 [[L_SEL]]
+;
+entry:
+  %ptr.sel = select i1 %c, ptr %b, ptr %a
+  %l.sel = load i32, ptr %ptr.sel, align 1, !tbaa !0, !llvm.access.group !7, !invariant.load !7, !noundef !{}, !range !6
+  ret i32 %l.sel
+}
+
 define double @preserve_load_metadata_after_select_transform2(ptr %a, ptr %b) {
 ; CHECK-LABEL: @preserve_load_metadata_after_select_transform2(
 ; CHECK-NEXT:  entry:
@@ -279,7 +294,7 @@ define double @preserve_load_metadata_after_select_transform_metadata_missing_4(
 ; CHECK-LABEL: @preserve_load_metadata_after_select_transform_metadata_missing_4(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[L_A:%.*]] = load double, ptr [[A:%.*]], align 8, !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META3]], !llvm.access.group [[META6]]
-; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B:%.*]], align 8, !tbaa [[TBAA0]], !alias.scope [[META10:![0-9]+]], !noalias [[META10]], !llvm.access.group [[ACC_GRP13:![0-9]+]]
+; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B:%.*]], align 8, !tbaa [[TBAA0]], !alias.scope [[META11:![0-9]+]], !noalias [[META11]], !llvm.access.group [[ACC_GRP14:![0-9]+]]
 ; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp fast olt double [[L_A]], [[L_B]]
 ; CHECK-NEXT:    [[L_SEL:%.*]] = select i1 [[CMP_I]], double [[L_B]], double [[L_A]]
 ; CHECK-NEXT:    ret double [[L_SEL]]
@@ -322,8 +337,9 @@ entry:
 ; CHECK: [[META7]] = !{i32 1}
 ; CHECK: [[META8]] = !{i64 8}
 ; CHECK: [[ACC_GRP9]] = distinct !{}
-; CHECK: [[META10]] = !{[[META11:![0-9]+]]}
-; CHECK: [[META11]] = distinct !{[[META11]], [[META12:![0-9]+]]}
-; CHECK: [[META12]] = distinct !{[[META12]]}
-; CHECK: [[ACC_GRP13]] = distinct !{}
+; CHECK: [[RNG10]] = !{i32 0, i32 42}
+; CHECK: [[META11]] = !{[[META12:![0-9]+]]}
+; CHECK: [[META12]] = distinct !{[[META12]], [[META13:![0-9]+]]}
+; CHECK: [[META13]] = distinct !{[[META13]]}
+; CHECK: [[ACC_GRP14]] = distinct !{}
 ;.
diff --git a/llvm/test/Transforms/InstCombine/nsw.ll b/llvm/test/Transforms/InstCombine/nsw.ll
index 329a47324f862..b00f2e58add78 100644
--- a/llvm/test/Transforms/InstCombine/nsw.ll
+++ b/llvm/test/Transforms/InstCombine/nsw.ll
@@ -415,3 +415,63 @@ define i8 @neg_nsw_mul_missing_nsw_on_mul(i8 %a1, i8 %a2, i8 %b) {
   %neg = sub nsw i8 0, %shl
   ret i8 %neg
 }
+
+; This could propagate nsw.
+
+define i16 @mul_nsw_reassoc_prop(i16 %x) {
+; CHECK-LABEL: @mul_nsw_reassoc_prop(
+; CHECK-NEXT:    [[B:%.*]] = mul nsw i16 [[X:%.*]], 6
+; CHECK-NEXT:    ret i16 [[B]]
+;
+  %a = mul nsw i16 %x, 3
+  %b = mul nsw i16 %a, 2
+  ret i16 %b
+}
+
+; This could propagate nsw.
+
+define i16 @mul_nsw_reassoc_prop_neg(i16 %x) {
+; CHECK-LABEL: @mul_nsw_reassoc_prop_neg(
+; CHECK-NEXT:    [[B:%.*]] = mul nsw i16 [[X:%.*]], -2201
+; CHECK-NEXT:    ret i16 [[B]]
+;
+  %a = mul nsw i16 %x, -71
+  %b = mul nsw i16 %a, 31
+  ret i16 %b
+}
+
+; Must not propagate nsw.
+
+define i16 @mul_nsw_reassoc_prop_no_nsw1(i16 %x) {
+; CHECK-LABEL: @mul_nsw_reassoc_prop_no_nsw1(
+; CHECK-NEXT:    [[B:%.*]] = mul i16 [[X:%.*]], 6
+; CHECK-NEXT:    ret i16 [[B]]
+;
+  %a = mul i16 %x, 3
+  %b = mul nsw i16 %a, 2
+  ret i16 %b
+}
+
+; Must not propagate nsw.
+
+define i16 @mul_nsw_reassoc_prop_no_nsw2(i16 %x) {
+; CHECK-LABEL: @mul_nsw_reassoc_prop_no_nsw2(
+; CHECK-NEXT:    [[B:%.*]] = mul i16 [[X:%.*]], 6
+; CHECK-NEXT:    ret i16 [[B]]
+;
+  %a = mul nsw i16 %x, 3
+  %b = mul i16 %a, 2
+  ret i16 %b
+}
+
+; Must not propagate nsw.
+
+define i16 @mul_nsw_reassoc_prop_overflow(i16 %x) {
+; CHECK-LABEL: @mul_nsw_reassoc_prop_overflow(
+; CHECK-NEXT:    [[B:%.*]] = mul i16 [[X:%.*]], -31777
+; CHECK-NEXT:    ret i16 [[B]]
+;
+  %a = mul nsw i16 %x, 1023
+  %b = mul nsw i16 %a, 33
+  ret i16 %b
+}
diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-fmin-fmax.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-fmin-fmax.ll
new file mode 100644
index 0000000000000..4ab6b3cf295bf
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-fmin-fmax.ll
@@ -0,0 +1,918 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instsimplify -march=nvptx64 --mcpu=sm_86 --mattr=+ptx72 -S | FileCheck %s
+
+; Check constant-folding for NVVM fmin fmax intrinsics
+
+;###############################################################
+;#                    FMax(1.25, -2.0)                         #
+;###############################################################
+
+define double @test_fmax_1_25_neg_2_d() {
+; CHECK-LABEL: define double @test_fmax_1_25_neg_2_d() {
+; CHECK-NEXT:    ret double 1.250000e+00
+;
+  %res = call double @llvm.nvvm.fmax.d(double 1.25, double -2.0)
+  ret double %res
+}
+
+define float @test_fmax_1_25_neg_2_f() {
+; CHECK-LABEL: define float @test_fmax_1_25_neg_2_f() {
+; CHECK-NEXT:    ret float 1.250000e+00
+;
+  %res = call float @llvm.nvvm.fmax.f(float 1.25, float -2.0)
+  ret float %res
+}
+
+define float @test_fmax_1_25_neg_2_ftz_f() {
+; CHECK-LABEL: define float @test_fmax_1_25_neg_2_ftz_f() {
+; CHECK-NEXT:    ret float 1.250000e+00
+;
+  %res = call float @llvm.nvvm.fmax.ftz.f(float 1.25, float -2.0)
+  ret float %res
+}
+
+define float @test_fmax_1_25_neg_2_ftz_nan_f() {
+; CHECK-LABEL: define float @test_fmax_1_25_neg_2_ftz_nan_f() {
+; CHECK-NEXT:    ret float 1.250000e+00
+;
+  %res = call float @llvm.nvvm.fmax.ftz.nan.f(float 1.25, float -2.0)
+  ret float %res
+}
+
+define float @test_fmax_1_25_neg_2_ftz_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_1_25_neg_2_ftz_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float -2.000000e+00
+;
+  %res = call float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f(float 1.25, float -2.0)
+  ret float %res
+}
+
+define float @test_fmax_1_25_neg_2_ftz_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_1_25_neg_2_ftz_xorsign_abs_f() {
+; CHECK-NEXT:    ret float -2.000000e+00
+;
+  %res = call float @llvm.nvvm.fmax.ftz.xorsign.abs.f(float 1.25, float -2.0)
+  ret float %res
+}
+
+define float @test_fmax_1_25_neg_2_nan_f() {
+; CHECK-LABEL: define float @test_fmax_1_25_neg_2_nan_f() {
+; CHECK-NEXT:    ret float 1.250000e+00
+;
+  %res = call float @llvm.nvvm.fmax.nan.f(float 1.25, float -2.0)
+  ret float %res
+}
+
+define float @test_fmax_1_25_neg_2_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_1_25_neg_2_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float -2.000000e+00
+;
+  %res = call float @llvm.nvvm.fmax.nan.xorsign.abs.f(float 1.25, float -2.0)
+  ret float %res
+}
+
+define float @test_fmax_1_25_neg_2_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_1_25_neg_2_xorsign_abs_f() {
+; CHECK-NEXT:    ret float -2.000000e+00
+;
+  %res = call float @llvm.nvvm.fmax.xorsign.abs.f(float 1.25, float -2.0)
+  ret float %res
+}
+
+;###############################################################
+;#                   FMax(+Subnormal, 0.0)                     #
+;###############################################################
+
+define double @test_fmax_pos_subnorm_zero_d() {
+; CHECK-LABEL: define double @test_fmax_pos_subnorm_zero_d() {
+; CHECK-NEXT:    ret double 0x380FFFFFC0000000
+;
+  %res = call double @llvm.nvvm.fmax.d(double 0x380FFFFFC0000000, double 0.0)
+  ret double %res
+}
+
+define float @test_fmax_pos_subnorm_zero_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_zero_f() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmax.f(float 0x380FFFFFC0000000, float 0.0)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_zero_ftz_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_zero_ftz_f() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmax.ftz.f(float 0x380FFFFFC0000000, float 0.0)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_zero_ftz_nan_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_zero_ftz_nan_f() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmax.ftz.nan.f(float 0x380FFFFFC0000000, float 0.0)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_zero_ftz_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_zero_ftz_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f(float 0x380FFFFFC0000000, float 0.0)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_zero_ftz_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_zero_ftz_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmax.ftz.xorsign.abs.f(float 0x380FFFFFC0000000, float 0.0)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_zero_nan_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_zero_nan_f() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmax.nan.f(float 0x380FFFFFC0000000, float 0.0)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_zero_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_zero_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmax.nan.xorsign.abs.f(float 0x380FFFFFC0000000, float 0.0)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_zero_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_zero_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmax.xorsign.abs.f(float 0x380FFFFFC0000000, float 0.0)
+  ret float %res
+}
+
+;###############################################################
+;#                   FMax(+Subnormal, -Subnormal)              #
+;###############################################################
+
+define double @test_fmax_pos_subnorm_neg_subnorm_d() {
+; CHECK-LABEL: define double @test_fmax_pos_subnorm_neg_subnorm_d() {
+; CHECK-NEXT:    ret double 0x380FFFFFC0000000
+;
+  %res = call double @llvm.nvvm.fmax.d(double 0x380FFFFFC0000000, double 0xB80FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_fmax_pos_subnorm_neg_subnorm_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_neg_subnorm_f() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmax.f(float 0x380FFFFFC0000000, float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_neg_subnorm_ftz_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_neg_subnorm_ftz_f() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmax.ftz.f(float 0x380FFFFFC0000000, float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_neg_subnorm_ftz_nan_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_neg_subnorm_ftz_nan_f() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmax.ftz.nan.f(float 0x380FFFFFC0000000, float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_neg_subnorm_ftz_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_neg_subnorm_ftz_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f(float 0x380FFFFFC0000000, float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_neg_subnorm_ftz_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_neg_subnorm_ftz_xorsign_abs_f() {
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmax.ftz.xorsign.abs.f(float 0x380FFFFFC0000000, float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_neg_subnorm_nan_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_neg_subnorm_nan_f() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmax.nan.f(float 0x380FFFFFC0000000, float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_neg_subnorm_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_neg_subnorm_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0xB80FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmax.nan.xorsign.abs.f(float 0x380FFFFFC0000000, float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_neg_subnorm_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_neg_subnorm_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0xB80FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmax.xorsign.abs.f(float 0x380FFFFFC0000000, float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+;###############################################################
+;#                   FMax(+Subnormal, NaN)                     #
+;###############################################################
+
+define double @test_fmax_pos_subnorm_nan_d() {
+; CHECK-LABEL: define double @test_fmax_pos_subnorm_nan_d() {
+; CHECK-NEXT:    ret double 0x380FFFFFC0000000
+;
+  %res = call double @llvm.nvvm.fmax.d(double 0x380FFFFFC0000000, double 0x7fff444400000000)
+  ret double %res
+}
+
+define float @test_fmax_pos_subnorm_nan_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_nan_f() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmax.f(float 0x380FFFFFC0000000, float 0x7fff444400000000)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_nan_ftz_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_nan_ftz_f() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmax.ftz.f(float 0x380FFFFFC0000000, float 0x7fff444400000000)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_nan_ftz_nan_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_nan_ftz_nan_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmax.ftz.nan.f(float 0x380FFFFFC0000000, float 0x7fff444400000000)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_nan_ftz_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_nan_ftz_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f(float 0x380FFFFFC0000000, float 0x7fff444400000000)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_nan_ftz_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_nan_ftz_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmax.ftz.xorsign.abs.f(float 0x380FFFFFC0000000, float 0x7fff444400000000)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_nan_nan_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_nan_nan_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmax.nan.f(float 0x380FFFFFC0000000, float 0x7fff444400000000)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_nan_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_nan_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmax.nan.xorsign.abs.f(float 0x380FFFFFC0000000, float 0x7fff444400000000)
+  ret float %res
+}
+
+define float @test_fmax_pos_subnorm_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_pos_subnorm_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmax.xorsign.abs.f(float 0x380FFFFFC0000000, float 0x7fff444400000000)
+  ret float %res
+}
+
+;###############################################################
+;#                   FMax(+Subnormal, undef)                   #
+;###############################################################
+
+define double @test_fmax_subnorm_undef_d() {
+; CHECK-LABEL: define double @test_fmax_subnorm_undef_d() {
+; CHECK-NEXT:    ret double 0x380FFFFFC0000000
+;
+  %res = call double @llvm.nvvm.fmax.d(double 0x380FFFFFC0000000, double undef)
+  ret double %res
+}
+
+define float @test_fmax_subnorm_undef_f() {
+; CHECK-LABEL: define float @test_fmax_subnorm_undef_f() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmax.f(float 0x380FFFFFC0000000, float undef)
+  ret float %res
+}
+
+define float @test_fmax_subnorm_undef_ftz_f() {
+; CHECK-LABEL: define float @test_fmax_subnorm_undef_ftz_f() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmax.ftz.f(float 0x380FFFFFC0000000, float undef)
+  ret float %res
+}
+
+define float @test_fmax_subnorm_undef_ftz_nan_f() {
+; CHECK-LABEL: define float @test_fmax_subnorm_undef_ftz_nan_f() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmax.ftz.nan.f(float 0x380FFFFFC0000000, float undef)
+  ret float %res
+}
+
+define float @test_fmax_subnorm_undef_ftz_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_subnorm_undef_ftz_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f(float 0x380FFFFFC0000000, float undef)
+  ret float %res
+}
+
+define float @test_fmax_subnorm_undef_ftz_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_subnorm_undef_ftz_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmax.ftz.xorsign.abs.f(float 0x380FFFFFC0000000, float undef)
+  ret float %res
+}
+
+define float @test_fmax_subnorm_undef_nan_f() {
+; CHECK-LABEL: define float @test_fmax_subnorm_undef_nan_f() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmax.nan.f(float 0x380FFFFFC0000000, float undef)
+  ret float %res
+}
+
+define float @test_fmax_subnorm_undef_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_subnorm_undef_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmax.nan.xorsign.abs.f(float 0x380FFFFFC0000000, float undef)
+  ret float %res
+}
+
+define float @test_fmax_subnorm_undef_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_subnorm_undef_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmax.xorsign.abs.f(float 0x380FFFFFC0000000, float undef)
+  ret float %res
+}
+
+;###############################################################
+;#                      FMax(NaN, undef)                       #
+;###############################################################
+; Ensure we canonicalize the NaNs for f32
+
+define double @test_fmax_nan_undef_d() {
+; CHECK-LABEL: define double @test_fmax_nan_undef_d() {
+; CHECK-NEXT:    ret double 0x7FF4444400000000
+;
+  %res = call double @llvm.nvvm.fmax.d(double 0x7ff4444400000000, double undef)
+  ret double %res
+}
+
+define float @test_fmax_nan_undef_f() {
+; CHECK-LABEL: define float @test_fmax_nan_undef_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmax.f(float 0x7fff444400000000, float undef)
+  ret float %res
+}
+
+define float @test_fmax_nan_undef_ftz_f() {
+; CHECK-LABEL: define float @test_fmax_nan_undef_ftz_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmax.ftz.f(float 0x7fff444400000000, float undef)
+  ret float %res
+}
+
+define float @test_fmax_nan_undef_ftz_nan_f() {
+; CHECK-LABEL: define float @test_fmax_nan_undef_ftz_nan_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmax.ftz.nan.f(float 0x7fff444400000000, float undef)
+  ret float %res
+}
+
+define float @test_fmax_nan_undef_ftz_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_nan_undef_ftz_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f(float 0x7fff444400000000, float undef)
+  ret float %res
+}
+
+define float @test_fmax_nan_undef_ftz_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_nan_undef_ftz_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmax.ftz.xorsign.abs.f(float 0x7ffff4ff00000000, float undef)
+  ret float %res
+}
+
+define float @test_fmax_nan_undef_nan_f() {
+; CHECK-LABEL: define float @test_fmax_nan_undef_nan_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmax.nan.f(float 0x7fff444400000000, float undef)
+  ret float %res
+}
+
+define float @test_fmax_nan_undef_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_nan_undef_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmax.nan.xorsign.abs.f(float 0x7fff444400000000, float undef)
+  ret float %res
+}
+
+define float @test_fmax_nan_undef_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmax_nan_undef_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmax.xorsign.abs.f(float 0x7fff444400000000, float undef)
+  ret float %res
+}
+
+;###############################################################
+;#                    FMin(1.25, -2.0)                         #
+;###############################################################
+
+define double @test_fmin_1_25_neg_2_d() {
+; CHECK-LABEL: define double @test_fmin_1_25_neg_2_d() {
+; CHECK-NEXT:    ret double -2.000000e+00
+;
+  %res = call double @llvm.nvvm.fmin.d(double 1.25, double -2.0)
+  ret double %res
+}
+
+define float @test_fmin_1_25_neg_2_f() {
+; CHECK-LABEL: define float @test_fmin_1_25_neg_2_f() {
+; CHECK-NEXT:    ret float -2.000000e+00
+;
+  %res = call float @llvm.nvvm.fmin.f(float 1.25, float -2.0)
+  ret float %res
+}
+
+define float @test_fmin_1_25_neg_2_ftz_f() {
+; CHECK-LABEL: define float @test_fmin_1_25_neg_2_ftz_f() {
+; CHECK-NEXT:    ret float -2.000000e+00
+;
+  %res = call float @llvm.nvvm.fmin.ftz.f(float 1.25, float -2.0)
+  ret float %res
+}
+
+define float @test_fmin_1_25_neg_2_ftz_nan_f() {
+; CHECK-LABEL: define float @test_fmin_1_25_neg_2_ftz_nan_f() {
+; CHECK-NEXT:    ret float -2.000000e+00
+;
+  %res = call float @llvm.nvvm.fmin.ftz.nan.f(float 1.25, float -2.0)
+  ret float %res
+}
+
+define float @test_fmin_1_25_neg_2_ftz_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_1_25_neg_2_ftz_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float -1.250000e+00
+;
+  %res = call float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f(float 1.25, float -2.0)
+  ret float %res
+}
+
+define float @test_fmin_1_25_neg_2_ftz_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_1_25_neg_2_ftz_xorsign_abs_f() {
+; CHECK-NEXT:    ret float -1.250000e+00
+;
+  %res = call float @llvm.nvvm.fmin.ftz.xorsign.abs.f(float 1.25, float -2.0)
+  ret float %res
+}
+
+define float @test_fmin_1_25_neg_2_nan_f() {
+; CHECK-LABEL: define float @test_fmin_1_25_neg_2_nan_f() {
+; CHECK-NEXT:    ret float -2.000000e+00
+;
+  %res = call float @llvm.nvvm.fmin.nan.f(float 1.25, float -2.0)
+  ret float %res
+}
+
+define float @test_fmin_1_25_neg_2_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_1_25_neg_2_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float -1.250000e+00
+;
+  %res = call float @llvm.nvvm.fmin.nan.xorsign.abs.f(float 1.25, float -2.0)
+  ret float %res
+}
+
+define float @test_fmin_1_25_neg_2_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_1_25_neg_2_xorsign_abs_f() {
+; CHECK-NEXT:    ret float -1.250000e+00
+;
+  %res = call float @llvm.nvvm.fmin.xorsign.abs.f(float 1.25, float -2.0)
+  ret float %res
+}
+
+;###############################################################
+;#                   FMin(-Subnormal, 0.0)                     #
+;###############################################################
+
+define double @test_fmin_neg_subnorm_zero_d() {
+; CHECK-LABEL: define double @test_fmin_neg_subnorm_zero_d() {
+; CHECK-NEXT:    ret double 0xB80FFFFFC0000000
+;
+  %res = call double @llvm.nvvm.fmin.d(double 0xB80FFFFFC0000000, double 0.0)
+  ret double %res
+}
+
+define float @test_fmin_neg_subnorm_zero_f() {
+; CHECK-LABEL: define float @test_fmin_neg_subnorm_zero_f() {
+; CHECK-NEXT:    ret float 0xB80FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmin.f(float 0xB80FFFFFC0000000, float 0.0)
+  ret float %res
+}
+
+define float @test_fmin_neg_subnorm_zero_ftz_f() {
+; CHECK-LABEL: define float @test_fmin_neg_subnorm_zero_ftz_f() {
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmin.ftz.f(float 0xB80FFFFFC0000000, float 0.0)
+  ret float %res
+}
+
+define float @test_fmin_neg_subnorm_zero_ftz_nan_f() {
+; CHECK-LABEL: define float @test_fmin_neg_subnorm_zero_ftz_nan_f() {
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmin.ftz.nan.f(float 0xB80FFFFFC0000000, float 0.0)
+  ret float %res
+}
+
+define float @test_fmin_neg_subnorm_zero_ftz_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_neg_subnorm_zero_ftz_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f(float 0xB80FFFFFC0000000, float 0.0)
+  ret float %res
+}
+
+define float @test_fmin_neg_subnorm_zero_ftz_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_neg_subnorm_zero_ftz_xorsign_abs_f() {
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmin.ftz.xorsign.abs.f(float 0xB80FFFFFC0000000, float 0.0)
+  ret float %res
+}
+
+define float @test_fmin_neg_subnorm_zero_nan_f() {
+; CHECK-LABEL: define float @test_fmin_neg_subnorm_zero_nan_f() {
+; CHECK-NEXT:    ret float 0xB80FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmin.nan.f(float 0xB80FFFFFC0000000, float 0.0)
+  ret float %res
+}
+
+define float @test_fmin_neg_subnorm_zero_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_neg_subnorm_zero_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmin.nan.xorsign.abs.f(float 0xB80FFFFFC0000000, float 0.0)
+  ret float %res
+}
+
+define float @test_fmin_neg_subnorm_zero_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_neg_subnorm_zero_xorsign_abs_f() {
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmin.xorsign.abs.f(float 0xB80FFFFFC0000000, float 0.0)
+  ret float %res
+}
+
+;###############################################################
+;#                   FMin(+Subnormal, -Subnormal)              #
+;###############################################################
+
+define double @test_fmin_pos_subnorm_neg_subnorm_d() {
+; CHECK-LABEL: define double @test_fmin_pos_subnorm_neg_subnorm_d() {
+; CHECK-NEXT:    ret double 0xB80FFFFFC0000000
+;
+  %res = call double @llvm.nvvm.fmin.d(double 0x380FFFFFC0000000, double 0xB80FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_fmin_pos_subnorm_neg_subnorm_f() {
+; CHECK-LABEL: define float @test_fmin_pos_subnorm_neg_subnorm_f() {
+; CHECK-NEXT:    ret float 0xB80FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmin.f(float 0x380FFFFFC0000000, float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_fmin_pos_subnorm_neg_subnorm_ftz_f() {
+; CHECK-LABEL: define float @test_fmin_pos_subnorm_neg_subnorm_ftz_f() {
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmin.ftz.f(float 0x380FFFFFC0000000, float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_fmin_pos_subnorm_neg_subnorm_ftz_nan_f() {
+; CHECK-LABEL: define float @test_fmin_pos_subnorm_neg_subnorm_ftz_nan_f() {
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmin.ftz.nan.f(float 0x380FFFFFC0000000, float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_fmin_pos_subnorm_neg_subnorm_ftz_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_pos_subnorm_neg_subnorm_ftz_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f(float 0x380FFFFFC0000000, float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_fmin_pos_subnorm_neg_subnorm_ftz_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_pos_subnorm_neg_subnorm_ftz_xorsign_abs_f() {
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmin.ftz.xorsign.abs.f(float 0x380FFFFFC0000000, float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_fmin_pos_subnorm_neg_subnorm_nan_f() {
+; CHECK-LABEL: define float @test_fmin_pos_subnorm_neg_subnorm_nan_f() {
+; CHECK-NEXT:    ret float 0xB80FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmin.nan.f(float 0x380FFFFFC0000000, float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_fmin_pos_subnorm_neg_subnorm_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_pos_subnorm_neg_subnorm_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0xB80FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmin.nan.xorsign.abs.f(float 0x380FFFFFC0000000, float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_fmin_pos_subnorm_neg_subnorm_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_pos_subnorm_neg_subnorm_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0xB80FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmin.xorsign.abs.f(float 0x380FFFFFC0000000, float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+;###############################################################
+;#                   FMin(+Subnormal, NaN)                     #
+;###############################################################
+
+define double @test_fmin_pos_subnorm_nan_d() {
+; CHECK-LABEL: define double @test_fmin_pos_subnorm_nan_d() {
+; CHECK-NEXT:    ret double 0x380FFFFFC0000000
+;
+  %res = call double @llvm.nvvm.fmin.d(double 0x380FFFFFC0000000, double 0x7fff444400000000)
+  ret double %res
+}
+
+define float @test_fmin_pos_subnorm_nan_f() {
+; CHECK-LABEL: define float @test_fmin_pos_subnorm_nan_f() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmin.f(float 0x380FFFFFC0000000, float 0x7fff444400000000)
+  ret float %res
+}
+
+define float @test_fmin_pos_subnorm_nan_ftz_f() {
+; CHECK-LABEL: define float @test_fmin_pos_subnorm_nan_ftz_f() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmin.ftz.f(float 0x380FFFFFC0000000, float 0x7fff444400000000)
+  ret float %res
+}
+
+define float @test_fmin_pos_subnorm_nan_ftz_nan_f() {
+; CHECK-LABEL: define float @test_fmin_pos_subnorm_nan_ftz_nan_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmin.ftz.nan.f(float 0x380FFFFFC0000000, float 0x7fff444400000000)
+  ret float %res
+}
+
+define float @test_fmin_pos_subnorm_nan_ftz_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_pos_subnorm_nan_ftz_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f(float 0x380FFFFFC0000000, float 0x7fff444400000000)
+  ret float %res
+}
+
+define float @test_fmin_pos_subnorm_nan_ftz_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_pos_subnorm_nan_ftz_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmin.ftz.xorsign.abs.f(float 0x380FFFFFC0000000, float 0x7fff444400000000)
+  ret float %res
+}
+
+define float @test_fmin_pos_subnorm_nan_nan_f() {
+; CHECK-LABEL: define float @test_fmin_pos_subnorm_nan_nan_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmin.nan.f(float 0x380FFFFFC0000000, float 0x7fff444400000000)
+  ret float %res
+}
+
+define float @test_fmin_pos_subnorm_nan_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_pos_subnorm_nan_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmin.nan.xorsign.abs.f(float 0x380FFFFFC0000000, float 0x7fff444400000000)
+  ret float %res
+}
+
+define float @test_fmin_pos_subnorm_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_pos_subnorm_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmin.xorsign.abs.f(float 0x380FFFFFC0000000, float 0x7fff444400000000)
+  ret float %res
+}
+
+;###############################################################
+;#                   FMin(+Subnormal, undef)                   #
+;###############################################################
+
+define double @test_fmin_subnorm_undef_d() {
+; CHECK-LABEL: define double @test_fmin_subnorm_undef_d() {
+; CHECK-NEXT:    ret double 0x380FFFFFC0000000
+;
+  %res = call double @llvm.nvvm.fmin.d(double 0x380FFFFFC0000000, double undef)
+  ret double %res
+}
+
+define float @test_fmin_subnorm_undef_f() {
+; CHECK-LABEL: define float @test_fmin_subnorm_undef_f() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmin.f(float 0x380FFFFFC0000000, float undef)
+  ret float %res
+}
+
+define float @test_fmin_subnorm_undef_ftz_f() {
+; CHECK-LABEL: define float @test_fmin_subnorm_undef_ftz_f() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmin.ftz.f(float 0x380FFFFFC0000000, float undef)
+  ret float %res
+}
+
+define float @test_fmin_subnorm_undef_ftz_nan_f() {
+; CHECK-LABEL: define float @test_fmin_subnorm_undef_ftz_nan_f() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmin.ftz.nan.f(float 0x380FFFFFC0000000, float undef)
+  ret float %res
+}
+
+define float @test_fmin_subnorm_undef_ftz_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_subnorm_undef_ftz_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f(float 0x380FFFFFC0000000, float undef)
+  ret float %res
+}
+
+define float @test_fmin_subnorm_undef_ftz_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_subnorm_undef_ftz_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fmin.ftz.xorsign.abs.f(float 0x380FFFFFC0000000, float undef)
+  ret float %res
+}
+
+define float @test_fmin_subnorm_undef_nan_f() {
+; CHECK-LABEL: define float @test_fmin_subnorm_undef_nan_f() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmin.nan.f(float 0x380FFFFFC0000000, float undef)
+  ret float %res
+}
+
+define float @test_fmin_subnorm_undef_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_subnorm_undef_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmin.nan.xorsign.abs.f(float 0x380FFFFFC0000000, float undef)
+  ret float %res
+}
+
+define float @test_fmin_subnorm_undef_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_subnorm_undef_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fmin.xorsign.abs.f(float 0x380FFFFFC0000000, float undef)
+  ret float %res
+}
+
+;###############################################################
+;#                      FMin(NaN, undef)                       #
+;###############################################################
+; Ensure we canonicalize the NaNs for f32
+
+define double @test_fmin_nan_undef_d() {
+; CHECK-LABEL: define double @test_fmin_nan_undef_d() {
+; CHECK-NEXT:    ret double 0x7FF4444400000000
+;
+  %res = call double @llvm.nvvm.fmin.d(double 0x7ff4444400000000, double undef)
+  ret double %res
+}
+
+define float @test_fmin_nan_undef_f() {
+; CHECK-LABEL: define float @test_fmin_nan_undef_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmin.f(float 0x7fff444400000000, float undef)
+  ret float %res
+}
+
+define float @test_fmin_nan_undef_ftz_f() {
+; CHECK-LABEL: define float @test_fmin_nan_undef_ftz_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmin.ftz.f(float 0x7fff444400000000, float undef)
+  ret float %res
+}
+
+define float @test_fmin_nan_undef_ftz_nan_f() {
+; CHECK-LABEL: define float @test_fmin_nan_undef_ftz_nan_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmin.ftz.nan.f(float 0x7fff444400000000, float undef)
+  ret float %res
+}
+
+define float @test_fmin_nan_undef_ftz_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_nan_undef_ftz_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f(float 0x7fff444400000000, float undef)
+  ret float %res
+}
+
+define float @test_fmin_nan_undef_ftz_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_nan_undef_ftz_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmin.ftz.xorsign.abs.f(float 0x7ffff4ff00000000, float undef)
+  ret float %res
+}
+
+define float @test_fmin_nan_undef_nan_f() {
+; CHECK-LABEL: define float @test_fmin_nan_undef_nan_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmin.nan.f(float 0x7fff444400000000, float undef)
+  ret float %res
+}
+
+define float @test_fmin_nan_undef_nan_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_nan_undef_nan_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmin.nan.xorsign.abs.f(float 0x7fff444400000000, float undef)
+  ret float %res
+}
+
+define float @test_fmin_nan_undef_xorsign_abs_f() {
+; CHECK-LABEL: define float @test_fmin_nan_undef_xorsign_abs_f() {
+; CHECK-NEXT:    ret float 0x7FFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.fmin.xorsign.abs.f(float 0x7fff444400000000, float undef)
+  ret float %res
+}
diff --git a/llvm/test/Transforms/LICM/hoist-alloc.ll b/llvm/test/Transforms/LICM/hoist-alloc.ll
index 76047ec8c2438..fe4f03713926f 100644
--- a/llvm/test/Transforms/LICM/hoist-alloc.ll
+++ b/llvm/test/Transforms/LICM/hoist-alloc.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=licm -use-dereferenceable-at-point-semantics=0 < %s | FileCheck %s
-; RUN: opt -S -passes=licm -use-dereferenceable-at-point-semantics=1 < %s | FileCheck %s
+; RUN: opt -S -passes=licm -use-dereferenceable-at-point-semantics=false < %s | FileCheck %s
+; RUN: opt -S -passes=licm -use-dereferenceable-at-point-semantics < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/LoopInterchange/gh54176-scalar-deps.ll b/llvm/test/Transforms/LoopInterchange/gh54176-scalar-deps.ll
index b338365566898..bc9f16fbe58d6 100644
--- a/llvm/test/Transforms/LoopInterchange/gh54176-scalar-deps.ll
+++ b/llvm/test/Transforms/LoopInterchange/gh54176-scalar-deps.ll
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=loop-interchange -S | FileCheck %s
+; RUN: opt < %s -passes=loop-interchange -pass-remarks-output=%t -disable-output
+; RUN: FileCheck -input-file %t %s
 
 @f = dso_local local_unnamed_addr global [4 x [9 x i32]] [[9 x i32] [i32 5, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0], [9 x i32] zeroinitializer, [9 x i32] zeroinitializer, [9 x i32] zeroinitializer], align 4
 @g = common dso_local local_unnamed_addr global i32 0, align 4
@@ -23,54 +23,18 @@
 ;     return g;
 ; }
 ;
+define dso_local i32 @test1(i1 %cond) {
+;
 ; FIXME: if there's an output dependency inside the loop and Src doesn't
 ; dominate Dst, we should not interchange. Thus, this currently miscompiles.
 ;
-define dso_local i32 @test1(i1 %cond) {
-; CHECK-LABEL: define dso_local i32 @test1(
-; CHECK-SAME: i1 [[COND:%.*]]) {
-; CHECK-NEXT:  [[FOR_PREHEADER:.*:]]
-; CHECK-NEXT:    br label %[[INNERLOOP_PREHEADER:.*]]
-; CHECK:       [[OUTERLOOP_PREHEADER:.*]]:
-; CHECK-NEXT:    br label %[[OUTERLOOP:.*]]
-; CHECK:       [[OUTERLOOP]]:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INDVARS_IV_NEXT21_I:%.*]], %[[FOR_LATCH:.*]] ], [ 0, %[[OUTERLOOP_PREHEADER]] ]
-; CHECK-NEXT:    br label %[[INNERLOOP_SPLIT:.*]]
-; CHECK:       [[INNERLOOP_PREHEADER]]:
-; CHECK-NEXT:    br label %[[INNERLOOP:.*]]
-; CHECK:       [[INNERLOOP]]:
-; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[TMP0:%.*]], %[[IF_END_SPLIT:.*]] ], [ 0, %[[INNERLOOP_PREHEADER]] ]
-; CHECK-NEXT:    br label %[[OUTERLOOP_PREHEADER]]
-; CHECK:       [[INNERLOOP_SPLIT]]:
-; CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x [9 x i32]], ptr @f, i64 0, i64 [[J]], i64 [[I]]
-; CHECK-NEXT:    [[I1:%.*]] = load i32, ptr [[ARRAYIDX6_I]], align 4
-; CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp eq i32 [[I1]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_I]], label %[[LAND_END:.*]], label %[[LAND_RHS:.*]]
-; CHECK:       [[LAND_RHS]]:
-; CHECK-NEXT:    store i32 3, ptr @g, align 4
-; CHECK-NEXT:    br label %[[LAND_END]]
-; CHECK:       [[LAND_END]]:
-; CHECK-NEXT:    br i1 [[COND]], label %[[IF_END:.*]], label %[[IF_THEN:.*]]
-; CHECK:       [[IF_THEN]]:
-; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr @g, align 4
-; CHECK-NEXT:    [[INC_I:%.*]] = add i32 [[I2]], 1
-; CHECK-NEXT:    store i32 [[INC_I]], ptr @g, align 4
-; CHECK-NEXT:    br label %[[IF_END]]
-; CHECK:       [[IF_END]]:
-; CHECK-NEXT:    [[J_NEXT:%.*]] = add nuw nsw i64 [[J]], 1
-; CHECK-NEXT:    [[EXITCOND_I:%.*]] = icmp eq i64 [[J_NEXT]], 3
-; CHECK-NEXT:    br label %[[FOR_LATCH]]
-; CHECK:       [[IF_END_SPLIT]]:
-; CHECK-NEXT:    [[TMP0]] = add nuw nsw i64 [[J]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], 3
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[EXIT:.*]], label %[[INNERLOOP]]
-; CHECK:       [[FOR_LATCH]]:
-; CHECK-NEXT:    [[INDVARS_IV_NEXT21_I]] = add nsw i64 [[I]], 1
-; CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt i64 [[I]], 2
-; CHECK-NEXT:    br i1 [[CMP_I]], label %[[OUTERLOOP]], label %[[IF_END_SPLIT]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[I3:%.*]] = load i32, ptr @g, align 4
-; CHECK-NEXT:    ret i32 [[I3]]
+; CHECK:        --- !Passed
+; CHECK-NEXT:   Pass:            loop-interchange
+; CHECK-NEXT:   Name:            Interchanged
+; CHECK-NEXT:   Function:        test1
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - String:          Loop interchanged with enclosing loop.
+; CHECK-NEXT: ...
 ;
 for.preheader:
   br label %outerloop
@@ -133,54 +97,18 @@ exit:
 ;     return g;
 ; }
 ;
+define dso_local i32 @test2(i1 %cond) {
+;
 ; FIXME: if there's an output dependency inside the loop and Src doesn't
 ; dominate Dst, we should not interchange. Thus, this currently miscompiles.
 ;
-define dso_local i32 @test2(i1 %cond) {
-; CHECK-LABEL: define dso_local i32 @test2(
-; CHECK-SAME: i1 [[COND:%.*]]) {
-; CHECK-NEXT:  [[FOR_PREHEADER:.*:]]
-; CHECK-NEXT:    br label %[[INNERLOOP_PREHEADER:.*]]
-; CHECK:       [[OUTERLOOP_PREHEADER:.*]]:
-; CHECK-NEXT:    br label %[[OUTERLOOP:.*]]
-; CHECK:       [[OUTERLOOP]]:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INDVARS_IV_NEXT21_I:%.*]], %[[FOR_LATCH:.*]] ], [ 0, %[[OUTERLOOP_PREHEADER]] ]
-; CHECK-NEXT:    br label %[[INNERLOOP_SPLIT:.*]]
-; CHECK:       [[INNERLOOP_PREHEADER]]:
-; CHECK-NEXT:    br label %[[INNERLOOP:.*]]
-; CHECK:       [[INNERLOOP]]:
-; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[TMP0:%.*]], %[[IF_END_SPLIT:.*]] ], [ 0, %[[INNERLOOP_PREHEADER]] ]
-; CHECK-NEXT:    br label %[[OUTERLOOP_PREHEADER]]
-; CHECK:       [[INNERLOOP_SPLIT]]:
-; CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x [9 x i32]], ptr @f, i64 0, i64 [[J]], i64 [[I]]
-; CHECK-NEXT:    [[I1:%.*]] = load i32, ptr [[ARRAYIDX6_I]], align 4
-; CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp eq i32 [[I1]], 0
-; CHECK-NEXT:    store i32 3, ptr @g, align 4
-; CHECK-NEXT:    br i1 [[TOBOOL_I]], label %[[LAND_END:.*]], label %[[LAND_RHS:.*]]
-; CHECK:       [[LAND_RHS]]:
-; CHECK-NEXT:    br label %[[LAND_END]]
-; CHECK:       [[LAND_END]]:
-; CHECK-NEXT:    br i1 [[COND]], label %[[IF_END:.*]], label %[[IF_THEN:.*]]
-; CHECK:       [[IF_THEN]]:
-; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr @g, align 4
-; CHECK-NEXT:    [[INC_I:%.*]] = add i32 [[I2]], 1
-; CHECK-NEXT:    store i32 [[INC_I]], ptr @g, align 4
-; CHECK-NEXT:    br label %[[IF_END]]
-; CHECK:       [[IF_END]]:
-; CHECK-NEXT:    [[J_NEXT:%.*]] = add nuw nsw i64 [[J]], 1
-; CHECK-NEXT:    [[EXITCOND_I:%.*]] = icmp eq i64 [[J_NEXT]], 3
-; CHECK-NEXT:    br label %[[FOR_LATCH]]
-; CHECK:       [[IF_END_SPLIT]]:
-; CHECK-NEXT:    [[TMP0]] = add nuw nsw i64 [[J]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], 3
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[EXIT:.*]], label %[[INNERLOOP]]
-; CHECK:       [[FOR_LATCH]]:
-; CHECK-NEXT:    [[INDVARS_IV_NEXT21_I]] = add nsw i64 [[I]], 1
-; CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt i64 [[I]], 2
-; CHECK-NEXT:    br i1 [[CMP_I]], label %[[OUTERLOOP]], label %[[IF_END_SPLIT]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[I3:%.*]] = load i32, ptr @g, align 4
-; CHECK-NEXT:    ret i32 [[I3]]
+; CHECK:        --- !Passed
+; CHECK-NEXT:   Pass:            loop-interchange
+; CHECK-NEXT:   Name:            Interchanged
+; CHECK-NEXT:   Function:        test2
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - String:          Loop interchanged with enclosing loop.
+; CHECK-NEXT: ...
 ;
 for.preheader:
   br label %outerloop
diff --git a/llvm/test/Transforms/LoopInterchange/innermost-latch-uses-values-in-middle-header.ll b/llvm/test/Transforms/LoopInterchange/innermost-latch-uses-values-in-middle-header.ll
index bad84224d445a..230f7dc2bcfad 100644
--- a/llvm/test/Transforms/LoopInterchange/innermost-latch-uses-values-in-middle-header.ll
+++ b/llvm/test/Transforms/LoopInterchange/innermost-latch-uses-values-in-middle-header.ll
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=loop-interchange -verify-dom-info -verify-loop-info -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=loop-interchange -verify-dom-info -verify-loop-info -pass-remarks-output=%t -disable-output
+; RUN: FileCheck -input-file %t %s
 
 @a = common global i32 0, align 4
 @d = common dso_local local_unnamed_addr global [1 x [6 x i32]] zeroinitializer, align 4
@@ -9,53 +9,29 @@
 ; values defined in the new innermost loop not available in the exiting block of
 ; the entire loop nest.
 ;
-define void @innermost_latch_uses_values_in_middle_header() {
-; CHECK-LABEL: define void @innermost_latch_uses_values_in_middle_header() {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @a, align 4
-; CHECK-NEXT:    [[B:%.*]] = add i32 80, 1
-; CHECK-NEXT:    br label %[[OUTERMOST_HEADER:.*]]
-; CHECK:       [[OUTERMOST_HEADER]]:
-; CHECK-NEXT:    [[INDVAR_OUTERMOST:%.*]] = phi i32 [ 10, %[[ENTRY]] ], [ [[INDVAR_OUTERMOST_NEXT:%.*]], %[[OUTERMOST_LATCH:.*]] ]
-; CHECK-NEXT:    [[TOBOOL71_I:%.*]] = icmp eq i32 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL71_I]], label %[[INNERMOST_HEADER_PREHEADER:.*]], label %[[OUTERMOST_LATCH]]
-; CHECK:       [[MIDDLE_HEADER_PREHEADER:.*]]:
-; CHECK-NEXT:    br label %[[MIDDLE_HEADER:.*]]
-; CHECK:       [[MIDDLE_HEADER]]:
-; CHECK-NEXT:    [[INDVAR_MIDDLE:%.*]] = phi i64 [ [[INDVAR_MIDDLE_NEXT:%.*]], %[[MIDDLE_LATCH:.*]] ], [ 4, %[[MIDDLE_HEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[INDVAR_MIDDLE_WIDE:%.*]] = zext i32 [[B]] to i64
-; CHECK-NEXT:    br label %[[INNERMOST_BODY:.*]]
-; CHECK:       [[INNERMOST_HEADER_PREHEADER]]:
-; CHECK-NEXT:    br label %[[INNERMOST_HEADER:.*]]
-; CHECK:       [[INNERMOST_HEADER]]:
-; CHECK-NEXT:    [[INDVAR_INNERMOST:%.*]] = phi i64 [ [[TMP1:%.*]], %[[INNERMOST_LATCH_SPLIT:.*]] ], [ 4, %[[INNERMOST_HEADER_PREHEADER]] ]
-; CHECK-NEXT:    br label %[[MIDDLE_HEADER_PREHEADER]]
-; CHECK:       [[INNERMOST_BODY]]:
-; CHECK-NEXT:    [[ARRAYIDX9_I:%.*]] = getelementptr inbounds [1 x [6 x i32]], ptr @d, i64 0, i64 [[INDVAR_INNERMOST]], i64 [[INDVAR_MIDDLE]]
-; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX9_I]], align 4
-; CHECK-NEXT:    br label %[[INNERMOST_LATCH:.*]]
-; CHECK:       [[INNERMOST_LATCH]]:
-; CHECK-NEXT:    [[INDVAR_INNERMOST_NEXT:%.*]] = add nsw i64 [[INDVAR_INNERMOST]], 1
-; CHECK-NEXT:    [[TOBOOL5_I:%.*]] = icmp eq i64 [[INDVAR_INNERMOST_NEXT]], [[INDVAR_MIDDLE_WIDE]]
-; CHECK-NEXT:    br label %[[MIDDLE_LATCH]]
-; CHECK:       [[INNERMOST_LATCH_SPLIT]]:
-; CHECK-NEXT:    [[INDVAR_MIDDLE_WIDE_LCSSA:%.*]] = phi i64 [ [[INDVAR_MIDDLE_WIDE]], %[[MIDDLE_LATCH]] ]
-; CHECK-NEXT:    [[TMP1]] = add nsw i64 [[INDVAR_INNERMOST]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], [[INDVAR_MIDDLE_WIDE_LCSSA]]
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[OUTERMOST_LATCH_LOOPEXIT:.*]], label %[[INNERMOST_HEADER]]
-; CHECK:       [[MIDDLE_LATCH]]:
-; CHECK-NEXT:    [[INDVAR_MIDDLE_NEXT]] = add nsw i64 [[INDVAR_MIDDLE]], -1
-; CHECK-NEXT:    [[TOBOOL2_I:%.*]] = icmp eq i64 [[INDVAR_MIDDLE_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL2_I]], label %[[INNERMOST_LATCH_SPLIT]], label %[[MIDDLE_HEADER]]
-; CHECK:       [[OUTERMOST_LATCH_LOOPEXIT]]:
-; CHECK-NEXT:    br label %[[OUTERMOST_LATCH]]
-; CHECK:       [[OUTERMOST_LATCH]]:
-; CHECK-NEXT:    [[INDVAR_OUTERMOST_NEXT]] = add nsw i32 [[INDVAR_OUTERMOST]], -5
-; CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp eq i32 [[INDVAR_OUTERMOST_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_I]], label %[[OUTERMOST_EXIT:.*]], label %[[OUTERMOST_HEADER]]
-; CHECK:       [[OUTERMOST_EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK:  --- !Passed
+; CHECK:  Pass:            loop-interchange
+; CHECK:  Name:            Interchanged
+; CHECK:  Function:        innermost_latch_uses_values_in_middle_header
+; CHECK:  Args:
+; CHECK:    - String:          Loop interchanged with enclosing loop.
+; CHECK:  ...
+; CHECK:  --- !Missed
+; CHECK:  Pass:            loop-interchange
+; CHECK:  Name:            UnsupportedInnerLatchPHI
+; CHECK:  Function:        innermost_latch_uses_values_in_middle_header
+; CHECK:  Args:
+; CHECK:    - String:          Cannot interchange loops because unsupported PHI nodes found in inner loop latch.
+; CHECK:  ...
+; CHECK:  --- !Missed
+; CHECK:  Pass:            loop-interchange
+; CHECK:  Name:            UnsupportedExitPHI
+; CHECK:  Function:        innermost_latch_uses_values_in_middle_header
+; CHECK:  Args:
+; CHECK:    - String:          Found unsupported PHI node in loop exit.
+; CHECK:  ...
 ;
+define void @innermost_latch_uses_values_in_middle_header() {
 entry:
   %0 = load i32, ptr @a, align 4
   %b = add i32 80, 1
diff --git a/llvm/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll b/llvm/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll
index 6daf61a4ec007..a208c1f46a705 100644
--- a/llvm/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll
+++ b/llvm/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info -pass-remarks-output=%t -disable-output
+; RUN: FileCheck -input-file %t %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -24,53 +24,29 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; So, loops InnerLoopId = 2 and OuterLoopId = 1 should be interchanged,
 ; but not InnerLoopId = 1 and OuterLoopId = 0.
 ;
+; CHECK:       --- !Passed
+; CHECK-NEXT:  Pass:            loop-interchange
+; CHECK-NEXT:  Name:            Interchanged
+; CHECK-NEXT:  Function:        interchange_09
+; CHECK-NEXT:  Args:
+; CHECK-NEXT:    - String:          Loop interchanged with enclosing loop.
+; CHECK-NEXT:  ...
+; CHECK-NEXT:  --- !Missed
+; CHECK-NEXT:  Pass:            loop-interchange
+; CHECK-NEXT:  Name:            NotTightlyNested
+; CHECK-NEXT:  Function:        interchange_09
+; CHECK-NEXT:  Args:
+; CHECK-NEXT:    - String:          Cannot interchange loops because they are not tightly nested.
+; CHECK-NEXT:  ...
+; CHECK-NEXT:  --- !Missed
+; CHECK-NEXT:  Pass:            loop-interchange
+; CHECK-NEXT:  Name:            InterchangeNotProfitable
+; CHECK-NEXT:  Function:        interchange_09
+; CHECK-NEXT:  Args:
+; CHECK-NEXT:    - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+; CHECK-NEXT:  ...
+
 define void @interchange_09(i32 %k) {
-; CHECK-LABEL: define void @interchange_09(
-; CHECK-SAME: i32 [[K:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_COND_CLEANUP:.*]]:
-; CHECK-NEXT:    ret void
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV45:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT46:%.*]], %[[FOR_COND_CLEANUP4:.*]] ]
-; CHECK-NEXT:    [[CALL:%.*]] = call double @fn1()
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x double], ptr @T, i64 0, i64 [[INDVARS_IV45]]
-; CHECK-NEXT:    store double [[CALL]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    br label %[[FOR_BODY9_PREHEADER:.*]]
-; CHECK:       [[FOR_COND6_PREHEADER_PREHEADER:.*]]:
-; CHECK-NEXT:    br label %[[FOR_COND6_PREHEADER:.*]]
-; CHECK:       [[FOR_COND6_PREHEADER]]:
-; CHECK-NEXT:    [[INDVARS_IV42:%.*]] = phi i64 [ [[INDVARS_IV_NEXT43:%.*]], %[[FOR_COND_CLEANUP8:.*]] ], [ 0, %[[FOR_COND6_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT:    br label %[[FOR_BODY9_SPLIT1:.*]]
-; CHECK:       [[FOR_BODY9_PREHEADER]]:
-; CHECK-NEXT:    br label %[[FOR_BODY9:.*]]
-; CHECK:       [[FOR_COND_CLEANUP4]]:
-; CHECK-NEXT:    [[TMP:%.*]] = load double, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    call void @fn2(double [[TMP]])
-; CHECK-NEXT:    [[INDVARS_IV_NEXT46]] = add nuw nsw i64 [[INDVARS_IV45]], 1
-; CHECK-NEXT:    [[EXITCOND47:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT46]], 100
-; CHECK-NEXT:    br i1 [[EXITCOND47]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP]]
-; CHECK:       [[FOR_COND_CLEANUP8]]:
-; CHECK-NEXT:    [[INDVARS_IV_NEXT43]] = add nuw nsw i64 [[INDVARS_IV42]], 1
-; CHECK-NEXT:    [[EXITCOND44:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT43]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND44]], label %[[FOR_COND6_PREHEADER]], label %[[FOR_BODY9_SPLIT:.*]]
-; CHECK:       [[FOR_BODY9]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0:%.*]], %[[FOR_BODY9_SPLIT]] ], [ 1, %[[FOR_BODY9_PREHEADER]] ]
-; CHECK-NEXT:    br label %[[FOR_COND6_PREHEADER_PREHEADER]]
-; CHECK:       [[FOR_BODY9_SPLIT1]]:
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [1000 x [1000 x i32]], ptr @Arr, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV42]]
-; CHECK-NEXT:    [[T1:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = trunc i64 [[INDVARS_IV45]] to i32
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[T1]], [[T2]]
-; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX13]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP8]]
-; CHECK:       [[FOR_BODY9_SPLIT]]:
-; CHECK-NEXT:    [[TMP0]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 1000
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[FOR_BODY9]], label %[[FOR_COND_CLEANUP4]]
-;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopInterchange/outer-header-jump-to-inner-latch.ll b/llvm/test/Transforms/LoopInterchange/outer-header-jump-to-inner-latch.ll
index 6db95c09b175f..aaf8b1daf0414 100644
--- a/llvm/test/Transforms/LoopInterchange/outer-header-jump-to-inner-latch.ll
+++ b/llvm/test/Transforms/LoopInterchange/outer-header-jump-to-inner-latch.ll
@@ -1,4 +1,5 @@
-; RUN: opt -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info -verify-loop-lcssa -S %s | FileCheck %s
+; RUN: opt -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info -verify-loop-lcssa %s -pass-remarks-output=%t -disable-output
+; RUN: FileCheck -input-file %t %s
 
 @b = global [3 x [5 x [8 x i16]]] [[5 x [8 x i16]] zeroinitializer, [5 x [8 x i16]] [[8 x i16] zeroinitializer, [8 x i16] [i16 0, i16 0, i16 0, i16 6, i16 1, i16 6, i16 0, i16 0], [8 x i16] zeroinitializer, [8 x i16] zeroinitializer, [8 x i16] zeroinitializer], [5 x [8 x i16]] zeroinitializer], align 2
 @a = common global i32 0, align 4
@@ -19,47 +20,16 @@
 ;;       a |= b[d][d][c + 5];
 ;;   }
 ;; }
-
+;
+; CHECK:       --- !Passed
+; CHECK-NEXT:  Pass:            loop-interchange
+; CHECK-NEXT:  Name:            Interchanged
+; CHECK-NEXT:  Function:        test1
+; CHECK-NEXT:  Args:
+; CHECK-NEXT:    - String:          Loop interchanged with enclosing loop.
+; CHECK-NEXT:  ...
+;
 define void @test1() {
-;CHECK-LABEL: @test1(
-;CHECK:          entry:
-;CHECK-NEXT:       br label [[FOR_COND1_PREHEADER:%.*]]
-;CHECK:          for.body.preheader:
-;CHECK-NEXT:       br label  [[FOR_BODY:%.*]]
-;CHECK:          for.body:
-;CHECK-NEXT:       [[INDVARS_IV22:%.*]] = phi i64 [ [[INDVARS_IV_NEXT23:%.*]], [[FOR_INC8:%.*]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ]
-;CHECK-NEXT:       [[TOBOOL:%.*]] = icmp eq i64 [[INDVARS_IV22:%.*]], 0
-;CHECK-NEXT:       br i1 [[TOBOOL]], label [[FOR_BODY3_SPLIT1:%.*]], label [[FOR_BODY3_SPLIT:%.*]]
-;CHECK:          for.cond1.preheader:
-;CHECK-NEXT:       br label [[FOR_BODY3:%.*]]
-;CHECK:          for.body3:
-;CHECK-NEXT:       [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ %3, [[FOR_BODY3_SPLIT]] ]
-;CHECK-NEXT:        br label [[FOR_BODY_PREHEADER]]
-;CHECK:          for.body3.split1:
-;CHECK-NEXT:       [[TMP0:%.*]] = add nuw nsw i64 [[INDVARS_IV22]], 5
-;CHECK-NEXT:       [[ARRAYIDX7:%.*]] = getelementptr inbounds [3 x [5 x [8 x i16]]], ptr @b, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV]], i64 [[TMP0]]
-;CHECK-NEXT:       [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX7]]
-;CHECK-NEXT:       [[CONV:%.*]] = sext i16 [[TMP1]] to i32
-;CHECK-NEXT:       [[TMP2:%.*]] = load i32, ptr @a
-;CHECK-NEXT:       [[TMP_OR:%.*]] = or i32 [[TMP2]], [[CONV]]
-;CHECK-NEXT:       store i32 [[TMP_OR]], ptr @a
-;CHECK-NEXT:       [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-;CHECK-NEXT:       [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 3
-;CHECK-NEXT:       br label [[FOR_INC8_LOOPEXIT:%.*]]
-;CHECK:          for.body3.split:
-;CHECK-NEXT:       [[TMP3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-;CHECK-NEXT:       [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 3
-;CHECK-NEXT:       br i1 %4, label [[FOR_BODY3]], label [[FOR_END10:%.*]]
-;CHECK:          for.inc8.loopexit:
-;CHECK-NEXT:       br label [[FOR_INC8]]
-;CHECK:          for.inc8:
-;CHECK-NEXT:       [[INDVARS_IV_NEXT23]] = add nuw nsw i64 [[INDVARS_IV22]], 1
-;CHECK-NEXT:       [[EXITCOND25:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT23]], 3
-;CHECK-NEXT:       br i1 [[EXITCOND25]], label [[FOR_BODY]], label [[FOR_BODY3_SPLIT]]
-;CHECK:         for.end10:
-;CHECK-NEXT:       [[TMP5:%.*]] = load i32, ptr @a
-;CHECK-NEXT:       ret void
-
 entry:
   br label %for.body
 
@@ -100,6 +70,7 @@ for.end10:                                        ; preds = %for.inc8
 ; Triply nested loop
 ; The innermost and the middle loop are interchanged.
 ; C test case:
+;
 ;; a;
 ;; d[][6];
 ;; void test2() {
@@ -116,50 +87,16 @@ for.end10:                                        ; preds = %for.inc8
 ;;     }
 ;;   }
 ;; }
-
-define void @test2() {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[OUTERMOST_HEADER:%.*]]
-; CHECK:       outermost.header:
-; CHECK-NEXT:    [[INDVAR_OUTERMOST:%.*]] = phi i32 [ 10, [[ENTRY:%.*]] ], [ [[INDVAR_OUTERMOST_NEXT:%.*]], [[OUTERMOST_LATCH:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @a, align 4
-; CHECK-NEXT:    [[TOBOOL71_I:%.*]] = icmp eq i32 [[TMP0]], 0
-; CHECK-NEXT:    br label [[INNERMOST_PREHEADER:%.*]]
-; CHECK:       middle.header.preheader:
-; CHECK-NEXT:    br label [[MIDDLE_HEADER:%.*]]
-; CHECK:       middle.header:
-; CHECK-NEXT:    [[INDVAR_MIDDLE:%.*]] = phi i64 [ [[INDVAR_MIDDLE_NEXT:%.*]], [[MIDDLE_LATCH:%.*]] ], [ 4, [[MIDDLE_HEADER_PREHEADER:%.*]] ]
-; CHECK-NEXT:    br i1 [[TOBOOL71_I]], label [[INNERMOST_BODY_SPLIT1:%.*]], label [[INNERMOST_BODY_SPLIT:%.*]]
-; CHECK:       innermost.preheader:
-; CHECK-NEXT:    br label [[INNERMOST_BODY:%.*]]
-; CHECK:       innermost.body:
-; CHECK-NEXT:    [[INDVAR_INNERMOST:%.*]] = phi i64 [ [[TMP1:%.*]], [[INNERMOST_BODY_SPLIT]] ], [ 4, [[INNERMOST_PREHEADER]] ]
-; CHECK-NEXT:    br label [[MIDDLE_HEADER_PREHEADER]]
-; CHECK:       innermost.body.split1:
-; CHECK-NEXT:    [[ARRAYIDX9_I:%.*]] = getelementptr inbounds [1 x [6 x i32]], ptr @d, i64 0, i64 [[INDVAR_INNERMOST]], i64 [[INDVAR_MIDDLE]]
-; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX9_I]], align 4
-; CHECK-NEXT:    [[INDVAR_INNERMOST_NEXT:%.*]] = add nsw i64 [[INDVAR_INNERMOST]], -1
-; CHECK-NEXT:    [[TOBOOL5_I:%.*]] = icmp eq i64 [[INDVAR_INNERMOST_NEXT]], 0
-; CHECK-NEXT:    br label [[MIDDLE_LATCH_LOOPEXIT:%.*]]
-; CHECK:       innermost.body.split:
-; CHECK-NEXT:    [[TMP1]] = add nsw i64 [[INDVAR_INNERMOST]], -1
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[TMP2]], label [[OUTERMOST_LATCH]], label [[INNERMOST_BODY]]
-; CHECK:       innermost.loopexit:
-; CHECK-NEXT:    br label [[MIDDLE_LATCH]]
-; CHECK:       middle.latch:
-; CHECK-NEXT:    [[INDVAR_MIDDLE_NEXT]] = add nsw i64 [[INDVAR_MIDDLE]], -1
-; CHECK-NEXT:    [[TOBOOL2_I:%.*]] = icmp eq i64 [[INDVAR_MIDDLE_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL2_I]], label [[INNERMOST_BODY_SPLIT]], label [[MIDDLE_HEADER]]
-; CHECK:       outermost.latch:
-; CHECK-NEXT:    [[INDVAR_OUTERMOST_NEXT]] = add nsw i32 [[INDVAR_OUTERMOST]], -5
-; CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp eq i32 [[INDVAR_OUTERMOST_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_I]], label [[OUTERMOST_EXIT:%.*]], label [[OUTERMOST_HEADER]]
-; CHECK:       outermost.exit:
-; CHECK-NEXT:    ret void
 ;
-
+; CHECK:       --- !Passed
+; CHECK-NEXT:  Pass:            loop-interchange
+; CHECK-NEXT:  Name:            Interchanged
+; CHECK-NEXT:  Function:        test2
+; CHECK-NEXT:  Args:
+; CHECK-NEXT:    - String:          Loop interchanged with enclosing loop.
+; CHECK-NEXT:  ...
+;
+define void @test2() {
 entry:
   br label %outermost.header
 
diff --git a/llvm/test/Transforms/LoopInterchange/pr43797-lcssa-for-multiple-outer-loop-blocks.ll b/llvm/test/Transforms/LoopInterchange/pr43797-lcssa-for-multiple-outer-loop-blocks.ll
index a0d0543075ffc..38970354c3d1c 100644
--- a/llvm/test/Transforms/LoopInterchange/pr43797-lcssa-for-multiple-outer-loop-blocks.ll
+++ b/llvm/test/Transforms/LoopInterchange/pr43797-lcssa-for-multiple-outer-loop-blocks.ll
@@ -1,48 +1,19 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=loop-interchange -cache-line-size=64 -verify-loop-lcssa -S %s | FileCheck %s
+; RUN: opt -passes=loop-interchange -cache-line-size=64 -verify-loop-lcssa %s -pass-remarks-output=%t -disable-output
+; RUN: FileCheck -input-file %t %s
 
 ; Tests for PR43797.
 
 @wdtdr = external dso_local global [5 x [5 x double]], align 16
 
+; CHECK:       --- !Passed
+; CHECK-NEXT:  Pass:            loop-interchange
+; CHECK-NEXT:  Name:            Interchanged
+; CHECK-NEXT:  Function:        test1
+; CHECK-NEXT:  Args:
+; CHECK-NEXT:    - String:          Loop interchanged with enclosing loop.
+; CHECK-NEXT:  ...
+
 define void @test1() {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[INNER_HEADER_PREHEADER:%.*]]
-; CHECK:       outer.header.preheader:
-; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
-; CHECK:       outer.header:
-; CHECK-NEXT:    [[OUTER_IDX:%.*]] = phi i64 [ [[OUTER_IDX_INC:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[OUTER_HEADER_PREHEADER:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [5 x [5 x double]], ptr @wdtdr, i64 0, i64 0, i64 [[OUTER_IDX]]
-; CHECK-NEXT:    br label [[INNER_HEADER_SPLIT:%.*]]
-; CHECK:       inner.header.preheader:
-; CHECK-NEXT:    br label [[INNER_HEADER:%.*]]
-; CHECK:       inner.header:
-; CHECK-NEXT:    [[INNER_IDX:%.*]] = phi i64 [ [[TMP3:%.*]], [[INNER_LATCH_SPLIT:%.*]] ], [ 0, [[INNER_HEADER_PREHEADER]] ]
-; CHECK-NEXT:    br label [[OUTER_HEADER_PREHEADER]]
-; CHECK:       inner.header.split:
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX8]], align 8
-; CHECK-NEXT:    store double undef, ptr [[ARRAYIDX8]], align 8
-; CHECK-NEXT:    br label [[INNER_LATCH:%.*]]
-; CHECK:       inner.latch:
-; CHECK-NEXT:    [[INNER_IDX_INC:%.*]] = add nsw i64 [[INNER_IDX]], 1
-; CHECK-NEXT:    br label [[INNER_EXIT:%.*]]
-; CHECK:       inner.latch.split:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ [[OUTER_V:%.*]], [[OUTER_LATCH]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ [[OUTER_IDX_INC]], [[OUTER_LATCH]] ]
-; CHECK-NEXT:    [[TMP3]] = add nsw i64 [[INNER_IDX]], 1
-; CHECK-NEXT:    br i1 false, label [[INNER_HEADER]], label [[OUTER_EXIT:%.*]]
-; CHECK:       inner.exit:
-; CHECK-NEXT:    [[OUTER_V]] = add nsw i64 [[OUTER_IDX]], 1
-; CHECK-NEXT:    br label [[OUTER_LATCH]]
-; CHECK:       outer.latch:
-; CHECK-NEXT:    [[OUTER_IDX_INC]] = add nsw i64 [[OUTER_IDX]], 1
-; CHECK-NEXT:    br i1 false, label [[OUTER_HEADER]], label [[INNER_LATCH_SPLIT]]
-; CHECK:       outer.exit:
-; CHECK-NEXT:    [[EXIT1_LCSSA:%.*]] = phi i64 [ [[TMP1]], [[INNER_LATCH_SPLIT]] ]
-; CHECK-NEXT:    [[EXIT2_LCSSA:%.*]] = phi i64 [ [[TMP2]], [[INNER_LATCH_SPLIT]] ]
-; CHECK-NEXT:    ret void
-;
 entry:
   br label %outer.header
 
@@ -75,48 +46,15 @@ outer.exit:                                        ; preds = %for.inc27
   ret void
 }
 
+; CHECK:       --- !Passed
+; CHECK-NEXT:  Pass:            loop-interchange
+; CHECK-NEXT:  Name:            Interchanged
+; CHECK-NEXT:  Function:        test2
+; CHECK-NEXT:  Args:
+; CHECK-NEXT:    - String:          Loop interchanged with enclosing loop.
+; CHECK-NEXT:  ...
+
 define void @test2(i1 %cond) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[INNER_HEADER_PREHEADER:%.*]], label [[OUTER_EXIT:%.*]]
-; CHECK:       outer.header.preheader:
-; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
-; CHECK:       outer.header:
-; CHECK-NEXT:    [[OUTER_IDX:%.*]] = phi i64 [ [[OUTER_IDX_INC:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[OUTER_HEADER_PREHEADER:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [5 x [5 x double]], ptr @wdtdr, i64 0, i64 0, i64 [[OUTER_IDX]]
-; CHECK-NEXT:    br label [[INNER_HEADER_SPLIT:%.*]]
-; CHECK:       inner.header.preheader:
-; CHECK-NEXT:    br label [[INNER_HEADER:%.*]]
-; CHECK:       inner.header:
-; CHECK-NEXT:    [[INNER_IDX:%.*]] = phi i64 [ [[TMP3:%.*]], [[INNER_LATCH_SPLIT:%.*]] ], [ 0, [[INNER_HEADER_PREHEADER]] ]
-; CHECK-NEXT:    br label [[OUTER_HEADER_PREHEADER]]
-; CHECK:       inner.header.split:
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX8]], align 8
-; CHECK-NEXT:    store double undef, ptr [[ARRAYIDX8]], align 8
-; CHECK-NEXT:    br label [[INNER_LATCH:%.*]]
-; CHECK:       inner.latch:
-; CHECK-NEXT:    [[INNER_IDX_INC:%.*]] = add nsw i64 [[INNER_IDX]], 1
-; CHECK-NEXT:    br label [[INNER_EXIT:%.*]]
-; CHECK:       inner.latch.split:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ [[OUTER_IDX_INC]], [[OUTER_LATCH]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ [[OUTER_V:%.*]], [[OUTER_LATCH]] ]
-; CHECK-NEXT:    [[TMP3]] = add nsw i64 [[INNER_IDX]], 1
-; CHECK-NEXT:    br i1 false, label [[INNER_HEADER]], label [[OUTER_EXIT_LOOPEXIT:%.*]]
-; CHECK:       inner.exit:
-; CHECK-NEXT:    [[OUTER_V]] = add nsw i64 [[OUTER_IDX]], 1
-; CHECK-NEXT:    br label [[OUTER_LATCH]]
-; CHECK:       outer.latch:
-; CHECK-NEXT:    [[OUTER_IDX_INC]] = add nsw i64 [[OUTER_IDX]], 1
-; CHECK-NEXT:    br i1 false, label [[OUTER_HEADER]], label [[INNER_LATCH_SPLIT]]
-; CHECK:       outer.exit.loopexit:
-; CHECK-NEXT:    [[OUTER_IDX_INC_LCSSA:%.*]] = phi i64 [ [[TMP1]], [[INNER_LATCH_SPLIT]] ]
-; CHECK-NEXT:    [[OUTER_V_LCSSA:%.*]] = phi i64 [ [[TMP2]], [[INNER_LATCH_SPLIT]] ]
-; CHECK-NEXT:    br label [[OUTER_EXIT]]
-; CHECK:       outer.exit:
-; CHECK-NEXT:    [[EXIT1_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_V_LCSSA]], [[OUTER_EXIT_LOOPEXIT]] ]
-; CHECK-NEXT:    [[EXIT2_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[OUTER_IDX_INC_LCSSA]], [[OUTER_EXIT_LOOPEXIT]] ]
-; CHECK-NEXT:    ret void
-;
 entry:
   br i1 %cond, label %outer.header, label %outer.exit
 
diff --git a/llvm/test/Transforms/LoopInterchange/vector-gep-operand.ll b/llvm/test/Transforms/LoopInterchange/vector-gep-operand.ll
index 03e3b4b7408b5..022cdd44b7f50 100644
--- a/llvm/test/Transforms/LoopInterchange/vector-gep-operand.ll
+++ b/llvm/test/Transforms/LoopInterchange/vector-gep-operand.ll
@@ -1,43 +1,17 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=loop-interchange -cache-line-size=64 -loop-interchange-threshold=-10 -S %s | FileCheck %s
+; RUN: opt -passes=loop-interchange -cache-line-size=64 -loop-interchange-threshold=-10 %s -pass-remarks-output=%t -disable-output
+; RUN: FileCheck -input-file %t %s
 
 ; The test contains a GEP with an operand that is not SCEV-able. Make sure
 ; loop-interchange does not crash.
-define void @test(ptr noalias %src, ptr %dst) {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[INNER_PREHEADER:%.*]]
-; CHECK:       outer.header.preheader:
-; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
-; CHECK:       outer.header:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[OUTER_HEADER_PREHEADER:%.*]] ]
-; CHECK-NEXT:    br label [[INNER_SPLIT1:%.*]]
-; CHECK:       inner.preheader:
-; CHECK-NEXT:    br label [[INNER:%.*]]
-; CHECK:       inner:
-; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[TMP0:%.*]], [[INNER_SPLIT:%.*]] ], [ 0, [[INNER_PREHEADER]] ]
-; CHECK-NEXT:    br label [[OUTER_HEADER_PREHEADER]]
-; CHECK:       inner.split1:
-; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds [256 x float], ptr [[SRC:%.*]], <2 x i64> <i64 0, i64 1>, i64 [[J]]
-; CHECK-NEXT:    [[SRC_0:%.*]] = extractelement <2 x ptr> [[SRC_GEP]], i32 0
-; CHECK-NEXT:    [[LV_0:%.*]] = load float, ptr [[SRC_0]], align 4
-; CHECK-NEXT:    [[ADD_0:%.*]] = fadd float [[LV_0]], 1.000000e+00
-; CHECK-NEXT:    [[DST_GEP:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 [[J]]
-; CHECK-NEXT:    store float [[ADD_0]], ptr [[DST_GEP]], align 4
-; CHECK-NEXT:    [[J_NEXT:%.*]] = add nuw nsw i64 [[J]], 1
-; CHECK-NEXT:    [[INNER_EXITCOND:%.*]] = icmp eq i64 [[J_NEXT]], 100
-; CHECK-NEXT:    br label [[OUTER_LATCH]]
-; CHECK:       inner.split:
-; CHECK-NEXT:    [[TMP0]] = add nuw nsw i64 [[J]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], 100
-; CHECK-NEXT:    br i1 [[TMP1]], label [[EXIT:%.*]], label [[INNER]]
-; CHECK:       outer.latch:
-; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT:    [[OUTER_EXITCOND:%.*]] = icmp eq i32 [[I_NEXT]], 100
-; CHECK-NEXT:    br i1 [[OUTER_EXITCOND]], label [[INNER_SPLIT]], label [[OUTER_HEADER]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
+; CHECK:       --- !Passed
+; CHECK-NEXT:  Pass:            loop-interchange
+; CHECK-NEXT:  Name:            Interchanged
+; CHECK-NEXT:  Function:        test
+; CHECK-NEXT:  Args:
+; CHECK-NEXT:    - String:          Loop interchanged with enclosing loop.
+
+define void @test(ptr noalias %src, ptr %dst) {
 entry:
   br label %outer.header
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll b/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
index 0ff98d2abe776..6b55f5291efd8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
@@ -56,9 +56,59 @@ exit:
   %res = phi i64 [ %red.next, %loop ]
   ret i64 %res
 }
+
+define i32 @add_reduction_select_operand_constant_but_non_uniform() {
+; CHECK-LABEL: define i32 @add_reduction_select_operand_constant_but_non_uniform() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 42, i32 0, i32 0, i32 0>, %[[VECTOR_PH]] ], [ splat (i32 42), %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ splat (i32 42), %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 64
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> splat (i32 84))
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 42, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD2_REASS:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ADD2_REASS]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[RDX_NEXT]] = add i32 0, [[RDX]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ADD2_REASS]], 64
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[ADD_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 42, %entry ], [ %rdx.next, %loop ]
+
+  %iv.next = add i32 %iv, 1
+  %rdx.next = add i32 0, %rdx
+
+  %cmp = icmp ult i32 %iv.next, 64
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i32 %rdx.next
+}
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index bf95622733461..05c0bc0761ea4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -396,8 +396,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
 ; CHECK-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
@@ -1548,5 +1548,263 @@ end:
   ret void
 }
 
+; Check vectorization on an interleaved load/store groups of factor 4
+
+; for (int i = 0; i < 1024; ++i) {
+;   dst[i].x = a[i].x + b[i].x;
+;   dst[i].y = a[i].y - b[i].y;
+;   dst[i].z = a[i].z << b[i].z;
+;   dst[i].t = a[i].t >> b[i].t;
+; }
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a, ptr readonly %b) {
+; CHECK-LABEL: @interleave_deinterleave(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], 1024
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP7]])
+; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP8]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <vscale x 16 x i32>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC9:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC8]])
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 1
+; CHECK-NEXT:    [[STRIDED_VEC10:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP14]])
+; CHECK-NEXT:    [[STRIDED_VEC11:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP15]])
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP9]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP22:%.*]] = sub nsw <vscale x 4 x i32> [[TMP10]], [[TMP17]]
+; CHECK-NEXT:    [[TMP23:%.*]] = shl <vscale x 4 x i32> [[TMP11]], [[TMP18]]
+; CHECK-NEXT:    [[TMP24:%.*]] = ashr <vscale x 4 x i32> [[TMP12]], [[TMP19]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP23]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP24]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC13:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC12]])
+; CHECK-NEXT:    store <vscale x 16 x i32> [[INTERLEAVED_VEC13]], ptr [[TMP21]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[Y]], align 4
+; CHECK-NEXT:    [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[Y11]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP28]], [[TMP29]]
+; CHECK-NEXT:    [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4
+; CHECK-NEXT:    store i32 [[SUB]], ptr [[Y14]], align 4
+; CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[Z]], align 4
+; CHECK-NEXT:    [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8
+; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[Z19]], align 4
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP30]], [[TMP31]]
+; CHECK-NEXT:    [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8
+; CHECK-NEXT:    store i32 [[SHL]], ptr [[Z22]], align 4
+; CHECK-NEXT:    [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12
+; CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[T]], align 4
+; CHECK-NEXT:    [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12
+; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[T27]], align 4
+; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[TMP32]], [[TMP33]]
+; CHECK-NEXT:    [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12
+; CHECK-NEXT:    store i32 [[SHR]], ptr [[T30]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv
+  store i32 %add, ptr %arrayidx5, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4
+  %2 = load i32, ptr %y, align 4
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4
+  %3 = load i32, ptr %y11, align 4
+  %sub = sub nsw i32 %2, %3
+  %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4
+  store i32 %sub, ptr %y14, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8
+  %4 = load i32, ptr %z, align 4
+  %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8
+  %5 = load i32, ptr %z19, align 4
+  %shl = shl i32 %4, %5
+  %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8
+  store i32 %shl, ptr %z22, align 4
+  %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12
+  %6 = load i32, ptr %t, align 4
+  %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12
+  %7 = load i32, ptr %t27, align 4
+  %shr = ashr i32 %6, %7
+  %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12
+  store i32 %shr, ptr %t30, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; Check vectorization on a reverse interleaved load/store groups of factor 4
+
+; for (int i = 1023; i >= 0; i--) {
+;   int a = A[i].x + i;
+;   int b = A[i].y - i;
+;   int c = A[i].z * i;
+;   int d = A[i].t << i;
+;   B[i].x = a;
+;   B[i].y = b;
+;   B[i].z = c;
+;   B[i].t = d;
+; }
+
+define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) #1{
+; CHECK-LABEL: @interleave_deinterleave_reverse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK-NEXT:    [[INDUCTION:%.*]] = sub <vscale x 4 x i32> splat (i32 1023), [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw i32 0, [[TMP3]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP4]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP11]])
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP12]])
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
+; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
+; CHECK-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]])
+; CHECK-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]])
+; CHECK-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP16]])
+; CHECK-NEXT:    [[TMP17:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE3]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP19:%.*]] = mul nsw <vscale x 4 x i32> [[REVERSE4]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP20:%.*]] = shl nuw nsw <vscale x 4 x i32> [[REVERSE5]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4
+; CHECK-NEXT:    [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]]
+; CHECK-NEXT:    [[REVERSE6:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP17]])
+; CHECK-NEXT:    [[REVERSE7:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; CHECK-NEXT:    [[REVERSE8:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-NEXT:    [[REVERSE9:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE6]], <vscale x 4 x i32> [[REVERSE8]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE7]], <vscale x 4 x i32> [[REVERSE9]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC10]])
+; CHECK-NEXT:    store <vscale x 16 x i32> [[INTERLEAVED_VEC11]], ptr [[TMP26]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP44:![0-9]+]]
+;
+entry:
+  br label %for.body
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
+  %x = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 0
+  %load1 = load i32, ptr %x, align 4
+  %trunc = trunc i64 %indvars.iv to i32
+  %add = add nsw i32 %load1, %trunc
+  %y = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 1
+  %load2 = load i32, ptr %y, align 4
+  %sub = sub nsw i32 %load2, %trunc
+  %z = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 2
+  %load3 = load i32, ptr %z, align 4
+  %mul = mul nsw i32 %load3, %trunc
+  %t = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 3
+  %load4 = load i32, ptr %t, align 4
+  %shl = shl nuw nsw i32 %load4, %trunc
+  %x5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 0
+  store i32 %add, ptr %x5, align 4
+  %y8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 1
+  store i32 %sub, ptr %y8, align 4
+  %z5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 2
+  store i32 %mul, ptr %z5, align 4
+  %t8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 3
+  store i32 %shl, ptr %t8, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+}
 attributes #1 = { "target-features"="+sve" vscale_range(1, 16) }
 attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 1a281fe7c6f7f..d4392bebdf37b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -529,3 +529,255 @@ for.inc:
 for.end:
   ret void
 }
+
+; Expected to contain interleave2/deinterleave2 instructions
+;
+; void masked_strided_factor4(const unsigned char* restrict p,
+;                            unsigned char* restrict q,
+;                            unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char left1 = p[4*ix];
+;         char right1 = p[4*ix + 1];
+;         char left2 = p[4*ix + 2];
+;         char right2 = p[4*ix + 3];
+;         char max1 = max(left1, right1);
+;         char max2 = max(left2, right2);
+;         q[4*ix] = max1;
+;         q[4*ix + 1] = 0 - max1;
+;         q[4*ix + 2] = max2;
+;         q[4*ix + 3] = 0 - max2;
+;     }
+; }
+;}
+define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
+; SCALAR_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4
+; SCALAR_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; SCALAR_TAIL_FOLDING-NEXT:  entry:
+; SCALAR_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALAR_TAIL_FOLDING:       vector.ph:
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALAR_TAIL_FOLDING:       vector.body:
+; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 2
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK2:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK2]], <vscale x 64 x i8> poison)
+; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 32 x i8>, <vscale x 32 x i8> } @llvm.vector.deinterleave2.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP11]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP12]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP17]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP8]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP19]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC5:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP20]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC6:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave2.nxv64i8(<vscale x 32 x i8> [[INTERLEAVED_VEC]], <vscale x 32 x i8> [[INTERLEAVED_VEC5]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK7:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK8:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK9:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK7]], <vscale x 32 x i1> [[INTERLEAVED_MASK8]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK9]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALAR_TAIL_FOLDING:       middle.block:
+; SCALAR_TAIL_FOLDING-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SCALAR_TAIL_FOLDING:       scalar.ph:
+; SCALAR_TAIL_FOLDING-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    br label [[FOR_BODY:%.*]]
+; SCALAR_TAIL_FOLDING:       for.body:
+; SCALAR_TAIL_FOLDING-NEXT:    [[IX_024:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_024]], [[CONV]]
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; SCALAR_TAIL_FOLDING:       if.then:
+; SCALAR_TAIL_FOLDING-NEXT:    [[IDX0:%.*]] = shl nuw nsw i32 [[IX_024]], 2
+; SCALAR_TAIL_FOLDING-NEXT:    [[IDX1:%.*]] = or disjoint i32 [[IDX0]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[IDX2:%.*]] = or disjoint i32 [[IDX0]], 2
+; SCALAR_TAIL_FOLDING-NEXT:    [[IDX3:%.*]] = or disjoint i32 [[IDX0]], 3
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP24:%.*]] = zext nneg i32 [[IDX0]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY1IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP24]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP25:%.*]] = load i8, ptr [[ARRAY1IDX0]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP26:%.*]] = zext nneg i32 [[IDX1]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY1IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP26]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP27:%.*]] = load i8, ptr [[ARRAY1IDX1]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP28:%.*]] = zext nneg i32 [[IDX2]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY1IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP28]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAY1IDX2]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP30:%.*]] = zext nneg i32 [[IDX3]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY1IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP30]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP31:%.*]] = load i8, ptr [[ARRAY1IDX3]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[SPEC_SELECT_I1:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP25]], i8 [[TMP27]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[SUB1:%.*]] = sub i8 0, [[SPEC_SELECT_I1]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[SPEC_SELECT_I2:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP29]], i8 [[TMP31]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[SUB2:%.*]] = sub i8 0, [[SPEC_SELECT_I2]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP32:%.*]] = zext nneg i32 [[IDX0]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY3IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP32]]
+; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SPEC_SELECT_I1]], ptr [[ARRAY3IDX0]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP33:%.*]] = zext nneg i32 [[IDX1]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY3IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP33]]
+; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SUB1]], ptr [[ARRAY3IDX1]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP34:%.*]] = zext nneg i32 [[IDX2]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY3IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP34]]
+; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SPEC_SELECT_I2]], ptr [[ARRAY3IDX2]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP35:%.*]] = zext nneg i32 [[IDX3]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY3IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP35]]
+; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SUB2]], ptr [[ARRAY3IDX3]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    br label [[FOR_INC]]
+; SCALAR_TAIL_FOLDING:       for.inc:
+; SCALAR_TAIL_FOLDING-NEXT:    [[INC]] = add nuw nsw i32 [[IX_024]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; SCALAR_TAIL_FOLDING:       for.end:
+; SCALAR_TAIL_FOLDING-NEXT:    ret void
+;
+; PREDICATED_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4
+; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; PREDICATED_TAIL_FOLDING-NEXT:  entry:
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PREDICATED_TAIL_FOLDING:       vector.ph:
+; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PREDICATED_TAIL_FOLDING:       vector.body:
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 2
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK2:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK2]], <vscale x 64 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 32 x i8>, <vscale x 32 x i8> } @llvm.vector.deinterleave2.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP11]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP12]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP17]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP8]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP19]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC5:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP20]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC6:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave2.nxv64i8(<vscale x 32 x i8> [[INTERLEAVED_VEC]], <vscale x 32 x i8> [[INTERLEAVED_VEC5]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK7:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK8:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK9:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK7]], <vscale x 32 x i1> [[INTERLEAVED_MASK8]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK9]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
+; PREDICATED_TAIL_FOLDING:       middle.block:
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; PREDICATED_TAIL_FOLDING:       scalar.ph:
+; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_BODY:%.*]]
+; PREDICATED_TAIL_FOLDING:       for.body:
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 poison, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; PREDICATED_TAIL_FOLDING:       if.then:
+; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_INC]]
+; PREDICATED_TAIL_FOLDING:       for.inc:
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; PREDICATED_TAIL_FOLDING:       for.end:
+; PREDICATED_TAIL_FOLDING-NEXT:    ret void
+;
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.024, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %idx0 = shl nuw nsw i32 %ix.024, 2
+  %idx1 = add i32 %idx0, 1
+  %idx2 = add i32 %idx0, 2
+  %idx3 = add i32 %idx0, 3
+
+  %array1idx0 = getelementptr inbounds i8, ptr %p, i32 %idx0
+  %0 = load i8, ptr %array1idx0, align 1
+  %array1idx1 = getelementptr inbounds i8, ptr %p, i32 %idx1
+  %1 = load i8, ptr %array1idx1, align 1
+  %array1idx2 = getelementptr inbounds i8, ptr %p, i32 %idx2
+  %2 = load i8, ptr %array1idx2, align 1
+  %array1idx3 = getelementptr inbounds i8, ptr %p, i32 %idx3
+  %3 = load i8, ptr %array1idx3, align 1
+
+  %cmp.i1 = icmp slt i8 %0, %1
+  %spec.select.i1 = select i1 %cmp.i1, i8 %1, i8 %0
+  %sub1 = sub i8 0, %spec.select.i1
+  %cmp.i2 = icmp slt i8 %2, %3
+  %spec.select.i2 = select i1 %cmp.i2, i8 %3, i8 %2
+  %sub2 = sub i8 0, %spec.select.i2
+
+  %array3idx0 = getelementptr inbounds i8, ptr %q, i32 %idx0
+  store i8 %spec.select.i1, ptr %array3idx0, align 1
+  %array3idx1 = getelementptr inbounds i8, ptr %q, i32 %idx1
+  store i8 %sub1, ptr %array3idx1, align 1
+  %array3idx2 = getelementptr inbounds i8, ptr %q, i32 %idx2
+  store i8 %spec.select.i2, ptr %array3idx2, align 1
+  %array3idx3 = getelementptr inbounds i8, ptr %q, i32 %idx3
+  store i8 %sub2, ptr %array3idx3, align 1
+
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.024, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index bda4839dead51..b1ff589fe51bf 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -9,7 +9,7 @@ define void @load_store_factor2_i32(ptr %p) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
@@ -17,88 +17,88 @@ define void @load_store_factor2_i32(ptr %p) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[Q0]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 1)
-; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP11]], splat (i32 2)
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP15]])
-; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 1)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 2)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP12]])
+; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
+; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q2]], align 4
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; CHECK-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
 ; CHECK-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor2_i32(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP2]], align 4
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; FIXED-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; FIXED-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; FIXED-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; FIXED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; FIXED-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
+; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
+; FIXED-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; FIXED-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
+; FIXED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
+; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
 ; FIXED-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q2]], align 4
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; FIXED-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
 ; FIXED-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -107,7 +107,7 @@ define void @load_store_factor2_i32(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
@@ -115,44 +115,44 @@ define void @load_store_factor2_i32(ptr %p) {
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[Q0]], align 4
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 1)
-; SCALABLE-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP11]], splat (i32 2)
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP15]])
-; SCALABLE-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 1)
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 2)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP12]])
+; SCALABLE-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q2]], align 4
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
 ; SCALABLE-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -186,7 +186,7 @@ define void @load_store_factor2_i64(ptr %p) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
@@ -194,88 +194,88 @@ define void @load_store_factor2_i64(ptr %p) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 1)
-; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], splat (i64 2)
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP15]])
-; CHECK-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 1)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 2)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP12]])
+; CHECK-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q2]], align 8
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
 ; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor2_i64(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; FIXED-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; FIXED-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; FIXED-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; FIXED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; FIXED-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
+; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; FIXED-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; FIXED-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 4
+; FIXED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
 ; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q2]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
 ; FIXED-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -284,7 +284,7 @@ define void @load_store_factor2_i64(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
@@ -292,44 +292,44 @@ define void @load_store_factor2_i64(ptr %p) {
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[TMP8]], align 8
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[Q0]], align 8
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], splat (i64 2)
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP15]])
-; SCALABLE-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 2)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP12]])
+; SCALABLE-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q2]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
 ; SCALABLE-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -360,42 +360,42 @@ exit:
 define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; CHECK-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; CHECK-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
+; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
+; CHECK-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
+; CHECK-NEXT:    [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]]
+; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q3]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q3]], align 4
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; CHECK-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
@@ -405,50 +405,50 @@ define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-NEXT:    [[X2:%.*]] = load i32, ptr [[Q2]], align 4
 ; CHECK-NEXT:    [[Y2:%.*]] = add i32 [[X2]], 3
 ; CHECK-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor3_i32(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
 ; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; FIXED-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; FIXED-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; FIXED-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; FIXED-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; FIXED-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; FIXED-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; FIXED-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
+; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
+; FIXED-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
+; FIXED-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; FIXED-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; FIXED-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; FIXED-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
+; FIXED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
+; FIXED-NEXT:    [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]]
+; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q3]], align 4
 ; FIXED-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q3]], align 4
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; FIXED-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
@@ -458,50 +458,50 @@ define void @load_store_factor3_i32(ptr %p) {
 ; FIXED-NEXT:    [[X2:%.*]] = load i32, ptr [[Q2]], align 4
 ; FIXED-NEXT:    [[Y2:%.*]] = add i32 [[X2]], 3
 ; FIXED-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
 ; SCALABLE-LABEL: @load_store_factor3_i32(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
-; SCALABLE-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
 ; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; SCALABLE-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; SCALABLE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SCALABLE-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SCALABLE-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; SCALABLE-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
+; SCALABLE-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
+; SCALABLE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SCALABLE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SCALABLE-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; SCALABLE-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
+; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
+; SCALABLE-NEXT:    [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q3]], align 4
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q3]], align 4
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
@@ -511,9 +511,9 @@ define void @load_store_factor3_i32(ptr %p) {
 ; SCALABLE-NEXT:    [[X2:%.*]] = load i32, ptr [[Q2]], align 4
 ; SCALABLE-NEXT:    [[Y2:%.*]] = add i32 [[X2]], 3
 ; SCALABLE-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -550,42 +550,42 @@ exit:
 define void @load_store_factor3_i64(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
+; CHECK-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q3]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q3]], align 8
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -595,50 +595,50 @@ define void @load_store_factor3_i64(ptr %p) {
 ; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; CHECK-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
 ; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor3_i64(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; FIXED-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; FIXED-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; FIXED-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; FIXED-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; FIXED-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; FIXED-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; FIXED-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
+; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; FIXED-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; FIXED-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; FIXED-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; FIXED-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; FIXED-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 4
+; FIXED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
+; FIXED-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q3]], align 8
 ; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q3]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -648,50 +648,50 @@ define void @load_store_factor3_i64(ptr %p) {
 ; FIXED-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; FIXED-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
 ; FIXED-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
 ; SCALABLE-LABEL: @load_store_factor3_i64(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
-; SCALABLE-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; SCALABLE-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; SCALABLE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SCALABLE-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; SCALABLE-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; SCALABLE-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; SCALABLE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SCALABLE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SCALABLE-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; SCALABLE-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 4
+; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
+; SCALABLE-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q3]], align 8
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q3]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -701,9 +701,9 @@ define void @load_store_factor3_i64(ptr %p) {
 ; SCALABLE-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; SCALABLE-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
 ; SCALABLE-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -740,56 +740,75 @@ exit:
 define void @load_store_factor8(ptr %p) {
 ; CHECK-LABEL: @load_store_factor8(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 0, i32 8>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 1, i32 9>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 2, i32 10>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 3, i32 11>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 4, i32 12>
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 5, i32 13>
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 6, i32 14>
-; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 7, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; CHECK-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; CHECK-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; CHECK-NEXT:    [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
-; CHECK-NEXT:    [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8)
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i64>, ptr [[Q0]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP6]])
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP7]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 1
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP8]])
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP9]])
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP10]])
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP11]])
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 1)
+; CHECK-NEXT:    [[TMP21:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 2)
+; CHECK-NEXT:    [[TMP22:%.*]] = add <vscale x 1 x i64> [[TMP14]], splat (i64 3)
+; CHECK-NEXT:    [[TMP23:%.*]] = add <vscale x 1 x i64> [[TMP15]], splat (i64 4)
+; CHECK-NEXT:    [[TMP24:%.*]] = add <vscale x 1 x i64> [[TMP16]], splat (i64 5)
+; CHECK-NEXT:    [[TMP25:%.*]] = add <vscale x 1 x i64> [[TMP17]], splat (i64 6)
+; CHECK-NEXT:    [[TMP26:%.*]] = add <vscale x 1 x i64> [[TMP18]], splat (i64 7)
+; CHECK-NEXT:    [[TMP27:%.*]] = add <vscale x 1 x i64> [[TMP19]], splat (i64 8)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP24]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC7:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP25]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC8:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP26]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC9:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP23]], <vscale x 1 x i64> [[TMP27]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC]], <vscale x 2 x i64> [[INTERLEAVED_VEC8]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC7]], <vscale x 2 x i64> [[INTERLEAVED_VEC9]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> [[INTERLEAVED_VEC10]], <vscale x 4 x i64> [[INTERLEAVED_VEC11]])
+; CHECK-NEXT:    store <vscale x 8 x i64> [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP2]]
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET8:%.*]] = shl i64 [[I1]], 3
+; CHECK-NEXT:    [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q8]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q8]], align 8
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -824,23 +843,23 @@ define void @load_store_factor8(ptr %p) {
 ; CHECK-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
 ; CHECK-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
 ; CHECK-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor8(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 3
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[Q0]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 0, i32 8>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 1, i32 9>
 ; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 2, i32 10>
@@ -849,39 +868,39 @@ define void @load_store_factor8(ptr %p) {
 ; FIXED-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 5, i32 13>
 ; FIXED-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 6, i32 14>
 ; FIXED-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 7, i32 15>
-; FIXED-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; FIXED-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; FIXED-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; FIXED-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; FIXED-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; FIXED-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; FIXED-NEXT:    [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
-; FIXED-NEXT:    [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8)
-; FIXED-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; FIXED-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; FIXED-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; FIXED-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; FIXED-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; FIXED-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; FIXED-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; FIXED-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; FIXED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; FIXED-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
+; FIXED-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; FIXED-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; FIXED-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
+; FIXED-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
+; FIXED-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
+; FIXED-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
+; FIXED-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8)
+; FIXED-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; FIXED-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; FIXED-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; FIXED-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; FIXED-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; FIXED-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; FIXED-NEXT:    [[TMP17:%.*]] = shufflevector <8 x i64> [[TMP15]], <8 x i64> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP17]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; FIXED-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 2
+; FIXED-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET8:%.*]] = shl i64 [[I1]], 3
+; FIXED-NEXT:    [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q8]], align 8
 ; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q8]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -916,64 +935,83 @@ define void @load_store_factor8(ptr %p) {
 ; FIXED-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
 ; FIXED-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
 ; FIXED-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
 ; SCALABLE-LABEL: @load_store_factor8(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; SCALABLE-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8
-; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 0, i32 8>
-; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 1, i32 9>
-; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 2, i32 10>
-; SCALABLE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 3, i32 11>
-; SCALABLE-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 4, i32 12>
-; SCALABLE-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 5, i32 13>
-; SCALABLE-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 6, i32 14>
-; SCALABLE-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 7, i32 15>
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; SCALABLE-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; SCALABLE-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; SCALABLE-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; SCALABLE-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; SCALABLE-NEXT:    [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
-; SCALABLE-NEXT:    [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8)
-; SCALABLE-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; SCALABLE-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; SCALABLE-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP3:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i64>, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP6]])
+; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP7]])
+; SCALABLE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 0
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 0
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 1
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 1
+; SCALABLE-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP8]])
+; SCALABLE-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP9]])
+; SCALABLE-NEXT:    [[STRIDED_VEC5:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP10]])
+; SCALABLE-NEXT:    [[STRIDED_VEC6:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP11]])
+; SCALABLE-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 0
+; SCALABLE-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 0
+; SCALABLE-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 0
+; SCALABLE-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 0
+; SCALABLE-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 1
+; SCALABLE-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 1
+; SCALABLE-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 1
+; SCALABLE-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 1
+; SCALABLE-NEXT:    [[TMP20:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP21:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 2)
+; SCALABLE-NEXT:    [[TMP22:%.*]] = add <vscale x 1 x i64> [[TMP14]], splat (i64 3)
+; SCALABLE-NEXT:    [[TMP23:%.*]] = add <vscale x 1 x i64> [[TMP15]], splat (i64 4)
+; SCALABLE-NEXT:    [[TMP24:%.*]] = add <vscale x 1 x i64> [[TMP16]], splat (i64 5)
+; SCALABLE-NEXT:    [[TMP25:%.*]] = add <vscale x 1 x i64> [[TMP17]], splat (i64 6)
+; SCALABLE-NEXT:    [[TMP26:%.*]] = add <vscale x 1 x i64> [[TMP18]], splat (i64 7)
+; SCALABLE-NEXT:    [[TMP27:%.*]] = add <vscale x 1 x i64> [[TMP19]], splat (i64 8)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP24]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC7:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP25]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC8:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP26]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC9:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP23]], <vscale x 1 x i64> [[TMP27]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC]], <vscale x 2 x i64> [[INTERLEAVED_VEC8]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC7]], <vscale x 2 x i64> [[INTERLEAVED_VEC9]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> [[INTERLEAVED_VEC10]], <vscale x 4 x i64> [[INTERLEAVED_VEC11]])
+; SCALABLE-NEXT:    store <vscale x 8 x i64> [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP2]]
+; SCALABLE-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET8:%.*]] = shl i64 [[I1]], 3
+; SCALABLE-NEXT:    [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q8]], align 8
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q8]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -1008,9 +1046,9 @@ define void @load_store_factor8(ptr %p) {
 ; SCALABLE-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
 ; SCALABLE-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
 ; SCALABLE-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -1080,7 +1118,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
@@ -1088,94 +1126,94 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[Q0]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP12]], ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP11]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
+; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; CHECK-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
-; CHECK-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
+; CHECK-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]]
 ; CHECK-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @combine_load_factor2_i32(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; FIXED-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[I]], 8
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1
 ; FIXED-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
-; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP2]]
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP3]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; FIXED-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
-; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; FIXED-NEXT:    [[TMP8:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]]
-; FIXED-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
-; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]]
-; FIXED-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
-; FIXED-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 8
-; FIXED-NEXT:    store <8 x i32> [[TMP8]], ptr [[TMP12]], align 4
-; FIXED-NEXT:    store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; FIXED-NEXT:    [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
+; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; FIXED-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
+; FIXED-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC4]]
+; FIXED-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]]
+; FIXED-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
+; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 8
+; FIXED-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP9]], align 4
+; FIXED-NEXT:    store <8 x i32> [[TMP7]], ptr [[TMP10]], align 4
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 16
+; FIXED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
+; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; FIXED-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
-; FIXED-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
+; FIXED-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]]
 ; FIXED-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -1184,7 +1222,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
@@ -1192,43 +1230,43 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[Q0]], align 4
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[TMP11]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
-; SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
-; SCALABLE-NEXT:    store <vscale x 4 x i32> [[TMP12]], ptr [[TMP14]], align 4
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], [[TMP10]]
+; SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
+; SCALABLE-NEXT:    store <vscale x 4 x i32> [[TMP11]], ptr [[TMP13]], align 4
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; SCALABLE-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
-; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
+; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]]
 ; SCALABLE-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -1263,7 +1301,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
@@ -1271,94 +1309,94 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP12]], ptr [[TMP14]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP11]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
+; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]]
 ; CHECK-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @combine_load_factor2_i64(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; FIXED-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[I]], 4
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1
 ; FIXED-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
-; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]]
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; FIXED-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; FIXED-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]]
-; FIXED-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
-; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
-; FIXED-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0
-; FIXED-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP10]], i32 4
-; FIXED-NEXT:    store <4 x i64> [[TMP8]], ptr [[TMP12]], align 8
-; FIXED-NEXT:    store <4 x i64> [[TMP9]], ptr [[TMP13]], align 8
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; FIXED-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; FIXED-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC1]]
+; FIXED-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC3]], [[STRIDED_VEC4]]
+; FIXED-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
+; FIXED-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0
+; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP8]], i32 4
+; FIXED-NEXT:    store <4 x i64> [[TMP6]], ptr [[TMP9]], align 8
+; FIXED-NEXT:    store <4 x i64> [[TMP7]], ptr [[TMP10]], align 8
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
+; FIXED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; FIXED-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; FIXED-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
+; FIXED-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]]
 ; FIXED-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -1367,7 +1405,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
@@ -1375,43 +1413,43 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[TMP8]], align 8
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[Q0]], align 8
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], [[TMP11]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
-; SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0
-; SCALABLE-NEXT:    store <vscale x 2 x i64> [[TMP12]], ptr [[TMP14]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP9]], [[TMP10]]
+; SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[TMP11]], ptr [[TMP13]], align 8
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; SCALABLE-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
+; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]]
 ; SCALABLE-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
index 9f8cf169c0593..809b69900731a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
@@ -11,6 +11,10 @@
 ; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP
 
+; FIXME: Fixed-order recurrence is not supported yet with EVL tail folding.
+; The llvm.splice may occurs unexpected behavior if the evl of the
+; second-to-last iteration is not VF*UF.
+
 define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-LABEL: define void @first_order_recurrence(
 ; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -27,31 +31,35 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TC]], 1
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
 ; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP11]]
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
-; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP25:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP26:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP25]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP26]]
+; IF-EVL-NEXT:    [[TMP27:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP13]]
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP15]], i32 4, <vscale x 4 x i1> [[TMP27]], <vscale x 4 x i32> poison)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1)
-; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP19:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[VP_OP_LOAD]]
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP13]]
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
-; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP19]], ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[TMP27]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]]
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
@@ -172,6 +180,7 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TC]], 1
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
@@ -182,27 +191,30 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], 4
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP13]], 1
 ; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 22, i32 [[TMP14]]
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
-; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP32:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP33:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP32]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP33]]
+; IF-EVL-NEXT:    [[TMP34:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]]
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP16]]
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[TMP34]], <vscale x 4 x i32> poison)
 ; IF-EVL-NEXT:    [[TMP19]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1)
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP19]], i32 -1)
-; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[TMP19]], <vscale x 4 x i32> [[TMP20]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
+; IF-EVL-NEXT:    [[TMP23:%.*]] = add nsw <vscale x 4 x i32> [[TMP19]], [[TMP20]]
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP16]]
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP21]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
-; IF-EVL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP15]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP23]], ptr [[TMP22]], i32 4, <vscale x 4 x i1> [[TMP34]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]]
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
@@ -218,12 +230,12 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL:       [[SCALAR_PH]]:
 ; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT6:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
 ; IF-EVL-NEXT:    br label %[[FOR_BODY:.*]]
 ; IF-EVL:       [[FOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP31:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT6]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
 ; IF-EVL-NEXT:    [[TMP31]] = load i32, ptr [[ARRAYIDX]], align 4
 ; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[FOR1]], [[FOR2]]
@@ -342,6 +354,7 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TC]], 1
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
@@ -356,30 +369,33 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], 4
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP16]], 1
 ; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 11, i32 [[TMP17]]
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
-; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VECTOR_RECUR4:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT3]], %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP39:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP40:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP39]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP40]]
+; IF-EVL-NEXT:    [[TMP41:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT6]]
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP19]]
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP21]], i32 4, <vscale x 4 x i1> [[TMP41]], <vscale x 4 x i32> poison)
 ; IF-EVL-NEXT:    [[TMP22]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1)
 ; IF-EVL-NEXT:    [[TMP23]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP22]], i32 -1)
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR4]], <vscale x 4 x i32> [[TMP23]], i32 -1)
-; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[TMP23]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
-; IF-EVL-NEXT:    [[VP_OP5:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[VP_OP]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
+; IF-EVL-NEXT:    [[TMP27:%.*]] = add nsw <vscale x 4 x i32> [[TMP23]], [[TMP24]]
+; IF-EVL-NEXT:    [[TMP42:%.*]] = add <vscale x 4 x i32> [[TMP27]], [[TMP22]]
 ; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP19]]
 ; IF-EVL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP25]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP5]], ptr align 4 [[TMP26]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
-; IF-EVL-NEXT:    [[TMP27:%.*]] = zext i32 [[TMP18]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP27]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP42]], ptr [[TMP26]], i32 4, <vscale x 4 x i1> [[TMP41]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]]
 ; IF-EVL-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
@@ -399,14 +415,14 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL:       [[SCALAR_PH]]:
 ; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
-; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT9:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT7]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT9:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT10:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT7]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ]
 ; IF-EVL-NEXT:    br label %[[FOR_BODY:.*]]
 ; IF-EVL:       [[FOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP38:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT9]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT9]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT10]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
 ; IF-EVL-NEXT:    [[TMP38]] = load i32, ptr [[ARRAYIDX]], align 4
 ; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[FOR2]], [[FOR3]]
diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
index 815e6bce52c0a..90671689f1dce 100644
--- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-vectorize -force-vector-width=2 -S %s | FileCheck %s
+; RUN: opt -p loop-vectorize -force-vector-width=2 -use-dereferenceable-at-point-semantics -S %s | FileCheck %s
 
 declare void @llvm.assume(i1)
 
-define void @deref_assumption_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+define void @deref_assumption_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
@@ -104,9 +104,9 @@ exit:
   ret void
 }
 
-define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4) ]
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -181,9 +181,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_too_small_in_header_constant_trip_count(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
@@ -282,9 +282,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_align_1(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
@@ -383,9 +383,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attribute(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) {
+define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attribute(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attribute(
-; CHECK-SAME: ptr noalias align 4 [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-SAME: ptr noalias align 4 [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
@@ -484,9 +484,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_align_not_known(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
@@ -585,9 +585,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_then_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+define void @deref_assumption_in_then_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_then_constant_trip_count(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
@@ -682,9 +682,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_latch_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+define void @deref_assumption_in_latch_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_latch_constant_trip_count(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
@@ -785,9 +785,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_header_variable_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+define void @deref_assumption_in_header_variable_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_variable_trip_count(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) {
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -890,9 +890,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_1(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 4000) ]
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -968,9 +968,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 3999) ]
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -1063,9 +1063,9 @@ exit:
   ret void
 }
 
-define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4000) ]
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -1142,9 +1142,9 @@ exit:
 }
 
 
-define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via_argument_attr(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) {
+define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via_argument_attr(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via_argument_attr(
-; CHECK-SAME: ptr noalias align 4 [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-SAME: ptr noalias align 4 [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 4000) ]
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -1220,9 +1220,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 4000) ]
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -1315,9 +1315,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 3999) ]
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -1409,6 +1409,200 @@ loop.latch:
 exit:
   ret void
 }
+
+; %a may be freed between the dereferenceable assumption and accesses.
+; It is not safe to use with -use-dereferenceable-at-point-semantics.
+define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; CHECK-LABEL: define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4) ]
+; CHECK-NEXT:    call void @may_free()
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true)
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP14]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <2 x i32> [ [[TMP12]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
+; CHECK:       [[LOOP_THEN]]:
+; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP33:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4) ]
+  call void @may_free()
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp sge i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %l.a = load i32, ptr %a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+; %a may be freed between the dereferenceable assumption and accesses.
+; It is not safe to use with -use-dereferenceable-at-point-semantics.
+define void @may_free_local_ptr_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %b, ptr noalias %c) nofree nosync {
+; CHECK-LABEL: define void @may_free_local_ptr_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(
+; CHECK-SAME: ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[A:%.*]] = call ptr @get_ptr()
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4) ]
+; CHECK-NEXT:    call void @may_free()
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true)
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP10]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x i32> [ [[TMP8]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
+; CHECK:       [[LOOP_THEN]]:
+; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP35:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = call ptr @get_ptr()
+  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4) ]
+  call void @may_free()
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp sge i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %l.a = load i32, ptr %a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+declare ptr @get_ptr()
+declare void @may_free()
+
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -1442,4 +1636,8 @@ exit:
 ; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]}
+; CHECK: [[LOOP32]] = distinct !{[[LOOP32]], [[META1]], [[META2]]}
+; CHECK: [[LOOP33]] = distinct !{[[LOOP33]], [[META2]], [[META1]]}
+; CHECK: [[LOOP34]] = distinct !{[[LOOP34]], [[META1]], [[META2]]}
+; CHECK: [[LOOP35]] = distinct !{[[LOOP35]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll
index 713b8f8d97951..81912b078b3b7 100644
--- a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name unnamed --version 5
 ; RUN: opt -S -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
@@ -23,11 +23,11 @@ define i32 @test1()  {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 4)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[SMAX]], [[B_PROMOTED]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I1:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[B_PROMOTED]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[B_PROMOTED]], [[N_VEC]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[B_PROMOTED]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
@@ -35,32 +35,32 @@ define i32 @test1()  {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10)
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> splat (i32 1), <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10)
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> splat (i32 1), <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[PREDPHI]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[PREDPHI]], i32 1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I1]]
-; CHECK:       [[_LR_PH_I1]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[OFFSET_IDX]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[BB]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[BB]] ]
 ; CHECK-NEXT:    br label %[[DOTLR_PH_I:.*]]
 ; CHECK:       [[_LR_PH_I:.*:]]
-; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ [[TMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I1]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[TMP8]], 10
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[BB16]], label %[[BB10:.*]]
-; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[UNNAMEDTMP2:%.*]] = icmp sgt i32 [[UNNAMEDTMP8]], 10
+; CHECK-NEXT:    br i1 [[UNNAMEDTMP2]], label %[[BB16]], label %[[UNNAMEDBB10:.*]]
+; CHECK:       [[UNNAMEDBB10]]:
 ; CHECK-NEXT:    br label %[[BB16]]
 ; CHECK:       [[BB16]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi i32 [ 0, %[[BB10]] ], [ 1, %[[DOTLR_PH_I]] ]
-; CHECK-NEXT:    [[TMP18]] = add nsw i32 [[TMP8]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp slt i32 [[TMP18]], 4
-; CHECK-NEXT:    br i1 [[TMP19]], label %[[DOTLR_PH_I]], label %[[F1_EXIT_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[UNNAMEDTMP17:%.*]] = phi i32 [ 0, %[[UNNAMEDBB10]] ], [ 1, %[[DOTLR_PH_I]] ]
+; CHECK-NEXT:    [[UNNAMEDTMP18]] = add nsw i32 [[UNNAMEDTMP8]], 1
+; CHECK-NEXT:    [[UNNAMEDTMP19:%.*]] = icmp slt i32 [[UNNAMEDTMP18]], 4
+; CHECK-NEXT:    br i1 [[UNNAMEDTMP19]], label %[[DOTLR_PH_I]], label %[[F1_EXIT_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[F1_EXIT_LOOPEXIT]]:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP17]], %[[BB16]] ], [ [[TMP4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[UNNAMEDTMP17]], %[[BB16]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[DOTLCSSA]]
 ;
 bb:
@@ -96,11 +96,11 @@ define i32 @test2()  {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 4)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[SMAX]], [[B_PROMOTED]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I1:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[B_PROMOTED]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[B_PROMOTED]], [[N_VEC]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[B_PROMOTED]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
@@ -108,32 +108,32 @@ define i32 @test2()  {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10)
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> splat (i32 1), <2 x i32> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10)
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> splat (i32 1), <2 x i32> [[VEC_IND]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[PREDPHI]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[PREDPHI]], i32 1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I1]]
-; CHECK:       [[_LR_PH_I1]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[OFFSET_IDX]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[BB]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[BB]] ]
 ; CHECK-NEXT:    br label %[[DOTLR_PH_I:.*]]
 ; CHECK:       [[_LR_PH_I:.*:]]
-; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ [[TMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I1]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[TMP8]], 10
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[BB16]], label %[[BB10:.*]]
-; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[UNNAMEDTMP2:%.*]] = icmp sgt i32 [[UNNAMEDTMP8]], 10
+; CHECK-NEXT:    br i1 [[UNNAMEDTMP2]], label %[[BB16]], label %[[UNNAMEDBB10:.*]]
+; CHECK:       [[UNNAMEDBB10]]:
 ; CHECK-NEXT:    br label %[[BB16]]
 ; CHECK:       [[BB16]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi i32 [ [[TMP8]], %[[BB10]] ], [ 1, %[[DOTLR_PH_I]] ]
-; CHECK-NEXT:    [[TMP18]] = add nsw i32 [[TMP8]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp slt i32 [[TMP18]], 4
-; CHECK-NEXT:    br i1 [[TMP19]], label %[[DOTLR_PH_I]], label %[[F1_EXIT_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[UNNAMEDTMP17:%.*]] = phi i32 [ [[UNNAMEDTMP8]], %[[UNNAMEDBB10]] ], [ 1, %[[DOTLR_PH_I]] ]
+; CHECK-NEXT:    [[UNNAMEDTMP18]] = add nsw i32 [[UNNAMEDTMP8]], 1
+; CHECK-NEXT:    [[UNNAMEDTMP19:%.*]] = icmp slt i32 [[UNNAMEDTMP18]], 4
+; CHECK-NEXT:    br i1 [[UNNAMEDTMP19]], label %[[DOTLR_PH_I]], label %[[F1_EXIT_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[F1_EXIT_LOOPEXIT]]:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP17]], %[[BB16]] ], [ [[TMP4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[UNNAMEDTMP17]], %[[BB16]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[DOTLCSSA]]
 ;
 bb:
@@ -169,11 +169,11 @@ define i32 @test3(i32 %N)  {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 4)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[SMAX]], [[B_PROMOTED]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I1:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[B_PROMOTED]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[B_PROMOTED]], [[N_VEC]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[B_PROMOTED]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
@@ -183,39 +183,39 @@ define i32 @test3(i32 %N)  {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10)
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> [[TMP3]], <2 x i1> [[TMP4]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> splat (i32 1), <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> splat (i32 2), <2 x i32> [[PREDPHI]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10)
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true)
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP5]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> splat (i32 1), <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <2 x i1> [[TMP6]], <2 x i32> splat (i32 2), <2 x i32> [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[PREDPHI1]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[PREDPHI1]], i32 1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I1]]
-; CHECK:       [[_LR_PH_I1]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[OFFSET_IDX]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[BB]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[BB]] ]
 ; CHECK-NEXT:    br label %[[DOTLR_PH_I:.*]]
 ; CHECK:       [[_LR_PH_I:.*:]]
-; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ [[TMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I1]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[TMP8]], 10
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[BB16]], label %[[BB10:.*]]
-; CHECK:       [[BB10]]:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP8]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP]], label %[[BB12:.*]], label %[[BB16]]
-; CHECK:       [[BB12]]:
+; CHECK-NEXT:    [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[UNNAMEDTMP2:%.*]] = icmp sgt i32 [[UNNAMEDTMP8]], 10
+; CHECK-NEXT:    br i1 [[UNNAMEDTMP2]], label %[[BB16]], label %[[UNNAMEDBB10:.*]]
+; CHECK:       [[UNNAMEDBB10]]:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[UNNAMEDTMP8]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[UNNAMEDBB12:.*]], label %[[BB16]]
+; CHECK:       [[UNNAMEDBB12]]:
 ; CHECK-NEXT:    br label %[[BB16]]
 ; CHECK:       [[BB16]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi i32 [ 0, %[[BB10]] ], [ 1, %[[DOTLR_PH_I]] ], [ 2, %[[BB12]] ]
-; CHECK-NEXT:    [[TMP18]] = add nsw i32 [[TMP8]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp slt i32 [[TMP18]], 4
-; CHECK-NEXT:    br i1 [[TMP19]], label %[[DOTLR_PH_I]], label %[[F1_EXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    [[UNNAMEDTMP17:%.*]] = phi i32 [ 0, %[[UNNAMEDBB10]] ], [ 1, %[[DOTLR_PH_I]] ], [ 2, %[[UNNAMEDBB12]] ]
+; CHECK-NEXT:    [[UNNAMEDTMP18]] = add nsw i32 [[UNNAMEDTMP8]], 1
+; CHECK-NEXT:    [[UNNAMEDTMP19:%.*]] = icmp slt i32 [[UNNAMEDTMP18]], 4
+; CHECK-NEXT:    br i1 [[UNNAMEDTMP19]], label %[[DOTLR_PH_I]], label %[[F1_EXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[F1_EXIT_LOOPEXIT]]:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP17]], %[[BB16]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[UNNAMEDTMP17]], %[[BB16]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[DOTLCSSA]]
 ;
 bb:
@@ -258,11 +258,11 @@ define i32 @test4(i32 %N)  {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 4)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[SMAX]], [[B_PROMOTED]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I1:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[B_PROMOTED]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[B_PROMOTED]], [[N_VEC]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[B_PROMOTED]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
@@ -270,32 +270,32 @@ define i32 @test4(i32 %N)  {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10)
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> splat (i32 1), <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10)
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> splat (i32 1), <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[PREDPHI]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[PREDPHI]], i32 1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT_LOOPEXIT:.*]], label %[[_LR_PH_I1]]
-; CHECK:       [[_LR_PH_I1]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[OFFSET_IDX]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[DOTLR_PH_I_PREHEADER]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[DOTLR_PH_I_PREHEADER]] ]
 ; CHECK-NEXT:    br label %[[DOTLR_PH_I:.*]]
 ; CHECK:       [[_LR_PH_I:.*:]]
-; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ [[TMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I1]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[TMP8]], 10
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[BB16]], label %[[BB10:.*]]
-; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[UNNAMEDTMP2:%.*]] = icmp sgt i32 [[UNNAMEDTMP8]], 10
+; CHECK-NEXT:    br i1 [[UNNAMEDTMP2]], label %[[BB16]], label %[[UNNAMEDBB10:.*]]
+; CHECK:       [[UNNAMEDBB10]]:
 ; CHECK-NEXT:    br label %[[BB16]]
 ; CHECK:       [[BB16]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi i32 [ 0, %[[BB10]] ], [ 1, %[[DOTLR_PH_I]] ]
-; CHECK-NEXT:    [[TMP18]] = add nsw i32 [[TMP8]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp slt i32 [[TMP18]], 4
-; CHECK-NEXT:    br i1 [[TMP19]], label %[[DOTLR_PH_I]], label %[[F1_EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    [[UNNAMEDTMP17:%.*]] = phi i32 [ 0, %[[UNNAMEDBB10]] ], [ 1, %[[DOTLR_PH_I]] ]
+; CHECK-NEXT:    [[UNNAMEDTMP18]] = add nsw i32 [[UNNAMEDTMP8]], 1
+; CHECK-NEXT:    [[UNNAMEDTMP19:%.*]] = icmp slt i32 [[UNNAMEDTMP18]], 4
+; CHECK-NEXT:    br i1 [[UNNAMEDTMP19]], label %[[DOTLR_PH_I]], label %[[F1_EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[F1_EXIT_LOOPEXIT_LOOPEXIT]]:
-; CHECK-NEXT:    [[TMP17_LCSSA:%.*]] = phi i32 [ [[TMP17]], %[[BB16]] ], [ [[TMP4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[TMP17_LCSSA:%.*]] = phi i32 [ [[UNNAMEDTMP17]], %[[BB16]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label %[[F1_EXIT_LOOPEXIT]]
 ; CHECK:       [[F1_EXIT_LOOPEXIT]]:
 ; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ 2, %[[BB]] ], [ [[TMP17_LCSSA]], %[[F1_EXIT_LOOPEXIT_LOOPEXIT]] ]
@@ -343,12 +343,12 @@ define i32 @reduction_sum(i32 %n, ptr noalias nocapture %A, ptr noalias nocaptur
 ; CHECK-NEXT:    [[C3:%.*]] = load i32, ptr [[C2]], align 4
 ; CHECK-NEXT:    [[C4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[C5:%.*]] = load i32, ptr [[C4]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[SUM_02]], 10
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[BB16]], label %[[BB10:.*]]
-; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[UNNAMEDTMP2:%.*]] = icmp sgt i32 [[SUM_02]], 10
+; CHECK-NEXT:    br i1 [[UNNAMEDTMP2]], label %[[BB16]], label %[[UNNAMEDBB10:.*]]
+; CHECK:       [[UNNAMEDBB10]]:
 ; CHECK-NEXT:    br label %[[BB16]]
 ; CHECK:       [[BB16]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi i32 [ [[SUM_02]], %[[BB10]] ], [ 1, %[[HEADER]] ]
+; CHECK-NEXT:    [[UNNAMEDTMP17:%.*]] = phi i32 [ [[SUM_02]], %[[UNNAMEDBB10]] ], [ 1, %[[HEADER]] ]
 ; CHECK-NEXT:    [[C6:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[C7:%.*]] = add i32 [[SUM_02]], [[C6]]
 ; CHECK-NEXT:    [[C8:%.*]] = add i32 [[C7]], [[C3]]
@@ -358,7 +358,7 @@ define i32 @reduction_sum(i32 %n, ptr noalias nocapture %A, ptr noalias nocaptur
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[DOT_CRIT_EDGE_LOOPEXIT:.*]], label %[[HEADER]]
 ; CHECK:       [[__CRIT_EDGE_LOOPEXIT:.*:]]
-; CHECK-NEXT:    [[TMP17_LCSSA:%.*]] = phi i32 [ [[TMP17]], %[[BB16]] ]
+; CHECK-NEXT:    [[TMP17_LCSSA:%.*]] = phi i32 [ [[UNNAMEDTMP17]], %[[BB16]] ]
 ; CHECK-NEXT:    [[C9_LCSSA:%.*]] = phi i32 [ [[C9]], %[[BB16]] ]
 ; CHECK-NEXT:    br [[DOT_CRIT_EDGE]]
 ; CHECK:       [[__CRIT_EDGE:.*:]]
@@ -410,17 +410,17 @@ define i32 @cyclic_dep_with_indvar()  {
 ; CHECK-NEXT:    br label %[[DOTLR_PH_I:.*]]
 ; CHECK:       [[_LR_PH_I:.*:]]
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IVNEXT:%.*]], %[[BB16:.*]] ], [ [[B_PROMOTED]], %[[BB]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[IV]], 10
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[BB16]], label %[[BB10:.*]]
-; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[UNNAMEDTMP2:%.*]] = icmp sgt i32 [[IV]], 10
+; CHECK-NEXT:    br i1 [[UNNAMEDTMP2]], label %[[BB16]], label %[[UNNAMEDBB10:.*]]
+; CHECK:       [[UNNAMEDBB10]]:
 ; CHECK-NEXT:    br label %[[BB16]]
 ; CHECK:       [[BB16]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi i32 [ 0, %[[BB10]] ], [ [[IV]], %[[DOTLR_PH_I]] ]
-; CHECK-NEXT:    [[IVNEXT]] = add nsw i32 [[TMP17]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp slt i32 [[IVNEXT]], 4
-; CHECK-NEXT:    br i1 [[TMP19]], label %[[DOTLR_PH_I]], label %[[F1_EXIT_LOOPEXIT:.*]]
+; CHECK-NEXT:    [[UNNAMEDTMP17:%.*]] = phi i32 [ 0, %[[UNNAMEDBB10]] ], [ [[IV]], %[[DOTLR_PH_I]] ]
+; CHECK-NEXT:    [[IVNEXT]] = add nsw i32 [[UNNAMEDTMP17]], 1
+; CHECK-NEXT:    [[UNNAMEDTMP19:%.*]] = icmp slt i32 [[IVNEXT]], 4
+; CHECK-NEXT:    br i1 [[UNNAMEDTMP19]], label %[[DOTLR_PH_I]], label %[[F1_EXIT_LOOPEXIT:.*]]
 ; CHECK:       [[F1_EXIT_LOOPEXIT]]:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP17]], %[[BB16]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[UNNAMEDTMP17]], %[[BB16]] ]
 ; CHECK-NEXT:    ret i32 [[DOTLCSSA]]
 ;
 bb:
@@ -458,16 +458,16 @@ define i32 @not_valid_reduction(i32 %n, ptr noalias nocapture %A) nounwind uwtab
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[LATCH:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[X_05:%.*]] = phi i32 [ [[TMP17:%.*]], %[[LATCH]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[X_05:%.*]] = phi i32 [ [[UNNAMEDTMP17:%.*]], %[[LATCH]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i64 [[INDVARS_IV]], 10
-; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[X_05]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[BB16:.*]], label %[[BB10:.*]]
-; CHECK:       [[BB10]]:
-; CHECK-NEXT:    br label %[[BB16]]
-; CHECK:       [[BB16]]:
-; CHECK-NEXT:    [[TMP17]] = phi i32 [ 1, %[[BB10]] ], [ [[SUB]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[UNNAMEDTMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[UNNAMEDTMP2:%.*]] = icmp sgt i64 [[INDVARS_IV]], 10
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[X_05]], [[UNNAMEDTMP0]]
+; CHECK-NEXT:    br i1 [[UNNAMEDTMP2]], label %[[UNNAMEDBB16:.*]], label %[[UNNAMEDBB10:.*]]
+; CHECK:       [[UNNAMEDBB10]]:
+; CHECK-NEXT:    br label %[[UNNAMEDBB16]]
+; CHECK:       [[UNNAMEDBB16]]:
+; CHECK-NEXT:    [[UNNAMEDTMP17]] = phi i32 [ 1, %[[UNNAMEDBB10]] ], [ [[SUB]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    br label %[[LATCH]]
 ; CHECK:       [[LATCH]]:
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
@@ -475,7 +475,7 @@ define i32 @not_valid_reduction(i32 %n, ptr noalias nocapture %A) nounwind uwtab
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END_LOOPEXIT]]:
-; CHECK-NEXT:    [[TMP17_LCSSA:%.*]] = phi i32 [ [[TMP17]], %[[LATCH]] ]
+; CHECK-NEXT:    [[TMP17_LCSSA:%.*]] = phi i32 [ [[UNNAMEDTMP17]], %[[LATCH]] ]
 ; CHECK-NEXT:    br label %[[FOR_END]]
 ; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    [[X_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP17_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
@@ -520,11 +520,11 @@ define i8 @outside_user_non_phi()  {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 4)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[SMAX]], [[B_PROMOTED]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I1:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[B_PROMOTED]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[B_PROMOTED]], [[N_VEC]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[B_PROMOTED]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
@@ -532,34 +532,34 @@ define i8 @outside_user_non_phi()  {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10)
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> splat (i32 1), <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i32> [[PREDPHI]] to <2 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10)
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> splat (i32 1), <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <2 x i32> [[PREDPHI]] to <2 x i8>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP4]], i32 1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I1]]
-; CHECK:       [[_LR_PH_I1]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[OFFSET_IDX]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[BB]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[BB]] ]
 ; CHECK-NEXT:    br label %[[DOTLR_PH_I:.*]]
 ; CHECK:       [[_LR_PH_I:.*:]]
-; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ [[TMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I1]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[TMP8]], 10
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[BB16]], label %[[BB10:.*]]
-; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[UNNAMEDTMP2:%.*]] = icmp sgt i32 [[UNNAMEDTMP8]], 10
+; CHECK-NEXT:    br i1 [[UNNAMEDTMP2]], label %[[BB16]], label %[[UNNAMEDBB10:.*]]
+; CHECK:       [[UNNAMEDBB10]]:
 ; CHECK-NEXT:    br label %[[BB16]]
 ; CHECK:       [[BB16]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi i32 [ 0, %[[BB10]] ], [ 1, %[[DOTLR_PH_I]] ]
-; CHECK-NEXT:    [[TMP17_TRUNC:%.*]] = trunc i32 [[TMP17]] to i8
-; CHECK-NEXT:    [[TMP18]] = add nsw i32 [[TMP8]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp slt i32 [[TMP18]], 4
-; CHECK-NEXT:    br i1 [[TMP19]], label %[[DOTLR_PH_I]], label %[[F1_EXIT_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    [[UNNAMEDTMP17:%.*]] = phi i32 [ 0, %[[UNNAMEDBB10]] ], [ 1, %[[DOTLR_PH_I]] ]
+; CHECK-NEXT:    [[TMP17_TRUNC:%.*]] = trunc i32 [[UNNAMEDTMP17]] to i8
+; CHECK-NEXT:    [[UNNAMEDTMP18]] = add nsw i32 [[UNNAMEDTMP8]], 1
+; CHECK-NEXT:    [[UNNAMEDTMP19:%.*]] = icmp slt i32 [[UNNAMEDTMP18]], 4
+; CHECK-NEXT:    br i1 [[UNNAMEDTMP19]], label %[[DOTLR_PH_I]], label %[[F1_EXIT_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[F1_EXIT_LOOPEXIT]]:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i8 [ [[TMP17_TRUNC]], %[[BB16]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i8 [ [[TMP17_TRUNC]], %[[BB16]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i8 [[DOTLCSSA]]
 ;
 bb:
@@ -651,46 +651,46 @@ define i32 @sum_arrays_outside_use(ptr %B, ptr %A, ptr %C, i32 %N)  {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N]], i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[SMAX]], [[B_PROMOTED]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[C1]], [[B2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i32 [[TMP2]], 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[C1]], [[A3]]
 ; CHECK-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i32 [[TMP3]], 8
 ; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
-; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[_LR_PH_I]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[B_PROMOTED]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[B_PROMOTED]], [[N_VEC]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX5:%.*]] = add i32 [[B_PROMOTED]], [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[OFFSET_IDX5]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x i32>, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[TMP10]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[B_PROMOTED]], [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x i32>, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[TMP11]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I]]
-; CHECK:       [[_LR_PH_I]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[OFFSET_IDX]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[VECTOR_MEMCHECK]] ], [ [[B_PROMOTED]], %[[BB]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[VECTOR_MEMCHECK]] ], [ [[B_PROMOTED]], %[[BB]] ]
 ; CHECK-NEXT:    br label %[[DOTLR_PH_I:.*]]
-; CHECK:       [[_LR_PH_I1:.*:]]
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IVNEXT:%.*]], %[[DOTLR_PH_I]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I]] ]
+; CHECK:       [[_LR_PH_I:.*:]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IVNEXT:%.*]], %[[DOTLR_PH_I]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = sext i32 [[IV]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[BLOAD:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
@@ -700,10 +700,10 @@ define i32 @sum_arrays_outside_use(ptr %B, ptr %A, ptr %C, i32 %N)  {
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i32 [[SUM]], ptr [[ARRAYIDX3]], align 4
 ; CHECK-NEXT:    [[IVNEXT]] = add nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp slt i32 [[IVNEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[TMP19]], label %[[DOTLR_PH_I]], label %[[F1_EXIT_LOOPEXIT]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    [[UNNAMEDTMP19:%.*]] = icmp slt i32 [[IVNEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[UNNAMEDTMP19]], label %[[DOTLR_PH_I]], label %[[F1_EXIT_LOOPEXIT]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[F1_EXIT_LOOPEXIT]]:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[SUM]], %[[DOTLR_PH_I]] ], [ [[TMP14]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[SUM]], %[[DOTLR_PH_I]] ], [ [[TMP15]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[DOTLCSSA]]
 ;
 bb:
diff --git a/llvm/test/Transforms/MemCpyOpt/store-to-memset-is-nonzero-type.ll b/llvm/test/Transforms/MemCpyOpt/store-to-memset-is-nonzero-type.ll
index 0455d65fe7521..6b53138342ebf 100644
--- a/llvm/test/Transforms/MemCpyOpt/store-to-memset-is-nonzero-type.ll
+++ b/llvm/test/Transforms/MemCpyOpt/store-to-memset-is-nonzero-type.ll
@@ -5,7 +5,7 @@
 
 define void @array_zero(ptr %p) {
 ; CHECK-LABEL: @array_zero(
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[P:%.*]], i8 undef, i64 0, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[P:%.*]], i8 poison, i64 0, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   store [0 x i8] zeroinitializer, ptr %p
@@ -25,7 +25,7 @@ define void @array_nonzero(ptr %p) {
 
 define void @struct_zero(ptr %p) {
 ; CHECK-LABEL: @struct_zero(
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[P:%.*]], i8 undef, i64 0, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[P:%.*]], i8 poison, i64 0, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   store { } zeroinitializer, ptr %p
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/infer-align-from-assumption.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/infer-align-from-assumption.ll
new file mode 100644
index 0000000000000..632e3a56aacac
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/infer-align-from-assumption.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+
+target triple = "arm64-apple-macosx"
+
+declare void @llvm.assume(i1 noundef)
+
+define i32 @earlycse_entry(ptr %p) {
+; CHECK-LABEL: define i32 @earlycse_entry(
+; CHECK-SAME: ptr nocapture [[P:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:    [[L_I:%.*]] = load ptr, ptr [[P]], align 8
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[L_I]], i64 4) ]
+; CHECK-NEXT:    [[L_ASSUME_ALIGNED_I_I:%.*]] = load i32, ptr [[L_I]], align 4
+; CHECK-NEXT:    [[R_I_I:%.*]] = tail call i32 @swap(i32 [[L_ASSUME_ALIGNED_I_I]])
+; CHECK-NEXT:    [[L_2_I:%.*]] = load ptr, ptr [[P]], align 8
+; CHECK-NEXT:    [[GEP_I:%.*]] = getelementptr i8, ptr [[L_2_I]], i64 4
+; CHECK-NEXT:    store ptr [[GEP_I]], ptr [[P]], align 8
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[GEP_I]], i64 4) ]
+; CHECK-NEXT:    [[L_ASSUME_ALIGNED_I_I2:%.*]] = load i32, ptr [[GEP_I]], align 4
+; CHECK-NEXT:    [[R_I_I3:%.*]] = tail call i32 @swap(i32 [[L_ASSUME_ALIGNED_I_I2]])
+; CHECK-NEXT:    [[L_2_I4:%.*]] = load ptr, ptr [[P]], align 8
+; CHECK-NEXT:    [[GEP_I5:%.*]] = getelementptr i8, ptr [[L_2_I4]], i64 4
+; CHECK-NEXT:    store ptr [[GEP_I5]], ptr [[P]], align 8
+; CHECK-NEXT:    ret i32 [[R_I_I3]]
+;
+  %r.1 = call i32 @earlycse_fn1(ptr %p)
+  %r.2 = call i32 @earlycse_fn1(ptr %p)
+  ret i32 %r.2
+}
+
+define i32 @earlycse_fn1(ptr %p) {
+; CHECK-LABEL: define i32 @earlycse_fn1(
+; CHECK-SAME: ptr nocapture [[P:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:    [[L:%.*]] = load ptr, ptr [[P]], align 8
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[L]], i64 4) ]
+; CHECK-NEXT:    [[L_ASSUME_ALIGNED_I:%.*]] = load i32, ptr [[L]], align 4
+; CHECK-NEXT:    [[R_I:%.*]] = tail call i32 @swap(i32 [[L_ASSUME_ALIGNED_I]])
+; CHECK-NEXT:    [[L_2:%.*]] = load ptr, ptr [[P]], align 8
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[L_2]], i64 4
+; CHECK-NEXT:    store ptr [[GEP]], ptr [[P]], align 8
+; CHECK-NEXT:    ret i32 [[R_I]]
+;
+  %l = load ptr, ptr %p, align 8
+  %r = call i32 @load_assume_aligned(ptr %l)
+  %l.2 = load ptr, ptr %p, align 8
+  %gep = getelementptr i8, ptr %l.2, i64 4
+  store ptr %gep, ptr %p, align 8
+  ret i32 %r
+}
+
+define i32 @load_assume_aligned(ptr %p) {
+; CHECK-LABEL: define i32 @load_assume_aligned(
+; CHECK-SAME: ptr [[P:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P]], i64 4) ]
+; CHECK-NEXT:    [[DOT0_COPYLOAD:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @swap(i32 [[DOT0_COPYLOAD]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  call void @llvm.assume(i1 true) [ "align"(ptr %p, i64 4) ]
+  %l.assume_aligned = load i32, ptr %p, align 1
+  %r = call i32 @swap(i32 %l.assume_aligned)
+  ret i32 %r
+}
+
+declare i32 @swap(i32)
+
+define void @sroa_align_entry(ptr %p) {
+; CHECK-LABEL: define void @sroa_align_entry(
+; CHECK-SAME: ptr [[P:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P]], i64 8) ]
+; CHECK-NEXT:    [[DOT0_COPYLOAD_I_I_I:%.*]] = load i64, ptr [[P]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[DOT0_COPYLOAD_I_I_I]] to ptr
+; CHECK-NEXT:    store i32 0, ptr [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+  %a = alloca ptr, align 8
+  store ptr %p, ptr %a, align 8
+  %r = call ptr @sroa_fn1(ptr %a)
+  store i32 0, ptr %r, align 4
+  ret void
+}
+
+define ptr @sroa_fn1(ptr %p) {
+; CHECK-LABEL: define ptr @sroa_fn1(
+; CHECK-SAME: ptr nocapture readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[L:%.*]] = load ptr, ptr [[P]], align 8
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[L]], i64 8) ]
+; CHECK-NEXT:    [[L_FN3_I_I:%.*]] = load i64, ptr [[L]], align 8
+; CHECK-NEXT:    [[I_I:%.*]] = inttoptr i64 [[L_FN3_I_I]] to ptr
+; CHECK-NEXT:    ret ptr [[I_I]]
+;
+  %l = load ptr, ptr %p, align 8
+  %r = call ptr @sroa_fn2(ptr %l)
+  ret ptr %r
+}
+
+define ptr @sroa_fn2(ptr %p) {
+; CHECK-LABEL: define ptr @sroa_fn2(
+; CHECK-SAME: ptr [[P:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P]], i64 8) ]
+; CHECK-NEXT:    [[DOT0_COPYLOAD_I_I:%.*]] = load i64, ptr [[P]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[DOT0_COPYLOAD_I_I]] to ptr
+; CHECK-NEXT:    ret ptr [[TMP3]]
+;
+  %r = call i64 @sroa_fn3(ptr %p)
+  %i = inttoptr i64 %r to ptr
+  ret ptr %i
+}
+
+define i64 @sroa_fn3(ptr %0) {
+; CHECK-LABEL: define i64 @sroa_fn3(
+; CHECK-SAME: ptr [[TMP0:%.*]]) local_unnamed_addr #[[ATTR3]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP0]], i64 8) ]
+; CHECK-NEXT:    [[DOT0_COPYLOAD_I:%.*]] = load i64, ptr [[TMP0]], align 8
+; CHECK-NEXT:    ret i64 [[DOT0_COPYLOAD_I]]
+;
+  call void @llvm.assume(i1 true) [ "align"(ptr %0, i64 8) ]
+  %l.fn3 = load i64, ptr %0, align 1
+  ret i64 %l.fn3
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll
new file mode 100644
index 0000000000000..362ec22600f92
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll
@@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize,interleaved-access -mattr=+sve -S -o - %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+%struct.xyzt = type { i32, i32, i32, i32 }
+; for (int i = 0; i < 1024; ++i) {
+;   dst[i].x = a[i].x + b[i].x;
+;   dst[i].y = a[i].y - b[i].y;
+;   dst[i].z = a[i].z << b[i].z;
+;   dst[i].t = a[i].t >> b[i].t;
+; }
+
+define void @interleave_deinterleave(ptr noalias %dst, ptr %a, ptr %b) {
+; CHECK-LABEL: @interleave_deinterleave(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[LDN9:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[TMP13]])
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN9]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN9]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN9]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN9]], 3
+; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP9]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP22:%.*]] = sub nsw <vscale x 4 x i32> [[TMP10]], [[TMP17]]
+; CHECK-NEXT:    [[TMP23:%.*]] = shl <vscale x 4 x i32> [[TMP11]], [[TMP18]]
+; CHECK-NEXT:    [[TMP24:%.*]] = ashr <vscale x 4 x i32> [[TMP12]], [[TMP19]]
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP23]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i1> splat (i1 true), ptr [[TMP21]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4
+; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[Y]], align 4
+; CHECK-NEXT:    [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[Y11]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP33]], [[TMP26]]
+; CHECK-NEXT:    [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4
+; CHECK-NEXT:    store i32 [[SUB]], ptr [[Y14]], align 4
+; CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[Z]], align 4
+; CHECK-NEXT:    [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[Z19]], align 4
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP27]], [[TMP28]]
+; CHECK-NEXT:    [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8
+; CHECK-NEXT:    store i32 [[SHL]], ptr [[Z22]], align 4
+; CHECK-NEXT:    [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[T]], align 4
+; CHECK-NEXT:    [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[T27]], align 4
+; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12
+; CHECK-NEXT:    store i32 [[SHR]], ptr [[T30]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.a = getelementptr inbounds %struct.xyzt, ptr %a, i64 %iv
+  %a.0 = load i32, ptr %gep.a, align 4
+  %gep.b = getelementptr inbounds %struct.xyzt, ptr %b, i64 %iv
+  %b.0 = load i32, ptr %gep.b, align 4
+  %add = add nsw i32 %b.0, %a.0
+  %gep.dst = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %iv
+  store i32 %add, ptr %gep.dst, align 4
+  %gep.a.1 = getelementptr inbounds nuw i8, ptr %gep.a, i64 4
+  %a.1 = load i32, ptr %gep.a.1, align 4
+  %gep.b.1 = getelementptr inbounds nuw i8, ptr %gep.b, i64 4
+  %b.1 = load i32, ptr %gep.b.1, align 4
+  %sub = sub nsw i32 %a.1, %b.1
+  %gep.dst.1 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 4
+  store i32 %sub, ptr %gep.dst.1, align 4
+  %gep.a.2 = getelementptr inbounds nuw i8, ptr %gep.a, i64 8
+  %a.2 = load i32, ptr %gep.a.2, align 4
+  %gep.b.2 = getelementptr inbounds nuw i8, ptr %gep.b, i64 8
+  %b.2 = load i32, ptr %gep.b.2, align 4
+  %shl = shl i32 %a.2, %b.2
+  %gep.dst.2 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 8
+  store i32 %shl, ptr %gep.dst.2, align 4
+  %gep.a.3 = getelementptr inbounds nuw i8, ptr %gep.a, i64 12
+  %a.3 = load i32, ptr %gep.a.3, align 4
+  %gep.b.3 = getelementptr inbounds nuw i8, ptr %gep.b, i64 12
+  %b.3 = load i32, ptr %gep.b.3, align 4
+  %shr = ashr i32 %a.3, %b.3
+  %gep.dst.3 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 12
+  store i32 %shr, ptr %gep.dst.3, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll
new file mode 100644
index 0000000000000..ffb8f44363249
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @foo(ptr %0) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  vector.scevcheck:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP0:%.*]], i64 4
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr null, i64 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x ptr> [[TMP1]], ptr [[SCEVGEP]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x ptr> [[TMP2]], <4 x ptr> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult <4 x ptr> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i1> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[SCEVGEP3]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult <4 x ptr> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = and <4 x i1> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = or i1 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    br i1 [[OP_RDX]], label [[DOTLR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    ret void
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    ret void
+;
+vector.scevcheck:
+  %scevgep = getelementptr i8, ptr %0, i64 4
+  %scevgep3 = getelementptr i8, ptr null, i64 4
+  %bound011 = icmp ult ptr %scevgep, null
+  %found.conflict13 = and i1 %bound011, false
+  %bound014 = icmp ult ptr %scevgep, null
+  %found.conflict16 = and i1 %bound014, false
+  %conflict.rdx17 = or i1 %found.conflict13, %found.conflict16
+  %bound018 = icmp ult ptr %scevgep, null
+  %found.conflict20 = and i1 %bound018, false
+  %conflict.rdx21 = or i1 %conflict.rdx17, %found.conflict20
+  %bound022 = icmp ult ptr %0, null
+  %found.conflict24 = and i1 %bound022, false
+  %conflict.rdx25 = or i1 %conflict.rdx21, %found.conflict24
+  %bound026 = icmp ult ptr %0, null
+  %found.conflict28 = and i1 %bound026, false
+  %conflict.rdx29 = or i1 %conflict.rdx25, %found.conflict28
+  %bound030 = icmp ult ptr %0, null
+  %found.conflict32 = and i1 %bound030, false
+  %conflict.rdx33 = or i1 %conflict.rdx29, %found.conflict32
+  %bound034 = icmp ult ptr %0, null
+  %found.conflict36 = and i1 %bound034, false
+  %conflict.rdx37 = or i1 %conflict.rdx33, %found.conflict36
+  %bound038 = icmp ult ptr %scevgep3, null
+  %found.conflict40 = and i1 %bound038, false
+  %conflict.rdx41 = or i1 %conflict.rdx37, %found.conflict40
+  br i1 %conflict.rdx41, label %.lr.ph, label %vector.ph
+
+vector.ph:                                        ; preds = %vector.scevcheck
+  ret void
+
+.lr.ph:                                           ; preds = %vector.scevcheck
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll
index 4755c690c0711..4b6f0438b8915 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll
@@ -34,7 +34,7 @@ bb:
   %a.lane.0 = fmul double %v1.lane.0, %v2.lane.2
   %a.lane.1 = fmul double %v1.lane.1, %v2.lane.3
 
-  %a.ins.0 = insertelement <2 x double> undef, double %a.lane.0, i32 0
+  %a.ins.0 = insertelement <2 x double> zeroinitializer, double %a.lane.0, i32 0
   %a.ins.1 = insertelement <2 x double> %a.ins.0, double %a.lane.1, i32 1
 
   call void @use(double %v1.lane.0)
@@ -73,7 +73,7 @@ bb:
   %a.lane.0 = fmul double %v1.lane.0, %v2.lane.2
   %a.lane.1 = fmul double %v3.lane.1, %v2.lane.2
 
-  %a.ins.0 = insertelement <2 x double> undef, double %a.lane.0, i32 0
+  %a.ins.0 = insertelement <2 x double> zeroinitializer, double %a.lane.0, i32 0
   %a.ins.1 = insertelement <2 x double> %a.ins.0, double %a.lane.1, i32 1
 
   call void @use(double %v1.lane.0)
@@ -95,7 +95,8 @@ define void @noop_extract_second_2_lanes(ptr %ptr.1, ptr %ptr.2) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x double> [[V_1]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> <i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP4]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_2]])
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_3]])
 ; CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[PTR_1]], align 8
@@ -112,7 +113,7 @@ bb:
   %a.lane.0 = fmul double %v1.lane.2, %v2.lane.2
   %a.lane.1 = fmul double %v1.lane.3, %v2.lane.2
 
-  %a.ins.0 = insertelement <4 x double> undef, double %a.lane.0, i32 0
+  %a.ins.0 = insertelement <4 x double> zeroinitializer, double %a.lane.0, i32 0
   %a.ins.1 = insertelement <4 x double> %a.ins.0, double %a.lane.1, i32 1
 
   call void @use(double %v1.lane.2)
@@ -149,7 +150,7 @@ bb:
   %a.lane.0 = fmul double %v1.lane.1, %v2.lane.2
   %a.lane.1 = fmul double %v1.lane.0, %v2.lane.2
 
-  %a.ins.0 = insertelement <2 x double> undef, double %a.lane.0, i32 0
+  %a.ins.0 = insertelement <2 x double> zeroinitializer, double %a.lane.0, i32 0
   %a.ins.1 = insertelement <2 x double> %a.ins.0, double %a.lane.1, i32 1
 
   call void @use(double %v1.lane.0)
@@ -170,7 +171,8 @@ define void @extract_lanes_1_and_2(ptr %ptr.1, ptr %ptr.2) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x double> [[V_1]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> <i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP4]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_1]])
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_2]])
 ; CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[PTR_1]], align 8
@@ -187,7 +189,7 @@ bb:
   %a.lane.0 = fmul double %v1.lane.1, %v2.lane.2
   %a.lane.1 = fmul double %v1.lane.2, %v2.lane.2
 
-  %a.ins.0 = insertelement <4 x double> undef, double %a.lane.0, i32 0
+  %a.ins.0 = insertelement <4 x double> zeroinitializer, double %a.lane.0, i32 0
   %a.ins.1 = insertelement <4 x double> %a.ins.0, double %a.lane.1, i32 1
 
   call void @use(double %v1.lane.1)
@@ -213,7 +215,8 @@ define void @noop_extracts_existing_vector_4_lanes(ptr %ptr.1, ptr %ptr.2) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <4 x i32> <i32 2, i32 0, i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x double> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <9 x i32> <i32 2, i32 3, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <9 x i32> <i32 2, i32 3, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <9 x double> zeroinitializer, <9 x double> [[TMP4]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 4, i32 5, i32 6, i32 7, i32 8>
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_0]])
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_1]])
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_2]])
@@ -235,7 +238,7 @@ bb:
   %a.lane.1 = fmul double %v1.lane.3, %v2.lane.2
   %a.lane.2 = fmul double %v1.lane.0, %v2.lane.2
   %a.lane.3 = fmul double %v1.lane.1, %v2.lane.0
-  %a.ins.0 = insertelement <9 x double> undef, double %a.lane.0, i32 0
+  %a.ins.0 = insertelement <9 x double> zeroinitializer, double %a.lane.0, i32 0
   %a.ins.1 = insertelement <9 x double> %a.ins.0, double %a.lane.1, i32 1
   %a.ins.2 = insertelement <9 x double> %a.ins.1, double %a.lane.2, i32 2
   %a.ins.3 = insertelement <9 x double> %a.ins.2, double %a.lane.3, i32 3
@@ -261,7 +264,8 @@ define void @extracts_jumbled_4_lanes(ptr %ptr.1, ptr %ptr.2) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <4 x i32> <i32 2, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x double> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <9 x i32> <i32 0, i32 2, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <9 x i32> <i32 0, i32 2, i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <9 x double> zeroinitializer, <9 x double> [[TMP4]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 4, i32 5, i32 6, i32 7, i32 8>
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_0]])
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_1]])
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_2]])
@@ -283,7 +287,7 @@ bb:
   %a.lane.1 = fmul double %v1.lane.2, %v2.lane.1
   %a.lane.2 = fmul double %v1.lane.1, %v2.lane.2
   %a.lane.3 = fmul double %v1.lane.3, %v2.lane.0
-  %a.ins.0 = insertelement <9 x double> undef, double %a.lane.0, i32 0
+  %a.ins.0 = insertelement <9 x double> zeroinitializer, double %a.lane.0, i32 0
   %a.ins.1 = insertelement <9 x double> %a.ins.0, double %a.lane.1, i32 1
   %a.ins.2 = insertelement <9 x double> %a.ins.1, double %a.lane.2, i32 2
   %a.ins.3 = insertelement <9 x double> %a.ins.2, double %a.lane.3, i32 3
@@ -313,12 +317,14 @@ define void @noop_extracts_9_lanes(ptr %ptr.1, ptr %ptr.2) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul <8 x double> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
-; CHECK-NEXT:    [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP3]], double [[A_LANE_8]], i32 8
+; CHECK-NEXT:    [[A_INS_72:%.*]] = shufflevector <9 x double> zeroinitializer, <9 x double> [[TMP3]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
+; CHECK-NEXT:    [[A_INS_8:%.*]] = insertelement <9 x double> [[A_INS_72]], double [[A_LANE_8]], i32 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> <i32 6, i32 7, i32 8, i32 0, i32 1, i32 2, i32 3, i32 4>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <8 x double> [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[B_LANE_8:%.*]] = fmul double [[V1_LANE_5]], [[V2_LANE_0]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
-; CHECK-NEXT:    [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP7]], double [[B_LANE_8]], i32 8
+; CHECK-NEXT:    [[B_INS_71:%.*]] = shufflevector <9 x double> zeroinitializer, <9 x double> [[TMP7]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
+; CHECK-NEXT:    [[B_INS_8:%.*]] = insertelement <9 x double> [[B_INS_71]], double [[B_LANE_8]], i32 8
 ; CHECK-NEXT:    [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]]
 ; CHECK-NEXT:    store <9 x double> [[RES]], ptr [[PTR_1]], align 8
 ; CHECK-NEXT:    ret void
@@ -350,7 +356,7 @@ bb:
   %a.lane.7 = fmul double %v1.lane.1, %v2.lane.1
   %a.lane.8 = fmul double %v1.lane.2, %v2.lane.0
 
-  %a.ins.0 = insertelement <9 x double> undef, double %a.lane.0, i32 0
+  %a.ins.0 = insertelement <9 x double> zeroinitializer, double %a.lane.0, i32 0
   %a.ins.1 = insertelement <9 x double> %a.ins.0, double %a.lane.1, i32 1
   %a.ins.2 = insertelement <9 x double> %a.ins.1, double %a.lane.2, i32 2
   %a.ins.3 = insertelement <9 x double> %a.ins.2, double %a.lane.3, i32 3
@@ -370,7 +376,7 @@ bb:
   %b.lane.7 = fmul double %v1.lane.4, %v2.lane.1
   %b.lane.8 = fmul double %v1.lane.5, %v2.lane.0
 
-  %b.ins.0 = insertelement <9 x double> undef, double %b.lane.0, i32 0
+  %b.ins.0 = insertelement <9 x double> zeroinitializer, double %b.lane.0, i32 0
   %b.ins.1 = insertelement <9 x double> %b.ins.0, double %b.lane.1, i32 1
   %b.ins.2 = insertelement <9 x double> %b.ins.1, double %b.lane.2, i32 2
   %b.ins.3 = insertelement <9 x double> %b.ins.2, double %b.lane.3, i32 3
@@ -401,12 +407,14 @@ define void @first_mul_chain_jumbled(ptr %ptr.1, ptr %ptr.2) {
 ; CHECK-NEXT:    [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1
 ; CHECK-NEXT:    [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
-; CHECK-NEXT:    [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP3]], double [[A_LANE_8]], i32 8
+; CHECK-NEXT:    [[A_INS_72:%.*]] = shufflevector <9 x double> zeroinitializer, <9 x double> [[TMP3]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
+; CHECK-NEXT:    [[A_INS_8:%.*]] = insertelement <9 x double> [[A_INS_72]], double [[A_LANE_8]], i32 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> <i32 6, i32 7, i32 8, i32 0, i32 1, i32 2, i32 3, i32 4>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <8 x double> [[TMP4]], [[TMP1]]
 ; CHECK-NEXT:    [[B_LANE_8:%.*]] = fmul double [[V1_LANE_5]], [[V2_LANE_0]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
-; CHECK-NEXT:    [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP6]], double [[B_LANE_8]], i32 8
+; CHECK-NEXT:    [[B_INS_71:%.*]] = shufflevector <9 x double> zeroinitializer, <9 x double> [[TMP6]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
+; CHECK-NEXT:    [[B_INS_8:%.*]] = insertelement <9 x double> [[B_INS_71]], double [[B_LANE_8]], i32 8
 ; CHECK-NEXT:    [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]]
 ; CHECK-NEXT:    store <9 x double> [[RES]], ptr [[PTR_1]], align 8
 ; CHECK-NEXT:    ret void
@@ -438,7 +446,7 @@ bb:
   %a.lane.7 = fmul double %v1.lane.0, %v2.lane.2
   %a.lane.8 = fmul double %v1.lane.2, %v2.lane.1
 
-  %a.ins.0 = insertelement <9 x double> undef, double %a.lane.0, i32 0
+  %a.ins.0 = insertelement <9 x double> zeroinitializer, double %a.lane.0, i32 0
   %a.ins.1 = insertelement <9 x double> %a.ins.0, double %a.lane.1, i32 1
   %a.ins.2 = insertelement <9 x double> %a.ins.1, double %a.lane.2, i32 2
   %a.ins.3 = insertelement <9 x double> %a.ins.2, double %a.lane.3, i32 3
@@ -458,7 +466,7 @@ bb:
   %b.lane.7 = fmul double %v1.lane.4, %v2.lane.2
   %b.lane.8 = fmul double %v1.lane.5, %v2.lane.0
 
-  %b.ins.0 = insertelement <9 x double> undef, double %b.lane.0, i32 0
+  %b.ins.0 = insertelement <9 x double> zeroinitializer, double %b.lane.0, i32 0
   %b.ins.1 = insertelement <9 x double> %b.ins.0, double %b.lane.1, i32 1
   %b.ins.2 = insertelement <9 x double> %b.ins.1, double %b.lane.2, i32 2
   %b.ins.3 = insertelement <9 x double> %b.ins.2, double %b.lane.3, i32 3
@@ -490,12 +498,14 @@ define void @first_and_second_mul_chain_jumbled(ptr %ptr.1, ptr %ptr.2) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul <8 x double> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
-; CHECK-NEXT:    [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP3]], double [[A_LANE_8]], i32 8
+; CHECK-NEXT:    [[A_INS_72:%.*]] = shufflevector <9 x double> zeroinitializer, <9 x double> [[TMP3]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
+; CHECK-NEXT:    [[A_INS_8:%.*]] = insertelement <9 x double> [[A_INS_72]], double [[A_LANE_8]], i32 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> <i32 7, i32 6, i32 8, i32 1, i32 0, i32 3, i32 2, i32 5>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <8 x double> [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[B_LANE_8:%.*]] = fmul double [[V1_LANE_4]], [[V2_LANE_2]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
-; CHECK-NEXT:    [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP7]], double [[B_LANE_8]], i32 8
+; CHECK-NEXT:    [[B_INS_71:%.*]] = shufflevector <9 x double> zeroinitializer, <9 x double> [[TMP7]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
+; CHECK-NEXT:    [[B_INS_8:%.*]] = insertelement <9 x double> [[B_INS_71]], double [[B_LANE_8]], i32 8
 ; CHECK-NEXT:    [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]]
 ; CHECK-NEXT:    store <9 x double> [[RES]], ptr [[PTR_1]], align 8
 ; CHECK-NEXT:    ret void
@@ -527,7 +537,7 @@ bb:
   %a.lane.7 = fmul double %v1.lane.0, %v2.lane.1
   %a.lane.8 = fmul double %v1.lane.2, %v2.lane.0
 
-  %a.ins.0 = insertelement <9 x double> undef, double %a.lane.0, i32 0
+  %a.ins.0 = insertelement <9 x double> zeroinitializer, double %a.lane.0, i32 0
   %a.ins.1 = insertelement <9 x double> %a.ins.0, double %a.lane.1, i32 1
   %a.ins.2 = insertelement <9 x double> %a.ins.1, double %a.lane.2, i32 2
   %a.ins.3 = insertelement <9 x double> %a.ins.2, double %a.lane.3, i32 3
@@ -547,7 +557,7 @@ bb:
   %b.lane.7 = fmul double %v1.lane.5, %v2.lane.0
   %b.lane.8 = fmul double %v1.lane.4, %v2.lane.2
 
-  %b.ins.0 = insertelement <9 x double> undef, double %b.lane.0, i32 0
+  %b.ins.0 = insertelement <9 x double> zeroinitializer, double %b.lane.0, i32 0
   %b.ins.1 = insertelement <9 x double> %b.ins.0, double %b.lane.1, i32 1
   %b.ins.2 = insertelement <9 x double> %b.ins.1, double %b.lane.2, i32 2
   %b.ins.3 = insertelement <9 x double> %b.ins.2, double %b.lane.3, i32 3
diff --git a/llvm/test/Transforms/SLPVectorizer/InstructionsState-is-invalid-2.ll b/llvm/test/Transforms/SLPVectorizer/InstructionsState-is-invalid-2.ll
new file mode 100644
index 0000000000000..445fd81bb234d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/InstructionsState-is-invalid-2.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -S %s | FileCheck %s
+
+define i32 @test(i32 %minlib) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL2_I306:%.*]] = mul i32 [[MINLIB:%.*]], [[MINLIB]]
+; CHECK-NEXT:    [[MUL3_I307:%.*]] = mul i32 [[MUL2_I306]], [[MINLIB]]
+; CHECK-NEXT:    [[CMP183:%.*]] = icmp sgt i32 [[MUL3_I307]], 0
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %mul2.i306 = mul i32 %minlib, %minlib
+  %mul3.i307 = mul i32 %mul2.i306, %minlib
+  %cmp183 = icmp sgt i32 %mul3.i307, 0
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/InstructionsState-is-invalid-1.ll b/llvm/test/Transforms/SLPVectorizer/X86/InstructionsState-is-invalid-1.ll
new file mode 100644
index 0000000000000..cade023300063
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/InstructionsState-is-invalid-1.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @_Z4blurN6Halide5Tools5ImageItEE(i1 %0, i1 %1, i1 %ident.check, i1 %ident.check56) {
+; CHECK-LABEL: @_Z4blurN6Halide5Tools5ImageItEE(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = or i1 [[TMP0:%.*]], [[TMP1:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or i1 [[IDENT_CHECK:%.*]], [[IDENT_CHECK56:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_BODY6_US_I_I:%.*]], label [[FOR_BODY6_US_I_I]]
+; CHECK:       for.body6.us.i.i:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %2 = or i1 %0, %1
+  %3 = or i1 %ident.check, %ident.check56
+  %4 = or i1 %3, %2
+  br i1 %4, label %for.body6.us.i.i, label %for.body6.us.i.i
+
+for.body6.us.i.i:                                 ; preds = %entry, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index e30f84e4f17b6..b83d35541bbae 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -23,7 +23,7 @@ define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b) {
   %b1 = extractelement <2 x double> %b, i32 1
   %r0 = fadd double %a0, %a1
   %r1 = fadd double %b0, %b1
-  %r00 = insertelement <2 x double> undef, double %r0, i32 0
+  %r00 = insertelement <2 x double> zeroinitializer, double %r0, i32 0
   %r01 = insertelement <2 x double>  %r00, double %r1, i32 1
   ret <2 x double> %r01
 }
@@ -47,7 +47,7 @@ define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b) {
   %r1 = fadd float %a2, %a3
   %r2 = fadd float %b0, %b1
   %r3 = fadd float %b2, %b3
-  %r00 = insertelement <4 x float> undef, float %r0, i32 0
+  %r00 = insertelement <4 x float> zeroinitializer, float %r0, i32 0
   %r01 = insertelement <4 x float>  %r00, float %r1, i32 1
   %r02 = insertelement <4 x float>  %r01, float %r2, i32 2
   %r03 = insertelement <4 x float>  %r02, float %r3, i32 3
@@ -67,7 +67,7 @@ define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) {
   %b1 = extractelement <2 x i64> %b, i32 1
   %r0 = add i64 %a0, %a1
   %r1 = add i64 %b0, %b1
-  %r00 = insertelement <2 x i64> undef, i64 %r0, i32 0
+  %r00 = insertelement <2 x i64> zeroinitializer, i64 %r0, i32 0
   %r01 = insertelement <2 x i64>  %r00, i64 %r1, i32 1
   ret <2 x i64> %r01
 }
@@ -91,7 +91,7 @@ define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) {
   %r1 = add i32 %a2, %a3
   %r2 = add i32 %b0, %b1
   %r3 = add i32 %b2, %b3
-  %r00 = insertelement <4 x i32> undef, i32 %r0, i32 0
+  %r00 = insertelement <4 x i32> zeroinitializer, i32 %r0, i32 0
   %r01 = insertelement <4 x i32>  %r00, i32 %r1, i32 1
   %r02 = insertelement <4 x i32>  %r01, i32 %r2, i32 2
   %r03 = insertelement <4 x i32>  %r02, i32 %r3, i32 3
@@ -129,7 +129,7 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
   %r5 = add i16 %b2, %b3
   %r6 = add i16 %b4, %b5
   %r7 = add i16 %b6, %b7
-  %r00 = insertelement <8 x i16> undef, i16 %r0, i32 0
+  %r00 = insertelement <8 x i16> zeroinitializer, i16 %r0, i32 0
   %r01 = insertelement <8 x i16>  %r00, i16 %r1, i32 1
   %r02 = insertelement <8 x i16>  %r01, i16 %r2, i32 2
   %r03 = insertelement <8 x i16>  %r02, i16 %r3, i32 3
@@ -204,7 +204,7 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
   %r1 = fadd double %b0, %b1
   %r2 = fadd double %a2, %a3
   %r3 = fadd double %b2, %b3
-  %r00 = insertelement <4 x double> undef, double %r0, i32 0
+  %r00 = insertelement <4 x double> zeroinitializer, double %r0, i32 0
   %r01 = insertelement <4 x double>  %r00, double %r1, i32 1
   %r02 = insertelement <4 x double>  %r01, double %r2, i32 2
   %r03 = insertelement <4 x double>  %r02, double %r3, i32 3
@@ -213,16 +213,41 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 
 ; PR50392
 define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @test_v4f64_partial_swizzle(
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> <double undef, double poison>, <4 x i32> <i32 0, i32 2, i32 1, i32 poison>
-; CHECK-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
-; CHECK-NEXT:    ret <4 x double> [[R03]]
+; SSE-LABEL: @test_v4f64_partial_swizzle(
+; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
+; SSE-NEXT:    [[R0212:%.*]] = insertelement <4 x double> [[TMP4]], double 0.000000e+00, i64 1
+; SSE-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R0212]], double [[R3]], i64 3
+; SSE-NEXT:    ret <4 x double> [[R03]]
+;
+; SLM-LABEL: @test_v4f64_partial_swizzle(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
+; SLM-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
+; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double poison, double poison>, double [[R0]], i64 0
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SLM-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; SLM-NEXT:    ret <4 x double> [[R031]]
+;
+; AVX-LABEL: @test_v4f64_partial_swizzle(
+; AVX-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; AVX-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
+; AVX-NEXT:    [[R0212:%.*]] = insertelement <4 x double> [[TMP4]], double 0.000000e+00, i64 1
+; AVX-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R0212]], double [[R3]], i64 3
+; AVX-NEXT:    ret <4 x double> [[R03]]
 ;
   %a0 = extractelement <4 x double> %a, i64 0
   %a1 = extractelement <4 x double> %a, i64 1
@@ -233,7 +258,7 @@ define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b
   %r0 = fadd double %a0, %a1
   %r2 = fadd double %b0, %b1
   %r3 = fadd double %b2, %b3
-  %r00 = insertelement <4 x double> undef, double %r0, i32 0
+  %r00 = insertelement <4 x double> zeroinitializer, double %r0, i32 0
   %r02 = insertelement <4 x double>  %r00, double %r2, i32 2
   %r03 = insertelement <4 x double>  %r02, double %r3, i32 3
   ret <4 x double> %r03
@@ -290,7 +315,7 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
   %r5 = fadd float %a6, %a7
   %r6 = fadd float %b4, %b5
   %r7 = fadd float %b6, %b7
-  %r00 = insertelement <8 x float> undef, float %r0, i32 0
+  %r00 = insertelement <8 x float> zeroinitializer, float %r0, i32 0
   %r01 = insertelement <8 x float>  %r00, float %r1, i32 1
   %r02 = insertelement <8 x float>  %r01, float %r2, i32 2
   %r03 = insertelement <8 x float>  %r02, float %r3, i32 3
@@ -340,7 +365,7 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
   %r1 = add i64 %b0, %b1
   %r2 = add i64 %a2, %a3
   %r3 = add i64 %b2, %b3
-  %r00 = insertelement <4 x i64> undef, i64 %r0, i32 0
+  %r00 = insertelement <4 x i64> zeroinitializer, i64 %r0, i32 0
   %r01 = insertelement <4 x i64>  %r00, i64 %r1, i32 1
   %r02 = insertelement <4 x i64>  %r01, i64 %r2, i32 2
   %r03 = insertelement <4 x i64>  %r02, i64 %r3, i32 3
@@ -398,7 +423,7 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
   %r5 = add i32 %a6, %a7
   %r6 = add i32 %b4, %b5
   %r7 = add i32 %b6, %b7
-  %r00 = insertelement <8 x i32> undef, i32 %r0, i32 0
+  %r00 = insertelement <8 x i32> zeroinitializer, i32 %r0, i32 0
   %r01 = insertelement <8 x i32>  %r00, i32 %r1, i32 1
   %r02 = insertelement <8 x i32>  %r01, i32 %r2, i32 2
   %r03 = insertelement <8 x i32>  %r02, i32 %r3, i32 3
@@ -484,7 +509,7 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   %r13 = add i16 %b10, %b11
   %r14 = add i16 %b12, %b13
   %r15 = add i16 %b14, %b15
-  %rv0  = insertelement <16 x i16> undef, i16 %r0 , i32 0
+  %rv0  = insertelement <16 x i16> zeroinitializer, i16 %r0 , i32 0
   %rv1  = insertelement <16 x i16> %rv0 , i16 %r1 , i32 1
   %rv2  = insertelement <16 x i16> %rv1 , i16 %r2 , i32 2
   %rv3  = insertelement <16 x i16> %rv2 , i16 %r3 , i32 3
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
index 1c56eb2f2ce36..ad0027330868c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
@@ -1093,8 +1093,9 @@ define <4 x double> @sitofp_4xi32_4f64(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
 ; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[A3:%.*]], i32 1
 ; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x double>
 ; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:    [[RES11:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSE-NEXT:    [[RES31:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; SSE-NEXT:    [[RES31:%.*]] = shufflevector <4 x double> [[RES11]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE-NEXT:    ret <4 x double> [[RES31]]
 ;
 ; AVX-LABEL: @sitofp_4xi32_4f64(
@@ -1109,7 +1110,7 @@ define <4 x double> @sitofp_4xi32_4f64(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
   %cvt1 = sitofp i32 %a1 to double
   %cvt2 = sitofp i32 %a2 to double
   %cvt3 = sitofp i32 %a3 to double
-  %res0 = insertelement <4 x double> undef, double %cvt0, i32 0
+  %res0 = insertelement <4 x double> zeroinitializer, double %cvt0, i32 0
   %res1 = insertelement <4 x double> %res0, double %cvt1, i32 1
   %res2 = insertelement <4 x double> %res1, double %cvt2, i32 2
   %res3 = insertelement <4 x double> %res2, double %cvt3, i32 3
@@ -1121,30 +1122,31 @@ define <4 x double> @sitofp_with_const_4xi32_4f64(i32 %a2, i32 %a3) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[A2:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A3:%.*]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double>
-; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x double> undef, double 1.000000e+00, i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x double> zeroinitializer, double 1.000000e+00, i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RES31:%.*]] = shufflevector <4 x double> [[RES0]], <4 x double> [[TMP4]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x double> [[RES31]]
 ;
   %cvt2 = sitofp i32 %a2 to double
   %cvt3 = sitofp i32 %a3 to double
-  %res0 = insertelement <4 x double> undef, double 1.0, i32 3
+  %res0 = insertelement <4 x double> zeroinitializer, double 1.0, i32 3
   %res2 = insertelement <4 x double> %res0, double %cvt2, i32 0
   %res3 = insertelement <4 x double> %res2, double %cvt3, i32 1
   ret <4 x double> %res3
 }
 
-define <4 x double> @sitofp_with_undef_4xi32_4f64(i32 %a2, i32 %a3) #0 {
-; CHECK-LABEL: @sitofp_with_undef_4xi32_4f64(
+define <4 x double> @sitofp_with_zeroinitializer_4xi32_4f64(i32 %a2, i32 %a3) #0 {
+; CHECK-LABEL: @sitofp_with_zeroinitializer_4xi32_4f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[A2:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A3:%.*]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP5]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x double> [[TMP4]]
 ;
   %cvt2 = sitofp i32 %a2 to double
   %cvt3 = sitofp i32 %a3 to double
-  %res2 = insertelement <4 x double> undef, double %cvt2, i32 0
+  %res2 = insertelement <4 x double> zeroinitializer, double %cvt2, i32 0
   %res3 = insertelement <4 x double> %res2, double %cvt3, i32 1
   ret <4 x double> %res3
 }
@@ -1162,7 +1164,7 @@ define <4 x float> @sitofp_4xi32_4f32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
   %cvt1 = sitofp i32 %a1 to float
   %cvt2 = sitofp i32 %a2 to float
   %cvt3 = sitofp i32 %a3 to float
-  %res0 = insertelement <4 x float> undef, float %cvt0, i32 0
+  %res0 = insertelement <4 x float> zeroinitializer, float %cvt0, i32 0
   %res1 = insertelement <4 x float> %res0, float %cvt1, i32 1
   %res2 = insertelement <4 x float> %res1, float %cvt2, i32 2
   %res3 = insertelement <4 x float> %res2, float %cvt3, i32 3
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll
index c5cdcdc1eb1a5..3c34abcdd36a6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll
@@ -13,27 +13,27 @@ define void @test(i1 %arg) {
 ; CHECK-NEXT:  bb279:
 ; CHECK-NEXT:    br label [[BB283:%.*]]
 ; CHECK:       bb283:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP11:%.*]], [[EXIT:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ undef, [[EXIT]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[BB279:%.*]] ], [ [[TMP11:%.*]], [[EXIT:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x float> [ zeroinitializer, [[BB279]] ], [ zeroinitializer, [[EXIT]] ]
 ; CHECK-NEXT:    br label [[BB284:%.*]]
 ; CHECK:       bb284:
 ; CHECK-NEXT:    [[TMP2:%.*]] = fpext <2 x float> [[TMP0]] to <2 x double>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP2]], undef
-; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    br label [[BB21_I:%.*]]
 ; CHECK:       bb21.i:
-; CHECK-NEXT:    br i1 %arg, label [[BB22_I:%.*]], label [[EXIT]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB22_I:%.*]], label [[EXIT]]
 ; CHECK:       bb22.i:
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> zeroinitializer, [[TMP4]]
 ; CHECK-NEXT:    br label [[BB32_I:%.*]]
 ; CHECK:       bb32.i:
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x double> [ [[TMP5]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ]
-; CHECK-NEXT:    br i1 %arg, label [[BB32_I]], label [[BB21_I]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[BB32_I]], label [[BB21_I]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[TMP7:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], <double undef, double 0.000000e+00>
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> undef, [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd <2 x double> [[TMP9]], undef
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <2 x double> [[TMP9]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11]] = fptrunc <2 x double> [[TMP10]] to <2 x float>
 ; CHECK-NEXT:    br label [[BB283]]
 ;
@@ -41,27 +41,27 @@ bb279:
   br label %bb283
 
 bb283:
-  %Av.sroa.8.0 = phi float [ undef, %bb279 ], [ %tmp315, %exit ]
-  %Av.sroa.5.0 = phi float [ undef, %bb279 ], [ %tmp319, %exit ]
-  %Av.sroa.3.0 = phi float [ undef, %bb279 ], [ %tmp307, %exit ]
-  %Av.sroa.0.0 = phi float [ undef, %bb279 ], [ %tmp317, %exit ]
+  %Av.sroa.8.0 = phi float [ zeroinitializer, %bb279 ], [ %tmp315, %exit ]
+  %Av.sroa.5.0 = phi float [ zeroinitializer, %bb279 ], [ %tmp319, %exit ]
+  %Av.sroa.3.0 = phi float [ zeroinitializer, %bb279 ], [ %tmp307, %exit ]
+  %Av.sroa.0.0 = phi float [ zeroinitializer, %bb279 ], [ %tmp317, %exit ]
   br label %bb284
 
 bb284:
   %tmp7.i = fpext float %Av.sroa.3.0 to double
-  %tmp8.i = fsub double %tmp7.i, undef
-  %tmp9.i = fsub double %tmp8.i, undef
+  %tmp8.i = fsub double %tmp7.i, zeroinitializer
+  %tmp9.i = fsub double %tmp8.i, zeroinitializer
   %tmp17.i = fpext float %Av.sroa.8.0 to double
-  %tmp19.i = fsub double %tmp17.i, undef
-  %tmp20.i = fsub double %tmp19.i, undef
+  %tmp19.i = fsub double %tmp17.i, zeroinitializer
+  %tmp20.i = fsub double %tmp19.i, zeroinitializer
   br label %bb21.i
 
 bb21.i:
   br i1 %arg, label %bb22.i, label %exit
 
 bb22.i:
-  %tmp24.i = fadd double undef, %tmp9.i
-  %tmp26.i = fadd double undef, %tmp20.i
+  %tmp24.i = fadd double zeroinitializer, %tmp9.i
+  %tmp26.i = fadd double zeroinitializer, %tmp20.i
   br label %bb32.i
 
 bb32.i:
@@ -71,17 +71,17 @@ bb32.i:
 
 exit:
   %tmp303 = fpext float %Av.sroa.0.0 to double
-  %tmp304 = fmul double %tmp303, undef
-  %tmp305 = fadd double undef, %tmp304
-  %tmp306 = fadd double %tmp305, undef
+  %tmp304 = fmul double %tmp303, zeroinitializer
+  %tmp305 = fadd double zeroinitializer, %tmp304
+  %tmp306 = fadd double %tmp305, zeroinitializer
   %tmp307 = fptrunc double %tmp306 to float
   %tmp311 = fpext float %Av.sroa.5.0 to double
   %tmp312 = fmul double %tmp311, 0.000000e+00
-  %tmp313 = fadd double undef, %tmp312
-  %tmp314 = fadd double %tmp313, undef
+  %tmp313 = fadd double zeroinitializer, %tmp312
+  %tmp314 = fadd double %tmp313, zeroinitializer
   %tmp315 = fptrunc double %tmp314 to float
-  %tmp317 = fptrunc double undef to float
-  %tmp319 = fptrunc double undef to float
+  %tmp317 = fptrunc double zeroinitializer to float
+  %tmp319 = fptrunc double zeroinitializer to float
   br label %bb283
 }
 
@@ -91,13 +91,13 @@ exit:
 define <4 x double> @constant_folding() {
 ; CHECK-LABEL: @constant_folding(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret <4 x double> <double 2.000000e+00, double 1.000000e+00, double undef, double undef>
+; CHECK-NEXT:    ret <4 x double> <double 2.000000e+00, double 1.000000e+00, double 0.000000e+00, double 0.000000e+00>
 ;
 entry:
   %t0 = fadd double 1.000000e+00 , 0.000000e+00
   %t1 = fadd double 1.000000e+00 , 1.000000e+00
   %t2 = fmul double %t0, 1.000000e+00
-  %i1 = insertelement <4 x double> undef, double %t2, i32 1
+  %i1 = insertelement <4 x double> zeroinitializer, double %t2, i32 1
   %t3 = fmul double %t1, 1.000000e+00
   %i2 = insertelement <4 x double> %i1, double %t3, i32 0
   ret <4 x double> %i2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
index a69849fabcef6..6a479174777b0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
@@ -4,26 +4,27 @@
 define void @foo(i1 %arg) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CONV:%.*]] = uitofp i16 undef to float
-; CHECK-NEXT:    [[SUB:%.*]] = fsub float 6.553500e+04, undef
+; CHECK-NEXT:    [[CONV:%.*]] = uitofp i16 0 to float
+; CHECK-NEXT:    [[SUB:%.*]] = fsub float 6.553500e+04, 0.000000e+00
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float poison, float poison, float undef, float undef>, float [[SUB]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00>, float [[SUB]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[CONV]], i32 1
 ; CHECK-NEXT:    br label [[BB2:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP14:%.*]], [[BB3:%.*]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr undef, align 8
-; CHECK-NEXT:    br i1 %arg, label [[BB3]], label [[BB4:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr null, align 8
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB3]], label [[BB4:%.*]]
 ; CHECK:       bb4:
 ; CHECK-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double>
-; CHECK-NEXT:    [[CONV2:%.*]] = uitofp i16 undef to double
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[CONV2]], i32 1
+; CHECK-NEXT:    [[CONV2:%.*]] = uitofp i16 0 to double
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double 0.000000e+00, double poison>, double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> <double 0.000000e+00, double poison>, double [[CONV2]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> <double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP15]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fcmp ogt <4 x double> [[TMP10]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fptrunc <4 x double> [[TMP10]] to <4 x float>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP2]], <4 x float> [[TMP12]]
@@ -33,8 +34,8 @@ define void @foo(i1 %arg) {
 ; CHECK-NEXT:    br label [[BB2]]
 ;
 entry:
-  %conv = uitofp i16 undef to float
-  %sub = fsub float 6.553500e+04, undef
+  %conv = uitofp i16 zeroinitializer to float
+  %sub = fsub float 6.553500e+04, zeroinitializer
   br label %bb1
 
 bb1:
@@ -43,28 +44,28 @@ bb1:
 bb2:
   %0 = phi float [ %sub, %bb1 ], [ %9, %bb3 ]
   %1 = phi float [ %conv, %bb1 ], [ %10, %bb3 ]
-  %2 = phi float [ undef, %bb1 ], [ %11, %bb3 ]
-  %3 = phi float [ undef, %bb1 ], [ %12, %bb3 ]
-  %4 = load double, ptr undef, align 8
+  %2 = phi float [ zeroinitializer, %bb1 ], [ %11, %bb3 ]
+  %3 = phi float [ zeroinitializer, %bb1 ], [ %12, %bb3 ]
+  %4 = load double, ptr zeroinitializer, align 8
   br i1 %arg, label %bb3, label %bb4
 
 bb4:
   %ext = fpext float %3 to double
-  %cmp1 = fcmp ogt double undef, %ext
-  %5 = fptrunc double undef to float
+  %cmp1 = fcmp ogt double zeroinitializer, %ext
+  %5 = fptrunc double zeroinitializer to float
   %sel1 = select i1 %cmp1, float %3, float %5
   %ext2 = fpext float %2 to double
-  %cmp2 = fcmp ogt double undef, %ext2
-  %6 = fptrunc double undef to float
+  %cmp2 = fcmp ogt double zeroinitializer, %ext2
+  %6 = fptrunc double zeroinitializer to float
   %sel2 = select i1 %cmp2, float %2, float %6
   %ext3 = fpext float %1 to double
-  %conv2 = uitofp i16 undef to double
+  %conv2 = uitofp i16 zeroinitializer to double
   %add1 = fadd double %4, %conv2
   %cmp3 = fcmp ogt double %add1, %ext3
   %7 = fptrunc double %add1 to float
   %sel3 = select i1 %cmp3, float %1, float %7
   %ext4 = fpext float %0 to double
-  %sub1 = fsub double undef, undef
+  %sub1 = fsub double zeroinitializer, zeroinitializer
   %cmp4 = fcmp ogt double %sub1, %ext4
   %8 = fptrunc double %sub1 to float
   %sel4 = select i1 %cmp4, float %0, float %8
diff --git a/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-const-undef.ll b/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-const.ll
similarity index 81%
rename from llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-const-undef.ll
rename to llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-const.ll
index 48b5145622bdf..a0e3950e49117 100644
--- a/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-const-undef.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-const.ll
@@ -9,7 +9,8 @@ define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> [[TMP2]], <2 x float> [[TMP3]], <2 x float> [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00>, <4 x float> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x float> [[TMP6]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
@@ -22,7 +23,7 @@ define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
   %cmp1 = icmp ne i32 %c1, 0
   %s0 = select i1 %cmp0, float %a0, float %b0
   %s1 = select i1 %cmp1, float %a1, float %b1
-  %ra = insertelement <4 x float> <float poison, float poison, float undef, float undef>, float %s0, i32 0
+  %ra = insertelement <4 x float> <float poison, float poison, float 0.0, float 0.0>, float %s0, i32 0
   %rb = insertelement <4 x float> %ra, float %s1, i32 1
   ret <4 x float> %rb
 }
diff --git a/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll
index 8e3a941932c97..0b896f4b3a36a 100644
--- a/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll
@@ -32,7 +32,7 @@ define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
   %s1 = select i1 %cmp1, float %a1, float %b1
   %s2 = select i1 %cmp2, float %a2, float %b2
   %s3 = select i1 %cmp3, float %a3, float %b3
-  %ra = insertelement <4 x float> undef, float %s0, i32 0
+  %ra = insertelement <4 x float> zeroinitializer, float %s0, i32 0
   %rb = insertelement <4 x float> %ra, float %s1, i32 1
   %rc = insertelement <4 x float> %rb, float %s2, i32 2
   %rd = insertelement <4 x float> %rc, float %s3, i32 3
@@ -43,7 +43,8 @@ define <8 x float> @simple_select2(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
 ; CHECK-LABEL: @simple_select2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 7, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <8 x i32> <i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 poison, i32 poison, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> zeroinitializer, <8 x float> [[TMP4]], <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 6, i32 15>
 ; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
@@ -66,7 +67,7 @@ define <8 x float> @simple_select2(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
   %s1 = select i1 %cmp1, float %a1, float %b1
   %s2 = select i1 %cmp2, float %a2, float %b2
   %s3 = select i1 %cmp3, float %a3, float %b3
-  %ra = insertelement <8 x float> undef, float %s0, i32 0
+  %ra = insertelement <8 x float> zeroinitializer, float %s0, i32 0
   %rb = insertelement <8 x float> %ra, float %s1, i32 2
   %rc = insertelement <8 x float> %rb, float %s2, i32 4
   %rd = insertelement <8 x float> %rc, float %s3, i32 7
@@ -98,7 +99,7 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
 ; THRESHOLD-NEXT:    [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
 ; THRESHOLD-NEXT:    [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]]
 ; THRESHOLD-NEXT:    [[S3:%.*]] = select i1 [[CMP3]], float [[A3]], float [[B3]]
-; THRESHOLD-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[S0]], i32 0
+; THRESHOLD-NEXT:    [[RA:%.*]] = insertelement <4 x float> zeroinitializer, float [[S0]], i32 0
 ; THRESHOLD-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1
 ; THRESHOLD-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[S2]], i32 2
 ; THRESHOLD-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
@@ -113,7 +114,7 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
 ; THRESHOLD-NEXT:    [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
 ; THRESHOLD-NEXT:    [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]]
 ; THRESHOLD-NEXT:    call void @llvm.assume(i1 [[QI]])
-; THRESHOLD-NEXT:    ret <4 x float> undef
+; THRESHOLD-NEXT:    ret <4 x float> zeroinitializer
 ;
 ; NOTHRESHOLD-LABEL: @simple_select_eph(
 ; NOTHRESHOLD-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
@@ -136,7 +137,7 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
 ; NOTHRESHOLD-NEXT:    [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
 ; NOTHRESHOLD-NEXT:    [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]]
 ; NOTHRESHOLD-NEXT:    [[S3:%.*]] = select i1 [[CMP3]], float [[A3]], float [[B3]]
-; NOTHRESHOLD-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[S0]], i32 0
+; NOTHRESHOLD-NEXT:    [[RA:%.*]] = insertelement <4 x float> zeroinitializer, float [[S0]], i32 0
 ; NOTHRESHOLD-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1
 ; NOTHRESHOLD-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[S2]], i32 2
 ; NOTHRESHOLD-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
@@ -149,7 +150,7 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
 ; NOTHRESHOLD-NEXT:    [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
 ; NOTHRESHOLD-NEXT:    [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]]
 ; NOTHRESHOLD-NEXT:    call void @llvm.assume(i1 [[QI]])
-; NOTHRESHOLD-NEXT:    ret <4 x float> undef
+; NOTHRESHOLD-NEXT:    ret <4 x float> zeroinitializer
 ;
 ; MINTREESIZE-LABEL: @simple_select_eph(
 ; MINTREESIZE-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
@@ -176,7 +177,7 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
 ; MINTREESIZE-NEXT:    [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
 ; MINTREESIZE-NEXT:    [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]]
 ; MINTREESIZE-NEXT:    [[S3:%.*]] = select i1 [[CMP3]], float [[A3]], float [[B3]]
-; MINTREESIZE-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[S0]], i32 0
+; MINTREESIZE-NEXT:    [[RA:%.*]] = insertelement <4 x float> zeroinitializer, float [[S0]], i32 0
 ; MINTREESIZE-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1
 ; MINTREESIZE-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[S2]], i32 2
 ; MINTREESIZE-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
@@ -193,7 +194,7 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
 ; MINTREESIZE-NEXT:    [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
 ; MINTREESIZE-NEXT:    [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]]
 ; MINTREESIZE-NEXT:    call void @llvm.assume(i1 [[QI]])
-; MINTREESIZE-NEXT:    ret <4 x float> undef
+; MINTREESIZE-NEXT:    ret <4 x float> zeroinitializer
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
   %c1 = extractelement <4 x i32> %c, i32 1
@@ -215,7 +216,7 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
   %s1 = select i1 %cmp1, float %a1, float %b1
   %s2 = select i1 %cmp2, float %a2, float %b2
   %s3 = select i1 %cmp3, float %a3, float %b3
-  %ra = insertelement <4 x float> undef, float %s0, i32 0
+  %ra = insertelement <4 x float> zeroinitializer, float %s0, i32 0
   %rb = insertelement <4 x float> %ra, float %s1, i32 1
   %rc = insertelement <4 x float> %rb, float %s2, i32 2
   %rd = insertelement <4 x float> %rc, float %s3, i32 3
@@ -228,7 +229,7 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
   %q6 = fadd float %q4, %q5
   %qi = fcmp olt float %q6, %q5
   call void @llvm.assume(i1 %qi)
-  ret <4 x float> undef
+  ret <4 x float> zeroinitializer
 }
 
 ; Insert in an order different from the vector indices to make sure it
@@ -260,7 +261,7 @@ define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float
   %s1 = select i1 %cmp1, float %a1, float %b1
   %s2 = select i1 %cmp2, float %a2, float %b2
   %s3 = select i1 %cmp3, float %a3, float %b3
-  %ra = insertelement <4 x float> undef, float %s0, i32 2
+  %ra = insertelement <4 x float> zeroinitializer, float %s0, i32 2
   %rb = insertelement <4 x float> %ra, float %s1, i32 1
   %rc = insertelement <4 x float> %rb, float %s2, i32 0
   %rd = insertelement <4 x float> %rc, float %s3, i32 3
@@ -298,7 +299,7 @@ define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32
   %s1 = select i1 %cmp1, float %a1, float %b1
   %s2 = select i1 %cmp2, float %a2, float %b2
   %s3 = select i1 %cmp3, float %a3, float %b3
-  %ra = insertelement <4 x float> undef, float %s0, i32 0
+  %ra = insertelement <4 x float> zeroinitializer, float %s0, i32 0
   %rb = insertelement <4 x float> %ra, float %s1, i32 1
   %rc = insertelement <4 x float> %rb, float %s2, i32 2
   %rd = insertelement <4 x float> %rc, float %s3, i32 3
@@ -319,9 +320,10 @@ define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP8]], <2 x float> [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RB2:%.*]] = shufflevector <4 x float> zeroinitializer, <4 x float> [[TMP11]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[RD1:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+; CHECK-NEXT:    [[RD1:%.*]] = shufflevector <4 x float> zeroinitializer, <4 x float> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x float> [[RD1]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
@@ -344,9 +346,9 @@ define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x
   %s1 = select i1 %cmp1, float %a1, float %b1
   %s2 = select i1 %cmp2, float %a2, float %b2
   %s3 = select i1 %cmp3, float %a3, float %b3
-  %ra = insertelement <4 x float> undef, float %s0, i32 0
+  %ra = insertelement <4 x float> zeroinitializer, float %s0, i32 0
   %rb = insertelement <4 x float> %ra, float %s1, i32 1
-  %rc = insertelement <4 x float> undef, float %s2, i32 2
+  %rc = insertelement <4 x float> zeroinitializer, float %s2, i32 2
   %rd = insertelement <4 x float> %rc, float %s3, i32 3
   ret <4 x float> %rd
 }
@@ -359,7 +361,7 @@ define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
 ; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
 ; CHECK-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
 ; CHECK-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
-; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[C0]], i32 0
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[C0]], i32 0
 ; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[C1]], i32 1
 ; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[C2]], i32 2
 ; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[C3]], i32 3
@@ -369,7 +371,7 @@ define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
   %c1 = extractelement <4 x i32> %c, i32 1
   %c2 = extractelement <4 x i32> %c, i32 2
   %c3 = extractelement <4 x i32> %c, i32 3
-  %ra = insertelement <4 x i32> undef, i32 %c0, i32 0
+  %ra = insertelement <4 x i32> zeroinitializer, i32 %c0, i32 0
   %rb = insertelement <4 x i32> %ra, i32 %c1, i32 1
   %rc = insertelement <4 x i32> %rb, i32 %c2, i32 2
   %rd = insertelement <4 x i32> %rc, i32 %c3, i32 3
@@ -392,13 +394,13 @@ define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %
   %cmp1 = icmp ne i32 %c1, 0
   %s0 = select i1 %cmp0, float %a0, float %b0
   %s1 = select i1 %cmp1, float %a1, float %b1
-  %ra = insertelement <2 x float> undef, float %s0, i32 0
+  %ra = insertelement <2 x float> zeroinitializer, float %s0, i32 0
   %rb = insertelement <2 x float> %ra, float %s1, i32 1
   ret <2 x float> %rb
 }
 
 ; Make sure when we construct partial vectors, we don't keep
-; re-visiting the insertelement chains starting with undef
+; re-visiting the insertelement chains starting with zeroinitializer
 ; (low cost threshold needed to force this to happen)
 define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
 ; CHECK-LABEL: @simple_select_partial_vector(
@@ -408,16 +410,16 @@ define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b,
 ; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
 ; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
 ; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[C0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> zeroinitializer, float [[A0]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A1]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> zeroinitializer, float [[B0]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B1]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
-; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP9]], i32 0
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP9]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
 ; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP10]], i32 1
 ; CHECK-NEXT:    ret <4 x float> [[RB]]
@@ -428,16 +430,16 @@ define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b,
   %a1 = extractelement <4 x float> %a, i32 1
   %b0 = extractelement <4 x float> %b, i32 0
   %b1 = extractelement <4 x float> %b, i32 1
-  %1 = insertelement <2 x i32> undef, i32 %c0, i32 0
+  %1 = insertelement <2 x i32> zeroinitializer, i32 %c0, i32 0
   %2 = insertelement <2 x i32> %1, i32 %c1, i32 1
   %3 = icmp ne <2 x i32> %2, zeroinitializer
-  %4 = insertelement <2 x float> undef, float %a0, i32 0
+  %4 = insertelement <2 x float> zeroinitializer, float %a0, i32 0
   %5 = insertelement <2 x float> %4, float %a1, i32 1
-  %6 = insertelement <2 x float> undef, float %b0, i32 0
+  %6 = insertelement <2 x float> zeroinitializer, float %b0, i32 0
   %7 = insertelement <2 x float> %6, float %b1, i32 1
   %8 = select <2 x i1> %3, <2 x float> %5, <2 x float> %7
   %9 = extractelement <2 x float> %8, i32 0
-  %ra = insertelement <4 x float> undef, float %9, i32 0
+  %ra = insertelement <4 x float> zeroinitializer, float %9, i32 0
   %10 = extractelement <2 x float> %8, i32 1
   %rb = insertelement <4 x float> %ra, float %10, i32 1
   ret <4 x float> %rb
@@ -453,7 +455,7 @@ define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
   %a0 = extractelement <4 x float> %a, i32 0
   %b0 = extractelement <4 x float> %b, i32 0
   %c0 = fadd float %a0, %b0
-  %v0 = insertelement <4 x float> undef, float %c0, i32 0
+  %v0 = insertelement <4 x float> zeroinitializer, float %c0, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
   %b1 = extractelement <4 x float> %b, i32 1
   %c1 = fadd float %a1, %b1
@@ -488,7 +490,7 @@ define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
   %a3 = extractelement <4 x float> %a, i32 3
   %b3 = extractelement <4 x float> %b, i32 3
   %c3 = fadd float %a3, %b3
-  %v0 = insertelement <4 x float> undef, float %c0, i32 0
+  %v0 = insertelement <4 x float> zeroinitializer, float %c0, i32 0
   %v1 = insertelement <4 x float> %v0, float %c1, i32 1
   %v2 = insertelement <4 x float> %v1, float %c2, i32 2
   %v3 = insertelement <4 x float> %v2, float %c3, i32 3
@@ -511,7 +513,7 @@ define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
   %t2 = fadd double %y , 2.000000e+00
   %t3 = fadd double %z , 3.000000e+00
   %t4 = fmul double %t0, 1.000000e+00
-  %i1 = insertelement <4 x double> undef, double %t4, i32 3
+  %i1 = insertelement <4 x double> zeroinitializer, double %t4, i32 3
   %t5 = fmul double %t1, 1.000000e+00
   %i2 = insertelement <4 x double> %i1, double %t5, i32 2
   %t6 = fmul double %t2, 1.000000e+00
@@ -550,7 +552,7 @@ define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr
   %vecext20 = extractelement <8 x float> %a, i32 7
   %vecext21 = extractelement <8 x float> %b, i32 7
   %add22 = fadd float %vecext20, %vecext21
-  %vecinit.i = insertelement <8 x float> undef, float %add, i32 0
+  %vecinit.i = insertelement <8 x float> zeroinitializer, float %add, i32 0
   %vecinit1.i = insertelement <8 x float> %vecinit.i, float %add4, i32 1
   %vecinit2.i = insertelement <8 x float> %vecinit1.i, float %add7, i32 2
   %vecinit3.i = insertelement <8 x float> %vecinit2.i, float %add10, i32 3
diff --git a/llvm/test/Transforms/SROA/readonlynocapture.ll b/llvm/test/Transforms/SROA/readonlynocapture.ll
index 2c21624f3ea51..611c90ac32b5a 100644
--- a/llvm/test/Transforms/SROA/readonlynocapture.ll
+++ b/llvm/test/Transforms/SROA/readonlynocapture.ll
@@ -390,4 +390,20 @@ define i32 @testcallalloca() {
   ret i32 %l1
 }
 
+declare void @callee_byval(ptr byval(i32) %p)
+
+define i32 @simple_byval() {
+; CHECK-LABEL: @simple_byval(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 0, ptr [[A]], align 4
+; CHECK-NEXT:    call void @callee_byval(ptr [[A]])
+; CHECK-NEXT:    ret i32 0
+;
+  %a = alloca i32
+  store i32 0, ptr %a
+  call void @callee_byval(ptr %a)
+  %l1 = load i32, ptr %a
+  ret i32 %l1
+}
+
 declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
diff --git a/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll b/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll
index d34c8f88e4b3c..a3798af839908 100644
--- a/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll
@@ -201,3 +201,44 @@ define void @pack_vectors(ptr %ptr, ptr %ptr2) {
   store float %ld1, ptr %ptr1
   ret void
 }
+
+define void @diamond(ptr %ptr) {
+; CHECK-LABEL: define void @diamond(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0
+; CHECK-NEXT:    [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4
+; CHECK-NEXT:    [[VEC:%.*]] = fsub <2 x float> [[VECL]], [[VECL]]
+; CHECK-NEXT:    store <2 x float> [[VEC]], ptr [[PTR0]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr0 = getelementptr float, ptr %ptr, i32 0
+  %ptr1 = getelementptr float, ptr %ptr, i32 1
+  %ld0 = load float, ptr %ptr0
+  %ld1 = load float, ptr %ptr1
+  %sub0 = fsub float %ld0, %ld0
+  %sub1 = fsub float %ld1, %ld1
+  store float %sub0, ptr %ptr0
+  store float %sub1, ptr %ptr1
+  ret void
+}
+
+define void @diamondWithShuffle(ptr %ptr) {
+; CHECK-LABEL: define void @diamondWithShuffle(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0
+; CHECK-NEXT:    [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4
+; CHECK-NEXT:    [[VSHUF:%.*]] = shufflevector <2 x float> [[VECL]], <2 x float> [[VECL]], <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[VEC:%.*]] = fsub <2 x float> [[VECL]], [[VSHUF]]
+; CHECK-NEXT:    store <2 x float> [[VEC]], ptr [[PTR0]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr0 = getelementptr float, ptr %ptr, i32 0
+  %ptr1 = getelementptr float, ptr %ptr, i32 1
+  %ld0 = load float, ptr %ptr0
+  %ld1 = load float, ptr %ptr1
+  %sub0 = fsub float %ld0, %ld1
+  %sub1 = fsub float %ld1, %ld0
+  store float %sub0, ptr %ptr0
+  store float %sub1, ptr %ptr1
+  ret void
+}
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/valid.yaml b/llvm/test/tools/llvm-ctxprof-util/Inputs/valid.yaml
index 6080c2414d64a..9bbf82d59c913 100644
--- a/llvm/test/tools/llvm-ctxprof-util/Inputs/valid.yaml
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/valid.yaml
@@ -1,13 +1,13 @@
-- Guid: 1000
-  Counters: [1, 2, 3]
-  Callsites:  - []
-              - 
-                - Guid: 2000
-                  Counters: [4, 5]
-                - Guid: 18446744073709551613
-                  Counters: [6, 7, 8]
-              - 
-                - Guid: 3000
-                  Counters: [40, 50]
-- Guid: 18446744073709551612
-  Counters: [5, 9, 10]
+
+- Guid:            1000
+  Counters:        [ 1, 2, 3 ]
+  Callsites:
+    - [  ]
+    - - Guid:            2000
+        Counters:        [ 4, 5 ]
+      - Guid:            18446744073709551613
+        Counters:        [ 6, 7, 8 ]
+    - - Guid:            3000
+        Counters:        [ 40, 50 ]
+- Guid:            18446744073709551612
+  Counters:        [ 5, 9, 10 ]
diff --git a/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test
index 91ebd1de59bb5..30bc8bce05410 100644
--- a/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test
+++ b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test
@@ -4,7 +4,9 @@
 ; RUN: llvm-ctxprof-util fromYAML --input %S/Inputs/empty.yaml -output %t/empty.bitstream
 ; RUN: llvm-bcanalyzer --dump %t/empty.bitstream | FileCheck %s --check-prefix=EMPTY
 
-; RUN: llvm-ctxprof-util fromYAML --input %S/Inputs/valid.yaml -output %t/valid.bitstream
+; RUN: llvm-ctxprof-util fromYAML -input %S/Inputs/valid.yaml -output %t/valid.bitstream
+; RUN: llvm-ctxprof-util toYAML -input %t/valid.bitstream -output %t/valid2.yaml
+; RUN: diff %t/valid2.yaml %S/Inputs/valid.yaml
 
 ; For the valid case, check against a reference output.
 ; Note that uint64_t are printed as signed values by llvm-bcanalyzer:
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/statistics.ll b/llvm/test/tools/llvm-dwarfdump/X86/statistics.ll
index a454bf14c3353..9f3a00df2ffe7 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/statistics.ll
+++ b/llvm/test/tools/llvm-dwarfdump/X86/statistics.ll
@@ -55,8 +55,8 @@
 ; CHECK:      "#bytes within functions": [[FUNCSIZE:[0-9]+]]
 ; CHECK:      "#bytes within inlined functions": [[INLINESIZE:[0-9]+]]
 ; CHECK:      "#bytes in __debug_loc": 35,
-; CHECK-NEXT: "#bytes in __debug_abbrev": 384,
-; CHECK-NEXT: "#bytes in __debug_info": 459,
+; CHECK-NEXT: "#bytes in __debug_abbrev": 386,
+; CHECK-NEXT: "#bytes in __debug_info": 463,
 ; CHECK-NEXT: "#bytes in __debug_str": 231,
 ; CHECK-NEXT: "#bytes in __apple_names": 348,
 ; CHECK-NEXT: "#bytes in __apple_objc": 36,
diff --git a/llvm/tools/dxil-dis/CMakeLists.txt b/llvm/tools/dxil-dis/CMakeLists.txt
index d0541fcf802e9..1e77530106420 100644
--- a/llvm/tools/dxil-dis/CMakeLists.txt
+++ b/llvm/tools/dxil-dis/CMakeLists.txt
@@ -38,7 +38,7 @@ ExternalProject_Add(DXC
                     ${GIT_SETTINGS}
                     SOURCE_DIR ${SOURCE_DIR}
                     BINARY_DIR ${BINARY_DIR}
-                    CMAKE_ARGS -C ${SOURCE_DIR}/cmake/caches/PredefinedParams.cmake -DLLVM_INCLUDE_TESTS=On
+                    CMAKE_ARGS -C ${SOURCE_DIR}/cmake/caches/PredefinedParams.cmake -DLLVM_INCLUDE_TESTS=Off -DCLANG_INCLUDE_TESTS=Off -DHLSL_INCLUDE_TESTS=Off
                     BUILD_COMMAND ${CMAKE_COMMAND} --build ${BINARY_DIR} --target llvm-dis
                     BUILD_BYPRODUCTS ${BINARY_DIR}/bin/llvm-dis
                     INSTALL_COMMAND ""
diff --git a/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp b/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp
index cfa14b22c1469..314144ac6624c 100644
--- a/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp
+++ b/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/ProfileData/PGOCtxProfReader.h"
 #include "llvm/ProfileData/PGOCtxProfWriter.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
@@ -23,6 +24,7 @@
 using namespace llvm;
 
 static cl::SubCommand FromYAML("fromYAML", "Convert from yaml");
+static cl::SubCommand ToYAML("toYAML", "Convert to yaml");
 
 static cl::opt<std::string> InputFilename(
     "input", cl::value_desc("input"), cl::init("-"),
@@ -35,15 +37,16 @@ static cl::opt<std::string> InputFilename(
         "'Contexts', optional. An array containing arrays of contexts. The "
         "context array at a position 'i' is the set of callees at that "
         "callsite index. Use an empty array to indicate no callees."),
-    cl::sub(FromYAML));
+    cl::sub(FromYAML), cl::sub(ToYAML));
 
 static cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
                                            cl::init("-"),
                                            cl::desc("Output file"),
-                                           cl::sub(FromYAML));
+                                           cl::sub(FromYAML), cl::sub(ToYAML));
 
+namespace {
 // Save the bitstream profile from the JSON representation.
-Error convertFromYAML() {
+Error convertFromYaml() {
   auto BufOrError =
       MemoryBuffer::getFileOrSTDIN(InputFilename, /*IsText=*/true);
   if (!BufOrError)
@@ -61,11 +64,30 @@ Error convertFromYAML() {
   return llvm::createCtxProfFromYAML(BufOrError.get()->getBuffer(), Out);
 }
 
+Error convertToYaml() {
+  auto BufOrError = MemoryBuffer::getFileOrSTDIN(InputFilename);
+  if (!BufOrError)
+    return createFileError(InputFilename, BufOrError.getError());
+
+  std::error_code EC;
+  raw_fd_ostream Out(OutputFilename, EC);
+  if (EC)
+    return createStringError(EC, "failed to open output");
+  PGOCtxProfileReader Reader(BufOrError.get()->getBuffer());
+  auto Prof = Reader.loadContexts();
+  if (!Prof)
+    return Prof.takeError();
+  llvm::convertCtxProfToYaml(Out, *Prof);
+  Out << "\n";
+  return Error::success();
+}
+} // namespace
+
 int main(int argc, const char **argv) {
   cl::ParseCommandLineOptions(argc, argv, "LLVM Contextual Profile Utils\n");
   ExitOnError ExitOnErr("llvm-ctxprof-util: ");
-  if (FromYAML) {
-    if (auto E = convertFromYAML()) {
+  auto HandleErr = [&](Error E) -> int {
+    if (E) {
       handleAllErrors(std::move(E), [&](const ErrorInfoBase &E) {
         E.log(errs());
         errs() << "\n";
@@ -73,7 +95,14 @@ int main(int argc, const char **argv) {
       return 1;
     }
     return 0;
-  }
+  };
+
+  if (FromYAML)
+    return HandleErr(convertFromYaml());
+
+  if (ToYAML)
+    return HandleErr(convertToYaml());
+
   cl::PrintHelpMessage();
   return 1;
 }
diff --git a/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp b/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp
index 51846862f0a73..5a7cc6f5e30d3 100644
--- a/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp
@@ -26,7 +26,7 @@ static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) {
 }
 
 // Generates instruction to load an immediate value into a register.
-static MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth,
+static MCInst loadImmediate(MCRegister Reg, unsigned RegBitWidth,
                             const APInt &Value) {
   if (Value.getBitWidth() > RegBitWidth)
     llvm_unreachable("Value must fit in the Register");
@@ -45,7 +45,7 @@ class ExegesisAArch64Target : public ExegesisTarget {
       : ExegesisTarget(AArch64CpuPfmCounters, AArch64_MC::isOpcodeAvailable) {}
 
 private:
-  std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg,
+  std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, MCRegister Reg,
                                const APInt &Value) const override {
     if (AArch64::GPR32RegClass.contains(Reg))
       return {loadImmediate(Reg, 32, Value)};
diff --git a/llvm/tools/llvm-exegesis/lib/Assembler.cpp b/llvm/tools/llvm-exegesis/lib/Assembler.cpp
index 13c8c2048a5c0..7a53b626c177c 100644
--- a/llvm/tools/llvm-exegesis/lib/Assembler.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Assembler.cpp
@@ -81,7 +81,7 @@ static bool generateSnippetSetupCode(const ExegesisTarget &ET,
       // If we're generating memory instructions, don't load in the value for
       // the register with the stack pointer as it will be used later to finish
       // the setup.
-      if (RV.Register == StackPointerRegister)
+      if (Register(RV.Register) == StackPointerRegister)
         continue;
     }
     // Load a constant in the register.
@@ -98,7 +98,7 @@ static bool generateSnippetSetupCode(const ExegesisTarget &ET,
       // Load in the stack register now as we're done using it elsewhere
       // and need to set the value in preparation for executing the
       // snippet.
-      if (RV.Register != StackPointerRegister)
+      if (Register(RV.Register) != StackPointerRegister)
         continue;
       const auto SetRegisterCode = ET.setRegTo(*MSI, RV.Register, RV.Value);
       if (SetRegisterCode.empty())
@@ -208,7 +208,7 @@ void BasicBlockFiller::addReturn(const ExegesisTarget &ET,
 }
 
 FunctionFiller::FunctionFiller(MachineFunction &MF,
-                               std::vector<unsigned> RegistersSetUp)
+                               std::vector<MCRegister> RegistersSetUp)
     : MF(MF), MCII(MF.getTarget().getMCInstrInfo()), Entry(addBasicBlock()),
       RegistersSetUp(std::move(RegistersSetUp)) {}
 
@@ -218,7 +218,7 @@ BasicBlockFiller FunctionFiller::addBasicBlock() {
   return BasicBlockFiller(MF, MBB, MCII);
 }
 
-ArrayRef<unsigned> FunctionFiller::getRegistersSetUp() const {
+ArrayRef<MCRegister> FunctionFiller::getRegistersSetUp() const {
   return RegistersSetUp;
 }
 
@@ -241,7 +241,7 @@ BitVector getFunctionReservedRegs(const TargetMachine &TM) {
 
 Error assembleToStream(const ExegesisTarget &ET,
                        std::unique_ptr<TargetMachine> TM,
-                       ArrayRef<unsigned> LiveIns, const FillFunction &Fill,
+                       ArrayRef<MCRegister> LiveIns, const FillFunction &Fill,
                        raw_pwrite_stream &AsmStream, const BenchmarkKey &Key,
                        bool GenerateMemoryInstructions) {
   auto Context = std::make_unique<LLVMContext>();
@@ -259,19 +259,19 @@ Error assembleToStream(const ExegesisTarget &ET,
   Properties.reset(MachineFunctionProperties::Property::IsSSA);
   Properties.set(MachineFunctionProperties::Property::NoPHIs);
 
-  for (const unsigned Reg : LiveIns)
+  for (const MCRegister Reg : LiveIns)
     MF.getRegInfo().addLiveIn(Reg);
 
   if (GenerateMemoryInstructions) {
-    for (const unsigned Reg : ET.getArgumentRegisters())
+    for (const MCRegister Reg : ET.getArgumentRegisters())
       MF.getRegInfo().addLiveIn(Reg);
     // Add a live in for registers that need saving so that the machine verifier
     // doesn't fail if the register is never defined.
-    for (const unsigned Reg : ET.getRegistersNeedSaving())
+    for (const MCRegister Reg : ET.getRegistersNeedSaving())
       MF.getRegInfo().addLiveIn(Reg);
   }
 
-  std::vector<unsigned> RegistersSetUp;
+  std::vector<MCRegister> RegistersSetUp;
   RegistersSetUp.reserve(Key.RegisterInitialValues.size());
   for (const auto &InitValue : Key.RegisterInitialValues) {
     RegistersSetUp.push_back(InitValue.Register);
@@ -279,15 +279,15 @@ Error assembleToStream(const ExegesisTarget &ET,
   FunctionFiller Sink(MF, std::move(RegistersSetUp));
   auto Entry = Sink.getEntry();
 
-  for (const unsigned Reg : LiveIns)
+  for (const MCRegister Reg : LiveIns)
     Entry.MBB->addLiveIn(Reg);
 
   if (GenerateMemoryInstructions) {
-    for (const unsigned Reg : ET.getArgumentRegisters())
+    for (const MCRegister Reg : ET.getArgumentRegisters())
       Entry.MBB->addLiveIn(Reg);
     // Add a live in for registers that need saving so that the machine verifier
     // doesn't fail if the register is never defined.
-    for (const unsigned Reg : ET.getRegistersNeedSaving())
+    for (const MCRegister Reg : ET.getRegistersNeedSaving())
       Entry.MBB->addLiveIn(Reg);
   }
 
diff --git a/llvm/tools/llvm-exegesis/lib/Assembler.h b/llvm/tools/llvm-exegesis/lib/Assembler.h
index 4d241e0281b5a..1c8854c21b9a7 100644
--- a/llvm/tools/llvm-exegesis/lib/Assembler.h
+++ b/llvm/tools/llvm-exegesis/lib/Assembler.h
@@ -61,7 +61,7 @@ class BasicBlockFiller {
 // Helper to fill in a function.
 class FunctionFiller {
 public:
-  FunctionFiller(MachineFunction &MF, std::vector<unsigned> RegistersSetUp);
+  FunctionFiller(MachineFunction &MF, std::vector<MCRegister> RegistersSetUp);
 
   // Adds a basic block to the function.
   BasicBlockFiller addBasicBlock();
@@ -73,12 +73,12 @@ class FunctionFiller {
   const MCInstrInfo *const MCII;
 
   // Returns the set of registers in the snippet setup code.
-  ArrayRef<unsigned> getRegistersSetUp() const;
+  ArrayRef<MCRegister> getRegistersSetUp() const;
 
 private:
   BasicBlockFiller Entry;
   // The set of registers that are set up in the basic block.
-  std::vector<unsigned> RegistersSetUp;
+  std::vector<MCRegister> RegistersSetUp;
 };
 
 // A callback that fills a function.
@@ -90,7 +90,7 @@ using FillFunction = std::function<void(FunctionFiller &)>;
 // AsmStream, the temporary function is eventually discarded.
 Error assembleToStream(const ExegesisTarget &ET,
                        std::unique_ptr<TargetMachine> TM,
-                       ArrayRef<unsigned> LiveIns, const FillFunction &Fill,
+                       ArrayRef<MCRegister> LiveIns, const FillFunction &Fill,
                        raw_pwrite_stream &AsmStreamm, const BenchmarkKey &Key,
                        bool GenerateMemoryInstructions);
 
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkCode.h b/llvm/tools/llvm-exegesis/lib/BenchmarkCode.h
index 1db8472e99f7c..5e3c10decf723 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkCode.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkCode.h
@@ -23,7 +23,7 @@ struct BenchmarkCode {
 
   // We also need to provide the registers that are live on entry for the
   // assembler to generate proper prologue/epilogue.
-  std::vector<unsigned> LiveIns;
+  std::vector<MCRegister> LiveIns;
 
   // Informations about how this configuration was built.
   std::string Info;
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
index 84dc23b343c6c..1823a534a301a 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
@@ -65,17 +65,17 @@ struct YamlContext {
 
   raw_string_ostream &getErrorStream() { return ErrorStream; }
 
-  StringRef getRegName(unsigned RegNo) {
-    // Special case: RegNo 0 is NoRegister. We have to deal with it explicitly.
-    if (RegNo == 0)
+  StringRef getRegName(MCRegister Reg) {
+    // Special case: Reg may be invalid. We have to deal with it explicitly.
+    if (!Reg.isValid())
       return kNoRegister;
-    const StringRef RegName = State->getRegInfo().getName(RegNo);
+    const StringRef RegName = State->getRegInfo().getName(Reg);
     if (RegName.empty())
-      ErrorStream << "No register with enum value '" << RegNo << "'\n";
+      ErrorStream << "No register with enum value '" << Reg.id() << "'\n";
     return RegName;
   }
 
-  std::optional<unsigned> getRegNo(StringRef RegName) {
+  std::optional<MCRegister> getRegNo(StringRef RegName) {
     std::optional<MCRegister> RegisterNumber =
         State->getRegisterNumberFromName(RegName);
     if (!RegisterNumber.has_value())
@@ -261,7 +261,7 @@ template <> struct ScalarTraits<exegesis::RegisterValue> {
     String.split(Pieces, "=0x", /* MaxSplit */ -1,
                  /* KeepEmpty */ false);
     YamlContext &Context = getTypedContext(Ctx);
-    std::optional<unsigned> RegNo;
+    std::optional<MCRegister> RegNo;
     if (Pieces.size() == 2 && (RegNo = Context.getRegNo(Pieces[0]))) {
       RV.Register = *RegNo;
       const unsigned BitsNeeded = APInt::getBitsNeeded(Pieces[1], kRadix);
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
index 3c09a8380146e..7984c8805cadc 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
@@ -75,7 +75,7 @@ struct BenchmarkKey {
   // being used supports it.
   uintptr_t SnippetAddress = 0;
   // The register that should be used to hold the loop counter.
-  unsigned LoopRegister;
+  MCRegister LoopRegister;
 };
 
 struct BenchmarkMeasure {
diff --git a/llvm/tools/llvm-exegesis/lib/CodeTemplate.h b/llvm/tools/llvm-exegesis/lib/CodeTemplate.h
index 7aca224302a1f..a65015b45b786 100644
--- a/llvm/tools/llvm-exegesis/lib/CodeTemplate.h
+++ b/llvm/tools/llvm-exegesis/lib/CodeTemplate.h
@@ -131,7 +131,7 @@ struct CodeTemplate {
   std::vector<InstructionTemplate> Instructions;
   // If the template uses the provided scratch memory, the register in which
   // the pointer to this memory is passed in to the function.
-  unsigned ScratchSpacePointerInReg = 0;
+  MCRegister ScratchSpacePointerInReg;
 
 #if defined(__GNUC__) && (defined(__clang__) || LLVM_GNUC_PREREQ(8, 0, 0))
   // FIXME: GCC7 bug workaround. Drop #if after GCC7 no longer supported.
diff --git a/llvm/tools/llvm-exegesis/lib/LlvmState.cpp b/llvm/tools/llvm-exegesis/lib/LlvmState.cpp
index 4c44c59286ccf..00d0d2cfd1cd3 100644
--- a/llvm/tools/llvm-exegesis/lib/LlvmState.cpp
+++ b/llvm/tools/llvm-exegesis/lib/LlvmState.cpp
@@ -83,7 +83,7 @@ LLVMState::LLVMState(std::unique_ptr<const TargetMachine> TM,
       OpcodeNameToOpcodeIdxMapping(createOpcodeNameToOpcodeIdxMapping()),
       RegNameToRegNoMapping(createRegNameToRegNoMapping()) {
   BitVector ReservedRegs = getFunctionReservedRegs(getTargetMachine());
-  for (const unsigned Reg : TheExegesisTarget->getUnavailableRegisters())
+  for (const MCPhysReg Reg : TheExegesisTarget->getUnavailableRegisters())
     ReservedRegs.set(Reg);
   RATC.reset(
       new RegisterAliasingTrackerCache(getRegInfo(), std::move(ReservedRegs)));
diff --git a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp
index c9225e51213e5..c002f68b427f7 100644
--- a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp
+++ b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp
@@ -38,7 +38,7 @@ bool Operand::isExplicit() const { return Info; }
 
 bool Operand::isImplicit() const { return !Info; }
 
-bool Operand::isImplicitReg() const { return ImplicitReg; }
+bool Operand::isImplicitReg() const { return ImplicitReg.isValid(); }
 
 bool Operand::isDef() const { return IsDef; }
 
@@ -64,7 +64,7 @@ unsigned Operand::getTiedToIndex() const { return *TiedToIndex; }
 
 unsigned Operand::getVariableIndex() const { return *VariableIndex; }
 
-unsigned Operand::getImplicitReg() const {
+MCRegister Operand::getImplicitReg() const {
   assert(ImplicitReg);
   return ImplicitReg;
 }
diff --git a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.h b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.h
index d7712e21c32c1..c1af10fa460a3 100644
--- a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.h
+++ b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.h
@@ -75,7 +75,7 @@ struct Operand {
   unsigned getIndex() const;
   unsigned getTiedToIndex() const;
   unsigned getVariableIndex() const;
-  unsigned getImplicitReg() const;
+  MCRegister getImplicitReg() const;
   const RegisterAliasingTracker &getRegisterAliasing() const;
   const MCOperandInfo &getExplicitOperandInfo() const;
 
@@ -85,7 +85,7 @@ struct Operand {
   const RegisterAliasingTracker *Tracker = nullptr; // Set for Register Op.
   const MCOperandInfo *Info = nullptr;              // Set for Explicit Op.
   std::optional<uint8_t> TiedToIndex;               // Set for Reg&Explicit Op.
-  MCPhysReg ImplicitReg = 0;                        // Non-0 for Implicit Op.
+  MCRegister ImplicitReg;                           // Non-0 for Implicit Op.
   std::optional<uint8_t> VariableIndex;             // Set for Explicit Op.
 };
 
diff --git a/llvm/tools/llvm-exegesis/lib/Mips/Target.cpp b/llvm/tools/llvm-exegesis/lib/Mips/Target.cpp
index 731e037c240df..f9666d98e1e81 100644
--- a/llvm/tools/llvm-exegesis/lib/Mips/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Mips/Target.cpp
@@ -58,12 +58,12 @@ class ExegesisMipsTarget : public ExegesisTarget {
       : ExegesisTarget(MipsCpuPfmCounters, Mips_MC::isOpcodeAvailable) {}
 
 private:
-  unsigned getScratchMemoryRegister(const Triple &TT) const override;
+  MCRegister getScratchMemoryRegister(const Triple &TT) const override;
   unsigned getMaxMemoryAccessSize() const override { return 64; }
-  void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg,
+  void fillMemoryOperands(InstructionTemplate &IT, MCRegister Reg,
                           unsigned Offset) const override;
 
-  std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg,
+  std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, MCRegister Reg,
                                const APInt &Value) const override;
   bool matchesArch(Triple::ArchType Arch) const override {
     return Arch == Triple::mips || Arch == Triple::mipsel ||
@@ -73,7 +73,7 @@ class ExegesisMipsTarget : public ExegesisTarget {
 } // end anonymous namespace
 
 // Generates instructions to load an immediate value into a register.
-static std::vector<MCInst> loadImmediate(unsigned Reg, bool IsGPR32,
+static std::vector<MCInst> loadImmediate(MCRegister Reg, bool IsGPR32,
                                          const APInt &Value) {
   unsigned ZeroReg;
   unsigned ORi, LUi, SLL;
@@ -134,12 +134,13 @@ static std::vector<MCInst> loadImmediate(unsigned Reg, bool IsGPR32,
   llvm_unreachable("Not implemented for values wider than 32 bits");
 }
 
-unsigned ExegesisMipsTarget::getScratchMemoryRegister(const Triple &TT) const {
+MCRegister
+ExegesisMipsTarget::getScratchMemoryRegister(const Triple &TT) const {
   return TT.isArch64Bit() ? Mips::A0_64 : Mips::A0;
 }
 
 void ExegesisMipsTarget::fillMemoryOperands(InstructionTemplate &IT,
-                                            unsigned Reg,
+                                            MCRegister Reg,
                                             unsigned Offset) const {
   assert(!isInvalidMemoryInstr(IT.getInstr()) &&
          "fillMemoryOperands requires a valid memory instruction");
@@ -149,7 +150,7 @@ void ExegesisMipsTarget::fillMemoryOperands(InstructionTemplate &IT,
 }
 
 std::vector<MCInst> ExegesisMipsTarget::setRegTo(const MCSubtargetInfo &STI,
-                                                 unsigned Reg,
+                                                 MCRegister Reg,
                                                  const APInt &Value) const {
   if (Mips::GPR32RegClass.contains(Reg))
     return loadImmediate(Reg, true, Value);
diff --git a/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp b/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp
index 114e274845e53..03506a2dd757c 100644
--- a/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp
+++ b/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp
@@ -90,9 +90,9 @@ static bool hasVariablesWithTiedOperands(const Instruction &Instr) {
 ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;
 
 void ParallelSnippetGenerator::instantiateMemoryOperands(
-    const unsigned ScratchSpacePointerInReg,
+    const MCRegister ScratchSpacePointerInReg,
     std::vector<InstructionTemplate> &Instructions) const {
-  if (ScratchSpacePointerInReg == 0)
+  if (!ScratchSpacePointerInReg)
     return; // no memory operands.
   const auto &ET = State.getExegesisTarget();
   const unsigned MemStep = ET.getMaxMemoryAccessSize();
@@ -261,10 +261,10 @@ generateSnippetForInstrAvoidingDefUseOverlap(
     if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) {
       assert(Op.isImplicitReg() && "Not an implicit register operand?");
       if (Op.isUse())
-        ImplicitUses.set(Op.getImplicitReg());
+        ImplicitUses.set(Op.getImplicitReg().id());
       else {
         assert(Op.isDef() && "Not a use and not a def?");
-        ImplicitDefs.set(Op.getImplicitReg());
+        ImplicitDefs.set(Op.getImplicitReg().id());
       }
     }
   }
@@ -300,7 +300,7 @@ ParallelSnippetGenerator::generateCodeTemplates(
       Instr.hasMemoryOperands()
           ? State.getExegesisTarget().getScratchMemoryRegister(
                 State.getTargetMachine().getTargetTriple())
-          : 0;
+          : MCRegister();
   const AliasingConfigurations SelfAliasing(Instr, Instr, ForbiddenRegisters);
   if (SelfAliasing.empty()) {
     CT.Info = "instruction is parallel, repeating a random one.";
diff --git a/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.h b/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.h
index 94eb4e26eb588..8a6b8569c5d4c 100644
--- a/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.h
+++ b/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.h
@@ -55,7 +55,7 @@ class ParallelSnippetGenerator : public SnippetGenerator {
   //   add eax, [rdi + 192]
   //   mov eax, [rdi + 256]
   void instantiateMemoryOperands(
-      unsigned ScratchSpaceReg,
+      MCRegister ScratchSpaceReg,
       std::vector<InstructionTemplate> &SnippetTemplate) const;
 };
 
diff --git a/llvm/tools/llvm-exegesis/lib/PowerPC/Target.cpp b/llvm/tools/llvm-exegesis/lib/PowerPC/Target.cpp
index 5c944c90384e3..0e576fa593fb4 100644
--- a/llvm/tools/llvm-exegesis/lib/PowerPC/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/PowerPC/Target.cpp
@@ -33,13 +33,13 @@ class ExegesisPowerPCTarget : public ExegesisTarget {
       : ExegesisTarget(PPCCpuPfmCounters, PPC_MC::isOpcodeAvailable) {}
 
 private:
-  std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg,
+  std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, MCRegister Reg,
                                const APInt &Value) const override;
   bool matchesArch(Triple::ArchType Arch) const override {
     return Arch == Triple::ppc64le;
   }
-  unsigned getScratchMemoryRegister(const Triple &) const override;
-  void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg,
+  MCRegister getScratchMemoryRegister(const Triple &) const override;
+  void fillMemoryOperands(InstructionTemplate &IT, MCRegister Reg,
                           unsigned Offset) const override;
 };
 } // end anonymous namespace
@@ -55,7 +55,7 @@ static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) {
 }
 
 // Generates instruction to load an immediate value into a register.
-static MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth,
+static MCInst loadImmediate(MCRegister Reg, unsigned RegBitWidth,
                             const APInt &Value) {
   if (Value.getBitWidth() > RegBitWidth)
     llvm_unreachable("Value must fit in the Register");
@@ -67,7 +67,7 @@ static MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth,
       .addImm(Value.getZExtValue());
 }
 
-unsigned
+MCRegister
 ExegesisPowerPCTarget::getScratchMemoryRegister(const Triple &TT) const {
   // R13 is reserved as Thread Pointer, we won't use threading in benchmark, so
   // use it as scratch memory register
@@ -75,7 +75,7 @@ ExegesisPowerPCTarget::getScratchMemoryRegister(const Triple &TT) const {
 }
 
 void ExegesisPowerPCTarget::fillMemoryOperands(InstructionTemplate &IT,
-                                               unsigned Reg,
+                                               MCRegister Reg,
                                                unsigned Offset) const {
   int MemOpIdx = 0;
   if (IT.getInstr().hasTiedRegisters())
@@ -93,7 +93,7 @@ void ExegesisPowerPCTarget::fillMemoryOperands(InstructionTemplate &IT,
 }
 
 std::vector<MCInst> ExegesisPowerPCTarget::setRegTo(const MCSubtargetInfo &STI,
-                                                    unsigned Reg,
+                                                    MCRegister Reg,
                                                     const APInt &Value) const {
   // X11 is optional use in function linkage, should be the least used one
   // Use it as scratch reg to load immediate.
diff --git a/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp
index 217b423d7b3f3..d70f609c5e080 100644
--- a/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp
@@ -29,8 +29,8 @@ namespace exegesis {
 namespace {
 
 // Stores constant value to a general-purpose (integer) register.
-static std::vector<MCInst> loadIntReg(const MCSubtargetInfo &STI, unsigned Reg,
-                                      const APInt &Value) {
+static std::vector<MCInst> loadIntReg(const MCSubtargetInfo &STI,
+                                      MCRegister Reg, const APInt &Value) {
   SmallVector<MCInst, 8> MCInstSeq;
   MCRegister DestReg = Reg;
 
@@ -40,11 +40,11 @@ static std::vector<MCInst> loadIntReg(const MCSubtargetInfo &STI, unsigned Reg,
   return MatIntInstrs;
 }
 
-const unsigned ScratchIntReg = RISCV::X30; // t5
+const MCPhysReg ScratchIntReg = RISCV::X30; // t5
 
 // Stores constant bits to a floating-point register.
 static std::vector<MCInst> loadFPRegBits(const MCSubtargetInfo &STI,
-                                         unsigned Reg, const APInt &Bits,
+                                         MCRegister Reg, const APInt &Bits,
                                          unsigned FmvOpcode) {
   std::vector<MCInst> Instrs = loadIntReg(STI, ScratchIntReg, Bits);
   Instrs.push_back(MCInstBuilder(FmvOpcode).addReg(Reg).addReg(ScratchIntReg));
@@ -57,7 +57,8 @@ static std::vector<MCInst> loadFPRegBits(const MCSubtargetInfo &STI,
 // and then do FCVT this is only reliable thing in 32-bit mode, otherwise we
 // need to use __floatsidf
 static std::vector<MCInst> loadFP64RegBits32(const MCSubtargetInfo &STI,
-                                             unsigned Reg, const APInt &Bits) {
+                                             MCRegister Reg,
+                                             const APInt &Bits) {
   double D = Bits.bitsToDouble();
   double IPart;
   double FPart = std::modf(D, &IPart);
@@ -82,7 +83,7 @@ static MCInst nop() {
       .addImm(0);
 }
 
-static bool isVectorRegList(unsigned Reg) {
+static bool isVectorRegList(MCRegister Reg) {
   return RISCV::VRM2RegClass.contains(Reg) ||
          RISCV::VRM4RegClass.contains(Reg) ||
          RISCV::VRM8RegClass.contains(Reg) ||
@@ -105,22 +106,22 @@ class ExegesisRISCVTarget : public ExegesisTarget {
 
   bool matchesArch(Triple::ArchType Arch) const override;
 
-  std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg,
+  std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, MCRegister Reg,
                                const APInt &Value) const override;
 
-  unsigned getDefaultLoopCounterRegister(const Triple &) const override;
+  MCRegister getDefaultLoopCounterRegister(const Triple &) const override;
 
   void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
                                    MachineBasicBlock &TargetMBB,
                                    const MCInstrInfo &MII,
-                                   unsigned LoopRegister) const override;
+                                   MCRegister LoopRegister) const override;
 
-  unsigned getScratchMemoryRegister(const Triple &TT) const override;
+  MCRegister getScratchMemoryRegister(const Triple &TT) const override;
 
-  void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg,
+  void fillMemoryOperands(InstructionTemplate &IT, MCRegister Reg,
                           unsigned Offset) const override;
 
-  ArrayRef<unsigned> getUnavailableRegisters() const override;
+  ArrayRef<MCPhysReg> getUnavailableRegisters() const override;
 
   bool allowAsBackToBack(const Instruction &Instr) const override {
     return !Instr.Description.isPseudo();
@@ -143,7 +144,7 @@ bool ExegesisRISCVTarget::matchesArch(Triple::ArchType Arch) const {
 }
 
 std::vector<MCInst> ExegesisRISCVTarget::setRegTo(const MCSubtargetInfo &STI,
-                                                  unsigned Reg,
+                                                  MCRegister Reg,
                                                   const APInt &Value) const {
   if (RISCV::GPRRegClass.contains(Reg))
     return loadIntReg(STI, Reg, Value);
@@ -173,17 +174,17 @@ std::vector<MCInst> ExegesisRISCVTarget::setRegTo(const MCSubtargetInfo &STI,
   return {};
 }
 
-const unsigned DefaultLoopCounterReg = RISCV::X31; // t6
-const unsigned ScratchMemoryReg = RISCV::X10;      // a0
+const MCPhysReg DefaultLoopCounterReg = RISCV::X31; // t6
+const MCPhysReg ScratchMemoryReg = RISCV::X10;      // a0
 
-unsigned
+MCRegister
 ExegesisRISCVTarget::getDefaultLoopCounterRegister(const Triple &) const {
   return DefaultLoopCounterReg;
 }
 
 void ExegesisRISCVTarget::decrementLoopCounterAndJump(
     MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB,
-    const MCInstrInfo &MII, unsigned LoopRegister) const {
+    const MCInstrInfo &MII, MCRegister LoopRegister) const {
   BuildMI(&MBB, DebugLoc(), MII.get(RISCV::ADDI))
       .addDef(LoopRegister)
       .addUse(LoopRegister)
@@ -194,12 +195,13 @@ void ExegesisRISCVTarget::decrementLoopCounterAndJump(
       .addMBB(&TargetMBB);
 }
 
-unsigned ExegesisRISCVTarget::getScratchMemoryRegister(const Triple &TT) const {
+MCRegister
+ExegesisRISCVTarget::getScratchMemoryRegister(const Triple &TT) const {
   return ScratchMemoryReg; // a0
 }
 
 void ExegesisRISCVTarget::fillMemoryOperands(InstructionTemplate &IT,
-                                             unsigned Reg,
+                                             MCRegister Reg,
                                              unsigned Offset) const {
   // TODO: for now we ignore Offset because have no way
   // to detect it in instruction.
@@ -217,10 +219,10 @@ void ExegesisRISCVTarget::fillMemoryOperands(InstructionTemplate &IT,
   IT.getValueFor(MemOp) = MCOperand::createReg(Reg);
 }
 
-const unsigned UnavailableRegisters[4] = {RISCV::X0, DefaultLoopCounterReg,
-                                          ScratchIntReg, ScratchMemoryReg};
+const MCPhysReg UnavailableRegisters[4] = {RISCV::X0, DefaultLoopCounterReg,
+                                           ScratchIntReg, ScratchMemoryReg};
 
-ArrayRef<unsigned> ExegesisRISCVTarget::getUnavailableRegisters() const {
+ArrayRef<MCPhysReg> ExegesisRISCVTarget::getUnavailableRegisters() const {
   return UnavailableRegisters;
 }
 
diff --git a/llvm/tools/llvm-exegesis/lib/RegisterAliasing.cpp b/llvm/tools/llvm-exegesis/lib/RegisterAliasing.cpp
index ee612fb0dd6af..96040bbf588e5 100644
--- a/llvm/tools/llvm-exegesis/lib/RegisterAliasing.cpp
+++ b/llvm/tools/llvm-exegesis/lib/RegisterAliasing.cpp
@@ -39,9 +39,9 @@ RegisterAliasingTracker::RegisterAliasingTracker(
 }
 
 RegisterAliasingTracker::RegisterAliasingTracker(const MCRegisterInfo &RegInfo,
-                                                 const MCPhysReg PhysReg)
+                                                 const MCRegister PhysReg)
     : RegisterAliasingTracker(RegInfo) {
-  SourceBits.set(PhysReg);
+  SourceBits.set(PhysReg.id());
   FillOriginAndAliasedBits(RegInfo, SourceBits);
 }
 
@@ -63,8 +63,8 @@ RegisterAliasingTrackerCache::RegisterAliasingTrackerCache(
       EmptyRegisters(RegInfo.getNumRegs()) {}
 
 const RegisterAliasingTracker &
-RegisterAliasingTrackerCache::getRegister(MCPhysReg PhysReg) const {
-  auto &Found = Registers[PhysReg];
+RegisterAliasingTrackerCache::getRegister(MCRegister PhysReg) const {
+  auto &Found = Registers[PhysReg.id()];
   if (!Found)
     Found.reset(new RegisterAliasingTracker(RegInfo, PhysReg));
   return *Found;
diff --git a/llvm/tools/llvm-exegesis/lib/RegisterAliasing.h b/llvm/tools/llvm-exegesis/lib/RegisterAliasing.h
index b2980854ba2d1..00e699d4c69b9 100644
--- a/llvm/tools/llvm-exegesis/lib/RegisterAliasing.h
+++ b/llvm/tools/llvm-exegesis/lib/RegisterAliasing.h
@@ -44,9 +44,9 @@ struct RegisterAliasingTracker {
                           const BitVector &ReservedReg,
                           const MCRegisterClass &RegClass);
 
-  // Construct a tracker from an MCPhysReg.
+  // Construct a tracker from an MCRegister.
   RegisterAliasingTracker(const MCRegisterInfo &RegInfo,
-                          const MCPhysReg Register);
+                          const MCRegister Register);
 
   const BitVector &sourceBits() const { return SourceBits; }
 
@@ -88,7 +88,7 @@ struct RegisterAliasingTrackerCache {
   const MCRegisterInfo &regInfo() const { return RegInfo; }
 
   // Retrieves the RegisterAliasingTracker for this particular register.
-  const RegisterAliasingTracker &getRegister(MCPhysReg Reg) const;
+  const RegisterAliasingTracker &getRegister(MCRegister Reg) const;
 
   // Retrieves the RegisterAliasingTracker for this particular register class.
   const RegisterAliasingTracker &getRegisterClass(unsigned RegClassIndex) const;
diff --git a/llvm/tools/llvm-exegesis/lib/RegisterValue.h b/llvm/tools/llvm-exegesis/lib/RegisterValue.h
index 3429783a48a30..d0f111b9e40e3 100644
--- a/llvm/tools/llvm-exegesis/lib/RegisterValue.h
+++ b/llvm/tools/llvm-exegesis/lib/RegisterValue.h
@@ -18,14 +18,15 @@
 
 #include <llvm/ADT/APFloat.h>
 #include <llvm/ADT/APInt.h>
+#include <llvm/MC/MCRegister.h>
 
 namespace llvm {
 namespace exegesis {
 
 // A simple object storing the value for a particular register.
 struct RegisterValue {
-  static RegisterValue zero(unsigned Reg) { return {Reg, APInt()}; }
-  unsigned Register;
+  static RegisterValue zero(MCRegister Reg) { return {Reg, APInt()}; }
+  MCRegister Register;
   APInt Value;
 };
 
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp b/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp
index b37999ab017f5..01a6e94e76147 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp
@@ -80,7 +80,7 @@ class BenchmarkCodeStreamer : public MCStreamer, public AsmCommentConsumer {
     if (CommentText.consume_front("LIVEIN")) {
       // LLVM-EXEGESIS-LIVEIN <reg>
       const auto RegName = CommentText.ltrim();
-      if (unsigned Reg = findRegisterByName(RegName))
+      if (MCRegister Reg = findRegisterByName(RegName))
         Result->LiveIns.push_back(Reg);
       else {
         errs() << "unknown register '" << RegName
@@ -179,7 +179,7 @@ class BenchmarkCodeStreamer : public MCStreamer, public AsmCommentConsumer {
     }
     if (CommentText.consume_front("LOOP-REGISTER")) {
       // LLVM-EXEGESIS-LOOP-REGISTER <loop register>
-      unsigned LoopRegister;
+      MCRegister LoopRegister;
 
       if (!(LoopRegister = findRegisterByName(CommentText.trim()))) {
         errs() << "unknown register '" << CommentText
@@ -207,13 +207,13 @@ class BenchmarkCodeStreamer : public MCStreamer, public AsmCommentConsumer {
   void emitZerofill(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
                     Align ByteAlignment, SMLoc Loc) override {}
 
-  unsigned findRegisterByName(const StringRef RegName) const {
+  MCRegister findRegisterByName(const StringRef RegName) const {
     std::optional<MCRegister> RegisterNumber =
         State.getRegisterNumberFromName(RegName);
     if (!RegisterNumber.has_value()) {
       errs() << "'" << RegName
              << "' is not a valid register name for the target\n";
-      return MCRegister::NoRegister;
+      return MCRegister();
     }
     return *RegisterNumber;
   }
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp b/llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp
index 48357d443f713..04064ae1d8441 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp
@@ -47,9 +47,9 @@ Error SnippetGenerator::generateConfigurations(
   // using the scratch register and its aliasing registers.
   if (Variant.getInstr().hasMemoryOperands()) {
     const auto &ET = State.getExegesisTarget();
-    unsigned ScratchSpacePointerInReg =
+    MCRegister ScratchSpacePointerInReg =
         ET.getScratchMemoryRegister(State.getTargetMachine().getTargetTriple());
-    if (ScratchSpacePointerInReg == 0)
+    if (!ScratchSpacePointerInReg.isValid())
       return make_error<Failure>(
           "Infeasible : target does not support memory instructions");
     const auto &ScratchRegAliases =
@@ -58,7 +58,7 @@ Error SnippetGenerator::generateConfigurations(
     // FIXME: We could make a copy of the scratch register.
     for (const auto &Op : Variant.getInstr().Operands) {
       if (Op.isDef() && Op.isImplicitReg() &&
-          ScratchRegAliases.test(Op.getImplicitReg()))
+          ScratchRegAliases.test(Op.getImplicitReg().id()))
         return make_error<Failure>(
             "Infeasible : memory instruction uses scratch memory register");
     }
@@ -114,38 +114,38 @@ std::vector<RegisterValue> SnippetGenerator::computeRegisterInitialValues(
   // If target always expects a scratch memory register as live input,
   // mark it as defined.
   const ExegesisTarget &Target = State.getExegesisTarget();
-  unsigned ScratchMemoryReg = Target.getScratchMemoryRegister(
+  MCRegister ScratchMemoryReg = Target.getScratchMemoryRegister(
       State.getTargetMachine().getTargetTriple());
-  DefinedRegs.set(ScratchMemoryReg);
+  DefinedRegs.set(ScratchMemoryReg.id());
   std::vector<RegisterValue> RIV;
   for (const InstructionTemplate &IT : Instructions) {
     // Returns the register that this Operand sets or uses, or 0 if this is not
     // a register.
-    const auto GetOpReg = [&IT](const Operand &Op) -> unsigned {
+    const auto GetOpReg = [&IT](const Operand &Op) -> MCRegister {
       if (Op.isMemory())
-        return 0;
+        return MCRegister();
       if (Op.isImplicitReg())
         return Op.getImplicitReg();
       if (Op.isExplicit() && IT.getValueFor(Op).isReg())
         return IT.getValueFor(Op).getReg();
-      return 0;
+      return MCRegister();
     };
     // Collect used registers that have never been def'ed.
     for (const Operand &Op : IT.getInstr().Operands) {
       if (Op.isUse()) {
-        const unsigned Reg = GetOpReg(Op);
-        if (Reg > 0 && !DefinedRegs.test(Reg)) {
+        const MCRegister Reg = GetOpReg(Op);
+        if (Reg && !DefinedRegs.test(Reg.id())) {
           RIV.push_back(RegisterValue::zero(Reg));
-          DefinedRegs.set(Reg);
+          DefinedRegs.set(Reg.id());
         }
       }
     }
     // Mark defs as having been def'ed.
     for (const Operand &Op : IT.getInstr().Operands) {
       if (Op.isDef()) {
-        const unsigned Reg = GetOpReg(Op);
-        if (Reg > 0)
-          DefinedRegs.set(Reg);
+        const MCRegister Reg = GetOpReg(Op);
+        if (Reg)
+          DefinedRegs.set(Reg.id());
       }
     }
   }
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
index 0bab30d158200..e4fe27f010c2f 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
@@ -48,7 +48,7 @@ class DuplicateSnippetRepetitor : public SnippetRepetitor {
 
 class LoopSnippetRepetitor : public SnippetRepetitor {
 public:
-  explicit LoopSnippetRepetitor(const LLVMState &State, unsigned LoopRegister)
+  explicit LoopSnippetRepetitor(const LLVMState &State, MCRegister LoopRegister)
       : SnippetRepetitor(State), LoopCounter(LoopRegister) {}
 
   // Loop over the snippet ceil(MinInstructions / Instructions.Size()) times.
@@ -102,7 +102,7 @@ class LoopSnippetRepetitor : public SnippetRepetitor {
         // The live ins are: the loop counter, the registers that were setup by
         // the entry block, and entry block live ins.
         Loop.MBB->addLiveIn(LoopCounter);
-        for (unsigned Reg : Filler.getRegistersSetUp())
+        for (MCRegister Reg : Filler.getRegistersSetUp())
           Loop.MBB->addLiveIn(Reg);
         for (const auto &LiveIn : Entry.MBB->liveins())
           Loop.MBB->addLiveIn(LiveIn);
@@ -127,7 +127,7 @@ class LoopSnippetRepetitor : public SnippetRepetitor {
   }
 
 private:
-  const unsigned LoopCounter;
+  const MCRegister LoopCounter;
 };
 
 } // namespace
@@ -136,7 +136,7 @@ SnippetRepetitor::~SnippetRepetitor() {}
 
 std::unique_ptr<const SnippetRepetitor>
 SnippetRepetitor::Create(Benchmark::RepetitionModeE Mode,
-                         const LLVMState &State, unsigned LoopRegister) {
+                         const LLVMState &State, MCRegister LoopRegister) {
   switch (Mode) {
   case Benchmark::Duplicate:
   case Benchmark::MiddleHalfDuplicate:
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h
index c62e80f161f12..88dd0f3cb2dbd 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h
+++ b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h
@@ -30,7 +30,7 @@ class SnippetRepetitor {
 public:
   static std::unique_ptr<const SnippetRepetitor>
   Create(Benchmark::RepetitionModeE Mode, const LLVMState &State,
-         unsigned LoopRegister);
+         MCRegister LoopRegister);
 
   virtual ~SnippetRepetitor();
 
diff --git a/llvm/tools/llvm-exegesis/lib/Target.cpp b/llvm/tools/llvm-exegesis/lib/Target.cpp
index 29e58692f0e92..5ea5b4c2c002f 100644
--- a/llvm/tools/llvm-exegesis/lib/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Target.cpp
@@ -212,7 +212,7 @@ class ExegesisDefaultTarget : public ExegesisTarget {
   ExegesisDefaultTarget() : ExegesisTarget({}, opcodeIsNotAvailable) {}
 
 private:
-  std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg,
+  std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, MCRegister Reg,
                                const APInt &Value) const override {
     llvm_unreachable("Not yet implemented");
   }
diff --git a/llvm/tools/llvm-exegesis/lib/Target.h b/llvm/tools/llvm-exegesis/lib/Target.h
index 92cc1cb248a1c..f3fbe3780616f 100644
--- a/llvm/tools/llvm-exegesis/lib/Target.h
+++ b/llvm/tools/llvm-exegesis/lib/Target.h
@@ -91,7 +91,8 @@ class ExegesisTarget {
 
   // Generates code to move a constant into a the given register.
   // Precondition: Value must fit into Reg.
-  virtual std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg,
+  virtual std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI,
+                                       MCRegister Reg,
                                        const APInt &Value) const = 0;
 
   // Generates the code for the lower munmap call. The code generated by this
@@ -177,14 +178,14 @@ class ExegesisTarget {
 
   // Gets the ABI dependent registers that are used to pass arguments in a
   // function call.
-  virtual std::vector<unsigned> getArgumentRegisters() const {
+  virtual std::vector<MCRegister> getArgumentRegisters() const {
     report_fatal_error(
         "getArgumentRegisters is not implemented on the current architecture");
   };
 
   // Gets the registers that might potentially need to be saved by while
   // the setup in the test harness executes.
-  virtual std::vector<unsigned> getRegistersNeedSaving() const {
+  virtual std::vector<MCRegister> getRegistersNeedSaving() const {
     report_fatal_error("getRegistersNeedSaving is not implemented on the "
                        "current architecture");
   };
@@ -192,25 +193,27 @@ class ExegesisTarget {
   // Returns the register pointing to scratch memory, or 0 if this target
   // does not support memory operands. The benchmark function uses the
   // default calling convention.
-  virtual unsigned getScratchMemoryRegister(const Triple &) const { return 0; }
+  virtual MCRegister getScratchMemoryRegister(const Triple &) const {
+    return MCRegister();
+  }
 
   // Fills memory operands with references to the address at [Reg] + Offset.
-  virtual void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg,
+  virtual void fillMemoryOperands(InstructionTemplate &IT, MCRegister Reg,
                                   unsigned Offset) const {
     llvm_unreachable(
         "fillMemoryOperands() requires getScratchMemoryRegister() > 0");
   }
 
   // Returns a counter usable as a loop counter.
-  virtual unsigned getDefaultLoopCounterRegister(const Triple &) const {
-    return 0;
+  virtual MCRegister getDefaultLoopCounterRegister(const Triple &) const {
+    return MCRegister();
   }
 
   // Adds the code to decrement the loop counter and
   virtual void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
                                            MachineBasicBlock &TargetMBB,
                                            const MCInstrInfo &MII,
-                                           unsigned LoopRegister) const {
+                                           MCRegister LoopRegister) const {
     llvm_unreachable("decrementLoopCounterAndBranch() requires "
                      "getLoopCounterRegister() > 0");
   }
@@ -218,7 +221,7 @@ class ExegesisTarget {
   // Returns a list of unavailable registers.
   // Targets can use this to prevent some registers to be automatically selected
   // for use in snippets.
-  virtual ArrayRef<unsigned> getUnavailableRegisters() const { return {}; }
+  virtual ArrayRef<MCPhysReg> getUnavailableRegisters() const { return {}; }
 
   // Returns the maximum number of bytes a load/store instruction can access at
   // once. This is typically the size of the largest register available on the
diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
index 3c3bff76fb681..1659cfb31f117 100644
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -468,7 +468,7 @@ static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) {
 }
 
 // Generates instruction to load an immediate value into a register.
-static MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth,
+static MCInst loadImmediate(MCRegister Reg, unsigned RegBitWidth,
                             const APInt &Value) {
   if (Value.getBitWidth() > RegBitWidth)
     llvm_unreachable("Value must fit in the Register");
@@ -500,7 +500,7 @@ static MCInst fillStackSpace(unsigned MovOpcode, unsigned OffsetBytes,
 }
 
 // Loads scratch memory into register `Reg` using opcode `RMOpcode`.
-static MCInst loadToReg(unsigned Reg, unsigned RMOpcode) {
+static MCInst loadToReg(MCRegister Reg, unsigned RMOpcode) {
   return MCInstBuilder(RMOpcode)
       .addReg(Reg)
       // Address = ESP
@@ -525,12 +525,12 @@ namespace {
 struct ConstantInliner {
   explicit ConstantInliner(const APInt &Constant) : Constant_(Constant) {}
 
-  std::vector<MCInst> loadAndFinalize(unsigned Reg, unsigned RegBitWidth,
+  std::vector<MCInst> loadAndFinalize(MCRegister Reg, unsigned RegBitWidth,
                                       unsigned Opcode);
 
-  std::vector<MCInst> loadX87STAndFinalize(unsigned Reg);
+  std::vector<MCInst> loadX87STAndFinalize(MCRegister Reg);
 
-  std::vector<MCInst> loadX87FPAndFinalize(unsigned Reg);
+  std::vector<MCInst> loadX87FPAndFinalize(MCRegister Reg);
 
   std::vector<MCInst> popFlagAndFinalize();
 
@@ -554,7 +554,7 @@ struct ConstantInliner {
 };
 } // namespace
 
-std::vector<MCInst> ConstantInliner::loadAndFinalize(unsigned Reg,
+std::vector<MCInst> ConstantInliner::loadAndFinalize(MCRegister Reg,
                                                      unsigned RegBitWidth,
                                                      unsigned Opcode) {
   assert((RegBitWidth & 7) == 0 && "RegBitWidth must be a multiple of 8 bits");
@@ -564,7 +564,7 @@ std::vector<MCInst> ConstantInliner::loadAndFinalize(unsigned Reg,
   return std::move(Instructions);
 }
 
-std::vector<MCInst> ConstantInliner::loadX87STAndFinalize(unsigned Reg) {
+std::vector<MCInst> ConstantInliner::loadX87STAndFinalize(MCRegister Reg) {
   initStack(kF80Bytes);
   add(MCInstBuilder(X86::LD_F80m)
           // Address = ESP
@@ -579,7 +579,7 @@ std::vector<MCInst> ConstantInliner::loadX87STAndFinalize(unsigned Reg) {
   return std::move(Instructions);
 }
 
-std::vector<MCInst> ConstantInliner::loadX87FPAndFinalize(unsigned Reg) {
+std::vector<MCInst> ConstantInliner::loadX87FPAndFinalize(MCRegister Reg) {
   initStack(kF80Bytes);
   add(MCInstBuilder(X86::LD_Fp80m)
           .addReg(Reg)
@@ -729,9 +729,9 @@ class ExegesisX86Target : public ExegesisTarget {
 private:
   void addTargetSpecificPasses(PassManagerBase &PM) const override;
 
-  unsigned getScratchMemoryRegister(const Triple &TT) const override;
+  MCRegister getScratchMemoryRegister(const Triple &TT) const override;
 
-  unsigned getDefaultLoopCounterRegister(const Triple &) const override;
+  MCRegister getDefaultLoopCounterRegister(const Triple &) const override;
 
   unsigned getMaxMemoryAccessSize() const override { return 64; }
 
@@ -739,15 +739,15 @@ class ExegesisX86Target : public ExegesisTarget {
                                  MCOperand &AssignedValue,
                                  const BitVector &ForbiddenRegs) const override;
 
-  void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg,
+  void fillMemoryOperands(InstructionTemplate &IT, MCRegister Reg,
                           unsigned Offset) const override;
 
   void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
                                    MachineBasicBlock &TargetMBB,
                                    const MCInstrInfo &MII,
-                                   unsigned LoopRegister) const override;
+                                   MCRegister LoopRegister) const override;
 
-  std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg,
+  std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, MCRegister Reg,
                                const APInt &Value) const override;
 
 #ifdef __linux__
@@ -773,12 +773,12 @@ class ExegesisX86Target : public ExegesisTarget {
 
   std::vector<MCInst> configurePerfCounter(long Request, bool SaveRegisters) const override;
 
-  std::vector<unsigned> getArgumentRegisters() const override;
+  std::vector<MCRegister> getArgumentRegisters() const override;
 
-  std::vector<unsigned> getRegistersNeedSaving() const override;
+  std::vector<MCRegister> getRegistersNeedSaving() const override;
 #endif // __linux__
 
-  ArrayRef<unsigned> getUnavailableRegisters() const override {
+  ArrayRef<MCPhysReg> getUnavailableRegisters() const override {
     if (DisableUpperSSERegisters)
       return ArrayRef(kUnavailableRegistersSSE);
 
@@ -844,25 +844,25 @@ class ExegesisX86Target : public ExegesisTarget {
     return std::make_unique<X86SavedState>();
   }
 
-  static const unsigned kUnavailableRegisters[4];
-  static const unsigned kUnavailableRegistersSSE[12];
+  static const MCPhysReg kUnavailableRegisters[4];
+  static const MCPhysReg kUnavailableRegistersSSE[12];
 };
 
 // We disable a few registers that cannot be encoded on instructions with a REX
 // prefix.
-const unsigned ExegesisX86Target::kUnavailableRegisters[4] = {X86::AH, X86::BH,
-                                                              X86::CH, X86::DH};
+const MCPhysReg ExegesisX86Target::kUnavailableRegisters[4] = {
+    X86::AH, X86::BH, X86::CH, X86::DH};
 
 // Optionally, also disable the upper (x86_64) SSE registers to reduce frontend
 // decoder load.
-const unsigned ExegesisX86Target::kUnavailableRegistersSSE[12] = {
+const MCPhysReg ExegesisX86Target::kUnavailableRegistersSSE[12] = {
     X86::AH,    X86::BH,    X86::CH,    X86::DH,    X86::XMM8,  X86::XMM9,
     X86::XMM10, X86::XMM11, X86::XMM12, X86::XMM13, X86::XMM14, X86::XMM15};
 
 // We're using one of R8-R15 because these registers are never hardcoded in
 // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less
 // conflicts.
-constexpr const unsigned kDefaultLoopCounterReg = X86::R8;
+constexpr const MCPhysReg kDefaultLoopCounterReg = X86::R8;
 
 } // namespace
 
@@ -871,19 +871,19 @@ void ExegesisX86Target::addTargetSpecificPasses(PassManagerBase &PM) const {
   PM.add(createX86FloatingPointStackifierPass());
 }
 
-unsigned ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const {
+MCRegister ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const {
   if (!TT.isArch64Bit()) {
     // FIXME: This would require popping from the stack, so we would have to
     // add some additional setup code.
-    return 0;
+    return MCRegister();
   }
   return TT.isOSWindows() ? X86::RCX : X86::RDI;
 }
 
-unsigned
+MCRegister
 ExegesisX86Target::getDefaultLoopCounterRegister(const Triple &TT) const {
   if (!TT.isArch64Bit()) {
-    return 0;
+    return MCRegister();
   }
   return kDefaultLoopCounterReg;
 }
@@ -910,7 +910,7 @@ Error ExegesisX86Target::randomizeTargetMCOperand(
 }
 
 void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT,
-                                           unsigned Reg,
+                                           MCRegister Reg,
                                            unsigned Offset) const {
   assert(!isInvalidMemoryInstr(IT.getInstr()) &&
          "fillMemoryOperands requires a valid memory instruction");
@@ -927,7 +927,7 @@ void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT,
 
 void ExegesisX86Target::decrementLoopCounterAndJump(
     MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB,
-    const MCInstrInfo &MII, unsigned LoopRegister) const {
+    const MCInstrInfo &MII, MCRegister LoopRegister) const {
   BuildMI(&MBB, DebugLoc(), MII.get(X86::ADD64ri8))
       .addDef(LoopRegister)
       .addUse(LoopRegister)
@@ -988,7 +988,7 @@ static void restoreSyscallRegisters(std::vector<MCInst> &GeneratedCode,
 }
 #endif // __linux__
 
-static std::vector<MCInst> loadImmediateSegmentRegister(unsigned Reg,
+static std::vector<MCInst> loadImmediateSegmentRegister(MCRegister Reg,
                                                         const APInt &Value) {
 #if defined(__x86_64__) && defined(__linux__)
   assert(Value.getBitWidth() <= 64 && "Value must fit in the register.");
@@ -1021,7 +1021,7 @@ static std::vector<MCInst> loadImmediateSegmentRegister(unsigned Reg,
 }
 
 std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI,
-                                                unsigned Reg,
+                                                MCRegister Reg,
                                                 const APInt &Value) const {
   if (X86::SEGMENT_REGRegClass.contains(Reg))
     return loadImmediateSegmentRegister(Reg, Value);
@@ -1298,11 +1298,11 @@ ExegesisX86Target::configurePerfCounter(long Request, bool SaveRegisters) const
   return ConfigurePerfCounterCode;
 }
 
-std::vector<unsigned> ExegesisX86Target::getArgumentRegisters() const {
+std::vector<MCRegister> ExegesisX86Target::getArgumentRegisters() const {
   return {X86::RDI, X86::RSI};
 }
 
-std::vector<unsigned> ExegesisX86Target::getRegistersNeedSaving() const {
+std::vector<MCRegister> ExegesisX86Target::getRegistersNeedSaving() const {
   return {X86::RAX, X86::RDI, X86::RSI, X86::RCX, X86::R11};
 }
 
diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
index fa37e05956be8..b9938a92855a4 100644
--- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -520,7 +520,7 @@ void benchmarkMain() {
   const auto Opcodes = getOpcodesOrDie(State);
   std::vector<BenchmarkCode> Configurations;
 
-  unsigned LoopRegister =
+  MCRegister LoopRegister =
       State.getExegesisTarget().getDefaultLoopCounterRegister(
           State.getTargetMachine().getTargetTriple());
 
diff --git a/llvm/unittests/ADT/APIntTest.cpp b/llvm/unittests/ADT/APIntTest.cpp
index 4d5553fcbd1e3..b14366eac2185 100644
--- a/llvm/unittests/ADT/APIntTest.cpp
+++ b/llvm/unittests/ADT/APIntTest.cpp
@@ -29,6 +29,73 @@ TEST(APIntTest, ValueInit) {
   EXPECT_TRUE(!Zero.sext(64));
 }
 
+// Test that 0^5 == 0
+TEST(APIntTest, PowZeroTo5) {
+  APInt Zero = APInt::getZero(32);
+  EXPECT_TRUE(!Zero);
+  APInt ZeroTo5 = APIntOps::pow(Zero, 5);
+  EXPECT_TRUE(!ZeroTo5);
+}
+
+// Test that 1^16 == 1
+TEST(APIntTest, PowOneTo16) {
+  APInt One(32, 1);
+  APInt OneTo16 = APIntOps::pow(One, 16);
+  EXPECT_EQ(One, OneTo16);
+}
+
+// Test that 2^10 == 1024
+TEST(APIntTest, PowerTwoTo10) {
+  APInt Two(32, 2);
+  APInt TwoTo20 = APIntOps::pow(Two, 10);
+  APInt V_1024(32, 1024);
+  EXPECT_EQ(TwoTo20, V_1024);
+}
+
+// Test that 3^3 == 27
+TEST(APIntTest, PowerThreeTo3) {
+  APInt Three(32, 3);
+  APInt ThreeTo3 = APIntOps::pow(Three, 3);
+  APInt V_27(32, 27);
+  EXPECT_EQ(ThreeTo3, V_27);
+}
+
+// Test that SignedMaxValue^3 == SignedMaxValue
+TEST(APIntTest, PowerSignedMaxValue) {
+  APInt SignedMaxValue = APInt::getSignedMaxValue(32);
+  APInt MaxTo3 = APIntOps::pow(SignedMaxValue, 3);
+  EXPECT_EQ(MaxTo3, SignedMaxValue);
+}
+
+// Test that MaxValue^3 == MaxValue
+TEST(APIntTest, PowerMaxValue) {
+  APInt MaxValue = APInt::getMaxValue(32);
+  APInt MaxTo3 = APIntOps::pow(MaxValue, 3);
+  EXPECT_EQ(MaxValue, MaxTo3);
+}
+
+// Test that SignedMinValue^3 == 0
+TEST(APIntTest, PowerSignedMinValueTo3) {
+  APInt SignedMinValue = APInt::getSignedMinValue(32);
+  APInt MinTo3 = APIntOps::pow(SignedMinValue, 3);
+  EXPECT_TRUE(MinTo3.isZero());
+}
+
+// Test that SignedMinValue^1 == SignedMinValue
+TEST(APIntTest, PowerSignedMinValueTo1) {
+  APInt SignedMinValue = APInt::getSignedMinValue(32);
+  APInt MinTo1 = APIntOps::pow(SignedMinValue, 1);
+  EXPECT_EQ(SignedMinValue, MinTo1);
+}
+
+// Test that MaxValue^3 == MaxValue
+TEST(APIntTest, ZeroToZero) {
+  APInt Zero = APInt::getZero(32);
+  APInt One(32, 1);
+  APInt ZeroToZero = APIntOps::pow(Zero, 0);
+  EXPECT_EQ(ZeroToZero, One);
+}
+
 // Test that APInt shift left works when bitwidth > 64 and shiftamt == 0
 TEST(APIntTest, ShiftLeftByZero) {
   APInt One = APInt::getZero(65) + 1;
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index 0145ee70a14c1..ee44aac45594d 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -2869,7 +2869,7 @@ const std::pair<const char *, const char *> IsBytewiseValueTests[] = {
         "ptr inttoptr (i96 -1 to ptr)",
     },
     {
-        "i8 undef",
+        "i8 poison",
         "[0 x i8] zeroinitializer",
     },
     {
@@ -2877,7 +2877,7 @@ const std::pair<const char *, const char *> IsBytewiseValueTests[] = {
         "[0 x i8] undef",
     },
     {
-        "i8 undef",
+        "i8 poison",
         "[5 x [0 x i8]] zeroinitializer",
     },
     {
@@ -2959,7 +2959,7 @@ const std::pair<const char *, const char *> IsBytewiseValueTests[] = {
         "[2 x i16] [i16 -21836, i16 -21846]]",
     },
     {
-        "i8 undef",
+        "i8 poison",
         "{ } zeroinitializer",
     },
     {
@@ -2967,7 +2967,7 @@ const std::pair<const char *, const char *> IsBytewiseValueTests[] = {
         "{ } undef",
     },
     {
-        "i8 undef",
+        "i8 poison",
         "{ {}, {} } zeroinitializer",
     },
     {
diff --git a/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp b/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp
index dc738d85547bb..6c08173f78622 100644
--- a/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp
+++ b/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp
@@ -384,10 +384,13 @@ class AsmPrinterHandlerTest : public AsmPrinterFixtureBase {
   public:
     TestHandler(AsmPrinterHandlerTest &Test) : Test(Test) {}
     virtual ~TestHandler() {}
+    virtual void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {}
     virtual void beginModule(Module *M) override { Test.BeginCount++; }
     virtual void endModule() override { Test.EndCount++; }
     virtual void beginFunction(const MachineFunction *MF) override {}
     virtual void endFunction(const MachineFunction *MF) override {}
+    virtual void beginInstruction(const MachineInstr *MI) override {}
+    virtual void endInstruction() override {}
   };
 
 protected:
@@ -424,54 +427,4 @@ TEST_F(AsmPrinterHandlerTest, Basic) {
   ASSERT_EQ(EndCount, 3);
 }
 
-class AsmPrinterDebugHandlerTest : public AsmPrinterFixtureBase {
-  class TestDebugHandler : public DebugHandlerBase {
-    AsmPrinterDebugHandlerTest &Test;
-
-  public:
-    TestDebugHandler(AsmPrinterDebugHandlerTest &Test, AsmPrinter *AP)
-        : DebugHandlerBase(AP), Test(Test) {}
-    virtual ~TestDebugHandler() {}
-    virtual void beginModule(Module *M) override { Test.BeginCount++; }
-    virtual void endModule() override { Test.EndCount++; }
-    virtual void beginFunctionImpl(const MachineFunction *MF) override {}
-    virtual void endFunctionImpl(const MachineFunction *MF) override {}
-    virtual void beginInstruction(const MachineInstr *MI) override {}
-    virtual void endInstruction() override {}
-  };
-
-protected:
-  bool init(const std::string &TripleStr, unsigned DwarfVersion,
-            dwarf::DwarfFormat DwarfFormat) {
-    if (!AsmPrinterFixtureBase::init(TripleStr, DwarfVersion, DwarfFormat))
-      return false;
-
-    auto *AP = TestPrinter->getAP();
-    AP->addDebugHandler(std::make_unique<TestDebugHandler>(*this, AP));
-    TargetMachine *TM = &AP->TM;
-    legacy::PassManager PM;
-    PM.add(new MachineModuleInfoWrapperPass(TM));
-    PM.add(TestPrinter->releaseAP()); // Takes ownership of destroying AP
-    LLVMContext Context;
-    std::unique_ptr<Module> M(new Module("TestModule", Context));
-    M->setDataLayout(TM->createDataLayout());
-    PM.run(*M);
-    // Now check that we can run it twice.
-    AP->addDebugHandler(std::make_unique<TestDebugHandler>(*this, AP));
-    PM.run(*M);
-    return true;
-  }
-
-  int BeginCount = 0;
-  int EndCount = 0;
-};
-
-TEST_F(AsmPrinterDebugHandlerTest, Basic) {
-  if (!init("x86_64-pc-linux", /*DwarfVersion=*/4, dwarf::DWARF32))
-    GTEST_SKIP();
-
-  ASSERT_EQ(BeginCount, 3);
-  ASSERT_EQ(EndCount, 3);
-}
-
 } // end namespace
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index bf9c597d8ac5e..736a36da97f57 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -392,6 +392,7 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   SDValue FPToSI = DAG->getNode(ISD::FP_TO_SINT, DL, FloatVT, Op2);
   SDValue FPToUI = DAG->getNode(ISD::FP_TO_UINT, DL, FloatVT, Op2);
 
+  SDValue Bcast = DAG->getNode(ISD::BITCAST, DL, FloatVT, Op0);
   SDValue Brev = DAG->getNode(ISD::BITREVERSE, DL, Int32VT, Op0);
   SDValue Bswap = DAG->getNode(ISD::BSWAP, DL, Int32VT, Op0);
 
@@ -423,8 +424,12 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   EXPECT_FALSE(sd_match(FPToUI, m_FPToSI(m_Value())));
   EXPECT_FALSE(sd_match(FPToSI, m_FPToUI(m_Value())));
 
+  EXPECT_TRUE(sd_match(Bcast, m_BitCast(m_Value())));
+  EXPECT_TRUE(sd_match(Bcast, m_BitCast(m_SpecificVT(MVT::i32))));
   EXPECT_TRUE(sd_match(Brev, m_BitReverse(m_Value())));
   EXPECT_TRUE(sd_match(Bswap, m_BSwap(m_Value())));
+  EXPECT_FALSE(sd_match(Bcast, m_BitReverse(m_Value())));
+  EXPECT_FALSE(sd_match(Bcast, m_BitCast(m_SpecificVT(MVT::f32))));
   EXPECT_FALSE(sd_match(Brev, m_BSwap(m_Value())));
   EXPECT_FALSE(sd_match(Bswap, m_BitReverse(m_Value())));
 
diff --git a/llvm/unittests/ExecutionEngine/JITLink/CMakeLists.txt b/llvm/unittests/ExecutionEngine/JITLink/CMakeLists.txt
index bbf6b1bf1e0ed..a1882ea73c35c 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/CMakeLists.txt
+++ b/llvm/unittests/ExecutionEngine/JITLink/CMakeLists.txt
@@ -17,6 +17,7 @@ add_llvm_unittest(JITLinkTests
     MachOLinkGraphTests.cpp
     MemoryManagerErrorTests.cpp
     StubsTests.cpp
+    X86_64Tests.cpp
   )
 
 target_link_libraries(JITLinkTests PRIVATE LLVMTestingSupport)
diff --git a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
index ff6cf49bb9758..fb60acddf7821 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
+++ b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
@@ -127,6 +127,36 @@ TEST(LinkGraphTest, BlockAndSymbolIteration) {
   EXPECT_TRUE(llvm::count(G.defined_symbols(), &S4));
 }
 
+TEST(LinkGraphTest, EdgeIteration) {
+  // Check that we can iterate over blocks within Sections and across sections.
+  LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
+              getGenericEdgeKindName);
+  auto &Sec1 =
+      G.createSection("__data.1", orc::MemProt::Read | orc::MemProt::Write);
+  auto &B =
+      G.createContentBlock(Sec1, BlockContent, orc::ExecutorAddr(0x1000), 8, 0);
+  auto &S = G.addExternalSymbol("S1", 0, false);
+
+  constexpr size_t NumEdges = 6;
+  Edge::OffsetT Offsets[NumEdges] = {0, 1, 2, 2, 3, 7};
+
+  for (auto O : Offsets)
+    B.addEdge(Edge::KeepAlive, O, S, 0);
+
+  EXPECT_EQ(llvm::range_size(B.edges()), NumEdges);
+  EXPECT_EQ(llvm::range_size(B.edges_at(0)), 1U);
+  EXPECT_EQ(llvm::range_size(B.edges_at(2)), 2U);
+  EXPECT_EQ(llvm::range_size(B.edges_at(4)), 0U);
+
+  {
+    // Check that offsets and iteration order are as expected.
+    size_t Idx = 0;
+    for (auto &E : B.edges())
+      EXPECT_EQ(E.getOffset(), Offsets[Idx++]);
+  }
+}
+
 TEST(LinkGraphTest, ContentAccessAndUpdate) {
   // Check that we can make a defined symbol external.
   LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
diff --git a/llvm/unittests/ExecutionEngine/JITLink/X86_64Tests.cpp b/llvm/unittests/ExecutionEngine/JITLink/X86_64Tests.cpp
new file mode 100644
index 0000000000000..8c79f0a8a9ee1
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/JITLink/X86_64Tests.cpp
@@ -0,0 +1,90 @@
+//===-------- X86_64Tests.cpp - Unit tests for the AArch64 backend --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <llvm/BinaryFormat/ELF.h>
+#include <llvm/ExecutionEngine/JITLink/x86_64.h>
+
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::jitlink;
+using namespace llvm::jitlink::x86_64;
+
+TEST(X86_64, EmptyLinkGraph) {
+  LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
+              getEdgeKindName);
+  EXPECT_EQ(G.getName(), "foo");
+  EXPECT_EQ(G.getTargetTriple().str(), "x86_64-apple-darwin");
+  EXPECT_EQ(G.getPointerSize(), 8U);
+  EXPECT_EQ(G.getEndianness(), llvm::endianness::little);
+  EXPECT_TRUE(G.external_symbols().empty());
+  EXPECT_TRUE(G.absolute_symbols().empty());
+  EXPECT_TRUE(G.defined_symbols().empty());
+  EXPECT_TRUE(G.blocks().empty());
+}
+
+TEST(X86_64, GOTAndStubs) {
+  LinkGraph G("foo", std::make_shared<orc::SymbolStringPool>(),
+              Triple("x86_64-apple-darwin"), SubtargetFeatures(),
+              getEdgeKindName);
+
+  auto &External = G.addExternalSymbol("external", 0, false);
+
+  // First table accesses. We expect the graph to be empty:
+  EXPECT_EQ(G.findSectionByName(GOTTableManager::getSectionName()), nullptr);
+  EXPECT_EQ(G.findSectionByName(PLTTableManager::getSectionName()), nullptr);
+
+  {
+    // Create first GOT and PLT table managers and request a PLT stub. This
+    // should force creation of both a PLT stub and GOT entry.
+    GOTTableManager GOT(G);
+    PLTTableManager PLT(G, GOT);
+
+    PLT.getEntryForTarget(G, External);
+  }
+
+  auto *GOTSec = G.findSectionByName(GOTTableManager::getSectionName());
+  EXPECT_NE(GOTSec, nullptr);
+  if (GOTSec) {
+    // Expect one entry in the GOT now.
+    EXPECT_EQ(GOTSec->symbols_size(), 1U);
+    EXPECT_EQ(GOTSec->blocks_size(), 1U);
+  }
+
+  auto *PLTSec = G.findSectionByName(PLTTableManager::getSectionName());
+  EXPECT_NE(PLTSec, nullptr);
+  if (PLTSec) {
+    // Expect one entry in the PLT.
+    EXPECT_EQ(PLTSec->symbols_size(), 1U);
+    EXPECT_EQ(PLTSec->blocks_size(), 1U);
+  }
+
+  {
+    // Create second GOT and PLT table managers and request a PLT stub. This
+    // should force creation of both a PLT stub and GOT entry.
+    GOTTableManager GOT(G);
+    PLTTableManager PLT(G, GOT);
+
+    PLT.getEntryForTarget(G, External);
+  }
+
+  EXPECT_EQ(G.findSectionByName(GOTTableManager::getSectionName()), GOTSec);
+  if (GOTSec) {
+    // Expect the same one entry in the GOT.
+    EXPECT_EQ(GOTSec->symbols_size(), 1U);
+    EXPECT_EQ(GOTSec->blocks_size(), 1U);
+  }
+
+  EXPECT_EQ(G.findSectionByName(PLTTableManager::getSectionName()), PLTSec);
+  if (PLTSec) {
+    // Expect the same one entry in the GOT.
+    EXPECT_EQ(PLTSec->symbols_size(), 1U);
+    EXPECT_EQ(PLTSec->blocks_size(), 1U);
+  }
+}
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index 874c32c2d4398..73e8ef283fc2a 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -5841,9 +5841,9 @@ define void @foo(i32 %i0, i32 %i1) {
     EXPECT_EQ(ICmp->getSignedPredicate(), LLVMICmp->getSignedPredicate());
     EXPECT_EQ(ICmp->getUnsignedPredicate(), LLVMICmp->getUnsignedPredicate());
   }
-  auto *NewCmp =
+  auto *NewCmp = cast<sandboxir::CmpInst>(
       sandboxir::CmpInst::create(llvm::CmpInst::ICMP_ULE, F.getArg(0),
-                                 F.getArg(1), BB->begin(), Ctx, "NewCmp");
+                                 F.getArg(1), BB->begin(), Ctx, "NewCmp"));
   EXPECT_EQ(NewCmp, &*BB->begin());
   EXPECT_EQ(NewCmp->getPredicate(), llvm::CmpInst::ICMP_ULE);
   EXPECT_EQ(NewCmp->getOperand(0), F.getArg(0));
@@ -5856,6 +5856,16 @@ define void @foo(i32 %i0, i32 %i1) {
   sandboxir::Type *RT =
       sandboxir::CmpInst::makeCmpResultType(F.getArg(0)->getType());
   EXPECT_TRUE(RT->isIntegerTy(1)); // Only one bit in a single comparison
+
+  {
+    // Check create() when operands are constant.
+    auto *Const42 =
+        sandboxir::ConstantInt::get(sandboxir::Type::getInt32Ty(Ctx), 42);
+    auto *NewConstCmp =
+        sandboxir::CmpInst::create(llvm::CmpInst::ICMP_ULE, Const42, Const42,
+                                   BB->begin(), Ctx, "NewConstCmp");
+    EXPECT_TRUE(isa<sandboxir::Constant>(NewConstCmp));
+  }
 }
 
 TEST_F(SandboxIRTest, FCmpInst) {
@@ -5906,8 +5916,8 @@ define void @foo(float %f0, float %f1) {
   CopyFrom->setFastMathFlags(FastMathFlags::getFast());
 
   // create with default flags
-  auto *NewFCmp = sandboxir::CmpInst::create(
-      llvm::CmpInst::FCMP_ONE, F.getArg(0), F.getArg(1), It1, Ctx, "NewFCmp");
+  auto *NewFCmp = cast<sandboxir::CmpInst>(sandboxir::CmpInst::create(
+      llvm::CmpInst::FCMP_ONE, F.getArg(0), F.getArg(1), It1, Ctx, "NewFCmp"));
   EXPECT_EQ(NewFCmp->getPredicate(), llvm::CmpInst::FCMP_ONE);
   EXPECT_EQ(NewFCmp->getOperand(0), F.getArg(0));
   EXPECT_EQ(NewFCmp->getOperand(1), F.getArg(1));
@@ -5917,9 +5927,10 @@ define void @foo(float %f0, float %f1) {
   FastMathFlags DefaultFMF = NewFCmp->getFastMathFlags();
   EXPECT_TRUE(CopyFrom->getFastMathFlags() != DefaultFMF);
   // create with copied flags
-  auto *NewFCmpFlags = sandboxir::CmpInst::createWithCopiedFlags(
-      llvm::CmpInst::FCMP_ONE, F.getArg(0), F.getArg(1), CopyFrom, It1, Ctx,
-      "NewFCmpFlags");
+  auto *NewFCmpFlags =
+      cast<sandboxir::CmpInst>(sandboxir::CmpInst::createWithCopiedFlags(
+          llvm::CmpInst::FCMP_ONE, F.getArg(0), F.getArg(1), CopyFrom, It1, Ctx,
+          "NewFCmpFlags"));
   EXPECT_FALSE(NewFCmpFlags->getFastMathFlags() !=
                CopyFrom->getFastMathFlags());
   EXPECT_EQ(NewFCmpFlags->getPredicate(), llvm::CmpInst::FCMP_ONE);
@@ -5928,6 +5939,16 @@ define void @foo(float %f0, float %f1) {
 #ifndef NDEBUG
   EXPECT_EQ(NewFCmpFlags->getName(), "NewFCmpFlags");
 #endif // NDEBUG
+
+  {
+    // Check create() when operands are constant.
+    auto *Const42 =
+        sandboxir::ConstantFP::get(sandboxir::Type::getFloatTy(Ctx), 42.0);
+    auto *NewConstCmp =
+        sandboxir::CmpInst::create(llvm::CmpInst::FCMP_ULE, Const42, Const42,
+                                   BB->begin(), Ctx, "NewConstCmp");
+    EXPECT_TRUE(isa<sandboxir::Constant>(NewConstCmp));
+  }
 }
 
 TEST_F(SandboxIRTest, UnreachableInst) {
diff --git a/llvm/unittests/Support/CrashRecoveryTest.cpp b/llvm/unittests/Support/CrashRecoveryTest.cpp
index a22e532ec4c83..ceafba5b36f11 100644
--- a/llvm/unittests/Support/CrashRecoveryTest.cpp
+++ b/llvm/unittests/Support/CrashRecoveryTest.cpp
@@ -26,10 +26,8 @@
 #endif
 
 #ifdef LLVM_ON_UNIX
-#ifdef HAVE_SIGNAL_H
 #include <signal.h>
 #endif
-#endif
 
 using namespace llvm;
 using namespace llvm::sys;
diff --git a/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp b/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
index 4420a6d065499..57a8f75a3a31a 100644
--- a/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
@@ -18,7 +18,6 @@
 #include "llvm/IR/PassInstrumentation.h"
 #include "llvm/ProfileData/PGOCtxProfReader.h"
 #include "llvm/ProfileData/PGOCtxProfWriter.h"
-#include "llvm/Support/JSON.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Testing/Support/SupportHelpers.h"
@@ -572,43 +571,34 @@ define i32 @f4() !guid !3 {
   raw_string_ostream OS(Str);
   CtxProfAnalysisPrinterPass Printer(OS);
   Printer.run(*M, MAM);
-  const char *Expected = R"json(
-  [
-  {
-    "Guid": 1000,
-    "Counters": [1, 11, 22],
-    "Callsites": [
-      [{ "Guid": 1001,
-          "Counters": [10]}, 
-        { "Guid": 1003,
-          "Counters": [12]
-        }], 
-        [{ "Guid": 1002,
-          "Counters": [11],
-          "Callsites": [
-          [{ "Guid": 1004,
-            "Counters": [13] }]]}]]
-  },
-  {
-    "Guid": 1005,
-    "Counters": [2],
-    "Callsites": [
-      [{ "Guid": 1000,
-         "Counters": [1, 102, 204],
-         "Callsites": [
-            [{ "Guid": 1001,
-               "Counters": [101]}, 
-             { "Guid": 1003,
-               "Counters": [103]}],
-            [{ "Guid": 1002,
-               "Counters": [102],
-               "Callsites": [
-            [{ "Guid": 1004,
-               "Counters": [104]}]]}]]}]]}
-])json";
-  auto ExpectedJSON = json::parse(Expected);
-  ASSERT_TRUE(!!ExpectedJSON);
-  auto ProducedJSON = json::parse(Str);
-  ASSERT_TRUE(!!ProducedJSON);
-  EXPECT_EQ(*ProducedJSON, *ExpectedJSON);
+  const char *Expected = R"yaml(
+- Guid:            1000
+  Counters:        [ 1, 11, 22 ]
+  Callsites:
+    - - Guid:            1001
+        Counters:        [ 10 ]
+      - Guid:            1003
+        Counters:        [ 12 ]
+    - - Guid:            1002
+        Counters:        [ 11 ]
+        Callsites:
+          - - Guid:            1004
+              Counters:        [ 13 ]
+- Guid:            1005
+  Counters:        [ 2 ]
+  Callsites:
+    - - Guid:            1000
+        Counters:        [ 1, 102, 204 ]
+        Callsites:
+          - - Guid:            1001
+              Counters:        [ 101 ]
+            - Guid:            1003
+              Counters:        [ 103 ]
+          - - Guid:            1002
+              Counters:        [ 102 ]
+              Callsites:
+                - - Guid:            1004
+                    Counters:        [ 104 ]
+)yaml";
+  EXPECT_EQ(Expected, Str);
 }
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/CMakeLists.txt b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/CMakeLists.txt
index df689767b7724..bbfbcc730a4cb 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/CMakeLists.txt
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/CMakeLists.txt
@@ -9,6 +9,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_unittest(SandboxVectorizerTests
   DependencyGraphTest.cpp
+  InstrMapsTest.cpp
   IntervalTest.cpp
   LegalityTest.cpp
   SchedulerTest.cpp
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/InstrMapsTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/InstrMapsTest.cpp
new file mode 100644
index 0000000000000..1d7c8f9cdde04
--- /dev/null
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/InstrMapsTest.cpp
@@ -0,0 +1,87 @@
+//===- InstrMapsTest.cpp --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/SandboxIR/Function.h"
+#include "llvm/SandboxIR/Instruction.h"
+#include "llvm/Support/SourceMgr.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+struct InstrMapsTest : public testing::Test {
+  LLVMContext C;
+  std::unique_ptr<Module> M;
+
+  void parseIR(LLVMContext &C, const char *IR) {
+    SMDiagnostic Err;
+    M = parseAssemblyString(IR, Err, C);
+    if (!M)
+      Err.print("InstrMapsTest", errs());
+  }
+};
+
+TEST_F(InstrMapsTest, Basic) {
+  parseIR(C, R"IR(
+define void @foo(i8 %v0, i8 %v1, i8 %v2, i8 %v3, <2 x i8> %vec) {
+  %add0 = add i8 %v0, %v0
+  %add1 = add i8 %v1, %v1
+  %add2 = add i8 %v2, %v2
+  %add3 = add i8 %v3, %v3
+  %vadd0 = add <2 x i8> %vec, %vec
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+
+  auto *Add0 = cast<sandboxir::BinaryOperator>(&*It++);
+  auto *Add1 = cast<sandboxir::BinaryOperator>(&*It++);
+  auto *Add2 = cast<sandboxir::BinaryOperator>(&*It++);
+  auto *Add3 = cast<sandboxir::BinaryOperator>(&*It++);
+  auto *VAdd0 = cast<sandboxir::BinaryOperator>(&*It++);
+  [[maybe_unused]] auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+
+  sandboxir::InstrMaps IMaps(Ctx);
+  // Check with empty IMaps.
+  EXPECT_EQ(IMaps.getVectorForOrig(Add0), nullptr);
+  EXPECT_EQ(IMaps.getVectorForOrig(Add1), nullptr);
+  EXPECT_FALSE(IMaps.getOrigLane(Add0, Add0));
+  // Check with 1 match.
+  IMaps.registerVector({Add0, Add1}, VAdd0);
+  EXPECT_EQ(IMaps.getVectorForOrig(Add0), VAdd0);
+  EXPECT_EQ(IMaps.getVectorForOrig(Add1), VAdd0);
+  EXPECT_FALSE(IMaps.getOrigLane(VAdd0, VAdd0)); // Bad Orig value
+  EXPECT_FALSE(IMaps.getOrigLane(Add0, Add0));   // Bad Vector value
+  EXPECT_EQ(*IMaps.getOrigLane(VAdd0, Add0), 0U);
+  EXPECT_EQ(*IMaps.getOrigLane(VAdd0, Add1), 1U);
+  // Check when the same vector maps to different original values (which is
+  // common for vector constants).
+  IMaps.registerVector({Add2, Add3}, VAdd0);
+  EXPECT_EQ(*IMaps.getOrigLane(VAdd0, Add2), 0U);
+  EXPECT_EQ(*IMaps.getOrigLane(VAdd0, Add3), 1U);
+  // Check when we register for a second time.
+#ifndef NDEBUG
+  EXPECT_DEATH(IMaps.registerVector({Add1, Add0}, VAdd0), ".*exists.*");
+#endif // NDEBUG
+  // Check callbacks: erase original instr.
+  Add0->eraseFromParent();
+  EXPECT_FALSE(IMaps.getOrigLane(VAdd0, Add0));
+  EXPECT_EQ(*IMaps.getOrigLane(VAdd0, Add1), 1U);
+  EXPECT_EQ(IMaps.getVectorForOrig(Add0), nullptr);
+  // Check callbacks: erase vector instr.
+  VAdd0->eraseFromParent();
+  EXPECT_FALSE(IMaps.getOrigLane(VAdd0, Add1));
+  EXPECT_EQ(IMaps.getVectorForOrig(Add1), nullptr);
+}
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
index b5e2c302f5901..b421d08bc6b02 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
@@ -18,6 +18,8 @@
 #include "llvm/SandboxIR/Function.h"
 #include "llvm/SandboxIR/Instruction.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h"
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
@@ -110,7 +112,8 @@ define void @foo(ptr %ptr, <2 x float> %vec2, <3 x float> %vec3, i8 %arg, float
   auto *CmpSLT = cast<sandboxir::CmpInst>(&*It++);
   auto *CmpSGT = cast<sandboxir::CmpInst>(&*It++);
 
-  sandboxir::LegalityAnalysis Legality(*AA, *SE, DL, Ctx);
+  llvm::sandboxir::InstrMaps IMaps(Ctx);
+  sandboxir::LegalityAnalysis Legality(*AA, *SE, DL, Ctx, IMaps);
   const auto &Result =
       Legality.canVectorize({St0, St1}, /*SkipScheduling=*/true);
   EXPECT_TRUE(isa<sandboxir::Widen>(Result));
@@ -228,7 +231,8 @@ define void @foo(ptr %ptr) {
   auto *St0 = cast<sandboxir::StoreInst>(&*It++);
   auto *St1 = cast<sandboxir::StoreInst>(&*It++);
 
-  sandboxir::LegalityAnalysis Legality(*AA, *SE, DL, Ctx);
+  llvm::sandboxir::InstrMaps IMaps(Ctx);
+  sandboxir::LegalityAnalysis Legality(*AA, *SE, DL, Ctx, IMaps);
   {
     // Can vectorize St0,St1.
     const auto &Result = Legality.canVectorize({St0, St1});
@@ -263,7 +267,8 @@ define void @foo() {
   };
 
   sandboxir::Context Ctx(C);
-  sandboxir::LegalityAnalysis Legality(*AA, *SE, DL, Ctx);
+  llvm::sandboxir::InstrMaps IMaps(Ctx);
+  sandboxir::LegalityAnalysis Legality(*AA, *SE, DL, Ctx, IMaps);
   EXPECT_TRUE(
       Matches(Legality.createLegalityResult<sandboxir::Widen>(), "Widen"));
   EXPECT_TRUE(Matches(Legality.createLegalityResult<sandboxir::Pack>(
@@ -283,3 +288,160 @@ define void @foo() {
                       "Pack Reason: DiffWrapFlags"));
 }
 #endif // NDEBUG
+
+TEST_F(LegalityTest, CollectDescr) {
+  parseIR(C, R"IR(
+define void @foo(ptr %ptr) {
+  %gep0 = getelementptr float, ptr %ptr, i32 0
+  %gep1 = getelementptr float, ptr %ptr, i32 1
+  %ld0 = load float, ptr %gep0
+  %ld1 = load float, ptr %gep1
+  %vld = load <4 x float>, ptr %ptr
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  getAnalyses(*LLVMF);
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  [[maybe_unused]] auto *Gep0 = cast<sandboxir::GetElementPtrInst>(&*It++);
+  [[maybe_unused]] auto *Gep1 = cast<sandboxir::GetElementPtrInst>(&*It++);
+  auto *Ld0 = cast<sandboxir::LoadInst>(&*It++);
+  [[maybe_unused]] auto *Ld1 = cast<sandboxir::LoadInst>(&*It++);
+  auto *VLd = cast<sandboxir::LoadInst>(&*It++);
+
+  sandboxir::CollectDescr::DescrVecT Descrs;
+  using EEDescr = sandboxir::CollectDescr::ExtractElementDescr;
+
+  {
+    // Check single input, no shuffle.
+    Descrs.push_back(EEDescr(VLd, 0));
+    Descrs.push_back(EEDescr(VLd, 1));
+    sandboxir::CollectDescr CD(std::move(Descrs));
+    EXPECT_TRUE(CD.getSingleInput());
+    EXPECT_EQ(CD.getSingleInput()->first, VLd);
+    EXPECT_THAT(CD.getSingleInput()->second, testing::ElementsAre(0, 1));
+    EXPECT_TRUE(CD.hasVectorInputs());
+  }
+  {
+    // Check single input, shuffle.
+    Descrs.push_back(EEDescr(VLd, 1));
+    Descrs.push_back(EEDescr(VLd, 0));
+    sandboxir::CollectDescr CD(std::move(Descrs));
+    EXPECT_TRUE(CD.getSingleInput());
+    EXPECT_EQ(CD.getSingleInput()->first, VLd);
+    EXPECT_THAT(CD.getSingleInput()->second, testing::ElementsAre(1, 0));
+    EXPECT_TRUE(CD.hasVectorInputs());
+  }
+  {
+    // Check multiple inputs.
+    Descrs.push_back(EEDescr(Ld0));
+    Descrs.push_back(EEDescr(VLd, 0));
+    Descrs.push_back(EEDescr(VLd, 1));
+    sandboxir::CollectDescr CD(std::move(Descrs));
+    EXPECT_FALSE(CD.getSingleInput());
+    EXPECT_TRUE(CD.hasVectorInputs());
+  }
+  {
+    // Check multiple inputs only scalars.
+    Descrs.push_back(EEDescr(Ld0));
+    Descrs.push_back(EEDescr(Ld1));
+    sandboxir::CollectDescr CD(std::move(Descrs));
+    EXPECT_FALSE(CD.getSingleInput());
+    EXPECT_FALSE(CD.hasVectorInputs());
+  }
+}
+
+TEST_F(LegalityTest, ShuffleMask) {
+  {
+    // Check SmallVector constructor.
+    SmallVector<int> Indices({0, 1, 2, 3});
+    sandboxir::ShuffleMask Mask(std::move(Indices));
+    EXPECT_THAT(Mask, testing::ElementsAre(0, 1, 2, 3));
+  }
+  {
+    // Check initializer_list constructor.
+    sandboxir::ShuffleMask Mask({0, 1, 2, 3});
+    EXPECT_THAT(Mask, testing::ElementsAre(0, 1, 2, 3));
+  }
+  {
+    // Check ArrayRef constructor.
+    sandboxir::ShuffleMask Mask(ArrayRef<int>({0, 1, 2, 3}));
+    EXPECT_THAT(Mask, testing::ElementsAre(0, 1, 2, 3));
+  }
+  {
+    // Check operator ArrayRef<int>().
+    sandboxir::ShuffleMask Mask({0, 1, 2, 3});
+    ArrayRef<int> Array = Mask;
+    EXPECT_THAT(Array, testing::ElementsAre(0, 1, 2, 3));
+  }
+  {
+    // Check getIdentity().
+    auto IdentityMask = sandboxir::ShuffleMask::getIdentity(4);
+    EXPECT_THAT(IdentityMask, testing::ElementsAre(0, 1, 2, 3));
+    EXPECT_TRUE(IdentityMask.isIdentity());
+  }
+  {
+    // Check isIdentity().
+    sandboxir::ShuffleMask Mask1({0, 1, 2, 3});
+    EXPECT_TRUE(Mask1.isIdentity());
+    sandboxir::ShuffleMask Mask2({1, 2, 3, 4});
+    EXPECT_FALSE(Mask2.isIdentity());
+  }
+  {
+    // Check operator==().
+    sandboxir::ShuffleMask Mask1({0, 1, 2, 3});
+    sandboxir::ShuffleMask Mask2({0, 1, 2, 3});
+    EXPECT_TRUE(Mask1 == Mask2);
+    EXPECT_FALSE(Mask1 != Mask2);
+  }
+  {
+    // Check operator!=().
+    sandboxir::ShuffleMask Mask1({0, 1, 2, 3});
+    sandboxir::ShuffleMask Mask2({0, 1, 2, 4});
+    EXPECT_TRUE(Mask1 != Mask2);
+    EXPECT_FALSE(Mask1 == Mask2);
+  }
+  {
+    // Check size().
+    sandboxir::ShuffleMask Mask({0, 1, 2, 3});
+    EXPECT_EQ(Mask.size(), 4u);
+  }
+  {
+    // Check operator[].
+    sandboxir::ShuffleMask Mask({0, 1, 2, 3});
+    for (auto [Idx, Elm] : enumerate(Mask)) {
+      EXPECT_EQ(Elm, Mask[Idx]);
+    }
+  }
+  {
+    // Check begin(), end().
+    sandboxir::ShuffleMask Mask({0, 1, 2, 3});
+    sandboxir::ShuffleMask::const_iterator Begin = Mask.begin();
+    sandboxir::ShuffleMask::const_iterator End = Mask.begin();
+    int Idx = 0;
+    for (auto It = Begin; It != End; ++It) {
+      EXPECT_EQ(*It, Mask[Idx++]);
+    }
+  }
+#ifndef NDEBUG
+  {
+    // Check print(OS).
+    sandboxir::ShuffleMask Mask({0, 1, 2, 3});
+    std::string Str;
+    raw_string_ostream OS(Str);
+    Mask.print(OS);
+    EXPECT_EQ(Str, "0,1,2,3");
+  }
+  {
+    // Check operator<<().
+    sandboxir::ShuffleMask Mask({0, 1, 2, 3});
+    std::string Str;
+    raw_string_ostream OS(Str);
+    OS << Mask;
+    EXPECT_EQ(Str, "0,1,2,3");
+  }
+#endif // NDEBUG
+}
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 1ac499fba4175..73dde0af8afdd 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -19,6 +19,7 @@
 #include <string>
 
 namespace llvm {
+
 namespace {
 
 #define CHECK_ITERATOR(Range1, ...)                                            \
@@ -30,12 +31,14 @@ namespace {
       EXPECT_EQ(&std::get<0>(Pair), std::get<1>(Pair));                        \
   } while (0)
 
-TEST(VPInstructionTest, insertBefore) {
+using VPInstructionTest = VPlanTestBase;
+
+TEST_F(VPInstructionTest, insertBefore) {
   VPInstruction *I1 = new VPInstruction(0, {});
   VPInstruction *I2 = new VPInstruction(1, {});
   VPInstruction *I3 = new VPInstruction(2, {});
 
-  VPBasicBlock VPBB1;
+  VPBasicBlock &VPBB1 = *getPlan().createVPBasicBlock("");
   VPBB1.appendRecipe(I1);
 
   I2->insertBefore(I1);
@@ -45,12 +48,12 @@ TEST(VPInstructionTest, insertBefore) {
   CHECK_ITERATOR(VPBB1, I3, I2, I1);
 }
 
-TEST(VPInstructionTest, eraseFromParent) {
+TEST_F(VPInstructionTest, eraseFromParent) {
   VPInstruction *I1 = new VPInstruction(0, {});
   VPInstruction *I2 = new VPInstruction(1, {});
   VPInstruction *I3 = new VPInstruction(2, {});
 
-  VPBasicBlock VPBB1;
+  VPBasicBlock &VPBB1 = *getPlan().createVPBasicBlock("");
   VPBB1.appendRecipe(I1);
   VPBB1.appendRecipe(I2);
   VPBB1.appendRecipe(I3);
@@ -65,12 +68,12 @@ TEST(VPInstructionTest, eraseFromParent) {
   EXPECT_TRUE(VPBB1.empty());
 }
 
-TEST(VPInstructionTest, moveAfter) {
+TEST_F(VPInstructionTest, moveAfter) {
   VPInstruction *I1 = new VPInstruction(0, {});
   VPInstruction *I2 = new VPInstruction(1, {});
   VPInstruction *I3 = new VPInstruction(2, {});
 
-  VPBasicBlock VPBB1;
+  VPBasicBlock &VPBB1 = *getPlan().createVPBasicBlock("");
   VPBB1.appendRecipe(I1);
   VPBB1.appendRecipe(I2);
   VPBB1.appendRecipe(I3);
@@ -81,7 +84,7 @@ TEST(VPInstructionTest, moveAfter) {
 
   VPInstruction *I4 = new VPInstruction(4, {});
   VPInstruction *I5 = new VPInstruction(5, {});
-  VPBasicBlock VPBB2;
+  VPBasicBlock &VPBB2 = *getPlan().createVPBasicBlock("");
   VPBB2.appendRecipe(I4);
   VPBB2.appendRecipe(I5);
 
@@ -92,12 +95,12 @@ TEST(VPInstructionTest, moveAfter) {
   EXPECT_EQ(I3->getParent(), I4->getParent());
 }
 
-TEST(VPInstructionTest, moveBefore) {
+TEST_F(VPInstructionTest, moveBefore) {
   VPInstruction *I1 = new VPInstruction(0, {});
   VPInstruction *I2 = new VPInstruction(1, {});
   VPInstruction *I3 = new VPInstruction(2, {});
 
-  VPBasicBlock VPBB1;
+  VPBasicBlock &VPBB1 = *getPlan().createVPBasicBlock("");
   VPBB1.appendRecipe(I1);
   VPBB1.appendRecipe(I2);
   VPBB1.appendRecipe(I3);
@@ -108,7 +111,7 @@ TEST(VPInstructionTest, moveBefore) {
 
   VPInstruction *I4 = new VPInstruction(4, {});
   VPInstruction *I5 = new VPInstruction(5, {});
-  VPBasicBlock VPBB2;
+  VPBasicBlock &VPBB2 = *getPlan().createVPBasicBlock("");
   VPBB2.appendRecipe(I4);
   VPBB2.appendRecipe(I5);
 
@@ -118,7 +121,7 @@ TEST(VPInstructionTest, moveBefore) {
   CHECK_ITERATOR(VPBB2, I3, I4, I5);
   EXPECT_EQ(I3->getParent(), I4->getParent());
 
-  VPBasicBlock VPBB3;
+  VPBasicBlock &VPBB3 = *getPlan().createVPBasicBlock("");
 
   I4->moveBefore(VPBB3, VPBB3.end());
 
@@ -128,9 +131,10 @@ TEST(VPInstructionTest, moveBefore) {
   EXPECT_EQ(&VPBB3, I4->getParent());
 }
 
-TEST(VPInstructionTest, setOperand) {
-  VPValue *VPV1 = new VPValue();
-  VPValue *VPV2 = new VPValue();
+TEST_F(VPInstructionTest, setOperand) {
+  IntegerType *Int32 = IntegerType::get(C, 32);
+  VPValue *VPV1 = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPValue *VPV2 = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 2));
   VPInstruction *I1 = new VPInstruction(0, {VPV1, VPV2});
   EXPECT_EQ(1u, VPV1->getNumUsers());
   EXPECT_EQ(I1, *VPV1->user_begin());
@@ -138,7 +142,7 @@ TEST(VPInstructionTest, setOperand) {
   EXPECT_EQ(I1, *VPV2->user_begin());
 
   // Replace operand 0 (VPV1) with VPV3.
-  VPValue *VPV3 = new VPValue();
+  VPValue *VPV3 = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 3));
   I1->setOperand(0, VPV3);
   EXPECT_EQ(0u, VPV1->getNumUsers());
   EXPECT_EQ(1u, VPV2->getNumUsers());
@@ -155,7 +159,7 @@ TEST(VPInstructionTest, setOperand) {
   EXPECT_EQ(I1, *std::next(VPV3->user_begin()));
 
   // Replace operand 0 (VPV3) with VPV4.
-  VPValue *VPV4 = new VPValue();
+  VPValue *VPV4 = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 4));
   I1->setOperand(0, VPV4);
   EXPECT_EQ(1u, VPV3->getNumUsers());
   EXPECT_EQ(I1, *VPV3->user_begin());
@@ -168,19 +172,16 @@ TEST(VPInstructionTest, setOperand) {
   EXPECT_EQ(I1, *std::next(VPV4->user_begin()));
 
   delete I1;
-  delete VPV1;
-  delete VPV2;
-  delete VPV3;
-  delete VPV4;
 }
 
-TEST(VPInstructionTest, replaceAllUsesWith) {
-  VPValue *VPV1 = new VPValue();
-  VPValue *VPV2 = new VPValue();
+TEST_F(VPInstructionTest, replaceAllUsesWith) {
+  IntegerType *Int32 = IntegerType::get(C, 32);
+  VPValue *VPV1 = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPValue *VPV2 = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 2));
   VPInstruction *I1 = new VPInstruction(0, {VPV1, VPV2});
 
   // Replace all uses of VPV1 with VPV3.
-  VPValue *VPV3 = new VPValue();
+  VPValue *VPV3 = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 3));
   VPV1->replaceAllUsesWith(VPV3);
   EXPECT_EQ(VPV3, I1->getOperand(0));
   EXPECT_EQ(VPV2, I1->getOperand(1));
@@ -215,14 +216,12 @@ TEST(VPInstructionTest, replaceAllUsesWith) {
 
   delete I1;
   delete I2;
-  delete VPV1;
-  delete VPV2;
-  delete VPV3;
 }
 
-TEST(VPInstructionTest, releaseOperandsAtDeletion) {
-  VPValue *VPV1 = new VPValue();
-  VPValue *VPV2 = new VPValue();
+TEST_F(VPInstructionTest, releaseOperandsAtDeletion) {
+  IntegerType *Int32 = IntegerType::get(C, 32);
+  VPValue *VPV1 = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPValue *VPV2 = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 1));
   VPInstruction *I1 = new VPInstruction(0, {VPV1, VPV2});
 
   EXPECT_EQ(1u, VPV1->getNumUsers());
@@ -234,9 +233,6 @@ TEST(VPInstructionTest, releaseOperandsAtDeletion) {
 
   EXPECT_EQ(0u, VPV1->getNumUsers());
   EXPECT_EQ(0u, VPV2->getNumUsers());
-
-  delete VPV1;
-  delete VPV2;
 }
 
 using VPBasicBlockTest = VPlanTestBase;
@@ -867,9 +863,11 @@ No successors
 
 using VPRecipeTest = VPlanTestBase;
 TEST_F(VPRecipeTest, CastVPInstructionToVPUser) {
-  VPValue Op1;
-  VPValue Op2;
-  VPInstruction Recipe(Instruction::Add, {&Op1, &Op2});
+  IntegerType *Int32 = IntegerType::get(C, 32);
+  VPlan &Plan = getPlan();
+  VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+  VPInstruction Recipe(Instruction::Add, {Op1, Op2});
   EXPECT_TRUE(isa<VPUser>(&Recipe));
   VPRecipeBase *BaseR = &Recipe;
   EXPECT_TRUE(isa<VPUser>(BaseR));
@@ -877,14 +875,15 @@ TEST_F(VPRecipeTest, CastVPInstructionToVPUser) {
 }
 
 TEST_F(VPRecipeTest, CastVPWidenRecipeToVPUser) {
+  VPlan &Plan = getPlan();
   IntegerType *Int32 = IntegerType::get(C, 32);
   auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32),
                                        PoisonValue::get(Int32));
-  VPValue Op1;
-  VPValue Op2;
+  VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
   SmallVector<VPValue *, 2> Args;
-  Args.push_back(&Op1);
-  Args.push_back(&Op1);
+  Args.push_back(Op1);
+  Args.push_back(Op2);
   VPWidenRecipe WidenR(*AI, make_range(Args.begin(), Args.end()));
   EXPECT_TRUE(isa<VPUser>(&WidenR));
   VPRecipeBase *WidenRBase = &WidenR;
@@ -894,17 +893,18 @@ TEST_F(VPRecipeTest, CastVPWidenRecipeToVPUser) {
 }
 
 TEST_F(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) {
+  VPlan &Plan = getPlan();
   IntegerType *Int32 = IntegerType::get(C, 32);
   FunctionType *FTy = FunctionType::get(Int32, false);
   Function *Fn = Function::Create(FTy, GlobalValue::ExternalLinkage, 0);
   auto *Call = CallInst::Create(FTy, Fn);
-  VPValue Op1;
-  VPValue Op2;
-  VPValue CalledFn(Call->getCalledFunction());
+  VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+  VPValue *CalledFn = Plan.getOrAddLiveIn(Call->getCalledFunction());
   SmallVector<VPValue *, 2> Args;
-  Args.push_back(&Op1);
-  Args.push_back(&Op2);
-  Args.push_back(&CalledFn);
+  Args.push_back(Op1);
+  Args.push_back(Op2);
+  Args.push_back(CalledFn);
   VPWidenCallRecipe Recipe(Call, Fn, Args);
   EXPECT_TRUE(isa<VPUser>(&Recipe));
   VPRecipeBase *BaseR = &Recipe;
@@ -920,17 +920,18 @@ TEST_F(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) {
 }
 
 TEST_F(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) {
+  VPlan &Plan = getPlan();
   IntegerType *Int1 = IntegerType::get(C, 1);
   IntegerType *Int32 = IntegerType::get(C, 32);
   auto *SelectI = SelectInst::Create(
       PoisonValue::get(Int1), PoisonValue::get(Int32), PoisonValue::get(Int32));
-  VPValue Op1;
-  VPValue Op2;
-  VPValue Op3;
+  VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+  VPValue *Op3 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3));
   SmallVector<VPValue *, 4> Args;
-  Args.push_back(&Op1);
-  Args.push_back(&Op2);
-  Args.push_back(&Op3);
+  Args.push_back(Op1);
+  Args.push_back(Op2);
+  Args.push_back(Op3);
   VPWidenSelectRecipe WidenSelectR(*SelectI,
                                    make_range(Args.begin(), Args.end()));
   EXPECT_TRUE(isa<VPUser>(&WidenSelectR));
@@ -946,15 +947,16 @@ TEST_F(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) {
 }
 
 TEST_F(VPRecipeTest, CastVPWidenGEPRecipeToVPUserAndVPDef) {
+  VPlan &Plan = getPlan();
   IntegerType *Int32 = IntegerType::get(C, 32);
   PointerType *Int32Ptr = PointerType::get(Int32, 0);
   auto *GEP = GetElementPtrInst::Create(Int32, PoisonValue::get(Int32Ptr),
                                         PoisonValue::get(Int32));
-  VPValue Op1;
-  VPValue Op2;
+  VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
   SmallVector<VPValue *, 4> Args;
-  Args.push_back(&Op1);
-  Args.push_back(&Op2);
+  Args.push_back(Op1);
+  Args.push_back(Op2);
   VPWidenGEPRecipe Recipe(GEP, make_range(Args.begin(), Args.end()));
   EXPECT_TRUE(isa<VPUser>(&Recipe));
   VPRecipeBase *BaseR = &Recipe;
@@ -969,15 +971,17 @@ TEST_F(VPRecipeTest, CastVPWidenGEPRecipeToVPUserAndVPDef) {
 }
 
 TEST_F(VPRecipeTest, CastVPBlendRecipeToVPUser) {
+  VPlan &Plan = getPlan();
   IntegerType *Int32 = IntegerType::get(C, 32);
   auto *Phi = PHINode::Create(Int32, 1);
-  VPValue I1;
-  VPValue I2;
-  VPValue M2;
+
+  VPValue *I1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPValue *I2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+  VPValue *M2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3));
   SmallVector<VPValue *, 4> Args;
-  Args.push_back(&I1);
-  Args.push_back(&I2);
-  Args.push_back(&M2);
+  Args.push_back(I1);
+  Args.push_back(I2);
+  Args.push_back(M2);
   VPBlendRecipe Recipe(Phi, Args);
   EXPECT_TRUE(isa<VPUser>(&Recipe));
   VPRecipeBase *BaseR = &Recipe;
@@ -986,10 +990,12 @@ TEST_F(VPRecipeTest, CastVPBlendRecipeToVPUser) {
 }
 
 TEST_F(VPRecipeTest, CastVPInterleaveRecipeToVPUser) {
-  VPValue Addr;
-  VPValue Mask;
+  VPlan &Plan = getPlan();
+  IntegerType *Int32 = IntegerType::get(C, 32);
+  VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
   InterleaveGroup<Instruction> IG(4, false, Align(4));
-  VPInterleaveRecipe Recipe(&IG, &Addr, {}, &Mask, false);
+  VPInterleaveRecipe Recipe(&IG, Addr, {}, Mask, false);
   EXPECT_TRUE(isa<VPUser>(&Recipe));
   VPRecipeBase *BaseR = &Recipe;
   EXPECT_TRUE(isa<VPUser>(BaseR));
@@ -997,13 +1003,14 @@ TEST_F(VPRecipeTest, CastVPInterleaveRecipeToVPUser) {
 }
 
 TEST_F(VPRecipeTest, CastVPReplicateRecipeToVPUser) {
-  VPValue Op1;
-  VPValue Op2;
+  VPlan &Plan = getPlan();
+  IntegerType *Int32 = IntegerType::get(C, 32);
+  VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
   SmallVector<VPValue *, 4> Args;
-  Args.push_back(&Op1);
-  Args.push_back(&Op2);
+  Args.push_back(Op1);
+  Args.push_back(Op2);
 
-  IntegerType *Int32 = IntegerType::get(C, 32);
   FunctionType *FTy = FunctionType::get(Int32, false);
   auto *Call = CallInst::Create(FTy, PoisonValue::get(FTy));
   VPReplicateRecipe Recipe(Call, make_range(Args.begin(), Args.end()), true);
@@ -1014,8 +1021,10 @@ TEST_F(VPRecipeTest, CastVPReplicateRecipeToVPUser) {
 }
 
 TEST_F(VPRecipeTest, CastVPBranchOnMaskRecipeToVPUser) {
-  VPValue Mask;
-  VPBranchOnMaskRecipe Recipe(&Mask);
+  VPlan &Plan = getPlan();
+  IntegerType *Int32 = IntegerType::get(C, 32);
+  VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPBranchOnMaskRecipe Recipe(Mask);
   EXPECT_TRUE(isa<VPUser>(&Recipe));
   VPRecipeBase *BaseR = &Recipe;
   EXPECT_TRUE(isa<VPUser>(BaseR));
@@ -1023,13 +1032,14 @@ TEST_F(VPRecipeTest, CastVPBranchOnMaskRecipeToVPUser) {
 }
 
 TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
+  VPlan &Plan = getPlan();
   IntegerType *Int32 = IntegerType::get(C, 32);
   PointerType *Int32Ptr = PointerType::get(Int32, 0);
   auto *Load =
       new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
-  VPValue Addr;
-  VPValue Mask;
-  VPWidenLoadRecipe Recipe(*Load, &Addr, &Mask, true, false, {});
+  VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+  VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {});
   EXPECT_TRUE(isa<VPUser>(&Recipe));
   VPRecipeBase *BaseR = &Recipe;
   EXPECT_TRUE(isa<VPUser>(BaseR));
@@ -1046,15 +1056,16 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
   IntegerType *Int1 = IntegerType::get(C, 1);
   IntegerType *Int32 = IntegerType::get(C, 32);
   PointerType *Int32Ptr = PointerType::get(Int32, 0);
+  VPlan &Plan = getPlan();
 
   {
     auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32),
                                          PoisonValue::get(Int32));
-    VPValue Op1;
-    VPValue Op2;
+    VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+    VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
     SmallVector<VPValue *, 2> Args;
-    Args.push_back(&Op1);
-    Args.push_back(&Op1);
+    Args.push_back(Op1);
+    Args.push_back(Op2);
     VPWidenRecipe Recipe(*AI, make_range(Args.begin(), Args.end()));
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
@@ -1067,13 +1078,13 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     auto *SelectI =
         SelectInst::Create(PoisonValue::get(Int1), PoisonValue::get(Int32),
                            PoisonValue::get(Int32));
-    VPValue Op1;
-    VPValue Op2;
-    VPValue Op3;
+    VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+    VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+    VPValue *Op3 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3));
     SmallVector<VPValue *, 4> Args;
-    Args.push_back(&Op1);
-    Args.push_back(&Op2);
-    Args.push_back(&Op3);
+    Args.push_back(Op1);
+    Args.push_back(Op2);
+    Args.push_back(Op3);
     VPWidenSelectRecipe Recipe(*SelectI, make_range(Args.begin(), Args.end()));
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
@@ -1085,11 +1096,11 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
   {
     auto *GEP = GetElementPtrInst::Create(Int32, PoisonValue::get(Int32Ptr),
                                           PoisonValue::get(Int32));
-    VPValue Op1;
-    VPValue Op2;
+    VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+    VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
     SmallVector<VPValue *, 4> Args;
-    Args.push_back(&Op1);
-    Args.push_back(&Op2);
+    Args.push_back(Op1);
+    Args.push_back(Op2);
     VPWidenGEPRecipe Recipe(GEP, make_range(Args.begin(), Args.end()));
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
@@ -1099,8 +1110,9 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
   }
 
   {
-    VPValue Mask;
-    VPBranchOnMaskRecipe Recipe(&Mask);
+    VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+
+    VPBranchOnMaskRecipe Recipe(Mask);
     EXPECT_TRUE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1108,11 +1120,11 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
   }
 
   {
-    VPValue ChainOp;
-    VPValue VecOp;
-    VPValue CondOp;
-    VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp,
-                             &VecOp, false);
+    VPValue *ChainOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+    VPValue *VecOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+    VPValue *CondOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3));
+    VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, ChainOp, CondOp,
+                             VecOp, false);
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1120,13 +1132,13 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
   }
 
   {
-    VPValue ChainOp;
-    VPValue VecOp;
-    VPValue CondOp;
-    VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp,
-                             &VecOp, false);
-    VPValue EVL;
-    VPReductionEVLRecipe EVLRecipe(Recipe, EVL, &CondOp);
+    VPValue *ChainOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+    VPValue *VecOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+    VPValue *CondOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3));
+    VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, ChainOp, CondOp,
+                             VecOp, false);
+    VPValue *EVL = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 4));
+    VPReductionEVLRecipe EVLRecipe(Recipe, *EVL, CondOp);
     EXPECT_FALSE(EVLRecipe.mayHaveSideEffects());
     EXPECT_FALSE(EVLRecipe.mayReadFromMemory());
     EXPECT_FALSE(EVLRecipe.mayWriteToMemory());
@@ -1136,9 +1148,9 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
   {
     auto *Load =
         new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
-    VPValue Addr;
-    VPValue Mask;
-    VPWidenLoadRecipe Recipe(*Load, &Addr, &Mask, true, false, {});
+    VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+    VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+    VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {});
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_TRUE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1149,10 +1161,10 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
   {
     auto *Store = new StoreInst(PoisonValue::get(Int32),
                                 PoisonValue::get(Int32Ptr), false, Align(1));
-    VPValue Addr;
-    VPValue Mask;
-    VPValue StoredV;
-    VPWidenStoreRecipe Recipe(*Store, &Addr, &StoredV, &Mask, false, false, {});
+    VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+    VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+    VPValue *StoredV = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3));
+    VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false, {});
     EXPECT_TRUE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_TRUE(Recipe.mayWriteToMemory());
@@ -1164,13 +1176,13 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     FunctionType *FTy = FunctionType::get(Int32, false);
     Function *Fn = Function::Create(FTy, GlobalValue::ExternalLinkage, 0);
     auto *Call = CallInst::Create(FTy, Fn);
-    VPValue Op1;
-    VPValue Op2;
-    VPValue CalledFn(Call->getCalledFunction());
+    VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+    VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+    VPValue *CalledFn = Plan.getOrAddLiveIn(Call->getCalledFunction());
     SmallVector<VPValue *, 3> Args;
-    Args.push_back(&Op1);
-    Args.push_back(&Op2);
-    Args.push_back(&CalledFn);
+    Args.push_back(Op1);
+    Args.push_back(Op2);
+    Args.push_back(CalledFn);
     VPWidenCallRecipe Recipe(Call, Fn, Args);
     EXPECT_TRUE(Recipe.mayHaveSideEffects());
     EXPECT_TRUE(Recipe.mayReadFromMemory());
@@ -1187,13 +1199,13 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
         Intrinsic::getOrInsertDeclaration(&M, Intrinsic::thread_pointer);
 
     auto *Call = CallInst::Create(TheFn->getFunctionType(), TheFn);
-    VPValue Op1;
-    VPValue Op2;
-    VPValue CalledFn(TheFn);
+    VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+    VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+    VPValue *CalledFn = Plan.getOrAddLiveIn(Call->getCalledFunction());
     SmallVector<VPValue *, 3> Args;
-    Args.push_back(&Op1);
-    Args.push_back(&Op2);
-    Args.push_back(&CalledFn);
+    Args.push_back(Op1);
+    Args.push_back(Op2);
+    Args.push_back(CalledFn);
     VPWidenCallRecipe Recipe(Call, TheFn, Args);
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
@@ -1203,21 +1215,20 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
   }
 
   {
-    VPValue Op1;
-    VPValue Op2;
+    VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+    VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
     InductionDescriptor IndDesc;
-    VPScalarIVStepsRecipe Recipe(IndDesc, &Op1, &Op2);
+    VPScalarIVStepsRecipe Recipe(IndDesc, Op1, Op2);
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
     EXPECT_FALSE(Recipe.mayReadOrWriteMemory());
   }
 
-  // The initial implementation is conservative with respect to VPInstructions.
   {
-    VPValue Op1;
-    VPValue Op2;
-    VPInstruction VPInst(Instruction::Add, {&Op1, &Op2});
+    VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+    VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+    VPInstruction VPInst(Instruction::Add, {Op1, Op2});
     VPRecipeBase &Recipe = VPInst;
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
@@ -1225,8 +1236,8 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     EXPECT_FALSE(Recipe.mayReadOrWriteMemory());
   }
   {
-    VPValue Op1;
-    VPPredInstPHIRecipe Recipe(&Op1, {});
+    VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+    VPPredInstPHIRecipe Recipe(Op1, {});
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1395,8 +1406,8 @@ TEST_F(VPRecipeTest, dumpRecipeUnnamedVPValuesNotInPlanOrBlock) {
   auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32),
                                        PoisonValue::get(Int32));
   AI->setName("a");
-  VPValue *ExtVPV1 = new VPValue(ConstantInt::get(Int32, 1));
-  VPValue *ExtVPV2 = new VPValue(AI);
+  VPValue *ExtVPV1 = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPValue *ExtVPV2 = getPlan().getOrAddLiveIn(AI);
 
   VPInstruction *I1 = new VPInstruction(Instruction::Add, {ExtVPV1, ExtVPV2});
   VPInstruction *I2 = new VPInstruction(Instruction::Mul, {I1, I1});
@@ -1466,36 +1477,37 @@ TEST_F(VPRecipeTest, dumpRecipeUnnamedVPValuesNotInPlanOrBlock) {
 
   delete I2;
   delete I1;
-  delete ExtVPV2;
-  delete ExtVPV1;
   delete AI;
 }
 
 #endif
 
 TEST_F(VPRecipeTest, CastVPReductionRecipeToVPUser) {
-  VPValue ChainOp;
-  VPValue VecOp;
-  VPValue CondOp;
-  VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp,
-                           &VecOp, false);
+  IntegerType *Int32 = IntegerType::get(C, 32);
+  VPValue *ChainOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPValue *VecOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 2));
+  VPValue *CondOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 3));
+  VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, ChainOp, CondOp,
+                           VecOp, false);
   EXPECT_TRUE(isa<VPUser>(&Recipe));
   VPRecipeBase *BaseR = &Recipe;
   EXPECT_TRUE(isa<VPUser>(BaseR));
 }
 
 TEST_F(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) {
-  VPValue ChainOp;
-  VPValue VecOp;
-  VPValue CondOp;
-  VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp,
-                           &VecOp, false);
-  VPValue EVL;
-  VPReductionEVLRecipe EVLRecipe(Recipe, EVL, &CondOp);
+  IntegerType *Int32 = IntegerType::get(C, 32);
+  VPValue *ChainOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPValue *VecOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 2));
+  VPValue *CondOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 3));
+  VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, ChainOp, CondOp,
+                           VecOp, false);
+  VPValue *EVL = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 0));
+  VPReductionEVLRecipe EVLRecipe(Recipe, *EVL, CondOp);
   EXPECT_TRUE(isa<VPUser>(&EVLRecipe));
   VPRecipeBase *BaseR = &EVLRecipe;
   EXPECT_TRUE(isa<VPUser>(BaseR));
 }
+} // namespace
 
 struct VPDoubleValueDef : public VPRecipeBase {
   VPDoubleValueDef(ArrayRef<VPValue *> Operands) : VPRecipeBase(99, Operands) {
@@ -1512,6 +1524,8 @@ struct VPDoubleValueDef : public VPRecipeBase {
 #endif
 };
 
+namespace {
+
 TEST(VPDoubleValueDefTest, traverseUseLists) {
   // Check that the def-use chains of a multi-def can be traversed in both
   // directions.
@@ -1557,8 +1571,9 @@ TEST(VPDoubleValueDefTest, traverseUseLists) {
 }
 
 TEST_F(VPRecipeTest, CastToVPSingleDefRecipe) {
-  VPValue Start;
-  VPEVLBasedIVPHIRecipe R(&Start, {});
+  IntegerType *Int32 = IntegerType::get(C, 32);
+  VPValue *Start = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 0));
+  VPEVLBasedIVPHIRecipe R(Start, {});
   VPRecipeBase *B = &R;
   EXPECT_TRUE(isa<VPSingleDefRecipe>(B));
   // TODO: check other VPSingleDefRecipes.
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
index f098ba0bce497..f818b49fdbe7f 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
@@ -20,14 +20,17 @@ using VPVerifierTest = VPlanTestBase;
 namespace {
 TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
   VPlan &Plan = getPlan();
-  VPInstruction *DefI = new VPInstruction(Instruction::Add, {});
+  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero});
   VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI});
+  auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
 
   VPBasicBlock *VPBB1 = Plan.getEntry();
   VPBB1->appendRecipe(UseI);
   VPBB1->appendRecipe(DefI);
 
   VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
+  VPBB2->appendRecipe(CanIV);
   VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB2, "R1");
   VPBlockUtils::connectBlocks(VPBB1, R1);
   VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
@@ -44,9 +47,10 @@ TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
 
 TEST_F(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) {
   VPlan &Plan = getPlan();
-  VPInstruction *DefI = new VPInstruction(Instruction::Add, {});
+  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero});
   VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI});
-  auto *CanIV = new VPCanonicalIVPHIRecipe(UseI, {});
+  auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
   VPInstruction *BranchOnCond =
       new VPInstruction(VPInstruction::BranchOnCond, {CanIV});
 
@@ -73,23 +77,22 @@ TEST_F(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) {
 }
 
 TEST_F(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) {
+  VPlan &Plan = getPlan();
   IntegerType *Int32 = IntegerType::get(C, 32);
   auto *Phi = PHINode::Create(Int32, 1);
+  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 0));
 
-  VPInstruction *I1 = new VPInstruction(Instruction::Add, {});
-  VPInstruction *DefI = new VPInstruction(Instruction::Add, {});
-  auto *CanIV = new VPCanonicalIVPHIRecipe(I1, {});
+  VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero});
+  auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
   VPInstruction *BranchOnCond =
       new VPInstruction(VPInstruction::BranchOnCond, {CanIV});
   auto *Blend = new VPBlendRecipe(Phi, {DefI});
 
-  VPlan &Plan = getPlan();
   VPBasicBlock *VPBB1 = Plan.getEntry();
   VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
   VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("");
   VPBasicBlock *VPBB4 = Plan.createVPBasicBlock("");
 
-  VPBB1->appendRecipe(I1);
   VPBB2->appendRecipe(CanIV);
   VPBB3->appendRecipe(Blend);
   VPBB4->appendRecipe(DefI);
@@ -116,14 +119,15 @@ TEST_F(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) {
 }
 
 TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
-  VPInstruction *I1 = new VPInstruction(Instruction::Add, {});
-  auto *CanIV = new VPCanonicalIVPHIRecipe(I1, {});
+  VPlan &Plan = getPlan();
+  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPInstruction *I1 = new VPInstruction(Instruction::Add, {Zero});
+  auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
   VPInstruction *BranchOnCond =
       new VPInstruction(VPInstruction::BranchOnCond, {CanIV});
   VPInstruction *BranchOnCond2 =
       new VPInstruction(VPInstruction::BranchOnCond, {I1});
 
-  VPlan &Plan = getPlan();
   VPBasicBlock *VPBB1 = Plan.getEntry();
   VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
 
@@ -149,14 +153,15 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
 }
 
 TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
-  VPInstruction *I1 = new VPInstruction(Instruction::Add, {});
-  auto *CanIV = new VPCanonicalIVPHIRecipe(I1, {});
+  VPlan &Plan = getPlan();
+  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPInstruction *I1 = new VPInstruction(Instruction::Add, {Zero});
+  auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
   VPInstruction *BranchOnCond =
       new VPInstruction(VPInstruction::BranchOnCond, {CanIV});
   VPInstruction *BranchOnCond2 =
       new VPInstruction(VPInstruction::BranchOnCond, {I1});
 
-  VPlan &Plan = getPlan();
   VPBasicBlock *VPBB1 = Plan.getEntry();
   VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
   VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("");
@@ -186,10 +191,15 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
 
 TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) {
   VPlan &Plan = getPlan();
+
   VPBasicBlock *VPBB1 = Plan.getEntry();
   VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
 
-  VPInstruction *DefI = new VPInstruction(Instruction::Add, {});
+  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
+  VPBB2->appendRecipe(CanIV);
+
+  VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero});
   VPInstruction *BranchOnCond =
       new VPInstruction(VPInstruction::BranchOnCond, {DefI});
 
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
index bf110c8ad76bb..60c726212062d 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
@@ -183,7 +183,7 @@ TEST_F(X86SerialSnippetGeneratorTest,
     ASSERT_THAT(IT.getVariableValues(), SizeIs(3));
     for (const auto &Var : IT.getVariableValues()) {
       if (Var.isReg()) {
-        EXPECT_FALSE(ForbiddenRegisters[Var.getReg()]);
+        EXPECT_FALSE(ForbiddenRegisters[Var.getReg().id()]);
       }
     }
   }
@@ -288,8 +288,8 @@ TEST_F(X86ParallelSnippetGeneratorTest, ReadAfterWrite_CMOV32rr) {
     EXPECT_THAT(CT.Info, HasSubstr("avoiding Read-After-Write issue"));
     EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
     ASSERT_GT(CT.Instructions.size(), 1U);
-    std::unordered_set<unsigned> AllDefRegisters;
-    std::unordered_set<unsigned> AllUseRegisters;
+    std::set<MCRegister> AllDefRegisters;
+    std::set<MCRegister> AllUseRegisters;
     for (const auto &IT : CT.Instructions) {
       ASSERT_THAT(IT.getVariableValues(), SizeIs(3));
       AllDefRegisters.insert(IT.getVariableValues()[0].getReg());
@@ -328,8 +328,8 @@ TEST_F(X86ParallelSnippetGeneratorTest, ReadAfterWrite_VFMADD132PDr) {
     EXPECT_THAT(CT.Info, HasSubstr("avoiding Read-After-Write issue"));
     EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
     ASSERT_GT(CT.Instructions.size(), 1U);
-    std::unordered_set<unsigned> AllDefRegisters;
-    std::unordered_set<unsigned> AllUseRegisters;
+    std::set<MCRegister> AllDefRegisters;
+    std::set<MCRegister> AllUseRegisters;
     for (const auto &IT : CT.Instructions) {
       ASSERT_THAT(IT.getVariableValues(), SizeIs(3));
       AllDefRegisters.insert(IT.getVariableValues()[0].getReg());
@@ -412,9 +412,9 @@ TEST_F(X86ParallelSnippetGeneratorTest, MemoryUse) {
     EXPECT_THAT(IT.getOpcode(), Opcode);
     ASSERT_THAT(IT.getVariableValues(), SizeIs(6));
     EXPECT_EQ(IT.getVariableValues()[2].getImm(), 1);
-    EXPECT_EQ(IT.getVariableValues()[3].getReg(), 0u);
+    EXPECT_FALSE(IT.getVariableValues()[3].getReg().isValid());
     EXPECT_EQ(IT.getVariableValues()[4].getImm(), 0);
-    EXPECT_EQ(IT.getVariableValues()[5].getReg(), 0u);
+    EXPECT_FALSE(IT.getVariableValues()[5].getReg().isValid());
   }
 }
 
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index f2f6066538e1a..139b1f9d897fa 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -1299,7 +1299,7 @@ void AsmMatcherInfo::buildRegisterClasses(
 
     if (!ContainingSet.empty()) {
       RegisterSets.insert(ContainingSet);
-      RegisterMap.insert(std::pair(CGR.TheDef, ContainingSet));
+      RegisterMap.try_emplace(CGR.TheDef, ContainingSet);
     }
   }
 
@@ -1320,7 +1320,7 @@ void AsmMatcherInfo::buildRegisterClasses(
     CI->DiagnosticType = "";
     CI->IsOptional = false;
     CI->DefaultMethod = ""; // unused
-    RegisterSetClasses.insert(std::pair(RS, CI));
+    RegisterSetClasses.try_emplace(RS, CI);
     ++Index;
   }
 
@@ -1362,7 +1362,7 @@ void AsmMatcherInfo::buildRegisterClasses(
     if (!CI->DiagnosticString.empty() && CI->DiagnosticType.empty())
       CI->DiagnosticType = RC.getName();
 
-    RegisterClassClasses.insert(std::pair(Def, CI));
+    RegisterClassClasses.try_emplace(Def, CI);
   }
 
   // Populate the map for individual registers.
@@ -2823,7 +2823,7 @@ emitMnemonicAliasVariant(raw_ostream &OS, const AsmMatcherInfo &Info,
 
     MatchCode += "return;";
 
-    Cases.push_back(std::pair(AliasEntry.first, MatchCode));
+    Cases.emplace_back(AliasEntry.first, MatchCode);
   }
   StringMatcher("Mnemonic", Cases, OS).Emit(Indent);
 }
diff --git a/llvm/utils/TableGen/AsmWriterEmitter.cpp b/llvm/utils/TableGen/AsmWriterEmitter.cpp
index 9880214a37368..e7606b9df4626 100644
--- a/llvm/utils/TableGen/AsmWriterEmitter.cpp
+++ b/llvm/utils/TableGen/AsmWriterEmitter.cpp
@@ -144,14 +144,14 @@ static void EmitInstructions(std::vector<AsmWriterInst> &Insts, raw_ostream &O,
       O << "    switch (MI->getOpcode()) {\n";
       O << "    default: llvm_unreachable(\"Unexpected opcode.\");\n";
       std::vector<std::pair<std::string, AsmWriterOperand>> OpsToPrint;
-      OpsToPrint.push_back(std::pair(FirstInst.CGI->Namespace.str() + "::" +
-                                         FirstInst.CGI->TheDef->getName().str(),
-                                     FirstInst.Operands[i]));
+      OpsToPrint.emplace_back(FirstInst.CGI->Namespace.str() +
+                                  "::" + FirstInst.CGI->TheDef->getName().str(),
+                              FirstInst.Operands[i]);
 
       for (const AsmWriterInst &AWI : SimilarInsts) {
-        OpsToPrint.push_back(std::pair(
-            AWI.CGI->Namespace.str() + "::" + AWI.CGI->TheDef->getName().str(),
-            AWI.Operands[i]));
+        OpsToPrint.emplace_back(AWI.CGI->Namespace.str() +
+                                    "::" + AWI.CGI->TheDef->getName().str(),
+                                AWI.Operands[i]);
       }
       std::reverse(OpsToPrint.begin(), OpsToPrint.end());
       while (!OpsToPrint.empty())
@@ -722,7 +722,7 @@ class IAPrinter {
   void addOperand(StringRef Op, int OpIdx, int PrintMethodIdx = -1) {
     assert(OpIdx >= 0 && OpIdx < 0xFE && "Idx out of range");
     assert(PrintMethodIdx >= -1 && PrintMethodIdx < 0xFF && "Idx out of range");
-    OpMap[Op] = std::pair(OpIdx, PrintMethodIdx);
+    OpMap[Op] = {OpIdx, PrintMethodIdx};
   }
 
   unsigned getNumMIOps() { return NumMIOps; }
@@ -753,7 +753,7 @@ class IAPrinter {
       Next = I;
     }
 
-    return std::pair(StringRef(Start, I - Start), Next);
+    return {StringRef(Start, I - Start), Next};
   }
 
   std::string formatAliasString(uint32_t &UnescapedSize) {
@@ -854,8 +854,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
       continue; // Aliases with priority 0 are never emitted.
 
     const DagInit *DI = R->getValueAsDag("ResultInst");
-    AliasMap[getQualifiedName(DI->getOperatorAsDef(R->getLoc()))].insert(
-        std::pair(CodeGenInstAlias(R, Target), Priority));
+    AliasMap[getQualifiedName(DI->getOperatorAsDef(R->getLoc()))].emplace(
+        CodeGenInstAlias(R, Target), Priority);
   }
 
   // A map of which conditions need to be met for each instruction operand
diff --git a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
index fd815f4a31dad..ab68e028f1e96 100644
--- a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
@@ -495,7 +495,7 @@ static void emitLeafTable(const DirectiveLanguage &DirLang, raw_ostream &OS,
   DenseMap<const Record *, int> DirId; // Record * -> llvm::omp::Directive
 
   for (auto [Idx, Rec] : enumerate(Directives))
-    DirId.insert(std::make_pair(Rec, Idx));
+    DirId.try_emplace(Rec, Idx);
 
   using LeafList = std::vector<int>;
   int MaxLeafCount = getMaxLeafCount(DirLang);
@@ -675,7 +675,7 @@ static void generateGetDirectiveAssociation(const DirectiveLanguage &DirLang,
                       D.getAssociation()->getName() + "'");
     }
     if (AS != Association::FromLeaves) {
-      AsMap.insert(std::make_pair(R, AS));
+      AsMap.try_emplace(R, AS);
       return AS;
     }
     // Compute the association from leaf constructs.
@@ -701,7 +701,7 @@ static void generateGetDirectiveAssociation(const DirectiveLanguage &DirLang,
 
     assert(Result != Association::Invalid);
     assert(Result != Association::FromLeaves);
-    AsMap.insert(std::make_pair(R, Result));
+    AsMap.try_emplace(R, Result);
     return Result;
   };
 
diff --git a/llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp
index 723f1d72b5159..8e4acf96c3da9 100644
--- a/llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp
@@ -241,8 +241,7 @@ static void emitRISCVExtensionBitmask(const RecordKeeper &RK, raw_ostream &OS) {
     ExtName.consume_front("experimental-");
 
 #ifndef NDEBUG
-    assert(Seen.insert(std::make_pair(GroupIDVal, BitPosVal)).second &&
-           "duplicated bitmask");
+    assert(Seen.insert({GroupIDVal, BitPosVal}).second && "duplicated bitmask");
 #endif
 
     OS.indent(4) << "{"
diff --git a/llvm/utils/TableGen/Basic/SequenceToOffsetTable.h b/llvm/utils/TableGen/Basic/SequenceToOffsetTable.h
index c918365b2289b..35a9abdc37c82 100644
--- a/llvm/utils/TableGen/Basic/SequenceToOffsetTable.h
+++ b/llvm/utils/TableGen/Basic/SequenceToOffsetTable.h
@@ -93,7 +93,7 @@ class SequenceToOffsetTable {
     if (I != Seqs.end() && isSuffix(Seq, I->first))
       return;
 
-    I = Seqs.insert(I, std::pair(Seq, 0u));
+    I = Seqs.insert(I, {Seq, 0u});
 
     // The entry before I may be a suffix of Seq that can now be erased.
     if (I != Seqs.begin() && isSuffix((--I)->first, Seq))
diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp
index 407ee81b7e0b6..475699ae3e78e 100644
--- a/llvm/utils/TableGen/CodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/CodeEmitterGen.cpp
@@ -338,11 +338,11 @@ CodeEmitterGen::getInstructionCases(const Record *R,
         Append("      }\n");
       }
       Append("      }\n");
-      return std::pair(std::move(Case), std::move(BitOffsetCase));
+      return {std::move(Case), std::move(BitOffsetCase)};
     }
   }
   addInstructionCasesForEncoding(R, R, Target, Case, BitOffsetCase);
-  return std::pair(std::move(Case), std::move(BitOffsetCase));
+  return {std::move(Case), std::move(BitOffsetCase)};
 }
 
 void CodeEmitterGen::addInstructionCasesForEncoding(
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index 1a61d32b4869a..013135a9def1f 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -3006,7 +3006,7 @@ TreePatternNodePtr TreePattern::ParseTreePattern(const Init *TheInit,
       // Check that the ComplexPattern uses are consistent: "(MY_PAT $a, $b)"
       // and "(MY_PAT $b, $a)" should not be allowed in the same pattern;
       // neither should "(MY_PAT_1 $a, $b)" and "(MY_PAT_2 $a, $b)".
-      auto OperandId = std::make_pair(Operator, i);
+      auto OperandId = std::pair(Operator, i);
       auto [PrevOp, Inserted] =
           ComplexPatternOperands.try_emplace(Child->getName(), OperandId);
       if (!Inserted && PrevOp->getValue() != OperandId) {
@@ -3218,7 +3218,7 @@ void CodeGenDAGPatterns::ParseNodeInfo() {
   const CodeGenHwModes &CGH = getTargetInfo().getHwModes();
 
   for (const Record *R : reverse(Records.getAllDerivedDefinitions("SDNode")))
-    SDNodes.insert(std::pair(R, SDNodeInfo(R, CGH)));
+    SDNodes.try_emplace(R, SDNodeInfo(R, CGH));
 
   // Get the builtin intrinsic nodes.
   intrinsic_void_sdnode = getSDNodeNamed("intrinsic_void");
@@ -3348,8 +3348,7 @@ void CodeGenDAGPatterns::ParseDefaultOperands() {
     // SomeSDnode so that we can parse this.
     std::vector<std::pair<const Init *, const StringInit *>> Ops;
     for (unsigned op = 0, e = DefaultInfo->getNumArgs(); op != e; ++op)
-      Ops.push_back(
-          std::pair(DefaultInfo->getArg(op), DefaultInfo->getArgName(op)));
+      Ops.emplace_back(DefaultInfo->getArg(op), DefaultInfo->getArgName(op));
     const DagInit *DI = DagInit::get(SomeSDNode, nullptr, Ops);
 
     // Create a TreePattern to parse this.
diff --git a/llvm/utils/TableGen/Common/CodeGenHwModes.cpp b/llvm/utils/TableGen/Common/CodeGenHwModes.cpp
index f5b5d3feed7c3..c744691ae9e08 100644
--- a/llvm/utils/TableGen/Common/CodeGenHwModes.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenHwModes.cpp
@@ -51,7 +51,7 @@ HwModeSelect::HwModeSelect(const Record *R, CodeGenHwModes &CGH) {
   }
   for (auto [Mode, Object] : zip_equal(Modes, Objects)) {
     unsigned ModeId = CGH.getHwModeId(Mode);
-    Items.push_back(std::pair(ModeId, Object));
+    Items.emplace_back(ModeId, Object);
   }
 }
 
@@ -70,13 +70,13 @@ CodeGenHwModes::CodeGenHwModes(const RecordKeeper &RK) : Records(RK) {
     if (R->getName() == DefaultModeName)
       continue;
     Modes.emplace_back(R);
-    ModeIds.insert(std::pair(R, Modes.size()));
+    ModeIds.try_emplace(R, Modes.size());
   }
 
   assert(Modes.size() <= 32 && "number of HwModes exceeds maximum of 32");
 
   for (const Record *R : Records.getAllDerivedDefinitions("HwModeSelect")) {
-    auto P = ModeSelects.emplace(std::pair(R, HwModeSelect(R, *this)));
+    auto P = ModeSelects.emplace(R, HwModeSelect(R, *this));
     assert(P.second);
     (void)P;
   }
diff --git a/llvm/utils/TableGen/Common/CodeGenInstAlias.cpp b/llvm/utils/TableGen/Common/CodeGenInstAlias.cpp
index 30694ac2bb213..5537a2fa8b980 100644
--- a/llvm/utils/TableGen/Common/CodeGenInstAlias.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenInstAlias.cpp
@@ -229,7 +229,7 @@ CodeGenInstAlias::CodeGenInstAlias(const Record *R, const CodeGenTarget &T)
                              InstOpRec->getValueAsDef("ParserMatchClass")
                                      ->getValueAsString("Name") != "Imm")) {
         ResultOperands.push_back(std::move(ResOp));
-        ResultInstOperandIndex.push_back(std::pair(i, -1));
+        ResultInstOperandIndex.emplace_back(i, -1);
         ++AliasOpNo;
 
         // Otherwise, we need to match each of the suboperands individually.
@@ -244,7 +244,7 @@ CodeGenInstAlias::CodeGenInstAlias(const Record *R, const CodeGenTarget &T)
               Result->getArgName(AliasOpNo)->getAsUnquotedString() + "." +
                   MIOI->getArgName(SubOp)->getAsUnquotedString(),
               SubRec);
-          ResultInstOperandIndex.push_back(std::pair(i, SubOp));
+          ResultInstOperandIndex.emplace_back(i, SubOp);
         }
         ++AliasOpNo;
       }
@@ -262,7 +262,7 @@ CodeGenInstAlias::CodeGenInstAlias(const Record *R, const CodeGenTarget &T)
         if (tryAliasOpMatch(Result, AliasOpNo, SubRec, false, R->getLoc(), T,
                             ResOp)) {
           ResultOperands.push_back(ResOp);
-          ResultInstOperandIndex.push_back(std::pair(i, SubOp));
+          ResultInstOperandIndex.emplace_back(i, SubOp);
           ++AliasOpNo;
         } else {
           PrintFatalError(
diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.cpp b/llvm/utils/TableGen/Common/CodeGenInstruction.cpp
index 344c4c15e2ebd..ecef9caa9c3d8 100644
--- a/llvm/utils/TableGen/Common/CodeGenInstruction.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenInstruction.cpp
@@ -175,7 +175,7 @@ CGIOperandList::CGIOperandList(const Record *R) : TheDef(R) {
         }
 
         OpInfo.SubOpNames[j] = SubArgName;
-        SubOpAliases[SubArgName] = std::pair(i, j);
+        SubOpAliases[SubArgName] = {i, j};
       }
     } else if (!EncoderMethod.empty()) {
       // If we have no explicit sub-op dag, but have an top-level encoder
@@ -276,7 +276,7 @@ CGIOperandList::ParseOperandName(StringRef Op, bool AllowWholeOp) {
                           Op + "'");
 
     // Otherwise, return the operand.
-    return std::pair(OpIdx, 0U);
+    return {OpIdx, 0U};
   }
 
   // Find the suboperand number involved.
@@ -289,13 +289,13 @@ CGIOperandList::ParseOperandName(StringRef Op, bool AllowWholeOp) {
   // Find the operand with the right name.
   for (unsigned i = 0, e = MIOpInfo->getNumArgs(); i != e; ++i)
     if (MIOpInfo->getArgNameStr(i) == SubOpName)
-      return std::pair(OpIdx, i);
+      return {OpIdx, i};
 
   // Otherwise, didn't find it!
   PrintFatalError(TheDef->getLoc(), TheDef->getName() +
                                         ": unknown suboperand name in '" + Op +
                                         "'");
-  return std::pair(0U, 0U);
+  return {0U, 0U};
 }
 
 static void ParseConstraint(StringRef CStr, CGIOperandList &Ops,
diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.h b/llvm/utils/TableGen/Common/CodeGenInstruction.h
index a799d023b1af4..44c0ab70dc615 100644
--- a/llvm/utils/TableGen/Common/CodeGenInstruction.h
+++ b/llvm/utils/TableGen/Common/CodeGenInstruction.h
@@ -204,7 +204,7 @@ class CGIOperandList {
     for (unsigned i = 0;; ++i) {
       assert(i < OperandList.size() && "Invalid flat operand #");
       if (OperandList[i].MIOperandNo + OperandList[i].MINumOperands > Op)
-        return std::pair(i, Op - OperandList[i].MIOperandNo);
+        return {i, Op - OperandList[i].MIOperandNo};
     }
   }
 
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index 2dbee94d7e540..973c86c6e5a55 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -287,13 +287,13 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
     CodeGenSubRegIndex *Idx = ExplicitSubRegIndices[i];
     if (!SR->Artificial)
       Idx->Artificial = false;
-    if (!SubRegs.insert(std::pair(Idx, SR)).second)
+    if (!SubRegs.try_emplace(Idx, SR).second)
       PrintFatalError(TheDef->getLoc(), "SubRegIndex " + Idx->getName() +
                                             " appears twice in Register " +
                                             getName());
     // Map explicit sub-registers first, so the names take precedence.
     // The inherited sub-registers are mapped below.
-    SubReg2Idx.insert(std::pair(SR, Idx));
+    SubReg2Idx.try_emplace(SR, Idx);
   }
 
   // Keep track of inherited subregs and how they can be reached.
@@ -333,7 +333,7 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
       if (SubRegs.count(Comp.second) || !Orphans.erase(SRI->second))
         continue;
       // We found a new name for the orphaned sub-register.
-      SubRegs.insert(std::pair(Comp.second, SRI->second));
+      SubRegs.try_emplace(Comp.second, SRI->second);
       Indices.push_back(Comp.second);
     }
   }
@@ -380,7 +380,7 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
 
     // Ensure that every sub-register has a unique name.
     DenseMap<const CodeGenRegister *, CodeGenSubRegIndex *>::iterator Ins =
-        SubReg2Idx.insert(std::pair(SubReg.second, SubReg.first)).first;
+        SubReg2Idx.try_emplace(SubReg.second, SubReg.first).first;
     if (Ins->second == SubReg.first)
       continue;
     // Trouble: Two different names for SubReg.second.
@@ -532,8 +532,8 @@ void CodeGenRegister::computeSecondarySubRegs(CodeGenRegBank &RegBank) {
       // a sub-register with a concatenated sub-register index.
       CodeGenSubRegIndex *Concat =
           RegBank.getConcatSubRegIndex(Parts, RegBank.getHwModes());
-      std::pair<CodeGenSubRegIndex *, CodeGenRegister *> NewSubReg =
-          std::pair(Concat, Cand);
+      std::pair<CodeGenSubRegIndex *, CodeGenRegister *> NewSubReg = {Concat,
+                                                                      Cand};
 
       if (!SubRegs.insert(NewSubReg).second)
         continue;
@@ -541,7 +541,7 @@ void CodeGenRegister::computeSecondarySubRegs(CodeGenRegBank &RegBank) {
       // We inserted a new subregister.
       NewSubRegs.push_back(NewSubReg);
       SubRegQueue.push(NewSubReg);
-      SubReg2Idx.insert(std::pair(Cand, Concat));
+      SubReg2Idx.try_emplace(Cand, Concat);
     }
   }
 
@@ -1098,7 +1098,7 @@ CodeGenRegisterClass::getMatchingSubClassWithSubRegs(
     BitVector SuperRegClassesBV(RegClasses.size());
     RC.getSuperRegClasses(SubIdx, SuperRegClassesBV);
     if (SuperRegClassesBV.any())
-      SuperRegClasses.push_back(std::pair(&RC, SuperRegClassesBV));
+      SuperRegClasses.emplace_back(&RC, SuperRegClassesBV);
   }
   llvm::stable_sort(SuperRegClasses,
                     [&](const std::pair<CodeGenRegisterClass *, BitVector> &A,
@@ -1247,8 +1247,7 @@ CodeGenRegBank::CodeGenRegBank(const RecordKeeper &Records,
     // causes some failures in MIPS - perhaps they have duplicate register name
     // entries? (or maybe there's a reason for it - I don't know much about this
     // code, just drive-by refactoring)
-    RegistersByName.insert(
-        std::pair(Reg.TheDef->getValueAsString("AsmName"), &Reg));
+    RegistersByName.try_emplace(Reg.TheDef->getValueAsString("AsmName"), &Reg);
 
   // Precompute all sub-register maps.
   // This will create Composite entries for all inferred sub-register indices.
@@ -1260,10 +1259,10 @@ CodeGenRegBank::CodeGenRegBank(const RecordKeeper &Records,
   for (CodeGenSubRegIndex &SRI : SubRegIndices) {
     SRI.computeConcatTransitiveClosure();
     if (!SRI.ConcatenationOf.empty())
-      ConcatIdx.insert(
-          std::pair(SmallVector<CodeGenSubRegIndex *, 8>(
-                        SRI.ConcatenationOf.begin(), SRI.ConcatenationOf.end()),
-                    &SRI));
+      ConcatIdx.try_emplace(
+          SmallVector<CodeGenSubRegIndex *, 8>(SRI.ConcatenationOf.begin(),
+                                               SRI.ConcatenationOf.end()),
+          &SRI);
   }
 
   // Infer even more sub-registers by combining leading super-registers.
@@ -1353,12 +1352,12 @@ CodeGenRegister *CodeGenRegBank::getReg(const Record *Def) {
 
 void CodeGenRegBank::addToMaps(CodeGenRegisterClass *RC) {
   if (const Record *Def = RC->getDef())
-    Def2RC.insert(std::pair(Def, RC));
+    Def2RC.try_emplace(Def, RC);
 
   // Duplicate classes are rejected by insert().
   // That's OK, we only care about the properties handled by CGRC::Key.
   CodeGenRegisterClass::Key K(*RC);
-  Key2RC.insert(std::pair(K, RC));
+  Key2RC.try_emplace(K, RC);
 }
 
 // Create a synthetic sub-class if it is missing.
@@ -1509,7 +1508,7 @@ void CodeGenRegBank::computeComposites() {
   SmallSet<CompositePair, 4> UserDefined;
   for (const CodeGenSubRegIndex &Idx : SubRegIndices)
     for (auto P : Idx.getComposites())
-      UserDefined.insert(std::pair(&Idx, P.first));
+      UserDefined.insert({&Idx, P.first});
 
   // Keep track of TopoSigs visited. We only need to visit each TopoSig once,
   // and many registers will share TopoSigs on regular architectures.
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.h b/llvm/utils/TableGen/Common/CodeGenRegisters.h
index 2fa6cab2afb89..5e2d1977545c1 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.h
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.h
@@ -110,7 +110,7 @@ class CodeGenSubRegIndex {
   CodeGenSubRegIndex *addComposite(CodeGenSubRegIndex *A, CodeGenSubRegIndex *B,
                                    const CodeGenHwModes &CGH) {
     assert(A && B);
-    std::pair<CompMap::iterator, bool> Ins = Composed.insert(std::pair(A, B));
+    std::pair<CompMap::iterator, bool> Ins = Composed.try_emplace(A, B);
 
     // Synthetic subreg indices that aren't contiguous (for instance ARM
     // register tuples) don't have a bit range, so it's OK to let
@@ -729,7 +729,7 @@ class CodeGenRegBank {
   // This function is only for use by CodeGenRegister::computeSuperRegs().
   // Others should simply use Reg->getTopoSig().
   unsigned getTopoSig(const TopoSigId &Id) {
-    return TopoSigs.insert(std::pair(Id, TopoSigs.size())).first->second;
+    return TopoSigs.try_emplace(Id, TopoSigs.size()).first->second;
   }
 
   // Create a native register unit that is associated with one or two root
diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
index 1fe322c88bb0f..a5ca060533bce 100644
--- a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
@@ -334,7 +334,7 @@ static void processSTIPredicate(STIPredicateFunction &Fn,
   APInt DefaultProcMask(ProcModelMap.size(), 0);
   APInt DefaultPredMask(NumUniquePredicates, 0);
   for (std::pair<APInt, APInt> &MaskPair : OpcodeMasks)
-    MaskPair = std::pair(DefaultProcMask, DefaultPredMask);
+    MaskPair = {DefaultProcMask, DefaultPredMask};
 
   // Construct a OpcodeInfo object for every unique opcode declared by an
   // InstructionEquivalenceClass definition.
@@ -384,7 +384,7 @@ static void processSTIPredicate(STIPredicateFunction &Fn,
 
         auto PopulationCountAndLeftBit =
             [](const APInt &Other) -> std::pair<int, int> {
-          return std::pair<int, int>(Other.popcount(), -Other.countl_zero());
+          return {Other.popcount(), -Other.countl_zero()};
         };
         auto lhsmask_first = PopulationCountAndLeftBit(LhsMasks.first);
         auto rhsmask_first = PopulationCountAndLeftBit(RhsMasks.first);
@@ -545,7 +545,7 @@ void CodeGenSchedModels::collectProcModels() {
 /// ProcessorItineraries.
 void CodeGenSchedModels::addProcModel(const Record *ProcDef) {
   const Record *ModelKey = getModelOrItinDef(ProcDef);
-  if (!ProcModelMap.insert(std::pair(ModelKey, ProcModels.size())).second)
+  if (!ProcModelMap.try_emplace(ModelKey, ProcModels.size()).second)
     return;
 
   std::string Name = std::string(ModelKey->getName());
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
index a81f2b53f2846..f0cd98dd2dee0 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
@@ -840,8 +840,7 @@ Error RuleMatcher::defineComplexSubOperand(StringRef SymbolicName,
     return Error::success();
   }
 
-  ComplexSubOperands[SymbolicName] =
-      std::tuple(ComplexPattern, RendererID, SubOperandID);
+  ComplexSubOperands[SymbolicName] = {ComplexPattern, RendererID, SubOperandID};
   ComplexSubOperandsParentName[SymbolicName] = std::move(ParentName);
 
   return Error::success();
@@ -875,10 +874,8 @@ unsigned RuleMatcher::getInsnVarID(InstructionMatcher &InsnMatcher) const {
 }
 
 void RuleMatcher::defineOperand(StringRef SymbolicName, OperandMatcher &OM) {
-  if (!DefinedOperands.contains(SymbolicName)) {
-    DefinedOperands[SymbolicName] = &OM;
+  if (DefinedOperands.try_emplace(SymbolicName, &OM).second)
     return;
-  }
 
   // If the operand is already defined, then we must ensure both references in
   // the matcher have the exact same node.
@@ -889,8 +886,7 @@ void RuleMatcher::defineOperand(StringRef SymbolicName, OperandMatcher &OM) {
 }
 
 void RuleMatcher::definePhysRegOperand(const Record *Reg, OperandMatcher &OM) {
-  if (!PhysRegOperands.contains(Reg))
-    PhysRegOperands[Reg] = &OM;
+  PhysRegOperands.try_emplace(Reg, &OM);
 }
 
 InstructionMatcher &
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
index 8e6de80d6083c..e7914a613973b 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
@@ -233,7 +233,7 @@ class MatchTable {
   unsigned allocateLabelID() { return CurrentLabelID++; }
 
   void defineLabel(unsigned LabelID) {
-    LabelMap.insert(std::pair(LabelID, CurrentSize));
+    LabelMap.try_emplace(LabelID, CurrentSize);
   }
 
   unsigned getLabelIndex(unsigned LabelID) const {
diff --git a/llvm/utils/TableGen/Common/InfoByHwMode.cpp b/llvm/utils/TableGen/Common/InfoByHwMode.cpp
index e5e8225518b58..c2368cb31dbbf 100644
--- a/llvm/utils/TableGen/Common/InfoByHwMode.cpp
+++ b/llvm/utils/TableGen/Common/InfoByHwMode.cpp
@@ -71,9 +71,9 @@ MVT &ValueTypeByHwMode::getOrCreateTypeForMode(unsigned Mode, MVT Type) {
   // make a copy of it for Mode and return it.
   auto D = Map.begin();
   if (D != Map.end() && D->first == DefaultMode)
-    return Map.insert(std::pair(Mode, D->second)).first->second;
+    return Map.try_emplace(Mode, D->second).first->second;
   // If default mode is not present either, use provided Type.
-  return Map.insert(std::pair(Mode, Type)).first->second;
+  return Map.try_emplace(Mode, Type).first->second;
 }
 
 StringRef ValueTypeByHwMode::getMVTName(MVT T) {
diff --git a/llvm/utils/TableGen/Common/InfoByHwMode.h b/llvm/utils/TableGen/Common/InfoByHwMode.h
index 4f11e8ecc7fcb..bff164c6a6aa7 100644
--- a/llvm/utils/TableGen/Common/InfoByHwMode.h
+++ b/llvm/utils/TableGen/Common/InfoByHwMode.h
@@ -144,7 +144,7 @@ template <typename InfoT> struct InfoByHwMode {
     assert(hasMode(Mode) || hasDefault());
     InfoT I = get(Mode);
     Map.clear();
-    Map.insert(std::pair(DefaultMode, I));
+    Map.try_emplace(DefaultMode, I);
   }
 
 protected:
@@ -212,7 +212,7 @@ struct RegSizeInfoByHwMode : public InfoByHwMode<RegSizeInfo> {
   void writeToStream(raw_ostream &OS) const;
 
   void insertRegSizeForMode(unsigned Mode, RegSizeInfo Info) {
-    Map.insert(std::pair(Mode, Info));
+    Map.try_emplace(Mode, Info);
   }
 };
 
@@ -233,7 +233,7 @@ struct SubRegRangeByHwMode : public InfoByHwMode<SubRegRange> {
   SubRegRangeByHwMode() = default;
 
   void insertSubRegRangeForMode(unsigned Mode, SubRegRange Info) {
-    Map.insert(std::pair(Mode, Info));
+    Map.try_emplace(Mode, Info);
   }
 };
 
diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
index dd05f4df0d723..e1c25075e384d 100644
--- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
@@ -252,7 +252,7 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode &N) {
   if (LeafRec->isSubClassOf("Register")) {
     AddMatcher(new RecordMatcher("physreg input " + LeafRec->getName().str(),
                                  NextRecordedOperandNo));
-    PhysRegInputs.push_back(std::pair(LeafRec, NextRecordedOperandNo++));
+    PhysRegInputs.emplace_back(LeafRec, NextRecordedOperandNo++);
     return;
   }
 
@@ -272,7 +272,7 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode &N) {
     // Remember this ComplexPattern so that we can emit it after all the other
     // structural matches are done.
     unsigned InputOperand = VariableMap[N.getName()] - 1;
-    MatchedComplexPatterns.push_back(std::pair(&N, InputOperand));
+    MatchedComplexPatterns.emplace_back(&N, InputOperand);
     return;
   }
 
diff --git a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
index 590786bb7fced..f747944543cfd 100644
--- a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
@@ -426,7 +426,7 @@ static void FactorNodes(std::unique_ptr<Matcher> &InputMatcherPtr) {
       CheckOpcodeMatcher *COM = cast<CheckOpcodeMatcher>(OptionsToMatch[i]);
       assert(Opcodes.insert(COM->getOpcode().getEnumName()).second &&
              "Duplicate opcodes not factored?");
-      Cases.push_back(std::pair(&COM->getOpcode(), COM->takeNext()));
+      Cases.emplace_back(&COM->getOpcode(), COM->takeNext());
       delete COM;
     }
 
@@ -463,7 +463,7 @@ static void FactorNodes(std::unique_ptr<Matcher> &InputMatcherPtr) {
       }
 
       Entry = Cases.size() + 1;
-      Cases.push_back(std::pair(CTMTy, MatcherWithoutCTM));
+      Cases.emplace_back(CTMTy, MatcherWithoutCTM);
     }
 
     // Make sure we recursively factor any scopes we may have created.
diff --git a/llvm/utils/TableGen/DFAEmitter.cpp b/llvm/utils/TableGen/DFAEmitter.cpp
index c150620b74175..a77397dd7d260 100644
--- a/llvm/utils/TableGen/DFAEmitter.cpp
+++ b/llvm/utils/TableGen/DFAEmitter.cpp
@@ -349,7 +349,7 @@ void CustomDfaEmitter::printActionType(raw_ostream &OS) { OS << TypeName; }
 void CustomDfaEmitter::printActionValue(action_type A, raw_ostream &OS) {
   const ActionTuple &AT = Actions[A];
   if (AT.size() > 1)
-    OS << "std::tuple(";
+    OS << "{";
   ListSeparator LS;
   for (const auto &SingleAction : AT) {
     OS << LS;
@@ -361,7 +361,7 @@ void CustomDfaEmitter::printActionValue(action_type A, raw_ostream &OS) {
       OS << std::get<unsigned>(SingleAction);
   }
   if (AT.size() > 1)
-    OS << ")";
+    OS << "}";
 }
 
 static TableGen::Emitter::OptClass<AutomatonEmitter>
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 90a6d0ee8acb5..b847031fdc00a 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -640,11 +640,11 @@ void Filter::recurse() {
 
     // Delegates to an inferior filter chooser for further processing on this
     // group of instructions whose segment values are variable.
-    FilterChooserMap.insert(std::pair(
+    FilterChooserMap.try_emplace(
         NO_FIXED_SEGMENTS_SENTINEL,
         std::make_unique<FilterChooser>(Owner->AllInstructions,
                                         VariableInstructions, Owner->Operands,
-                                        BitValueArray, *Owner)));
+                                        BitValueArray, *Owner));
   }
 
   // No need to recurse for a singleton filtered instruction.
@@ -667,10 +667,10 @@ void Filter::recurse() {
 
     // Delegates to an inferior filter chooser for further processing on this
     // category of instructions.
-    FilterChooserMap.insert(
-        std::pair(Inst.first, std::make_unique<FilterChooser>(
-                                  Owner->AllInstructions, Inst.second,
-                                  Owner->Operands, BitValueArray, *Owner)));
+    FilterChooserMap.try_emplace(Inst.first,
+                                 std::make_unique<FilterChooser>(
+                                     Owner->AllInstructions, Inst.second,
+                                     Owner->Operands, BitValueArray, *Owner));
   }
 }
 
@@ -1943,7 +1943,7 @@ static void parseVarLenInstOperand(const Record &Def,
       int TiedReg = TiedTo[OpSubOpPair.first];
       if (TiedReg != -1) {
         unsigned OpIdx = CGI.Operands.getFlattenedOperandNumber(
-            std::pair(TiedReg, OpSubOpPair.second));
+            {TiedReg, OpSubOpPair.second});
         Operands[OpIdx].addField(CurrBitPos, EncodingSegment.BitWidth, Offset);
       }
     }
@@ -2039,9 +2039,9 @@ populateInstruction(const CodeGenTarget &Target, const Record &EncodingDef,
   const DagInit *Out = Def.getValueAsDag("OutOperandList");
   const DagInit *In = Def.getValueAsDag("InOperandList");
   for (const auto &[Idx, Arg] : enumerate(Out->getArgs()))
-    InOutOperands.push_back(std::pair(Arg, Out->getArgNameStr(Idx)));
+    InOutOperands.emplace_back(Arg, Out->getArgNameStr(Idx));
   for (const auto &[Idx, Arg] : enumerate(In->getArgs()))
-    InOutOperands.push_back(std::pair(Arg, In->getArgNameStr(Idx)));
+    InOutOperands.emplace_back(Arg, In->getArgNameStr(Idx));
 
   // Search for tied operands, so that we can correctly instantiate
   // operands that are not explicitly represented in the encoding.
@@ -2146,7 +2146,7 @@ populateInstruction(const CodeGenTarget &Target, const Record &EncodingDef,
         InsnOperands.push_back(std::move(OpInfo));
     }
   }
-  Operands[Opc] = InsnOperands;
+  Operands[Opc] = std::move(InsnOperands);
 
 #if 0
   LLVM_DEBUG({
@@ -2587,7 +2587,7 @@ namespace llvm {
       if (!NumberedEncoding.HwModeName.empty())
         DecoderNamespace +=
             std::string("_") + NumberedEncoding.HwModeName.str();
-      OpcMap[std::pair(DecoderNamespace, Size)].emplace_back(
+      OpcMap[{DecoderNamespace, Size}].emplace_back(
           NEI, Target.getInstrIntValue(Def));
     } else {
       NumEncodingsOmitted++;
diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp
index f60c63c212d61..6963bb239e9d8 100644
--- a/llvm/utils/TableGen/FastISelEmitter.cpp
+++ b/llvm/utils/TableGen/FastISelEmitter.cpp
@@ -593,7 +593,7 @@ void FastISelMap::collectPatterns(const CodeGenDAGPatterns &CGP) {
     int complexity = Pattern.getPatternComplexity(CGP);
 
     auto inserted_simple_pattern = SimplePatternsCheck.insert(
-        std::tuple(Operands, OpcodeName, VT, RetVT, PredicateCheck));
+        {Operands, OpcodeName, VT, RetVT, PredicateCheck});
     if (!inserted_simple_pattern.second) {
       PrintFatalError(Pattern.getSrcRecord()->getLoc(),
                       "Duplicate predicate in FastISel table!");
diff --git a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
index bc300c3461100..770494405810d 100644
--- a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
@@ -2653,10 +2653,10 @@ GICombinerEmitter::buildMatchTable(MutableArrayRef<RuleMatcher> Rules) {
                                                const Matcher *B) {
     auto *L = static_cast<const RuleMatcher *>(A);
     auto *R = static_cast<const RuleMatcher *>(B);
-    return std::make_tuple(OpcodeOrder[L->getOpcode()],
-                           L->insnmatchers_front().getNumOperandMatchers()) <
-           std::make_tuple(OpcodeOrder[R->getOpcode()],
-                           R->insnmatchers_front().getNumOperandMatchers());
+    return std::tuple(OpcodeOrder[L->getOpcode()],
+                      L->insnmatchers_front().getNumOperandMatchers()) <
+           std::tuple(OpcodeOrder[R->getOpcode()],
+                      R->insnmatchers_front().getNumOperandMatchers());
   });
 
   for (Matcher *Rule : InputRules)
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index 04ebdbb0ffc90..5466d315c05a4 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -765,6 +765,18 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
     InsnMatcher.addPredicate<InstructionOpcodeMatcher>(SrcGIOrNull);
   }
 
+  // Since there are no opcodes for atomic loads and stores comparing to
+  // SelectionDAG, we add CheckMMOIsNonAtomic predicate immediately after the
+  // opcode predicate to make a logical combination of them.
+  if (SrcGIEquivOrNull &&
+      SrcGIEquivOrNull->getValueAsBit("CheckMMOIsNonAtomic"))
+    InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>("NotAtomic");
+  else if (SrcGIEquivOrNull &&
+           SrcGIEquivOrNull->getValueAsBit("CheckMMOIsAtomic")) {
+    InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>(
+        "Unordered", AtomicOrderingMMOPredicateMatcher::AO_OrStronger);
+  }
+
   unsigned OpIdx = 0;
   for (const TypeSetByHwMode &VTy : Src.getExtTypes()) {
     // Results don't have a name unless they are the root node. The caller will
@@ -827,15 +839,6 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
     }
   }
 
-  if (SrcGIEquivOrNull &&
-      SrcGIEquivOrNull->getValueAsBit("CheckMMOIsNonAtomic"))
-    InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>("NotAtomic");
-  else if (SrcGIEquivOrNull &&
-           SrcGIEquivOrNull->getValueAsBit("CheckMMOIsAtomic")) {
-    InsnMatcher.addPredicate<AtomicOrderingMMOPredicateMatcher>(
-        "Unordered", AtomicOrderingMMOPredicateMatcher::AO_OrStronger);
-  }
-
   if (Src.isLeaf()) {
     const Init *SrcInit = Src.getLeafValue();
     if (const IntInit *SrcIntInit = dyn_cast<IntInit>(SrcInit)) {
diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp
index 8c0e27215a736..7811734d5fdac 100644
--- a/llvm/utils/TableGen/InstrInfoEmitter.cpp
+++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp
@@ -259,8 +259,7 @@ void InstrInfoEmitter::initOperandMapData(
       StrUintMapIter I = Operands.find(Info.Name);
 
       if (I == Operands.end()) {
-        I = Operands.insert(Operands.begin(), std::pair<std::string, unsigned>(
-                                                  Info.Name, NumOperands++));
+        I = Operands.insert(Operands.begin(), {Info.Name, NumOperands++});
       }
       OpList[I->second] = Info.MIOperandNo;
     }
diff --git a/llvm/utils/TableGen/OptionParserEmitter.cpp b/llvm/utils/TableGen/OptionParserEmitter.cpp
index eca828cad5f4d..8b92d25239219 100644
--- a/llvm/utils/TableGen/OptionParserEmitter.cpp
+++ b/llvm/utils/TableGen/OptionParserEmitter.cpp
@@ -232,8 +232,7 @@ static void emitHelpTextsForVariants(
     assert(Visibilities.size() <= MaxVisibilityPerHelp &&
            "Too many visibilities to store in an "
            "OptTable::HelpTextsForVariants entry");
-    OS << "std::make_pair(std::array<unsigned, " << MaxVisibilityPerHelp
-       << ">{{";
+    OS << "{std::array<unsigned, " << MaxVisibilityPerHelp << ">{{";
 
     auto VisibilityEnd = Visibilities.cend();
     for (auto Visibility = Visibilities.cbegin(); Visibility != VisibilityEnd;
@@ -249,7 +248,7 @@ static void emitHelpTextsForVariants(
       writeCstring(OS, Help);
     else
       OS << "nullptr";
-    OS << ")";
+    OS << "}";
 
     if (std::next(VisibilityHelp) != VisibilityHelpEnd)
       OS << ", ";
@@ -516,8 +515,8 @@ static void emitOptionParser(const RecordKeeper &Records, raw_ostream &OS) {
       for (const Init *Visibility : Visibilities)
         VisibilityNames.push_back(Visibility->getAsUnquotedString());
 
-      HelpTextsForVariants.push_back(std::make_pair(
-          VisibilityNames, VisibilityHelp->getValueAsString("Text")));
+      HelpTextsForVariants.emplace_back(
+          VisibilityNames, VisibilityHelp->getValueAsString("Text"));
     }
     emitHelpTextsForVariants(OS, std::move(HelpTextsForVariants));
 
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index 0c1f5d205ca0f..8247b2d8f5a40 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -1882,9 +1882,8 @@ void RegisterInfoEmitter::debugDump(raw_ostream &OS) {
     OS << '\n';
     OS << "\tCoveredBySubregs: " << R.CoveredBySubRegs << '\n';
     OS << "\tHasDisjunctSubRegs: " << R.HasDisjunctSubRegs << '\n';
-    for (std::pair<CodeGenSubRegIndex *, CodeGenRegister *> P :
-         R.getSubRegs()) {
-      OS << "\tSubReg " << P.first->getName() << " = " << P.second->getName()
+    for (auto &[SubIdx, SubReg] : R.getSubRegs()) {
+      OS << "\tSubReg " << SubIdx->getName() << " = " << SubReg->getName()
          << '\n';
     }
   }
diff --git a/llvm/utils/TableGen/SearchableTableEmitter.cpp b/llvm/utils/TableGen/SearchableTableEmitter.cpp
index 91fde0c663057..38b6f2b395137 100644
--- a/llvm/utils/TableGen/SearchableTableEmitter.cpp
+++ b/llvm/utils/TableGen/SearchableTableEmitter.cpp
@@ -642,7 +642,7 @@ void SearchableTableEmitter::collectEnumEntries(
       Value = getInt(EntryRec, ValueField);
 
     Enum.Entries.push_back(std::make_unique<GenericEnum::Entry>(Name, Value));
-    Enum.EntryMap.insert(std::pair(EntryRec, Enum.Entries.back().get()));
+    Enum.EntryMap.try_emplace(EntryRec, Enum.Entries.back().get());
   }
 
   if (ValueField.empty()) {
@@ -745,7 +745,7 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
 
     collectEnumEntries(*Enum, NameField, ValueField,
                        Records.getAllDerivedDefinitions(FilterClass));
-    EnumMap.insert(std::pair(EnumRec, Enum.get()));
+    EnumMap.try_emplace(EnumRec, Enum.get());
     Enums.emplace_back(std::move(Enum));
   }
 
@@ -814,7 +814,7 @@ void SearchableTableEmitter::run(raw_ostream &OS) {
                         });
     }
 
-    TableMap.insert(std::pair(TableRec, Table.get()));
+    TableMap.try_emplace(TableRec, Table.get());
     Tables.emplace_back(std::move(Table));
   }
 
diff --git a/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp b/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
index 7373494e8b12f..5aa573ac857dc 100644
--- a/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
+++ b/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
@@ -75,7 +75,7 @@ void llvm::emitWebAssemblyDisassemblerTables(
       }
     }
     // Set this instruction as the one to use.
-    CGIP = std::pair(I, &CGI);
+    CGIP = {I, &CGI};
   }
   OS << "#include \"MCTargetDesc/WebAssemblyMCTargetDesc.h\"\n";
   OS << "\n";
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp
index 294923b250eea..5e7983a101e0b 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.cpp
+++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp
@@ -874,7 +874,7 @@ void DisassemblerTables::emitInstructionInfo(raw_ostream &o,
     for (auto Operand : InstructionSpecifiers[Index].operands) {
       OperandEncoding Encoding = (OperandEncoding)Operand.encoding;
       OperandType Type = (OperandType)Operand.type;
-      OperandList.push_back(std::pair(Encoding, Type));
+      OperandList.emplace_back(Encoding, Type);
     }
     unsigned &N = OperandSets[OperandList];
     if (N != 0)
@@ -906,7 +906,7 @@ void DisassemblerTables::emitInstructionInfo(raw_ostream &o,
     for (auto Operand : InstructionSpecifiers[index].operands) {
       OperandEncoding Encoding = (OperandEncoding)Operand.encoding;
       OperandType Type = (OperandType)Operand.type;
-      OperandList.push_back(std::pair(Encoding, Type));
+      OperandList.emplace_back(Encoding, Type);
     }
     o.indent(i * 2) << (OperandSets[OperandList] - 1) << ",\n";
 
diff --git a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
index 10fab469a0803..1ee79aa27fa98 100644
--- a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
+++ b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
@@ -236,7 +236,7 @@ void X86InstrMappingEmitter::emitCompressEVEXTable(
     if (!NewInst)
       continue;
 
-    Table.push_back(std::pair(Inst, NewInst));
+    Table.emplace_back(Inst, NewInst);
     auto Predicates = NewInst->TheDef->getValueAsListOfDefs("Predicates");
     auto It = llvm::find_if(Predicates, [](const Record *R) {
       StringRef Name = R->getName();
@@ -293,7 +293,7 @@ void X86InstrMappingEmitter::emitNFTransformTable(
         report_fatal_error("EFLAGS should be clobbered by " +
                            NewRec->getName());
 #endif
-      Table.push_back(std::pair(&Target.getInstruction(NewRec), Inst));
+      Table.emplace_back(&Target.getInstruction(NewRec), Inst);
     }
   }
   printTable(Table, "X86NFTransformTable", "GET_X86_NF_TRANSFORM_TABLE", OS);
@@ -321,7 +321,7 @@ void X86InstrMappingEmitter::emitND2NonNDTable(
       const auto *NewRec = Records.getDef(ManualMap.at(Rec->getName()));
       assert(NewRec && "Instruction not found!");
       auto &NewInst = Target.getInstruction(NewRec);
-      Table.push_back(std::pair(Inst, &NewInst));
+      Table.emplace_back(Inst, &NewInst);
       continue;
     }
 
@@ -332,7 +332,7 @@ void X86InstrMappingEmitter::emitND2NonNDTable(
       continue;
     const auto &NewInst = Target.getInstruction(NewRec);
     if (isRegisterOperand(NewInst.Operands[0].Rec))
-      Table.push_back(std::pair(Inst, &NewInst));
+      Table.emplace_back(Inst, &NewInst);
   }
   printTable(Table, "X86ND2NonNDTable", "GET_X86_ND2NONND_TABLE", OS);
 }
@@ -355,7 +355,7 @@ void X86InstrMappingEmitter::emitSSE2AVXTable(
       const auto *NewRec = Records.getDef(ManualMap.at(Rec->getName()));
       assert(NewRec && "Instruction not found!");
       const auto &NewInst = Target.getInstruction(NewRec);
-      Table.push_back(std::pair(Inst, &NewInst));
+      Table.emplace_back(Inst, &NewInst);
       continue;
     }
 
@@ -364,7 +364,7 @@ void X86InstrMappingEmitter::emitSSE2AVXTable(
     if (!AVXRec)
       continue;
     auto &AVXInst = Target.getInstruction(AVXRec);
-    Table.push_back(std::pair(Inst, &AVXInst));
+    Table.emplace_back(Inst, &AVXInst);
   }
   printTable(Table, "X86SSE2AVXTable", "GET_X86_SSE2AVX_TABLE", OS);
 }
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn
index c6b45efef2990..b609d4a7462fb 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn
@@ -105,7 +105,6 @@ static_library("clangd") {
     "GlobalCompilationDatabase.cpp",
     "HeaderSourceSwitch.cpp",
     "Headers.cpp",
-    "HeuristicResolver.cpp",
     "Hover.cpp",
     "IncludeCleaner.cpp",
     "IncludeFixer.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
index c79d5ad662b7f..7deefe9dc0613 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
@@ -80,7 +80,6 @@ unittest("ClangdTests") {
     "GlobalCompilationDatabaseTests.cpp",
     "HeaderSourceSwitchTests.cpp",
     "HeadersTests.cpp",
-    "HeuristicResolverTests.cpp",
     "HoverTests.cpp",
     "IncludeCleanerTests.cpp",
     "IndexActionTests.cpp",
diff --git a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
index d8c4d8abdfd11..70af9760a858d 100644
--- a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
@@ -40,6 +40,14 @@ foreach(diag_group, diag_groups) {
     ]
     td_file = "Diagnostic.td"
   }
+
+  clang_tablegen("Diagnostic${diag_group}Enums") {
+    args = [
+      "-gen-clang-diags-enums",
+      "-clang-component=${diag_group}",
+    ]
+    td_file = "Diagnostic.td"
+  }
 }
 group("diags_tablegen") {
   # DiagnosticGroups and DiagnosticIndexName are intentionally not part of this
@@ -47,7 +55,10 @@ group("diags_tablegen") {
   # but almost nothing needs DiagnosticGroups.inc or DiagnosticIndexName.inc.
   public_deps = []
   foreach(diag_group, diag_groups) {
-    public_deps += [ ":Diagnostic${diag_group}Kinds" ]
+    public_deps += [
+        ":Diagnostic${diag_group}Kinds",
+        ":Diagnostic${diag_group}Enums",
+    ]
   }
 }
 
diff --git a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
index fd2ac58714664..3b72177ee5d7c 100644
--- a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
@@ -44,6 +44,7 @@ static_library("Sema") {
     "DeclSpec.cpp",
     "DelayedDiagnostic.cpp",
     "HLSLExternalSemaSource.cpp",
+    "HeuristicResolver.cpp",
     "IdentifierResolver.cpp",
     "JumpDiagnostics.cpp",
     "MultiplexExternalSemaSource.cpp",
@@ -94,6 +95,7 @@ static_library("Sema") {
     "SemaObjC.cpp",
     "SemaObjCProperty.cpp",
     "SemaOpenACC.cpp",
+    "SemaOpenACCClause.cpp",
     "SemaOpenCL.cpp",
     "SemaOpenMP.cpp",
     "SemaOverload.cpp",
diff --git a/llvm/utils/gn/secondary/clang/unittests/Sema/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Sema/BUILD.gn
index 943840796a6a3..8a10db4bcc089 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Sema/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Sema/BUILD.gn
@@ -18,6 +18,7 @@ unittest("SemaTests") {
     "CodeCompleteTest.cpp",
     "ExternalSemaSourceTest.cpp",
     "GslOwnerPointerInference.cpp",
+    "HeuristicResolverTest.cpp",
     "SemaLookupTest.cpp",
     "SemaNoloadLookupTest.cpp",
   ]
diff --git a/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn
index fae0c22710b06..5e833cae6a4b1 100644
--- a/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn
@@ -84,6 +84,7 @@ target(liblldb_type, "liblldb") {
     "SBProcess.cpp",
     "SBProcessInfo.cpp",
     "SBProcessInfoList.cpp",
+    "SBProgress.cpp",
     "SBQueue.cpp",
     "SBQueueItem.cpp",
     "SBReproducer.cpp",
diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/Process/Utility/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/Process/Utility/BUILD.gn
index 42402565d6fa3..2cc6a0baaae29 100644
--- a/llvm/utils/gn/secondary/lldb/source/Plugins/Process/Utility/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Plugins/Process/Utility/BUILD.gn
@@ -35,6 +35,7 @@ static_library("Utility") {
     "NativeRegisterContextDBReg_x86.cpp",
     "NativeRegisterContextRegisterInfo.cpp",
     "NetBSDSignals.cpp",
+    "OpenBSDSignals.cpp",
     "RegisterContextDarwin_arm.cpp",
     "RegisterContextDarwin_arm64.cpp",
     "RegisterContextDarwin_i386.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index 75a370a3b7e8e..9b8990b5a6bcf 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -89,9 +89,6 @@ write_cmake_config("config") {
     "HAVE_DECL_FE_ALL_EXCEPT=1",
     "HAVE_DECL_FE_INEXACT=1",
     "LLVM_ENABLE_CRASH_DUMPS=",
-    "HAVE_ERRNO_H=1",
-    "HAVE_FCNTL_H=1",
-    "HAVE_FENV_H=1",
     "HAVE_FFI_CALL=",
     "HAVE_FFI_FFI_H=",
     "HAVE_FFI_H=",
@@ -100,8 +97,6 @@ write_cmake_config("config") {
     "HAVE_MALLCTL=",
     "HAVE_PTHREAD_GET_NAME_NP=",
     "HAVE_PTHREAD_SET_NAME_NP=",
-    "HAVE_SIGNAL_H=1",
-    "HAVE_SYS_STAT_H=1",
     "HAVE_VALGRIND_VALGRIND_H=",
     "HAVE__ALLOCA=",
     "HAVE___ALLOCA=",
@@ -211,7 +206,6 @@ write_cmake_config("config") {
       "HAVE_DLOPEN=",
       "HAVE_FUTIMES=",
       "HAVE_GETPAGESIZE=",
-      "HAVE_GETRLIMIT=",
       "HAVE_GETRUSAGE=",
       "HAVE_ISATTY=",
       "HAVE_LIBPTHREAD=",
@@ -222,15 +216,10 @@ write_cmake_config("config") {
       "HAVE_PTHREAD_RWLOCK_INIT=",
       "HAVE_SBRK=",
       "HAVE_SETENV=",
-      "HAVE_SETRLIMIT=",
       "HAVE_SIGALTSTACK=",
       "HAVE_STRERROR_R=",
       "HAVE_SYSCONF=",
-      "HAVE_SYS_IOCTL_H=",
       "HAVE_SYS_MMAN_H=",
-      "HAVE_SYS_RESOURCE_H=",
-      "HAVE_SYS_TIME_H=",
-      "HAVE_TERMIOS_H=",
       "HAVE_UNISTD_H=",
       "HAVE__CHSIZE_S=1",
       "HAVE__UNWIND_BACKTRACE=",
@@ -246,7 +235,6 @@ write_cmake_config("config") {
       "HAVE_DLOPEN=1",
       "HAVE_FUTIMES=1",
       "HAVE_GETPAGESIZE=1",
-      "HAVE_GETRLIMIT=1",
       "HAVE_GETRUSAGE=1",
       "HAVE_ISATTY=1",
       "HAVE_LIBPTHREAD=1",
@@ -257,15 +245,10 @@ write_cmake_config("config") {
       "HAVE_PTHREAD_RWLOCK_INIT=1",
       "HAVE_SBRK=1",
       "HAVE_SETENV=1",
-      "HAVE_SETRLIMIT=1",
       "HAVE_SIGALTSTACK=1",
       "HAVE_STRERROR_R=1",
       "HAVE_SYSCONF=1",
-      "HAVE_SYS_IOCTL_H=1",
       "HAVE_SYS_MMAN_H=1",
-      "HAVE_SYS_RESOURCE_H=1",
-      "HAVE_SYS_TIME_H=1",
-      "HAVE_TERMIOS_H=1",
       "HAVE_UNISTD_H=1",
       "HAVE__CHSIZE_S=",
       "HAVE__UNWIND_BACKTRACE=1",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
index ea0f9b8723082..f59b6446f0dea 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
@@ -14,6 +14,7 @@ static_library("Vectorize") {
     "LoopVectorize.cpp",
     "SLPVectorizer.cpp",
     "SandboxVectorizer/DependencyGraph.cpp",
+    "SandboxVectorizer/InstrMaps.cpp",
     "SandboxVectorizer/Interval.cpp",
     "SandboxVectorizer/Legality.cpp",
     "SandboxVectorizer/Passes/BottomUpVec.cpp",
@@ -22,6 +23,7 @@ static_library("Vectorize") {
     "SandboxVectorizer/SandboxVectorizerPassBuilder.cpp",
     "SandboxVectorizer/Scheduler.cpp",
     "SandboxVectorizer/SeedCollector.cpp",
+    "SandboxVectorizer/VecUtils.cpp",
     "VPlan.cpp",
     "VPlanAnalysis.cpp",
     "VPlanHCFGBuilder.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/JITLink/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/JITLink/BUILD.gn
index 78802e5cc2368..a83e9f5102668 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/JITLink/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/JITLink/BUILD.gn
@@ -23,5 +23,6 @@ unittest("JITLinkTests") {
     "MachOLinkGraphTests.cpp",
     "MemoryManagerErrorTests.cpp",
     "StubsTests.cpp",
+    "X86_64Tests.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/BUILD.gn
index 97df71c6279ef..919512919cacc 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/BUILD.gn
@@ -11,6 +11,7 @@ unittest("SandboxVectorizerTests") {
   ]
   sources = [
     "DependencyGraphTest.cpp",
+    "InstrMapsTest.cpp",
     "IntervalTest.cpp",
     "LegalityTest.cpp",
     "SchedulerTest.cpp",
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index a888ac243b044..9e786154a2b40 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -174,6 +174,8 @@ configure_file(
 # disable all package setup and control it themselves.
 #-------------------------------------------------------------------------------
 
+set(MLIR_BINDINGS_PYTHON_NB_DOMAIN "mlir"
+  CACHE STRING "nanobind domain for MLIR python bindings.")
 set(MLIR_ENABLE_BINDINGS_PYTHON 0 CACHE BOOL
        "Enables building of Python bindings.")
 set(MLIR_DETECT_PYTHON_ENV_PRIME_SEARCH 1 CACHE BOOL
diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake
index 0679db9cf93e1..815f65b106d94 100644
--- a/mlir/cmake/modules/AddMLIRPython.cmake
+++ b/mlir/cmake/modules/AddMLIRPython.cmake
@@ -667,7 +667,7 @@ function(add_mlir_python_extension libname extname)
     )
   elseif(ARG_PYTHON_BINDINGS_LIBRARY STREQUAL "nanobind")
     nanobind_add_module(${libname}
-      NB_DOMAIN mlir
+      NB_DOMAIN ${MLIR_BINDINGS_PYTHON_NB_DOMAIN}
       FREE_THREADED
       ${ARG_SOURCES}
     )
diff --git a/mlir/docs/Dialects/Affine.md b/mlir/docs/Dialects/Affine.md
index bfcbbf5bb3b13..0b6d7747e8a6f 100644
--- a/mlir/docs/Dialects/Affine.md
+++ b/mlir/docs/Dialects/Affine.md
@@ -69,9 +69,7 @@ immediately enclosed by the latter),
 3. a value that dominates the `AffineScope` op enclosing the value's
 use,
 4. the result of a constant operation,
-5. the result of an
-[`affine.apply` operation](#affineapply-mliraffineapplyop) that recursively takes as
-arguments any valid symbolic identifiers, or
+5. the result of a `Pure` operation whose operands are valid symbolic identifiers.
 6. the result of a
 [`dim` operation](MemRef.md/#memrefdim-mlirmemrefdimop) on either a memref that
 is an argument to a `AffineScope` op or a memref where the corresponding
diff --git a/mlir/docs/Dialects/emitc.md b/mlir/docs/Dialects/emitc.md
index 54b00f76bb5d5..2b83f73b25b4e 100644
--- a/mlir/docs/Dialects/emitc.md
+++ b/mlir/docs/Dialects/emitc.md
@@ -20,6 +20,8 @@ The following convention is followed:
     GCC or Clang.
 *   If `ub.posion` values should be initialized and have an opaque type,
     C++ is generated.
+*   If `emitc.array` with a dimension of size zero is used, then the code
+    requires [a GCC extension](https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html).
 *   Else the generated code is compatible with C99.
 
 These restrictions are neither inherent to the EmitC dialect itself nor to the
diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
index 094360e75ab61..cf0c96f0eba00 100644
--- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
+++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
@@ -64,7 +64,8 @@ struct FunctionCallBuilder {
 /// populate converter for gpu types.
 void populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
                                          RewritePatternSet &patterns,
-                                         bool kernelBarePtrCallConv = false);
+                                         bool kernelBarePtrCallConv = false,
+                                         bool typeCheckKernelArgs = false);
 
 /// A function that maps a MemorySpace enum to a target-specific integer value.
 using MemorySpaceMapping = std::function<unsigned(gpu::AddressSpace)>;
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 71d8284d3d373..61f754d67dc13 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -517,6 +517,12 @@ def GpuToLLVMConversionPass : Pass<"gpu-to-llvm", "ModuleOp"> {
            /*default=*/"false",
              "Use bare pointers to pass memref arguments to kernels. "
              "The kernel must use the same setting for this option."
+          >,
+    Option<"typeCheckKernelArgs", "type-check-kernel-args", "bool",
+           /*default=*/"false",
+             "Require all kernel arguments to be memrefs of rank 1 and with a "
+             "32-bit element size. This is a temporary option that will be "
+             "removed; TODO(https://github.com/llvm/llvm-project/issues/73457)."
           >
   ];
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 04042903e343e..bf3131932a56b 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -970,6 +970,77 @@ def NVVM_CpAsyncMBarrierArriveSharedOp : NVVM_Op<"cp.async.mbarrier.arrive.share
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// NVVM Conversion Ops (for "cvt.*" family of PTX instructions)
+//===----------------------------------------------------------------------===//
+
+// Attributes for the floating point rounding modes supported by PTX
+def FPRoundingModeNone : I32EnumAttrCase<"NONE", 0, "none">;
+def FPRoundingModeRN   : I32EnumAttrCase<"RN",   1, "rn">;
+def FPRoundingModeRM   : I32EnumAttrCase<"RM",   2, "rm">;
+def FPRoundingModeRP   : I32EnumAttrCase<"RP",   3, "rp">;
+def FPRoundingModeRZ   : I32EnumAttrCase<"RZ",   4, "rz">;
+def FPRoundingModeRNA  : I32EnumAttrCase<"RNA",  5, "rna">;
+
+def FPRoundingMode : I32EnumAttr<"FPRoundingMode", "NVVM FPRoundingMode kind",
+  [FPRoundingModeNone, FPRoundingModeRN, FPRoundingModeRM,
+    FPRoundingModeRP, FPRoundingModeRZ, FPRoundingModeRNA]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::NVVM";
+}
+def FPRoundingModeAttr : EnumAttr<NVVM_Dialect, FPRoundingMode, "fp_rnd_mode"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+def SaturationModeNone   : I32EnumAttrCase<"NONE", 0, "none">;
+def SaturationModeFinite : I32EnumAttrCase<"SATFINITE", 1, "satfinite">;
+
+def SaturationMode : I32EnumAttr<"SaturationMode", "NVVM SaturationMode kind",
+  [SaturationModeNone, SaturationModeFinite]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::NVVM";
+}
+def SaturationModeAttr : EnumAttr<NVVM_Dialect, SaturationMode, "sat_mode"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+def NVVM_CvtFloatToTF32Op : NVVM_Op<"cvt.float.to.tf32"> {
+  let summary = "Convert the given float input to TF32";
+  let description = [{
+    This Op converts the given f32 input to tf32.
+    The result `res` is represented as an i32 type.
+    The `relu` attribute, when set, lowers to the '.relu' variant of
+    the cvt instruction. The `rnd` and `sat` attributes specify the
+    the rounding and saturation modes respectively.
+    [For more information, see PTX ISA]
+    (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt)
+  }];
+
+  let hasVerifier = 1;
+  let results = (outs I32:$res);
+  let arguments = (ins
+    F32:$src,
+    DefaultValuedAttr<FPRoundingModeAttr, "FPRoundingMode::NONE">:$rnd,
+    DefaultValuedAttr<SaturationModeAttr, "SaturationMode::NONE">:$sat,
+    DefaultValuedAttr<BoolAttr, "false">:$relu);
+
+  let assemblyFormat = "$src attr-dict";
+
+  let extraClassDeclaration = [{
+    static llvm::Intrinsic::ID getIntrinsicID(NVVM::FPRoundingMode,
+                                              NVVM::SaturationMode,
+                                              bool hasRelu);
+  }];
+
+  string llvmBuilder = [{
+    auto intId = NVVM::CvtFloatToTF32Op::getIntrinsicID($rnd, $sat, $relu);
+    $res = createIntrinsicCall(builder, intId, {$src});
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// NVVM MMA Ops
+//===----------------------------------------------------------------------===//
 /// Helpers to instantiate different version of wmma intrinsics.
 /// This matches the hierarchy used in IntrinsicsNVVM.td to define all the
 /// combinations of the intrinsics.
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
index a4c01c0bc3418..469a9a0ef01dd 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
@@ -4171,6 +4171,7 @@ def SPIRV_IsArrayType : CPred<"::llvm::isa<::mlir::spirv::ArrayType>($_self)">;
 def SPIRV_IsCooperativeMatrixType :
   CPred<"::llvm::isa<::mlir::spirv::CooperativeMatrixType>($_self)">;
 def SPIRV_IsImageType : CPred<"::llvm::isa<::mlir::spirv::ImageType>($_self)">;
+def SPIRV_IsVectorType : CPred<"::llvm::isa<::mlir::VectorType>($_self)">;
 def SPIRV_IsMatrixType : CPred<"::llvm::isa<::mlir::spirv::MatrixType>($_self)">;
 def SPIRV_IsPtrType : CPred<"::llvm::isa<::mlir::spirv::PointerType>($_self)">;
 def SPIRV_IsRTArrayType : CPred<"::llvm::isa<::mlir::spirv::RuntimeArrayType>($_self)">;
@@ -4202,6 +4203,8 @@ def SPIRV_AnyCooperativeMatrix : DialectType<SPIRV_Dialect,
                                   "any SPIR-V cooperative matrix type">;
 def SPIRV_AnyImage : DialectType<SPIRV_Dialect, SPIRV_IsImageType,
                                 "any SPIR-V image type">;
+def SPIRV_AnyVector : DialectType<SPIRV_Dialect, SPIRV_IsVectorType,
+                                "any SPIR-V vector type">;
 def SPIRV_AnyMatrix : DialectType<SPIRV_Dialect, SPIRV_IsMatrixType,
                                 "any SPIR-V matrix type">;
 def SPIRV_AnyRTArray : DialectType<SPIRV_Dialect, SPIRV_IsRTArrayType,
@@ -4384,6 +4387,7 @@ def SPIRV_OC_OpFRem                         : I32EnumAttrCase<"OpFRem", 140>;
 def SPIRV_OC_OpFMod                         : I32EnumAttrCase<"OpFMod", 141>;
 def SPIRV_OC_OpVectorTimesScalar            : I32EnumAttrCase<"OpVectorTimesScalar", 142>;
 def SPIRV_OC_OpMatrixTimesScalar            : I32EnumAttrCase<"OpMatrixTimesScalar", 143>;
+def SPIRV_OC_OpMatrixTimesVector           : I32EnumAttrCase<"OpMatrixTimesVector", 145>;
 def SPIRV_OC_OpMatrixTimesMatrix            : I32EnumAttrCase<"OpMatrixTimesMatrix", 146>;
 def SPIRV_OC_OpDot                          : I32EnumAttrCase<"OpDot", 148>;
 def SPIRV_OC_OpIAddCarry                    : I32EnumAttrCase<"OpIAddCarry", 149>;
@@ -4553,7 +4557,7 @@ def SPIRV_OpcodeAttr :
       SPIRV_OC_OpFSub, SPIRV_OC_OpIMul, SPIRV_OC_OpFMul, SPIRV_OC_OpUDiv,
       SPIRV_OC_OpSDiv, SPIRV_OC_OpFDiv, SPIRV_OC_OpUMod, SPIRV_OC_OpSRem,
       SPIRV_OC_OpSMod, SPIRV_OC_OpFRem, SPIRV_OC_OpFMod,
-      SPIRV_OC_OpVectorTimesScalar, SPIRV_OC_OpMatrixTimesScalar,
+      SPIRV_OC_OpVectorTimesScalar, SPIRV_OC_OpMatrixTimesScalar, SPIRV_OC_OpMatrixTimesVector,
       SPIRV_OC_OpMatrixTimesMatrix, SPIRV_OC_OpDot, SPIRV_OC_OpIAddCarry,
       SPIRV_OC_OpISubBorrow, SPIRV_OC_OpUMulExtended, SPIRV_OC_OpSMulExtended,
       SPIRV_OC_OpIsNan, SPIRV_OC_OpIsInf, SPIRV_OC_OpOrdered, SPIRV_OC_OpUnordered,
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
index 3fcfb086f9662..1cdfa02f81787 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
@@ -1029,6 +1029,122 @@ def SPIRV_GLFMixOp :
   let hasVerifier = 0;
 }
 
+// -----
+
+def SPIRV_GLDistanceOp : SPIRV_GLOp<"Distance", 67, [
+    Pure,
+    AllTypesMatch<["p0", "p1"]>,
+    TypesMatchWith<"result type must match operand element type",
+                  "p0", "result",
+                  "::mlir::getElementTypeOrSelf($_self)">
+  ]> {
+  let summary = "Return distance between two points";
+
+  let description = [{
+    Result is the distance between p0 and p1, i.e., length(p0 - p1).
+
+    The operands must all be a scalar or vector whose component type is floating-point.
+
+    Result Type must be a scalar of the same type as the component type of the operands.
+
+    #### Example:
+
+    ```mlir
+    %2 = spirv.GL.Distance %0, %1 : vector<3xf32>, vector<3xf32> -> f32
+    ```
+  }];
+
+  let arguments = (ins
+    SPIRV_ScalarOrVectorOf<SPIRV_Float>:$p0,
+    SPIRV_ScalarOrVectorOf<SPIRV_Float>:$p1
+  );
+
+  let results = (outs
+    SPIRV_Float:$result
+  );
+
+  let assemblyFormat = [{
+    operands attr-dict `:` type($p0) `,` type($p1) `->` type($result)
+  }];
+
+  let hasVerifier = 0;
+}
+
+// -----
+
+def SPIRV_GLCrossOp : SPIRV_GLBinaryArithmeticOp<"Cross", 68, SPIRV_Float> {
+  let summary = "Return the cross product of two 3-component vectors";
+
+  let description = [{
+    Result is the cross product of x and y, i.e., the resulting components are, in order:
+
+    x[1] * y[2] - y[1] * x[2]
+
+    x[2] * y[0] - y[2] * x[0]
+
+    x[0] * y[1] - y[0] * x[1]
+
+    All the operands must be vectors of 3 components of a floating-point type.
+
+    Result Type and the type of all operands must be the same type.
+
+    #### Example:
+
+    ```mlir
+    %2 = spirv.GL.Cross %0, %1 : vector<3xf32>
+    %3 = spirv.GL.Cross %0, %1 : vector<3xf16>
+    ```
+  }];
+}
+
+// -----
+
+def SPIRV_GLNormalizeOp : SPIRV_GLUnaryArithmeticOp<"Normalize", 69, SPIRV_Float> {
+  let summary = "Normalizes a vector operand";
+
+  let description = [{
+    Result is the vector in the same direction as x but with a length of 1.
+
+    The operand x must be a scalar or vector whose component type is floating-point.
+
+    Result Type and the type of x must be the same type.
+
+    #### Example:
+
+    ```mlir
+    %2 = spirv.GL.Normalize %0 : vector<3xf32>
+    %3 = spirv.GL.Normalize %1 : vector<4xf16>
+    ```
+  }];
+}
+
+// -----
+
+def SPIRV_GLReflectOp : SPIRV_GLBinaryArithmeticOp<"Reflect", 71, SPIRV_Float> {
+  let summary = "Calculate reflection direction vector";
+
+  let description = [{
+    For the incident vector I and surface orientation N, the result is the reflection direction:
+
+    I - 2 * dot(N, I) * N
+
+    N must already be normalized in order to achieve the desired result.
+
+    The operands must all be a scalar or vector whose component type is floating-point.
+
+    Result Type and the type of all operands must be the same type.
+
+    #### Example:
+
+    ```mlir
+    %2 = spirv.GL.Reflect %0, %1 : f32
+    %3 = spirv.GL.Reflect %0, %1 : vector<3xf32>
+    ```
+  }];
+}
+
+// ----
+
 def SPIRV_GLFindUMsbOp : SPIRV_GLUnaryArithmeticOp<"FindUMsb", 75, SPIRV_Int32> {
   let summary = "Unsigned-integer most-significant bit";
 
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMatrixOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMatrixOps.td
index a6f0f41429bcb..5bd99386e0085 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMatrixOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMatrixOps.td
@@ -114,6 +114,47 @@ def SPIRV_MatrixTimesScalarOp : SPIRV_Op<
 
 // -----
 
+def SPIRV_MatrixTimesVectorOp : SPIRV_Op<"MatrixTimesVector", [Pure]> {
+  let summary = "Linear-algebraic multiply of matrix X vector.";
+
+  let description = [{
+    Result Type must be a vector of floating-point type.
+
+    Matrix must be an OpTypeMatrix whose Column Type is Result Type.
+
+    Vector must be a vector with the same Component Type as the Component Type in Result Type. Its number of components must equal the number of columns in Matrix.
+
+    #### Example:
+
+    ```mlir
+    %0 = spirv.MatrixTimesVector %matrix, %vector : 
+        !spirv.matrix<3 x vector<2xf32>>, vector<3xf32> -> vector<2xf32>
+    ```
+  }];
+
+  let availability = [
+    MinVersion<SPIRV_V_1_0>,
+    MaxVersion<SPIRV_V_1_6>,
+    Extension<[]>,
+    Capability<[SPIRV_C_Matrix]>
+  ];
+
+  let arguments = (ins
+    SPIRV_AnyMatrix:$matrix,
+    SPIRV_AnyVector:$vector
+  );
+
+  let results = (outs
+    SPIRV_AnyVector:$result
+  );
+
+  let assemblyFormat = [{
+    operands attr-dict `:` type($matrix) `,` type($vector) `->` type($result)
+  }];
+}
+
+// -----
+
 def SPIRV_TransposeOp : SPIRV_Op<"Transpose", [Pure]> {
   let summary = "Transpose a matrix.";
 
diff --git a/mlir/include/mlir/IR/BuiltinTypeInterfaces.td b/mlir/include/mlir/IR/BuiltinTypeInterfaces.td
index c36b738e38f42..8aa2c55570153 100644
--- a/mlir/include/mlir/IR/BuiltinTypeInterfaces.td
+++ b/mlir/include/mlir/IR/BuiltinTypeInterfaces.td
@@ -46,26 +46,6 @@ def FloatTypeInterface : TypeInterface<"FloatType"> {
   ];
 
   let extraClassDeclaration = [{
-    // Convenience factories.
-    static FloatType getBF16(MLIRContext *ctx);
-    static FloatType getF16(MLIRContext *ctx);
-    static FloatType getF32(MLIRContext *ctx);
-    static FloatType getTF32(MLIRContext *ctx);
-    static FloatType getF64(MLIRContext *ctx);
-    static FloatType getF80(MLIRContext *ctx);
-    static FloatType getF128(MLIRContext *ctx);
-    static FloatType getFloat8E5M2(MLIRContext *ctx);
-    static FloatType getFloat8E4M3(MLIRContext *ctx);
-    static FloatType getFloat8E4M3FN(MLIRContext *ctx);
-    static FloatType getFloat8E5M2FNUZ(MLIRContext *ctx);
-    static FloatType getFloat8E4M3FNUZ(MLIRContext *ctx);
-    static FloatType getFloat8E4M3B11FNUZ(MLIRContext *ctx);
-    static FloatType getFloat8E3M4(MLIRContext *ctx);
-    static FloatType getFloat4E2M1FN(MLIRContext *ctx);
-    static FloatType getFloat6E2M3FN(MLIRContext *ctx);
-    static FloatType getFloat6E3M2FN(MLIRContext *ctx);
-    static FloatType getFloat8E8M0FNU(MLIRContext *ctx);
-
     /// Return the bitwidth of this float type.
     unsigned getWidth();
 
diff --git a/mlir/include/mlir/IR/BuiltinTypes.h b/mlir/include/mlir/IR/BuiltinTypes.h
index 2b3c2b6d1753d..19c5361124aac 100644
--- a/mlir/include/mlir/IR/BuiltinTypes.h
+++ b/mlir/include/mlir/IR/BuiltinTypes.h
@@ -401,78 +401,6 @@ inline bool BaseMemRefType::isValidElementType(Type type) {
          llvm::isa<MemRefElementTypeInterface>(type);
 }
 
-inline FloatType FloatType::getFloat4E2M1FN(MLIRContext *ctx) {
-  return Float4E2M1FNType::get(ctx);
-}
-
-inline FloatType FloatType::getFloat6E2M3FN(MLIRContext *ctx) {
-  return Float6E2M3FNType::get(ctx);
-}
-
-inline FloatType FloatType::getFloat6E3M2FN(MLIRContext *ctx) {
-  return Float6E3M2FNType::get(ctx);
-}
-
-inline FloatType FloatType::getFloat8E5M2(MLIRContext *ctx) {
-  return Float8E5M2Type::get(ctx);
-}
-
-inline FloatType FloatType::getFloat8E4M3(MLIRContext *ctx) {
-  return Float8E4M3Type::get(ctx);
-}
-
-inline FloatType FloatType::getFloat8E4M3FN(MLIRContext *ctx) {
-  return Float8E4M3FNType::get(ctx);
-}
-
-inline FloatType FloatType::getFloat8E5M2FNUZ(MLIRContext *ctx) {
-  return Float8E5M2FNUZType::get(ctx);
-}
-
-inline FloatType FloatType::getFloat8E4M3FNUZ(MLIRContext *ctx) {
-  return Float8E4M3FNUZType::get(ctx);
-}
-
-inline FloatType FloatType::getFloat8E4M3B11FNUZ(MLIRContext *ctx) {
-  return Float8E4M3B11FNUZType::get(ctx);
-}
-
-inline FloatType FloatType::getFloat8E3M4(MLIRContext *ctx) {
-  return Float8E3M4Type::get(ctx);
-}
-
-inline FloatType FloatType::getFloat8E8M0FNU(MLIRContext *ctx) {
-  return Float8E8M0FNUType::get(ctx);
-}
-
-inline FloatType FloatType::getBF16(MLIRContext *ctx) {
-  return BFloat16Type::get(ctx);
-}
-
-inline FloatType FloatType::getF16(MLIRContext *ctx) {
-  return Float16Type::get(ctx);
-}
-
-inline FloatType FloatType::getTF32(MLIRContext *ctx) {
-  return FloatTF32Type::get(ctx);
-}
-
-inline FloatType FloatType::getF32(MLIRContext *ctx) {
-  return Float32Type::get(ctx);
-}
-
-inline FloatType FloatType::getF64(MLIRContext *ctx) {
-  return Float64Type::get(ctx);
-}
-
-inline FloatType FloatType::getF80(MLIRContext *ctx) {
-  return Float80Type::get(ctx);
-}
-
-inline FloatType FloatType::getF128(MLIRContext *ctx) {
-  return Float128Type::get(ctx);
-}
-
 inline bool TensorType::classof(Type type) {
   return llvm::isa<RankedTensorType, UnrankedTensorType>(type);
 }
diff --git a/mlir/include/mlir/IR/Diagnostics.h b/mlir/include/mlir/IR/Diagnostics.h
index 8429325412dc9..36c433c63b26d 100644
--- a/mlir/include/mlir/IR/Diagnostics.h
+++ b/mlir/include/mlir/IR/Diagnostics.h
@@ -578,6 +578,9 @@ class SourceMgrDiagnosticHandler : public ScopedDiagnosticHandler {
   void emitDiagnostic(Location loc, Twine message, DiagnosticSeverity kind,
                       bool displaySourceLine = true);
 
+  /// Set the maximum depth that a call stack will be printed. Defaults to 10.
+  void setCallStackLimit(unsigned limit);
+
 protected:
   /// Emit the given diagnostic with the held source manager.
   void emitDiagnostic(Diagnostic &diag);
@@ -605,7 +608,6 @@ class SourceMgrDiagnosticHandler : public ScopedDiagnosticHandler {
   std::optional<Location> findLocToShow(Location loc);
 
   /// The maximum depth that a call stack will be printed.
-  /// TODO: This should be a tunable flag.
   unsigned callStackLimit = 10;
 
   std::unique_ptr<detail::SourceMgrDiagnosticHandlerImpl> impl;
diff --git a/mlir/include/mlir/Target/SPIRV/Deserialization.h b/mlir/include/mlir/Target/SPIRV/Deserialization.h
index e39258beeaac8..a346a7fd1e5f7 100644
--- a/mlir/include/mlir/Target/SPIRV/Deserialization.h
+++ b/mlir/include/mlir/Target/SPIRV/Deserialization.h
@@ -15,6 +15,7 @@
 
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Support/LLVM.h"
+#include <cstdint>
 
 namespace mlir {
 class MLIRContext;
diff --git a/mlir/lib/CAPI/IR/BuiltinTypes.cpp b/mlir/lib/CAPI/IR/BuiltinTypes.cpp
index 252ff54afe0c5..250e4a6bbf8df 100644
--- a/mlir/lib/CAPI/IR/BuiltinTypes.cpp
+++ b/mlir/lib/CAPI/IR/BuiltinTypes.cpp
@@ -94,7 +94,7 @@ bool mlirTypeIsAFloat4E2M1FN(MlirType type) {
 }
 
 MlirType mlirFloat4E2M1FNTypeGet(MlirContext ctx) {
-  return wrap(FloatType::getFloat4E2M1FN(unwrap(ctx)));
+  return wrap(Float4E2M1FNType::get(unwrap(ctx)));
 }
 
 MlirTypeID mlirFloat6E2M3FNTypeGetTypeID() {
@@ -106,7 +106,7 @@ bool mlirTypeIsAFloat6E2M3FN(MlirType type) {
 }
 
 MlirType mlirFloat6E2M3FNTypeGet(MlirContext ctx) {
-  return wrap(FloatType::getFloat6E2M3FN(unwrap(ctx)));
+  return wrap(Float6E2M3FNType::get(unwrap(ctx)));
 }
 
 MlirTypeID mlirFloat6E3M2FNTypeGetTypeID() {
@@ -118,7 +118,7 @@ bool mlirTypeIsAFloat6E3M2FN(MlirType type) {
 }
 
 MlirType mlirFloat6E3M2FNTypeGet(MlirContext ctx) {
-  return wrap(FloatType::getFloat6E3M2FN(unwrap(ctx)));
+  return wrap(Float6E3M2FNType::get(unwrap(ctx)));
 }
 
 MlirTypeID mlirFloat8E5M2TypeGetTypeID() {
@@ -130,7 +130,7 @@ bool mlirTypeIsAFloat8E5M2(MlirType type) {
 }
 
 MlirType mlirFloat8E5M2TypeGet(MlirContext ctx) {
-  return wrap(FloatType::getFloat8E5M2(unwrap(ctx)));
+  return wrap(Float8E5M2Type::get(unwrap(ctx)));
 }
 
 MlirTypeID mlirFloat8E4M3TypeGetTypeID() {
@@ -142,7 +142,7 @@ bool mlirTypeIsAFloat8E4M3(MlirType type) {
 }
 
 MlirType mlirFloat8E4M3TypeGet(MlirContext ctx) {
-  return wrap(FloatType::getFloat8E4M3(unwrap(ctx)));
+  return wrap(Float8E4M3Type::get(unwrap(ctx)));
 }
 
 MlirTypeID mlirFloat8E4M3FNTypeGetTypeID() {
@@ -154,7 +154,7 @@ bool mlirTypeIsAFloat8E4M3FN(MlirType type) {
 }
 
 MlirType mlirFloat8E4M3FNTypeGet(MlirContext ctx) {
-  return wrap(FloatType::getFloat8E4M3FN(unwrap(ctx)));
+  return wrap(Float8E4M3FNType::get(unwrap(ctx)));
 }
 
 MlirTypeID mlirFloat8E5M2FNUZTypeGetTypeID() {
@@ -166,7 +166,7 @@ bool mlirTypeIsAFloat8E5M2FNUZ(MlirType type) {
 }
 
 MlirType mlirFloat8E5M2FNUZTypeGet(MlirContext ctx) {
-  return wrap(FloatType::getFloat8E5M2FNUZ(unwrap(ctx)));
+  return wrap(Float8E5M2FNUZType::get(unwrap(ctx)));
 }
 
 MlirTypeID mlirFloat8E4M3FNUZTypeGetTypeID() {
@@ -178,7 +178,7 @@ bool mlirTypeIsAFloat8E4M3FNUZ(MlirType type) {
 }
 
 MlirType mlirFloat8E4M3FNUZTypeGet(MlirContext ctx) {
-  return wrap(FloatType::getFloat8E4M3FNUZ(unwrap(ctx)));
+  return wrap(Float8E4M3FNUZType::get(unwrap(ctx)));
 }
 
 MlirTypeID mlirFloat8E4M3B11FNUZTypeGetTypeID() {
@@ -190,7 +190,7 @@ bool mlirTypeIsAFloat8E4M3B11FNUZ(MlirType type) {
 }
 
 MlirType mlirFloat8E4M3B11FNUZTypeGet(MlirContext ctx) {
-  return wrap(FloatType::getFloat8E4M3B11FNUZ(unwrap(ctx)));
+  return wrap(Float8E4M3B11FNUZType::get(unwrap(ctx)));
 }
 
 MlirTypeID mlirFloat8E3M4TypeGetTypeID() {
@@ -202,7 +202,7 @@ bool mlirTypeIsAFloat8E3M4(MlirType type) {
 }
 
 MlirType mlirFloat8E3M4TypeGet(MlirContext ctx) {
-  return wrap(FloatType::getFloat8E3M4(unwrap(ctx)));
+  return wrap(Float8E3M4Type::get(unwrap(ctx)));
 }
 
 MlirTypeID mlirFloat8E8M0FNUTypeGetTypeID() {
@@ -214,7 +214,7 @@ bool mlirTypeIsAFloat8E8M0FNU(MlirType type) {
 }
 
 MlirType mlirFloat8E8M0FNUTypeGet(MlirContext ctx) {
-  return wrap(FloatType::getFloat8E8M0FNU(unwrap(ctx)));
+  return wrap(Float8E8M0FNUType::get(unwrap(ctx)));
 }
 
 MlirTypeID mlirBFloat16TypeGetTypeID() {
@@ -224,7 +224,7 @@ MlirTypeID mlirBFloat16TypeGetTypeID() {
 bool mlirTypeIsABF16(MlirType type) { return unwrap(type).isBF16(); }
 
 MlirType mlirBF16TypeGet(MlirContext ctx) {
-  return wrap(FloatType::getBF16(unwrap(ctx)));
+  return wrap(BFloat16Type::get(unwrap(ctx)));
 }
 
 MlirTypeID mlirFloat16TypeGetTypeID() { return wrap(Float16Type::getTypeID()); }
@@ -232,7 +232,7 @@ MlirTypeID mlirFloat16TypeGetTypeID() { return wrap(Float16Type::getTypeID()); }
 bool mlirTypeIsAF16(MlirType type) { return unwrap(type).isF16(); }
 
 MlirType mlirF16TypeGet(MlirContext ctx) {
-  return wrap(FloatType::getF16(unwrap(ctx)));
+  return wrap(Float16Type::get(unwrap(ctx)));
 }
 
 MlirTypeID mlirFloatTF32TypeGetTypeID() {
@@ -242,7 +242,7 @@ MlirTypeID mlirFloatTF32TypeGetTypeID() {
 bool mlirTypeIsATF32(MlirType type) { return unwrap(type).isTF32(); }
 
 MlirType mlirTF32TypeGet(MlirContext ctx) {
-  return wrap(FloatType::getTF32(unwrap(ctx)));
+  return wrap(FloatTF32Type::get(unwrap(ctx)));
 }
 
 MlirTypeID mlirFloat32TypeGetTypeID() { return wrap(Float32Type::getTypeID()); }
@@ -250,7 +250,7 @@ MlirTypeID mlirFloat32TypeGetTypeID() { return wrap(Float32Type::getTypeID()); }
 bool mlirTypeIsAF32(MlirType type) { return unwrap(type).isF32(); }
 
 MlirType mlirF32TypeGet(MlirContext ctx) {
-  return wrap(FloatType::getF32(unwrap(ctx)));
+  return wrap(Float32Type::get(unwrap(ctx)));
 }
 
 MlirTypeID mlirFloat64TypeGetTypeID() { return wrap(Float64Type::getTypeID()); }
@@ -258,7 +258,7 @@ MlirTypeID mlirFloat64TypeGetTypeID() { return wrap(Float64Type::getTypeID()); }
 bool mlirTypeIsAF64(MlirType type) { return unwrap(type).isF64(); }
 
 MlirType mlirF64TypeGet(MlirContext ctx) {
-  return wrap(FloatType::getF64(unwrap(ctx)));
+  return wrap(Float64Type::get(unwrap(ctx)));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp b/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
index 211f00807d72b..d347ae916784b 100644
--- a/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
+++ b/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
@@ -739,65 +739,38 @@ class ItoFCastOpConversion : public OpConversionPattern<CastOp> {
   }
 };
 
-class TruncFConversion : public OpConversionPattern<arith::TruncFOp> {
-public:
-  using OpConversionPattern<arith::TruncFOp>::OpConversionPattern;
-
-  LogicalResult
-  matchAndRewrite(arith::TruncFOp castOp,
-                  typename arith::TruncFOp::Adaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    // FIXME Upstream LLVM (commit 77cbc9bf60) brings in a rounding mode
-    // attribute that we need to check. For now, the behavior is the default,
-    // i.e. truncate.
-    Type operandType = adaptor.getIn().getType();
-    if (!emitc::isFloatOrOpaqueType(operandType))
-      return rewriter.notifyMatchFailure(castOp,
-                                         "unsupported cast source type");
-
-    Type dstType = this->getTypeConverter()->convertType(castOp.getType());
-    if (!dstType)
-      return rewriter.notifyMatchFailure(castOp, "type conversion failed");
-
-    if (!emitc::isFloatOrOpaqueType(dstType))
-      return rewriter.notifyMatchFailure(castOp,
-                                         "unsupported cast destination type");
-
-    if (!emitc::CastOp::areCastCompatible(operandType, dstType))
-      return rewriter.notifyMatchFailure(castOp, "cast-incompatible types");
-
-    rewriter.replaceOpWithNewOp<emitc::CastOp>(castOp, dstType,
-                                               adaptor.getIn());
-
-    return success();
-  }
-};
-
-class ExtFConversion : public OpConversionPattern<arith::ExtFOp> {
+// Floating-point to floating-point conversions.
+template <typename CastOp>
+class FpCastOpConversion : public OpConversionPattern<CastOp> {
 public:
-  using OpConversionPattern<arith::ExtFOp>::OpConversionPattern;
+  FpCastOpConversion(const TypeConverter &typeConverter, MLIRContext *context)
+      : OpConversionPattern<CastOp>(typeConverter, context) {}
 
   LogicalResult
-  matchAndRewrite(arith::ExtFOp castOp, typename arith::ExtFOp::Adaptor adaptor,
+  matchAndRewrite(CastOp castOp, typename CastOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    // Vectors in particular are not supported.
     Type operandType = adaptor.getIn().getType();
-    if (!emitc::isFloatOrOpaqueType(operandType))
+    if (!emitc::isSupportedFloatType(operandType))
       return rewriter.notifyMatchFailure(castOp,
                                          "unsupported cast source type");
+    if (auto roundingModeOp =
+            dyn_cast<arith::ArithRoundingModeInterface>(*castOp)) {
+      // Only supporting default rounding mode as of now.
+      if (roundingModeOp.getRoundingModeAttr())
+        return rewriter.notifyMatchFailure(castOp, "unsupported rounding mode");
+    }
 
     Type dstType = this->getTypeConverter()->convertType(castOp.getType());
     if (!dstType)
       return rewriter.notifyMatchFailure(castOp, "type conversion failed");
 
-    if (!emitc::isFloatOrOpaqueType(dstType))
+    if (!emitc::isSupportedFloatType(dstType))
       return rewriter.notifyMatchFailure(castOp,
                                          "unsupported cast destination type");
 
-    if (!emitc::CastOp::areCastCompatible(operandType, dstType))
-      return rewriter.notifyMatchFailure(castOp, "cast-incompatible types");
-
-    rewriter.replaceOpWithNewOp<emitc::CastOp>(castOp, dstType,
-                                               adaptor.getIn());
+    Value fpCastOperand = adaptor.getIn();
+    rewriter.replaceOpWithNewOp<emitc::CastOp>(castOp, dstType, fpCastOperand);
 
     return success();
   }
@@ -849,8 +822,8 @@ void mlir::populateArithToEmitCPatterns(TypeConverter &typeConverter,
     ItoFCastOpConversion<arith::UIToFPOp>,
     FtoICastOpConversion<arith::FPToSIOp>,
     FtoICastOpConversion<arith::FPToUIOp>,
-    TruncFConversion,
-    ExtFConversion
+    FpCastOpConversion<arith::ExtFOp>,
+    FpCastOpConversion<arith::TruncFOp>
   >(typeConverter, ctx);
   // clang-format on
 }
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 83208e0c42da2..ca9883a79dc16 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -427,9 +427,11 @@ class LegalizeLaunchFuncOpPattern
     : public ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp> {
 public:
   LegalizeLaunchFuncOpPattern(const LLVMTypeConverter &typeConverter,
-                              bool kernelBarePtrCallConv)
+                              bool kernelBarePtrCallConv,
+                              bool typeCheckKernelArgs)
       : ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp>(typeConverter),
-        kernelBarePtrCallConv(kernelBarePtrCallConv) {}
+        kernelBarePtrCallConv(kernelBarePtrCallConv),
+        typeCheckKernelArgs(typeCheckKernelArgs) {}
 
 private:
   LogicalResult
@@ -437,6 +439,7 @@ class LegalizeLaunchFuncOpPattern
                   ConversionPatternRewriter &rewriter) const override;
 
   bool kernelBarePtrCallConv;
+  bool typeCheckKernelArgs;
 };
 
 /// A rewrite pattern to convert gpu.memcpy operations into a GPU runtime
@@ -563,8 +566,8 @@ void GpuToLLVMConversionPass::runOnOperation() {
   populateFinalizeMemRefToLLVMConversionPatterns(converter, patterns);
   populateAsyncStructuralTypeConversionsAndLegality(converter, patterns,
                                                     target);
-  populateGpuToLLVMConversionPatterns(converter, patterns,
-                                      kernelBarePtrCallConv);
+  populateGpuToLLVMConversionPatterns(
+      converter, patterns, kernelBarePtrCallConv, typeCheckKernelArgs);
 
   if (failed(
           applyPartialConversion(getOperation(), target, std::move(patterns))))
@@ -966,6 +969,28 @@ LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite(
   // stream must be created to pass to subsequent operations.
   else if (launchOp.getAsyncToken())
     stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult();
+
+  if (typeCheckKernelArgs) {
+    // The current non-bare-pointer ABI is a bad fit for `mgpuLaunchKernel`,
+    // which takes an untyped list of arguments. The type check here prevents
+    // accidentally violating the assumption made in vulkan-runtime-wrappers.cpp
+    // and creating a unchecked runtime ABI mismatch.
+    // TODO(https://github.com/llvm/llvm-project/issues/73457): Change the ABI
+    // here to remove the need for this type check.
+    for (Value arg : launchOp.getKernelOperands()) {
+      if (auto memrefTy = dyn_cast<MemRefType>(arg.getType())) {
+        if (memrefTy.getRank() != 1 ||
+            memrefTy.getElementTypeBitWidth() != 32) {
+          return rewriter.notifyMatchFailure(
+              launchOp, "Operand to launch op is not a rank-1 memref with "
+                        "32-bit element type.");
+        }
+      } else {
+        return rewriter.notifyMatchFailure(
+            launchOp, "Operand to launch op is not a memref.");
+      }
+    }
+  }
   // Lower the kernel operands to match kernel parameters.
   // Note: If `useBarePtrCallConv` is set in the type converter's options,
   // the value of `kernelBarePtrCallConv` will be ignored.
@@ -1737,7 +1762,8 @@ LogicalResult ConvertCreateBsrOpToGpuRuntimeCallPattern::matchAndRewrite(
 
 void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
                                                RewritePatternSet &patterns,
-                                               bool kernelBarePtrCallConv) {
+                                               bool kernelBarePtrCallConv,
+                                               bool typeCheckKernelArgs) {
   addOpaquePointerConversion<gpu::AsyncTokenType>(converter);
   addOpaquePointerConversion<gpu::SparseDnTensorHandleType>(converter);
   addOpaquePointerConversion<gpu::SparseSpMatHandleType>(converter);
@@ -1774,7 +1800,8 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
                ConvertSpGEMMCopyOpToGpuRuntimeCallPattern,
                ConvertSpMatGetSizeOpToGpuRuntimeCallPattern,
                ConvertSetCsrPointersOpToGpuRuntimeCallPattern>(converter);
-  patterns.add<LegalizeLaunchFuncOpPattern>(converter, kernelBarePtrCallConv);
+  patterns.add<LegalizeLaunchFuncOpPattern>(converter, kernelBarePtrCallConv,
+                                            typeCheckKernelArgs);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index ba1d4a2f56c11..c95899d817eac 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -410,7 +410,8 @@ bool mlir::affine::isValidSymbol(Value value) {
 /// A value can be used as a symbol for `region` iff it meets one of the
 /// following conditions:
 /// *) It is a constant.
-/// *) It is the result of an affine apply operation with symbol arguments.
+/// *) It is a result of a `Pure` operation whose operands are valid symbolic
+/// *) identifiers.
 /// *) It is a result of the dim op on a memref whose corresponding size is
 ///    a valid symbol.
 /// *) It is defined at the top level of 'region' or is its argument.
@@ -443,9 +444,12 @@ bool mlir::affine::isValidSymbol(Value value, Region *region) {
   if (matchPattern(defOp, m_Constant(&operandCst)))
     return true;
 
-  // Affine apply operation is ok if all of its operands are ok.
-  if (auto applyOp = dyn_cast<AffineApplyOp>(defOp))
-    return applyOp.isValidSymbol(region);
+  // `Pure` operation that whose operands are valid symbolic identifiers.
+  if (isPure(defOp) && llvm::all_of(defOp->getOperands(), [&](Value operand) {
+        return affine::isValidSymbol(operand, region);
+      })) {
+    return true;
+  }
 
   // Dim op results could be valid symbols at any level.
   if (auto dimOp = dyn_cast<ShapedDimOpInterface>(defOp))
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
index ff1636bc121b6..e4f9d6f987401 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
@@ -270,11 +270,9 @@ Attribute ConstantRangeAttr::parse(AsmParser &parser, Type odsType) {
   if (parser.parseInteger(lower) || parser.parseComma() ||
       parser.parseInteger(upper) || parser.parseGreater())
     return Attribute{};
-  // For some reason, 0 is always parsed as 64-bits, fix that if needed.
-  if (lower.isZero())
-    lower = lower.sextOrTrunc(bitWidth);
-  if (upper.isZero())
-    upper = upper.sextOrTrunc(bitWidth);
+  // Non-positive numbers may use more bits than `bitWidth`
+  lower = lower.sextOrTrunc(bitWidth);
+  upper = upper.sextOrTrunc(bitWidth);
   return parser.getChecked<ConstantRangeAttr>(loc, parser.getContext(), lower,
                                               upper);
 }
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index d8fde3e765ac4..ccb5ad05f0bf7 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -138,6 +138,26 @@ LogicalResult CpAsyncBulkTensorReduceOp::verify() {
                                          getLoc());
 }
 
+LogicalResult CvtFloatToTF32Op::verify() {
+  using RndMode = NVVM::FPRoundingMode;
+  switch (getRnd()) {
+  case RndMode::RNA:
+    if (getRelu())
+      return emitError("Relu not supported with rna rounding mode.");
+    break;
+  case RndMode::RN:
+  case RndMode::RZ:
+    if (getSat() != NVVM::SaturationMode::NONE)
+      return emitError(
+          "Saturation mode not supported with rn/rz rounding modes.");
+    break;
+  default:
+    return emitError(
+        "Only {rn,rz,rna} rounding modes supported for CvtFloatToTF32Op.");
+  }
+  return success();
+}
+
 // Given the element type of an operand and whether or not it is an accumulator,
 // this function returns the PTX type (`NVVM::MMATypes`) that corresponds to the
 // operand's element type.
@@ -1163,6 +1183,26 @@ llvm::Intrinsic::ID CpAsyncBulkTensorReduceOp::getIntrinsicID(
   llvm_unreachable("Invalid Reduction Op for CpAsyncBulkTensorReduceOp");
 }
 
+llvm::Intrinsic::ID CvtFloatToTF32Op::getIntrinsicID(NVVM::FPRoundingMode rnd,
+                                                     NVVM::SaturationMode sat,
+                                                     bool hasRelu) {
+  using RndMode = NVVM::FPRoundingMode;
+  switch (rnd) {
+  case RndMode::RN:
+    return hasRelu ? llvm::Intrinsic::nvvm_f2tf32_rn_relu
+                   : llvm::Intrinsic::nvvm_f2tf32_rn;
+  case RndMode::RZ:
+    return hasRelu ? llvm::Intrinsic::nvvm_f2tf32_rz_relu
+                   : llvm::Intrinsic::nvvm_f2tf32_rz;
+  case RndMode::RNA:
+    return (sat == NVVM::SaturationMode::SATFINITE)
+               ? llvm::Intrinsic::nvvm_f2tf32_rna_satfinite
+               : llvm::Intrinsic::nvvm_f2tf32_rna;
+  default:
+    llvm_unreachable("Invalid RoundingMode for CvtFloatToTF32Op");
+  }
+}
+
 /// Infer the result ranges for the NVVM SpecialRangeableRegisterOp that might
 /// have ConstantRangeAttr.
 static void nvvmInferResultRanges(Operation *op, Value result,
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Split.cpp b/mlir/lib/Dialect/Linalg/Transforms/Split.cpp
index 47b5fcd4014a0..671dea8bb415f 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Split.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Split.cpp
@@ -40,7 +40,7 @@ createSplitPart(RewriterBase &b, Location loc, TilingInterface op,
   sizesCopy[dimension] = size;
   offsetsCopy[dimension] = offset;
 
-  // Create the part as it it were a single tile.
+  // Create the part as if it were a single tile.
   FailureOr<TilingResult> tilingResult =
       op.getTiledImplementation(b, offsetsCopy, sizesCopy);
 
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
index 26559c1321db5..040bf6a34cea7 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
@@ -1698,6 +1698,33 @@ LogicalResult spirv::TransposeOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// spirv.MatrixTimesVector
+//===----------------------------------------------------------------------===//
+
+LogicalResult spirv::MatrixTimesVectorOp::verify() {
+  auto matrixType = llvm::cast<spirv::MatrixType>(getMatrix().getType());
+  auto vectorType = llvm::cast<VectorType>(getVector().getType());
+  auto resultType = llvm::cast<VectorType>(getType());
+
+  if (matrixType.getNumColumns() != vectorType.getNumElements())
+    return emitOpError("matrix columns (")
+           << matrixType.getNumColumns() << ") must match vector operand size ("
+           << vectorType.getNumElements() << ")";
+
+  if (resultType.getNumElements() != matrixType.getNumRows())
+    return emitOpError("result size (")
+           << resultType.getNumElements() << ") must match the matrix rows ("
+           << matrixType.getNumRows() << ")";
+
+  auto matrixElementType = matrixType.getElementType();
+  if (matrixElementType != vectorType.getElementType() ||
+      matrixElementType != resultType.getElementType())
+    return emitOpError("matrix, vector, and result element types must match");
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // spirv.MatrixTimesMatrix
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp
index edc4709ec3c6e..8439b063f2634 100644
--- a/mlir/lib/IR/Builders.cpp
+++ b/mlir/lib/IR/Builders.cpp
@@ -35,62 +35,56 @@ Location Builder::getFusedLoc(ArrayRef<Location> locs, Attribute metadata) {
 //===----------------------------------------------------------------------===//
 
 FloatType Builder::getFloat4E2M1FNType() {
-  return FloatType::getFloat4E2M1FN(context);
+  return Float4E2M1FNType::get(context);
 }
 
 FloatType Builder::getFloat6E2M3FNType() {
-  return FloatType::getFloat6E2M3FN(context);
+  return Float6E2M3FNType::get(context);
 }
 
 FloatType Builder::getFloat6E3M2FNType() {
-  return FloatType::getFloat6E3M2FN(context);
+  return Float6E3M2FNType::get(context);
 }
 
-FloatType Builder::getFloat8E5M2Type() {
-  return FloatType::getFloat8E5M2(context);
-}
+FloatType Builder::getFloat8E5M2Type() { return Float8E5M2Type::get(context); }
 
-FloatType Builder::getFloat8E4M3Type() {
-  return FloatType::getFloat8E4M3(context);
-}
+FloatType Builder::getFloat8E4M3Type() { return Float8E4M3Type::get(context); }
 
 FloatType Builder::getFloat8E4M3FNType() {
-  return FloatType::getFloat8E4M3FN(context);
+  return Float8E4M3FNType::get(context);
 }
 
 FloatType Builder::getFloat8E5M2FNUZType() {
-  return FloatType::getFloat8E5M2FNUZ(context);
+  return Float8E5M2FNUZType::get(context);
 }
 
 FloatType Builder::getFloat8E4M3FNUZType() {
-  return FloatType::getFloat8E4M3FNUZ(context);
+  return Float8E4M3FNUZType::get(context);
 }
 
 FloatType Builder::getFloat8E4M3B11FNUZType() {
-  return FloatType::getFloat8E4M3B11FNUZ(context);
+  return Float8E4M3B11FNUZType::get(context);
 }
 
-FloatType Builder::getFloat8E3M4Type() {
-  return FloatType::getFloat8E3M4(context);
-}
+FloatType Builder::getFloat8E3M4Type() { return Float8E3M4Type::get(context); }
 
 FloatType Builder::getFloat8E8M0FNUType() {
-  return FloatType::getFloat8E8M0FNU(context);
+  return Float8E8M0FNUType::get(context);
 }
 
-FloatType Builder::getBF16Type() { return FloatType::getBF16(context); }
+FloatType Builder::getBF16Type() { return BFloat16Type::get(context); }
 
-FloatType Builder::getF16Type() { return FloatType::getF16(context); }
+FloatType Builder::getF16Type() { return Float16Type::get(context); }
 
-FloatType Builder::getTF32Type() { return FloatType::getTF32(context); }
+FloatType Builder::getTF32Type() { return FloatTF32Type::get(context); }
 
-FloatType Builder::getF32Type() { return FloatType::getF32(context); }
+FloatType Builder::getF32Type() { return Float32Type::get(context); }
 
-FloatType Builder::getF64Type() { return FloatType::getF64(context); }
+FloatType Builder::getF64Type() { return Float64Type::get(context); }
 
-FloatType Builder::getF80Type() { return FloatType::getF80(context); }
+FloatType Builder::getF80Type() { return Float80Type::get(context); }
 
-FloatType Builder::getF128Type() { return FloatType::getF128(context); }
+FloatType Builder::getF128Type() { return Float128Type::get(context); }
 
 IndexType Builder::getIndexType() { return IndexType::get(context); }
 
diff --git a/mlir/lib/IR/BuiltinTypes.cpp b/mlir/lib/IR/BuiltinTypes.cpp
index 41b794bc0aec5..bd1163bddf7ee 100644
--- a/mlir/lib/IR/BuiltinTypes.cpp
+++ b/mlir/lib/IR/BuiltinTypes.cpp
@@ -117,23 +117,23 @@ FLOAT_TYPE_SEMANTICS(Float128Type, IEEEquad)
 
 FloatType Float16Type::scaleElementBitwidth(unsigned scale) const {
   if (scale == 2)
-    return FloatType::getF32(getContext());
+    return Float32Type::get(getContext());
   if (scale == 4)
-    return FloatType::getF64(getContext());
+    return Float64Type::get(getContext());
   return FloatType();
 }
 
 FloatType BFloat16Type::scaleElementBitwidth(unsigned scale) const {
   if (scale == 2)
-    return FloatType::getF32(getContext());
+    return Float32Type::get(getContext());
   if (scale == 4)
-    return FloatType::getF64(getContext());
+    return Float64Type::get(getContext());
   return FloatType();
 }
 
 FloatType Float32Type::scaleElementBitwidth(unsigned scale) const {
   if (scale == 2)
-    return FloatType::getF64(getContext());
+    return Float64Type::get(getContext());
   return FloatType();
 }
 
diff --git a/mlir/lib/IR/Diagnostics.cpp b/mlir/lib/IR/Diagnostics.cpp
index 7eb3d5bcd07f1..19b32120f5890 100644
--- a/mlir/lib/IR/Diagnostics.cpp
+++ b/mlir/lib/IR/Diagnostics.cpp
@@ -519,6 +519,10 @@ void SourceMgrDiagnosticHandler::emitDiagnostic(Diagnostic &diag) {
   }
 }
 
+void SourceMgrDiagnosticHandler::setCallStackLimit(unsigned limit) {
+  callStackLimit = limit;
+}
+
 /// Get a memory buffer for the given file, or nullptr if one is not found.
 const llvm::MemoryBuffer *
 SourceMgrDiagnosticHandler::getBufferForFile(StringRef filename) {
diff --git a/mlir/lib/Target/LLVMIR/DataLayoutImporter.cpp b/mlir/lib/Target/LLVMIR/DataLayoutImporter.cpp
index 35001757f214e..35fdbc0be22c3 100644
--- a/mlir/lib/Target/LLVMIR/DataLayoutImporter.cpp
+++ b/mlir/lib/Target/LLVMIR/DataLayoutImporter.cpp
@@ -29,15 +29,15 @@ FloatType mlir::LLVM::detail::getFloatType(MLIRContext *context,
                                            unsigned width) {
   switch (width) {
   case 16:
-    return FloatType::getF16(context);
+    return Float16Type::get(context);
   case 32:
-    return FloatType::getF32(context);
+    return Float32Type::get(context);
   case 64:
-    return FloatType::getF64(context);
+    return Float64Type::get(context);
   case 80:
-    return FloatType::getF80(context);
+    return Float80Type::get(context);
   case 128:
-    return FloatType::getF128(context);
+    return Float128Type::get(context);
   default:
     return {};
   }
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index c7dce5d6c6556..de3c1ab1a3f5e 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -270,7 +270,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
       .Case([&](omp::TaskOp op) {
         checkAllocate(op, result);
         checkInReduction(op, result);
-        checkPriority(op, result);
         checkUntied(op, result);
       })
       .Case([&](omp::TaskgroupOp op) {
@@ -281,6 +280,10 @@ static LogicalResult checkImplementationStatus(Operation &op) {
         checkDepend(op, result);
         checkNowait(op, result);
       })
+      .Case([&](omp::TaskloopOp op) {
+        // TODO: Add other clauses check
+        checkPriority(op, result);
+      })
       .Case([&](omp::WsloopOp op) {
         checkAllocate(op, result);
         checkLinear(op, result);
@@ -1347,11 +1350,9 @@ allocatePrivateVars(llvm::IRBuilderBase &builder,
   // Allocate private vars
   llvm::BranchInst *allocaTerminator =
       llvm::cast<llvm::BranchInst>(allocaIP.getBlock()->getTerminator());
-  if (allocaTerminator->getNumSuccessors() != 1) {
-    splitBB(llvm::OpenMPIRBuilder::InsertPointTy(
-                allocaIP.getBlock(), allocaTerminator->getIterator()),
-            true, "omp.region.after_alloca");
-  }
+  splitBB(llvm::OpenMPIRBuilder::InsertPointTy(allocaIP.getBlock(),
+                                               allocaTerminator->getIterator()),
+          true, "omp.region.after_alloca");
 
   llvm::IRBuilderBase::InsertPointGuard guard(builder);
   // Update the allocaTerminator in case the alloca block was split above.
@@ -1797,7 +1798,8 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
           moduleTranslation.lookupValue(taskOp.getFinal()),
           moduleTranslation.lookupValue(taskOp.getIfExpr()), dds,
           taskOp.getMergeable(),
-          moduleTranslation.lookupValue(taskOp.getEventHandle()));
+          moduleTranslation.lookupValue(taskOp.getEventHandle()),
+          moduleTranslation.lookupValue(taskOp.getPriority()));
 
   if (failed(handleError(afterIP, *taskOp)))
     return failure();
@@ -1887,6 +1889,59 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
       findAllocaInsertPoint(builder, moduleTranslation);
 
+  // The following loop is workaround until we private ops' alloca regions to be
+  // "pure". See
+  // https://discourse.llvm.org/t/rfc-openmp-supporting-delayed-task-execution-with-firstprivate-variables/83084/7
+  // and https://discourse.llvm.org/t/delayed-privatization-for-omp-wsloop/83989
+  // for more info.
+  for (auto [privateVar, privateDeclOp] :
+       llvm::zip_equal(mlirPrivateVars, privateDecls)) {
+    llvm::Value *llvmValue = moduleTranslation.lookupValue(privateVar);
+    bool isAllocArgUsed =
+        !privateDeclOp.getAllocRegion().args_begin()->use_empty();
+
+    // If the alloc region argument is not used, we can skip the workaround.
+    if (!isAllocArgUsed)
+      continue;
+
+    llvm::Instruction *definingInst =
+        llvm::dyn_cast<llvm::Instruction>(llvmValue);
+
+    // If the alloc region argument is not defined by an op, it has to dominate
+    // the current alloc IP. So we skip the workaround.
+    if (!definingInst)
+      continue;
+
+    llvm::BasicBlock *definingBlock = definingInst->getParent();
+    llvm::Function *definingFun = definingBlock->getParent();
+    llvm::Function *allocaFun = allocaIP.getBlock()->getParent();
+
+    // If the alloc region argument is defined in a different function that
+    // current one where allocs are being inserted (for example, we are building
+    // the outlined function of a target region), we skip the workaround.
+    if (definingFun != allocaFun)
+      continue;
+
+    llvm::DominatorTree dt(*definingFun);
+    // If the defining instruction of the alloc region argument dominates the
+    // alloca insertion point already, we can skip the workaround.
+    if (dt.dominates(definingInst, allocaIP.getPoint()))
+      continue;
+
+    // If all the above conditions are violated, then we have to move the alloca
+    // insertion point below the defining instruction.
+
+    if (definingBlock->getTerminator() == nullptr) {
+      assert(builder.GetInsertBlock() == definingBlock);
+      builder.SetInsertPoint(splitBB(llvm::OpenMPIRBuilder::InsertPointTy(
+                                         definingBlock, definingBlock->end()),
+                                     true, "omp.region.after_defining_block"));
+    }
+
+    allocaIP = llvm::OpenMPIRBuilder::InsertPointTy(
+        definingBlock, definingBlock->getTerminator()->getIterator());
+  }
+
   SmallVector<llvm::Value *> privateReductionVariables(
       wsloopOp.getNumReductionVars());
 
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index eba86f06d0905..f6826a2362bfd 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -816,7 +816,7 @@ static TypedAttr getScalarConstantAsAttr(OpBuilder &builder,
     llvm::Type *type = constFloat->getType();
     FloatType floatType =
         type->isBFloatTy()
-            ? FloatType::getBF16(context)
+            ? BFloat16Type::get(context)
             : LLVM::detail::getFloatType(context, type->getScalarSizeInBits());
     if (!floatType) {
       emitError(UnknownLoc::get(builder.getContext()))
diff --git a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir
index 5feffd88e4fb0..c082d02a06835 100644
--- a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir
+++ b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir
@@ -261,3 +261,43 @@ func.func @arith_remui_vector(%arg0: vector<5xi32>, %arg1: vector<5xi32>) -> vec
   %divui = arith.remui %arg0, %arg1 : vector<5xi32>
   return %divui: vector<5xi32>
 }
+
+// -----
+
+func.func @arith_truncf(%arg0: f64) -> f32 {
+  // expected-error @+1 {{failed to legalize operation 'arith.truncf'}}
+  %truncd = arith.truncf %arg0 to_nearest_away : f64 to f32
+  return %truncd : f32
+}
+
+// -----
+
+func.func @arith_extf_f128(%arg0: f32) -> f128 {
+  // expected-error @+1 {{failed to legalize operation 'arith.extf'}}
+  %extd = arith.extf %arg0 : f32 to f128
+  return %extd : f128
+}
+
+// -----
+
+func.func @arith_truncf_f128(%arg0: f128) -> f32 {
+  // expected-error @+1 {{failed to legalize operation 'arith.truncf'}}
+  %truncd = arith.truncf %arg0 : f128 to f32
+  return %truncd : f32
+}
+
+// -----
+
+func.func @arith_extf_vector(%arg0: vector<4xf32>) -> vector<4xf64> {
+  // expected-error @+1 {{failed to legalize operation 'arith.extf'}}
+  %extd = arith.extf %arg0 : vector<4xf32> to vector<4xf64>
+  return %extd : vector<4xf64>
+}
+
+// -----
+
+func.func @arith_truncf_vector(%arg0: vector<4xf64>) -> vector<4xf32> {
+  // expected-error @+1 {{failed to legalize operation 'arith.truncf'}}
+  %truncd = arith.truncf %arg0 : vector<4xf64> to vector<4xf32>
+  return %truncd : vector<4xf32>
+}
diff --git a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
index d2ecbaf0baa58..8ec31f96b86c4 100644
--- a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
+++ b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
@@ -754,3 +754,29 @@ func.func @arith_extf_truncf(%arg0: f32, %arg1: f64) {
 
   return
 }
+
+// -----
+
+func.func @arith_extf(%arg0: f16) -> f64 {
+  // CHECK-LABEL: arith_extf
+  // CHECK-SAME: (%[[Arg0:[^ ]*]]: f16)
+  // CHECK: %[[Extd0:.*]] = emitc.cast %[[Arg0]] : f16 to f32
+  %extd0 = arith.extf %arg0 : f16 to f32
+  // CHECK: %[[Extd1:.*]] = emitc.cast %[[Extd0]] : f32 to f64
+  %extd1 = arith.extf %extd0 : f32 to f64
+
+  return %extd1 : f64
+}
+
+// -----
+
+func.func @arith_truncf(%arg0: f64) -> f16 {
+  // CHECK-LABEL: arith_truncf
+  // CHECK-SAME: (%[[Arg0:[^ ]*]]: f64)
+  // CHECK: %[[Truncd0:.*]] = emitc.cast %[[Arg0]] : f64 to f32
+  %truncd0 = arith.truncf %arg0 : f64 to f32
+  // CHECK: %[[Truncd1:.*]] = emitc.cast %[[Truncd0]] : f32 to f16
+  %truncd1 = arith.truncf %truncd0 : f32 to f16
+
+  return %truncd1 : f16
+}
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir
index 29c42fcd50bd7..b616632a6fe24 100644
--- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir
@@ -638,13 +638,13 @@ func.func @vecdim_reduction_complex_ub(%in: memref<256x512xf32>, %out: memref<25
  return
 }
 
-// CHECK:       #[[$map3:.*]] = affine_map<([[d0:.*]], [[d1:.*]]) -> ([[d0]], [[d1]] * 2)>
-// CHECK:       #[[$map3_sub:.*]] = affine_map<([[d0:.*]], [[d1:.*]]) -> ([[d0]] - [[d1]])>
+// CHECK:       #[[$map3:.*]] = affine_map<(d0, d1) -> (d0, d1 * 2)>
+// CHECK:       #[[$map3_sub:.*]] = affine_map<(d0)[s0] -> (-d0 + s0)>
 // CHECK-LABEL: @vecdim_reduction_complex_ub
 // CHECK:         %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32>
 // CHECK:         %{{.*}} = affine.for %[[iv:.*]] = 0 to min #[[$map3]](%[[M:.*]], %[[N:.*]]) step 128 iter_args(%[[red_iter:.*]] = {{.*}}) -> (vector<128xf32>) {
 // CHECK:           %[[ub:.*]] = affine.min #[[$map3]](%[[M]], %[[N]])
-// CHECK:           %[[elems_left:.*]] = affine.apply #[[$map3_sub]](%[[ub]], %[[iv]])
+// CHECK:           %[[elems_left:.*]] = affine.apply #[[$map3_sub]](%[[iv]])[%[[ub]]]
 // CHECK:           %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
 // CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
 // CHECK:           %[[select:.*]] = arith.select %[[mask]], %[[ld]], %[[vzero]] : vector<128xi1>, vector<128xf32>
diff --git a/mlir/test/Dialect/Affine/invalid.mlir b/mlir/test/Dialect/Affine/invalid.mlir
index 2694649576464..44e484b9ba598 100644
--- a/mlir/test/Dialect/Affine/invalid.mlir
+++ b/mlir/test/Dialect/Affine/invalid.mlir
@@ -20,36 +20,6 @@ func.func @affine_apply_resul_non_index(%arg0 : index) {
   return
 }
 
-// -----
-
-#map = affine_map<(d0)[s0] -> (d0 + s0)>
-
-func.func @affine_for_lower_bound_invalid_dim(%arg : index) {
-  affine.for %n0 = 0 to 7 {
-    %dim = arith.addi %arg, %arg : index
-
-    // expected-error@+1 {{operand cannot be used as a dimension id}}
-    affine.for %n1 = 0 to #map(%dim)[%arg] {
-    }
-  }
-  return
-}
-
-// -----
-
-#map = affine_map<(d0)[s0] -> (d0 + s0)>
-
-func.func @affine_for_upper_bound_invalid_dim(%arg : index) {
-  affine.for %n0 = 0 to 7 {
-    %dim = arith.addi %arg, %arg : index
-
-    // expected-error@+1 {{operand cannot be used as a dimension id}}
-    affine.for %n1 = #map(%dim)[%arg] to 7 {
-    }
-  }
-  return
-}
-
 // -----
 func.func @affine_load_invalid_dim(%M : memref<10xi32>) {
   "unknown"() ({
@@ -93,20 +63,6 @@ func.func @affine_for_upper_bound_invalid_sym() {
 
 #set0 = affine_set<(i)[N] : (i >= 0, N - i >= 0)>
 
-func.func @affine_if_invalid_dim(%arg : index) {
-  affine.for %n0 = 0 to 7 {
-    %dim = arith.addi %arg, %arg : index
-    // Non-affine operand %dim has been made legal as input to affine.if.
-    // expected-error@+1 {{operand cannot be used as a symbol}}
-    affine.if #set0(%dim)[%n0] {}
-  }
-  return
-}
-
-// -----
-
-#set0 = affine_set<(i)[N] : (i >= 0, N - i >= 0)>
-
 func.func @affine_if_invalid_sym() {
   affine.for %i0 = 0 to 7 {
     // expected-error@+1 {{operand cannot be used as a symbol}}
diff --git a/mlir/test/Dialect/Affine/ops.mlir b/mlir/test/Dialect/Affine/ops.mlir
index c6bfb688db1c1..e3721806989bb 100644
--- a/mlir/test/Dialect/Affine/ops.mlir
+++ b/mlir/test/Dialect/Affine/ops.mlir
@@ -324,3 +324,88 @@ module attributes {gpu.container_module} {
 // CHECK:             affine.for %[[VAL_4:.*]] = %[[VAL_3]] to %[[VAL_2]] step 32 {
 // CHECK:             }
 // CHECK:             gpu.return
+
+// -----
+
+#map = affine_map<()[s0] -> (s0 mod 32)>
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<()[s0] -> (s0 mod 32)>
+
+// CHECK-LABEL: gpu.func @affine_thread_id
+
+module {
+  gpu.module @gpu {
+    gpu.func @affine_thread_id(%arg0: memref<?x?xf32>) kernel {
+      %c3 = arith.constant 3 : index
+      %dim = memref.dim %arg0, %c3 : memref<?x?xf32>
+      %c0 = arith.constant 0 : index
+      affine.for %arg3 = %c0 to %dim step 32 {
+        %thread_id_x = gpu.thread_id  x
+        %0 = affine.apply #map()[%thread_id_x]
+        %c128 = arith.constant 128 : index
+        affine.for %arg4 = %0 to %c128 step 8 {
+          %c32 = arith.constant 32 : index
+        }
+      }
+      gpu.return
+    }
+  }
+}
+
+// CHECK-SAME: (%[[VAL_0:.*]]: memref<?x?xf32>) kernel {
+// CHECK: %[[VAL_1:.*]] = arith.constant 3 : index
+// CHECK: %[[VAL_2:.*]] = memref.dim %[[VAL_0]], %[[VAL_1]] : memref<?x?xf32>
+// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK: affine.for %[[VAL_4:.*]] = %[[VAL_3]] to %[[VAL_2]] step 32 {
+// CHECK: %[[VAL_5:.*]] = gpu.thread_id  x
+// CHECK: %[[VAL_6:.*]] = affine.apply #[[$ATTR_0]](){{\[}}%[[VAL_5]]]
+// CHECK: %[[VAL_7:.*]] = arith.constant 128 : index
+// CHECK: affine.for %{{.*}} = %[[VAL_6]] to %[[VAL_7]] step 8 {
+
+// -----
+
+#map = affine_map<(d0)[s0] -> (d0 + s0)>
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
+
+// CHECK-LABEL: func @arith_add_vaild_symbol_upper_bound
+
+func.func @arith_add_vaild_symbol_upper_bound(%arg : index) {
+  affine.for %n0 = 0 to 7 {
+    %dim = arith.addi %arg, %arg : index
+    affine.for %n1 = 0 to #map(%dim)[%arg] {
+    }
+  }
+  return
+}
+
+// CHECK-SAME: %[[VAL_0:.*]]: index) {
+// CHECK: affine.for %[[VAL_1:.*]] = 0 to 7 {
+// CHECK:   %[[VAL_2:.*]] = arith.addi %[[VAL_0]], %[[VAL_0]] : index
+// CHECK:   affine.for %[[VAL_3:.*]] = 0 to #[[$ATTR_0]](%[[VAL_2]]){{\[}}%[[VAL_0]]] {
+// CHECK:   }
+// CHECK: }
+
+// -----
+
+#map = affine_map<(d0)[s0] -> (d0 + s0)>
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
+
+// CHECK-LABEL: func @arith_add_vaild_symbol_lower_bound
+
+func.func @arith_add_vaild_symbol_lower_bound(%arg : index) {
+  affine.for %n0 = 0 to 7 {
+    %dim = arith.addi %arg, %arg : index
+    affine.for %n1 = #map(%dim)[%arg] to 7 {
+    }
+  }
+  return
+}
+
+// CHECK-SAME: %[[VAL_0:.*]]: index) {
+// CHECK: affine.for %[[VAL_1:.*]] = 0 to 7 {
+// CHECK:   %[[VAL_2:.*]] = arith.addi %[[VAL_0]], %[[VAL_0]] : index
+// CHECK:   affine.for %[[VAL_3:.*]] = #[[$ATTR_0]](%[[VAL_2]]){{\[}}%[[VAL_0]]] to 7 {
+// CHECK:   }
+// CHECK: }
diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir
index 72572c6a38de1..0a5c85336831a 100644
--- a/mlir/test/Dialect/GPU/transform-gpu.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu.mlir
@@ -43,7 +43,7 @@ module attributes {transform.with_named_sequence} {
 !type = memref<2 x 32 x f32>
 !type1d = memref<32 x f32>
 
-// CHECK-DAG: #[[$MAP:.*]] = affine_map<(d0) -> (d0 floordiv 128)>
+// CHECK-DAG: #[[$MAP:.*]] = affine_map<()[s0] -> (s0 floordiv 128)> 
 
 // CHECK-LABEL: func.func @warpgroup_3d(
 // CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>
@@ -61,7 +61,7 @@ func.func @warpgroup_3d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream
 //      CHECK:   gpu.launch
 //      CHECK:   %[[TIDX:.*]] = gpu.thread_id  x
 //      CHECK:   %[[TIDY:.*]] = gpu.thread_id  y
-//  CHECK-DAG:   %[[WG:.*]] = affine.apply #[[$MAP]](%[[TIDX]])
+//  CHECK-DAG:   %[[WG:.*]] = affine.apply #[[$MAP]]()[%[[TIDX]]]
 //  CHECK-DAG:   %[[CMPX:.*]] = arith.cmpi ult, %[[TIDX]], %[[C384]] : index
 //  CHECK-DAG:   %[[CMPY:.*]] = arith.cmpi ult, %[[TIDY]], %[[C1]] : index
 //      CHECK:   %[[COND:.*]] = arith.andi %[[CMPX]], %[[CMPY]] : i1
@@ -95,7 +95,7 @@ module attributes {transform.with_named_sequence} {
 !type = memref<2 x 32 x f32>
 !type1d = memref<32 x f32>
 
-// CHECK-DAG: #[[$MAP:.*]] = affine_map<(d0) -> (d0 floordiv 16)>
+// CHECK-DAG: #map = affine_map<()[s0] -> (s0 floordiv 16)>
 
 // CHECK-LABEL: func.func @warp_3d(
 // CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>
@@ -114,7 +114,7 @@ func.func @warp_3d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !g
 //      CHECK:   gpu.launch
 //      CHECK:   %[[TIDX:.*]] = gpu.thread_id  x
 //      CHECK:   %[[TIDY:.*]] = gpu.thread_id  y
-//  CHECK-DAG:   %[[W:.*]] = affine.apply #[[$MAP]](%[[TIDX]])
+//  CHECK-DAG:   %[[W:.*]] = affine.apply #[[$MAP]]()[%[[TIDX]]]
 //  CHECK-DAG:   %[[CMPX:.*]] = arith.cmpi ult, %[[TIDX]], %[[C32]] : index
 //  CHECK-DAG:   %[[CMPY:.*]] = arith.cmpi ult, %[[TIDY]], %[[C3]] : index
 //      CHECK:   %[[COND:.*]] = arith.andi %[[CMPX]], %[[CMPY]] : i1
@@ -354,9 +354,9 @@ module attributes {transform.with_named_sequence} {
 !type = memref<2 x 32 x f32>
 !type1d = memref<32 x f32>
 
-// CHECK-DAG: #[[$MAPWGLIN:.*]] = affine_map<(d0, d1, d2) -> (d0 + d1 * 32 + d2 * 256)>
-// CHECK-DAG: #[[$MAPWGX:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 32) floordiv 128) mod 2)>
-// CHECK-DAG: #[[$MAPWGY:.*]] = affine_map<(d0, d1, d2) -> (d2 + ((d0 + d1 * 32) floordiv 128) floordiv 2)>
+// CHECK-DAG: #[[$MAPWGLIN:.*]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * 32 + s2 * 256)>
+// CHECK-DAG: #[[$MAPWGX:.*]] = affine_map<()[s0, s1] -> (((s0 + s1 * 32) floordiv 128) mod 2)>
+// CHECK-DAG: #[[$MAPWGY:.*]] = affine_map<()[s0, s1, s2] -> (s2 + ((s0 + s1 * 32) floordiv 128) floordiv 2)>
 
 // CHECK-LABEL: func.func @warpgroup_linear(
 // CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>
@@ -376,9 +376,9 @@ func.func @warpgroup_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %st
 // CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id  x
 // CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id  y
 // CHECK-DAG: %[[TIDZ:.*]] = gpu.thread_id  z
-// CHECK-DAG: %[[WIDLIN:.*]] = affine.apply #[[$MAPWGLIN]](%[[TIDX]], %[[TIDY]], %[[TIDZ]])
-// CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWGX]](%[[TIDX]], %[[TIDY]])
-// CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWGY]](%[[TIDX]], %[[TIDY]], %[[TIDZ]])
+// CHECK-DAG: %[[WIDLIN:.*]] = affine.apply #[[$MAPWGLIN]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]]
+// CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWGX]]()[%[[TIDX]], %[[TIDY]]]
+// CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWGY]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]]
 // CHECK-DAG: %[[CMPLIN:.*]] = arith.cmpi ult, %[[WIDLIN]], %[[C768]] : index
 //     CHECK: scf.if %[[CMPLIN]]
 //      CHECK:   memref.load %[[ARGX]][%[[WIDX]], %[[WIDY]]]
@@ -410,9 +410,9 @@ module attributes {transform.with_named_sequence} {
 !type = memref<2 x 32 x f32>
 !type1d = memref<32 x f32>
 
-// CHECK-DAG: #[[$MAPWLIN:.*]] = affine_map<(d0, d1, d2) -> (d0 + d1 * 32 + d2 * 256)>
-// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<(d0, d1, d2) -> ((d1 + d2 * 8 + d0 floordiv 32) mod 2)>
-// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<(d0, d1, d2) -> ((d1 + d2 * 8 + d0 floordiv 32) floordiv 2)>
+// CHECK-DAG: #[[$MAPWLIN:.*]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * 32 + s2 * 256)>
+// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<()[s0, s1, s2] -> ((s1 + s2 * 8 + s0 floordiv 32) mod 2)>
+// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<()[s0, s1, s2] -> ((s1 + s2 * 8 + s0 floordiv 32) floordiv 2)>
 
 // CHECK-LABEL: func.func @warp_linear(
 // CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>
@@ -432,9 +432,9 @@ func.func @warp_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream
 // CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id  x
 // CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id  y
 // CHECK-DAG: %[[TIDZ:.*]] = gpu.thread_id  z
-// CHECK-DAG: %[[WIDLIN:.*]] = affine.apply #[[$MAPWLIN]](%[[TIDX]], %[[TIDY]], %[[TIDZ]])
-// CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]](%[[TIDX]], %[[TIDY]], %[[TIDZ]])
-// CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]](%[[TIDX]], %[[TIDY]], %[[TIDZ]])
+// CHECK-DAG: %[[WIDLIN:.*]] = affine.apply #[[$MAPWLIN]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]]
+// CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]]
+// CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]]
 // CHECK-DAG: %[[CMPLIN:.*]] = arith.cmpi ult, %[[WIDLIN]], %[[C192]] : index
 //     CHECK: scf.if %[[CMPLIN]]
 //      CHECK:   memref.load %[[ARGX]][%[[WIDX]], %[[WIDY]]]
@@ -466,12 +466,12 @@ module attributes {transform.with_named_sequence} {
 !type = memref<2 x 32 x f32>
 !type1d = memref<32 x f32>
 
-// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 18) floordiv 32) mod 3)>
-// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<(d0, d1) -> ((((d0 + d1 * 18) floordiv 32) mod 6) floordiv 3)>
+// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<()[s0, s1] -> (((s0 + s1 * 18) floordiv 32) mod 3)>
+// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<()[s0, s1] -> ((((s0 + s1 * 18) floordiv 32) mod 6) floordiv 3)>
 
-// CHECK-DAG: #[[$MAPLIN:.*]] = affine_map<(d0, d1) -> (d0 + d1 * 18)>
-// CHECK-DAG: #[[$MAPLX:.*]] = affine_map<(d0, d1) -> ((d0 + d1 * 18) mod 10)>
-// CHECK-DAG: #[[$MAPLY:.*]] = affine_map<(d0, d1) -> ((d0 + d1 * 18) floordiv 10)>
+// CHECK-DAG: #[[$MAPLIN:.*]] = affine_map<()[s0, s1] -> (s0 + s1 * 18)>
+// CHECK-DAG: #[[$MAPLX:.*]] = affine_map<()[s0, s1] -> ((s0 + s1 * 18) mod 10)>
+// CHECK-DAG: #[[$MAPLY:.*]] = affine_map<()[s0, s1] -> ((s0 + s1 * 18) floordiv 10)>
 
 // CHECK-LABEL: func.func @map_multi_level_linear(
 func.func @map_multi_level_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
@@ -504,9 +504,9 @@ func.func @map_multi_level_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f3
       memref.store %6, %y[%i, %j] : !type
     }  { mapping = [#gpu.thread<y>, #gpu.thread<x>]}
 
-    // CHECK-DAG: %[[LIN:.*]] = affine.apply #[[$MAPLIN]](%[[TIDX]], %[[TIDY]])
-    // CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]](%[[TIDX]], %[[TIDY]])
-    // CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]](%[[TIDX]], %[[TIDY]])
+    // CHECK-DAG: %[[LIN:.*]] = affine.apply #[[$MAPLIN]]()[%[[TIDX]], %[[TIDY]]]
+    // CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]]()[%[[TIDX]], %[[TIDY]]]
+    // CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]]()[%[[TIDX]], %[[TIDY]]]
     // CHECK-DAG: %[[CMPLIN:.*]] = arith.cmpi ult, %[[LIN]], %[[C192]] : index
     //     CHECK: scf.if %[[CMPLIN]]
     scf.forall (%i, %j, %k) in (%c3, %c2, %c1) {
@@ -515,8 +515,8 @@ func.func @map_multi_level_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f3
         memref.store %8, %y[%i, %j] : !type
      }  {mapping = [#gpu.warp<linear_dim_0>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_2>] }
 
-    // CHECK-DAG: %[[LIDX:.*]] = affine.apply #[[$MAPLX]](%[[TIDX]], %[[TIDY]])
-    // CHECK-DAG: %[[LIDY:.*]] = affine.apply #[[$MAPLY]](%[[TIDX]], %[[TIDY]])
+    // CHECK-DAG: %[[LIDX:.*]] = affine.apply #[[$MAPLX]]()[%[[TIDX]], %[[TIDY]]]
+    // CHECK-DAG: %[[LIDY:.*]] = affine.apply #[[$MAPLY]]()[%[[TIDX]], %[[TIDY]]]
     // CHECK-DAG: %[[COND:.*]] = arith.cmpi ult, %[[LIN]], %[[C20]] : index
     //     CHECK: scf.if %[[COND]]
     //     CHECK:   memref.load %{{.*}}[%[[LIDX]]] : memref<32xf32>
@@ -545,9 +545,9 @@ module attributes {transform.with_named_sequence} {
 !type = memref<2 x 32 x f32>
 !type1d = memref<32 x f32>
 
-// CHECK-DAG: #[[$MAPBLIN:.*]] = affine_map<(d0, d1, d2) -> (d0 + d1 * 12 + d2 * 108)>
-// CHECK-DAG: #[[$MAPBX:.*]] = affine_map<(d0, d1, d2) -> ((d0 + d1 * 12 + d2 * 108) mod 7)>
-// CHECK-DAG: #[[$MAPBY:.*]] = affine_map<(d0, d1, d2) -> ((d0 + d1 * 12 + d2 * 108) floordiv 7)>
+// CHECK-DAG: #[[$MAPBLIN:.*]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * 12 + s2 * 108)> 
+// CHECK-DAG: #[[$MAPBX:.*]] = affine_map<()[s0, s1, s2] -> ((s0 + s1 * 12 + s2 * 108) mod 7)>
+// CHECK-DAG: #[[$MAPBY:.*]] = affine_map<()[s0, s1, s2] -> ((s0 + s1 * 12 + s2 * 108) floordiv 7)>
 
 // CHECK-LABEL: func.func @block_linear_existing_launch(
 // CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>
@@ -566,9 +566,9 @@ func.func @block_linear_existing_launch(
 //  CHECK-DAG: %[[BIDX:.*]] = gpu.block_id  x
 //  CHECK-DAG: %[[BIDY:.*]] = gpu.block_id  y
 //  CHECK-DAG: %[[BIDZ:.*]] = gpu.block_id  z
-//  CHECK-DAG: %[[BIDLIN:.*]] = affine.apply #[[$MAPBLIN]](%[[BIDX]], %[[BIDY]], %[[BIDZ]])
-//  CHECK-DAG: %[[BLX:.*]] = affine.apply #[[$MAPBX]](%[[BIDX]], %[[BIDY]], %[[BIDZ]])
-//  CHECK-DAG: %[[BLY:.*]] = affine.apply #[[$MAPBY]](%[[BIDX]], %[[BIDY]], %[[BIDZ]])
+//  CHECK-DAG: %[[BIDLIN:.*]] = affine.apply #[[$MAPBLIN]]()[%[[BIDX]], %[[BIDY]], %[[BIDZ]]]
+//  CHECK-DAG: %[[BLX:.*]] = affine.apply #[[$MAPBX]]()[%[[BIDX]], %[[BIDY]], %[[BIDZ]]]
+//  CHECK-DAG: %[[BLY:.*]] = affine.apply #[[$MAPBY]]()[%[[BIDX]], %[[BIDY]], %[[BIDZ]]]
 //  CHECK-DAG: %[[CMPLIN:.*]] = arith.cmpi ult, %[[BIDLIN]], %[[C63]] : index
 //     CHECK: scf.if %[[CMPLIN]]
 //      CHECK:   memref.load %[[ARGX]][%[[BLX]], %[[BLY]]]
@@ -600,8 +600,8 @@ module attributes {transform.with_named_sequence} {
 !type = memref<2 x 32 x f32>
 !type1d = memref<32 x f32>
 
-// CHECK-DAG: #[[$MAPBX:.*]] = affine_map<(d0) -> (d0 mod 7)>
-// CHECK-DAG: #[[$MAPBY:.*]] = affine_map<(d0, d1, d2) -> (d1 + d2 * 9 + d0 floordiv 7)>
+// CHECK-DAG: #[[$MAPBX:.*]] = affine_map<()[s0] -> (s0 mod 7)>
+// CHECK-DAG: #[[$MAPBY:.*]] = affine_map<()[s0, s1, s2] -> (s1 + s2 * 9 + s0 floordiv 7)>
 
 // CHECK-LABEL: func.func @block_linear_generate_launch(
 // CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>
@@ -620,8 +620,8 @@ func.func @block_linear_generate_launch(
 //  CHECK-DAG: %[[BIDX:.*]] = gpu.block_id  x
 //  CHECK-DAG: %[[BIDY:.*]] = gpu.block_id  y
 //  CHECK-DAG: %[[BIDZ:.*]] = gpu.block_id  z
-//  CHECK-DAG: %[[BLX:.*]] = affine.apply #[[$MAPBX]](%[[BIDX]])
-//  CHECK-DAG: %[[BLY:.*]] = affine.apply #[[$MAPBY]](%[[BIDX]], %[[BIDY]], %[[BIDZ]])
+//  CHECK-DAG: %[[BLX:.*]] = affine.apply #[[$MAPBX]]()[%[[BIDX]]]
+//  CHECK-DAG: %[[BLY:.*]] = affine.apply #[[$MAPBY]]()[%[[BIDX]], %[[BIDY]], %[[BIDZ]]]
 //      CHECK:   memref.load %[[ARGX]][%[[BLX]], %[[BLY]]]
 //      CHECK:   memref.load %[[ARGY]][%[[BLX]], %[[BLY]]]
   scf.forall (%i, %j) in (%c7, %c9) {
@@ -647,8 +647,8 @@ module attributes {transform.with_named_sequence} {
 #map = affine_map<(d0) -> (d0 *  128)>
 #map1 = affine_map<(d0) -> (d0 * 32)>
 
-// CHECK-DAG: #[[$MAPB:.*]] = affine_map<(d0) -> (d0 * 128)>
-// CHECK-DAG: #[[$MAPW:.*]] = affine_map<(d0, d1, d2) -> (d2 * 32 + ((d0 + d1 * 4) floordiv 32) * 32)>
+// CHECK-DAG: #[[$MAPB:.*]] = affine_map<()[s0] -> (s0 * 128)> 
+// CHECK-DAG: #[[$MAPW:.*]] = affine_map<()[s0, s1, s2] -> (s2 * 32 + ((s0 + s1 * 4) floordiv 32) * 32)>
 
 // CHECK-LABEL: func.func @simple_fill(
 func.func @simple_fill(%arg0: memref<128xf32>) -> memref<128xf32> {
@@ -660,14 +660,14 @@ func.func @simple_fill(%arg0: memref<128xf32>) -> memref<128xf32> {
 //       CHECK:   gpu.launch
   scf.forall (%arg1) in (1) {
 //       CHECK:     %[[BIDX:.*]] = gpu.block_id  x
-//       CHECK:     %[[BLX:.*]] = affine.apply #[[$MAPB]](%[[BIDX]])
+//       CHECK:     %[[BLX:.*]] = affine.apply #[[$MAPB]]()[%[[BIDX]]]
     %0 = affine.apply #map(%arg1)
     %subview = memref.subview %arg0[%0] [128] [1] : memref<128xf32> to memref<128xf32, strided<[1], offset: ?>>
     scf.forall (%arg2) in (4) {
 //       CHECK:     %[[TIDX:.*]] = gpu.thread_id  x
 //       CHECK:     %[[TIDY:.*]] = gpu.thread_id  y
 //       CHECK:     %[[TIDZ:.*]] = gpu.thread_id  z
-//       CHECK:     %[[THX:.*]] = affine.apply #[[$MAPW]](%[[TIDX]], %[[TIDY]], %[[TIDZ]])
+//       CHECK:     %[[THX:.*]] = affine.apply #[[$MAPW]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]]
 //   CHECK-NOT:     scf.if
 //       CHECK:       memref.subview %{{.*}}[%[[THX]]]
       %1 = affine.apply #map1(%arg2)
diff --git a/mlir/test/Dialect/LLVMIR/func.mlir b/mlir/test/Dialect/LLVMIR/func.mlir
index e2a444c1faaba..74dd862ce8fb2 100644
--- a/mlir/test/Dialect/LLVMIR/func.mlir
+++ b/mlir/test/Dialect/LLVMIR/func.mlir
@@ -479,3 +479,9 @@ llvm.func @intel_reqd_sub_group_size_hint() attributes {llvm.intel_reqd_sub_grou
 // CHECK-SAME: llvm.workgroup_attribution = #llvm.mlir.workgroup_attribution<512 : i64, i32>
 // CHECK-SAME: llvm.workgroup_attribution = #llvm.mlir.workgroup_attribution<128 : i64, !llvm.struct<(i32, i64, f32)>
 llvm.func @workgroup_attribution(%arg0: !llvm.ptr {llvm.workgroup_attribution = #llvm.mlir.workgroup_attribution<512 : i64, i32>}, %arg1: !llvm.ptr {llvm.workgroup_attribution = #llvm.mlir.workgroup_attribution<128 : i64, !llvm.struct<(i32, i64, f32)>>})
+
+// -----
+
+// CHECK: @constant_range_negative
+// CHECK-SAME: llvm.range = #llvm.constant_range<i32, 0, -2147483648>
+llvm.func @constant_range_negative() -> (i32 {llvm.range = #llvm.constant_range<i32, 0, -2147483648>})
diff --git a/mlir/test/Dialect/Linalg/convert-conv2d-to-img2col.mlir b/mlir/test/Dialect/Linalg/convert-conv2d-to-img2col.mlir
index c7c846d7ecc9c..c17f20b2d03ab 100644
--- a/mlir/test/Dialect/Linalg/convert-conv2d-to-img2col.mlir
+++ b/mlir/test/Dialect/Linalg/convert-conv2d-to-img2col.mlir
@@ -40,9 +40,9 @@ module attributes {transform.with_named_sequence} {
 // CHECK: %[[KINDEX:.+]] = linalg.index 2 : index
 
 // Compute input channel/convolved indices.
-// CHECK: %[[ICINDEX:.+]] = affine.apply affine_map<(d0) -> (d0 mod 4)>(%[[KINDEX]])
-// CHECK: %[[CONVH:.+]] = affine.apply affine_map<(d0, d1) -> (d0 floordiv 14 + d1 floordiv 12)>(%[[MINDEX]], %[[KINDEX]])
-// CHECK: %[[CONVW:.+]] = affine.apply affine_map<(d0, d1) -> (d0 mod 14 + (d1 mod 12) floordiv 4)>(%[[MINDEX]], %[[KINDEX]])
+// CHECK: %[[ICINDEX:.+]] = affine.apply affine_map<()[s0] -> (s0 mod 4)>()[%[[KINDEX]]]
+// CHECK: %[[CONVH:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 floordiv 14 + s1 floordiv 12)>()[%[[MINDEX]], %[[KINDEX]]]
+// CHECK: %[[CONVW:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 mod 14 + (s1 mod 12) floordiv 4)>()[%[[MINDEX]], %[[KINDEX]]]
 
 // Extract from the input tensor.
 // CHECK: %[[EXTRACTED_INPUT:.+]] = tensor.extract
@@ -227,9 +227,9 @@ module attributes {transform.with_named_sequence} {
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 
 //  Im2col maps
-//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 floordiv 9)>
-//  CHECK-DAG: #[[MAP7:.+]] = affine_map<(d0, d1) -> (d0 floordiv 14 + (d1 mod 9) floordiv 3)>
-//  CHECK-DAG: #[[MAP8:.+]] = affine_map<(d0, d1) -> (d0 + d1 - (d0 floordiv 14) * 14 - (d1 floordiv 3) * 3)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 floordiv 9)>
+//  CHECK-DAG: #[[MAP7:.+]] = affine_map<()[s0, s1] -> (s0 floordiv 14 + (s1 mod 9) floordiv 3)>
+//  CHECK-DAG: #[[MAP8:.+]] = affine_map<()[s0, s1] -> (s0 + s1 - (s0 floordiv 14) * 14 - (s1 floordiv 3) * 3)>
 
 
 //  CHECK-DAG: #[[LHSMAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d1, d3)>
@@ -251,9 +251,9 @@ module attributes {transform.with_named_sequence} {
 //      CHECK:       %[[NINDEX:.+]] = linalg.index 2 : index
 
 //      Compute input channel/convolved indices.
-//      CHECK:       %[[ICINDEX:.+]] = affine.apply #[[MAP1]](%[[KINDEX]])
-//      CHECK:       %[[CONVH:.+]] = affine.apply #[[MAP7]](%[[NINDEX]], %[[KINDEX]])
-//      CHECK:       %[[CONVW:.+]] = affine.apply #[[MAP8]](%[[NINDEX]], %[[KINDEX]])
+//      CHECK:       %[[ICINDEX:.+]] = affine.apply #[[MAP1]]()[%[[KINDEX]]]
+//      CHECK:       %[[CONVH:.+]] = affine.apply #[[MAP7]]()[%[[NINDEX]], %[[KINDEX]]]
+//      CHECK:       %[[CONVW:.+]] = affine.apply #[[MAP8]]()[%[[NINDEX]], %[[KINDEX]]]
 
 //      Extract from the input tensor.
 //      CHECK:       %[[EXTRACTED_INPUT:.+]] = tensor.extract
@@ -300,9 +300,9 @@ module attributes {transform.with_named_sequence} {
 // CHECK: %[[KINDEX:.+]] = linalg.index 2 : index
 
 // Compute input channel/convolved indices.
-// CHECK: %[[ICINDEX:.+]] = affine.apply affine_map<(d0) -> (d0 mod 4)>(%[[KINDEX]])
-// CHECK: %[[CONVH:.+]] = affine.apply affine_map<(d0, d1) -> (d0 floordiv 14 + d1 floordiv 12)>(%[[MINDEX]], %[[KINDEX]])
-// CHECK: %[[CONVW:.+]] = affine.apply affine_map<(d0, d1) -> (d0 mod 14 + (d1 mod 12) floordiv 4)>(%[[MINDEX]], %[[KINDEX]])
+// CHECK: %[[ICINDEX:.+]] = affine.apply affine_map<()[s0] -> (s0 mod 4)>()[%[[KINDEX]]]
+// CHECK: %[[CONVH:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 floordiv 14 + s1 floordiv 12)>()[%[[MINDEX]], %[[KINDEX]]]
+// CHECK: %[[CONVW:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 mod 14 + (s1 mod 12) floordiv 4)>()[%[[MINDEX]], %[[KINDEX]]]
 
 // Extract from the input tensor.
 // CHECK: %[[EXTRACTED_INPUT:.+]] = tensor.extract
diff --git a/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir b/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
index a0405d9450edd..613c23f35f6b1 100644
--- a/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
+++ b/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
@@ -986,13 +986,13 @@ module {
 #map3 = affine_map<(d0, d1) -> (d0, d1)>
 
 // CHECK-DAG: [[$MAP0:#[a-zA-Z0-9_]*]] = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK-DAG: [[$MAP1:#[a-zA-Z0-9_]*]] = affine_map<(d0) -> (d0 floordiv 4)>
+// CHECK-DAG: [[$MAP1:#[a-zA-Z0-9_]*]] = affine_map<()[s0] -> (s0 floordiv 4)>
 
 func.func @fuse_and_collapse(%arg0: tensor<3x4xindex>) -> tensor<2x12xindex> {
   %1 = tensor.empty() : tensor<2x3x4xindex>
   // CHECK: linalg.generic {
   // CHECK: %[[INDEX1:[a-zA-Z0-9_]+]] = linalg.index 1 : index
-  // CHECK-NEXT: %[[MAP:[a-zA-Z0-9_]+]] = affine.apply #map1(%[[INDEX1]])
+  // CHECK-NEXT: %[[MAP:[a-zA-Z0-9_]+]] = affine.apply #map1()[%[[INDEX1]]]
   // CHECK-NEXT: linalg.yield %[[MAP]] : index
   %2 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg0: tensor<3x4xindex>) outs(%1 : tensor<2x3x4xindex>) {
   ^bb0(%in: index, %out: index):
diff --git a/mlir/test/Dialect/Linalg/tile-indexed.mlir b/mlir/test/Dialect/Linalg/tile-indexed.mlir
index b4aa0a33bc592..d96a251b01ccb 100644
--- a/mlir/test/Dialect/Linalg/tile-indexed.mlir
+++ b/mlir/test/Dialect/Linalg/tile-indexed.mlir
@@ -19,13 +19,13 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-// TILE-10n25-DAG: [[$MAP:#[a-zA-Z0-9_]*]] = affine_map<(d0, d1) -> (d0 + d1)>
+// TILE-10n25-DAG: [[$MAP:#[a-zA-Z0-9_]*]] = affine_map<(d0)[s0] -> (d0 + s0)> 
 // TILE-10n25-LABEL: func @indexed_vector
 // TILE-10n25: %[[C10:.*]] = arith.constant 10 : index
 // TILE-10n25: scf.for %[[J:.*]] = {{.*}} step %[[C10]]
 // TILE-10n25:   linalg.generic
 // TILE-10n25:     %[[I:.*]] = linalg.index 0 : index
-// TILE-10n25:     %[[NEW_I:.*]] = affine.apply [[$MAP]](%[[I]], %[[J]])
+// TILE-10n25:     %[[NEW_I:.*]] = affine.apply [[$MAP]](%[[J]])[%[[I]]]
 // TILE-10n25:     linalg.yield %[[NEW_I]] : index
 
 // -----
@@ -51,7 +51,7 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-// TILE-10n25-DAG: [[$MAP:#[a-zA-Z0-9_]*]] = affine_map<(d0, d1) -> (d0 + d1)>
+// TILE-10n25-DAG: [[$MAP:#[a-zA-Z0-9_]*]] = affine_map<(d0)[s0] -> (d0 + s0)>
 // TILE-10n25-LABEL: func @indexed_matrix
 // TILE-10n25-DAG: %[[C25:.*]] = arith.constant 25 : index
 // TILE-10n25-DAG: %[[C10:.*]] = arith.constant 10 : index
@@ -59,8 +59,8 @@ module attributes {transform.with_named_sequence} {
 // TILE-10n25:   scf.for %[[L:.*]] = {{.*}} step %[[C25]]
 // TILE-10n25:     linalg.generic
 // TILE-10n25:       %[[I:.*]] = linalg.index 0 : index
-// TILE-10n25:       %[[NEW_I:.*]] = affine.apply [[$MAP]](%[[I]], %[[K]])
+// TILE-10n25:       %[[NEW_I:.*]] = affine.apply [[$MAP]](%[[K]])[%[[I]]]
 // TILE-10n25:       %[[J:.*]] = linalg.index 1 : index
-// TILE-10n25:       %[[NEW_J:.*]] = affine.apply [[$MAP]](%[[J]], %[[L]])
+// TILE-10n25:       %[[NEW_J:.*]] = affine.apply [[$MAP]](%[[L]])[%[[J]]]
 // TILE-10n25:       %[[SUM:.*]] = arith.addi %[[NEW_I]], %[[NEW_J]] : index
 // TILE-10n25:       linalg.yield %[[SUM]] : index
diff --git a/mlir/test/Dialect/Linalg/transform-op-split.mlir b/mlir/test/Dialect/Linalg/transform-op-split.mlir
index 68c849385ba6b..7f0ef401c8422 100644
--- a/mlir/test/Dialect/Linalg/transform-op-split.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-split.mlir
@@ -10,7 +10,7 @@ module attributes {transform.with_named_sequence} {
 
 func.func private @elem(%arg0: f32, %arg1: index, %arg2: index) -> f32
 
-// CHECK: #[[$ADD_42_MAP:.+]] = affine_map<(d0) -> (d0 + 42)>
+// CHECK: #[[$ADD_42_MAP:.+]] = affine_map<()[s0] -> (s0 + 42)>
 
 // CHECK-LABEL: @one_d_static
 // CHECK-SAME:  %[[IN:.+]]: tensor<100xf32>, %[[OUT:.+]]: tensor<100xf32>
@@ -30,7 +30,7 @@ func.func @one_d_static(%arg0: tensor<100xf32>, %arg1: tensor<100xf32>) -> tenso
   // CHECK:   ins(%[[IN_SLICE_HIGH]]
   // CHECK:   outs(%[[OUT_SLICE_HIGH]]
   // CHECK:   %[[IDX:.+]] = linalg.index 0
-  // CHECK:   affine.apply #[[$ADD_42_MAP]](%[[IDX]])
+  // CHECK:   affine.apply #[[$ADD_42_MAP]]()[%[[IDX]]]
   // CHECK:   func.call @elem
   // CHECK: %[[RES:.+]] = tensor.insert_slice %[[RES_SLICE_HIGH]] into %[[RES_PARTIAL]][42] [58] [1]
   %0 = linalg.generic {
diff --git a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
index 3683e5b469b17..beda3872bc8d2 100644
--- a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
@@ -541,3 +541,125 @@ func.func @findumsb(%arg0 : i64) -> () {
   %2 = spirv.GL.FindUMsb %arg0 : i64
   return
 }
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.Distance 
+//===----------------------------------------------------------------------===//
+
+func.func @distance_scalar(%arg0 : f32, %arg1 : f32) {
+  // CHECK: spirv.GL.Distance {{%.*}}, {{%.*}} : f32, f32 -> f32
+  %0 = spirv.GL.Distance %arg0, %arg1 : f32, f32 -> f32
+  return
+}
+
+func.func @distance_vector(%arg0 : vector<3xf32>, %arg1 : vector<3xf32>) {
+  // CHECK: spirv.GL.Distance {{%.*}}, {{%.*}} : vector<3xf32>, vector<3xf32> -> f32
+  %0 = spirv.GL.Distance %arg0, %arg1 : vector<3xf32>, vector<3xf32> -> f32
+  return
+}
+
+// -----
+
+func.func @distance_invalid_type(%arg0 : i32, %arg1 : i32) {
+  // expected-error @+1 {{'spirv.GL.Distance' op operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values of length 2/3/4/8/16}}
+  %0 = spirv.GL.Distance %arg0, %arg1 : i32, i32 -> f32
+  return
+}
+
+// -----
+
+func.func @distance_arg_mismatch(%arg0 : vector<3xf32>, %arg1 : vector<4xf32>) {
+  // expected-error @+1 {{'spirv.GL.Distance' op failed to verify that all of {p0, p1} have same type}}
+  %0 = spirv.GL.Distance %arg0, %arg1 : vector<3xf32>, vector<4xf32> -> f32
+  return
+}
+
+// -----
+
+func.func @distance_invalid_vector_size(%arg0 : vector<5xf32>, %arg1 : vector<5xf32>) {
+  // expected-error @+1 {{'spirv.GL.Distance' op operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values of length 2/3/4/8/16}}
+  %0 = spirv.GL.Distance %arg0, %arg1 : vector<5xf32>, vector<5xf32> -> f32
+  return
+}
+
+// -----
+
+func.func @distance_invalid_result(%arg0 : f32, %arg1 : f32) {
+  // expected-error @+1 {{'spirv.GL.Distance' op result #0 must be 16/32/64-bit float}}
+  %0 = spirv.GL.Distance %arg0, %arg1 : f32, f32 -> i32
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.Cross
+//===----------------------------------------------------------------------===//
+
+func.func @cross(%arg0 : vector<3xf32>, %arg1 : vector<3xf32>) {
+  %2 = spirv.GL.Cross %arg0, %arg1 : vector<3xf32>
+  // CHECK: %{{.+}} = spirv.GL.Cross %{{.+}}, %{{.+}} : vector<3xf32>
+  return
+}
+
+// -----
+
+func.func @cross_invalid_type(%arg0 : vector<3xi32>, %arg1 : vector<3xi32>) {
+  // expected-error @+1 {{'spirv.GL.Cross' op operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values of length 2/3/4/8/16, but got 'vector<3xi32>'}}
+  %0 = spirv.GL.Cross %arg0, %arg1 : vector<3xi32>
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.Normalize
+//===----------------------------------------------------------------------===//
+
+func.func @normalize_scalar(%arg0 : f32) {
+  %2 = spirv.GL.Normalize %arg0 : f32
+  // CHECK: %{{.+}} = spirv.GL.Normalize %{{.+}} : f32
+  return
+}
+
+func.func @normalize_vector(%arg0 : vector<3xf32>) {
+  %2 = spirv.GL.Normalize %arg0 : vector<3xf32>
+  // CHECK: %{{.+}} = spirv.GL.Normalize %{{.+}} : vector<3xf32>
+  return
+}
+
+// -----
+
+func.func @normalize_invalid_type(%arg0 : i32) {
+  // expected-error @+1 {{'spirv.GL.Normalize' op operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %0 = spirv.GL.Normalize %arg0 : i32
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.Reflect
+//===----------------------------------------------------------------------===//
+
+func.func @reflect_scalar(%arg0 : f32, %arg1 : f32) {
+  %2 = spirv.GL.Reflect %arg0, %arg1 : f32
+  // CHECK: %{{.+}} = spirv.GL.Reflect %{{.+}}, %{{.+}} : f32
+  return
+}
+
+func.func @reflect_vector(%arg0 : vector<3xf32>, %arg1 : vector<3xf32>) {
+  %2 = spirv.GL.Reflect %arg0, %arg1 : vector<3xf32>
+  // CHECK: %{{.+}} = spirv.GL.Reflect %{{.+}}, %{{.+}} : vector<3xf32>
+  return
+}
+
+// -----
+
+func.func @reflect_invalid_type(%arg0 : i32, %arg1 : i32) {
+  // expected-error @+1 {{'spirv.GL.Reflect' op operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %0 = spirv.GL.Reflect %arg0, %arg1 : i32
+  return
+}
diff --git a/mlir/test/Dialect/SPIRV/IR/matrix-ops.mlir b/mlir/test/Dialect/SPIRV/IR/matrix-ops.mlir
index 372fcc6e514b9..37e7514d664ef 100644
--- a/mlir/test/Dialect/SPIRV/IR/matrix-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/matrix-ops.mlir
@@ -29,6 +29,13 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     spirv.ReturnValue %result : !spirv.matrix<3 x vector<3xf32>>
   }
 
+  // CHECK-LABEL: @matrix_times_vector_1
+  spirv.func @matrix_times_vector_1(%arg0: !spirv.matrix<3 x vector<4xf32>>, %arg1: vector<3xf32>) -> vector<4xf32> "None" {
+    // CHECK: {{%.*}} = spirv.MatrixTimesVector {{%.*}}, {{%.*}} : !spirv.matrix<3 x vector<4xf32>>, vector<3xf32> -> vector<4xf32>
+    %result = spirv.MatrixTimesVector %arg0, %arg1 : !spirv.matrix<3 x vector<4xf32>>, vector<3xf32> -> vector<4xf32>
+    spirv.ReturnValue %result : vector<4xf32>
+  }
+
   // CHECK-LABEL: @matrix_times_matrix_1
   spirv.func @matrix_times_matrix_1(%arg0: !spirv.matrix<3 x vector<3xf32>>, %arg1: !spirv.matrix<3 x vector<3xf32>>) -> !spirv.matrix<3 x vector<3xf32>> "None"{
     // CHECK: {{%.*}} = spirv.MatrixTimesMatrix {{%.*}}, {{%.*}} : !spirv.matrix<3 x vector<3xf32>>, !spirv.matrix<3 x vector<3xf32>> -> !spirv.matrix<3 x vector<3xf32>>
@@ -124,3 +131,27 @@ func.func @matrix_times_matrix_component_type_mismatch_2(%arg0 : !spirv.matrix<3
    %result = spirv.MatrixTimesMatrix %arg0, %arg1 : !spirv.matrix<3 x vector<3xf64>>, !spirv.matrix<3 x vector<3xf32>> -> !spirv.matrix<3 x vector<3xf32>>
    return
 }
+
+// -----
+
+func.func @matrix_times_vector_element_type_mismatch(%arg0: !spirv.matrix<4 x vector<3xf32>>, %arg1: vector<4xf16>) {
+  // expected-error @+1 {{matrix, vector, and result element types must match}}
+  %result = spirv.MatrixTimesVector %arg0, %arg1 : !spirv.matrix<4 x vector<3xf32>>, vector<4xf16> -> vector<3xf32>
+  return
+}
+
+// -----
+
+func.func @matrix_times_vector_row_mismatch(%arg0: !spirv.matrix<4 x vector<3xf32>>, %arg1: vector<4xf32>) {
+  // expected-error @+1 {{spirv.MatrixTimesVector' op result size (4) must match the matrix rows (3)}}
+  %result = spirv.MatrixTimesVector %arg0, %arg1 : !spirv.matrix<4 x vector<3xf32>>, vector<4xf32> -> vector<4xf32>
+  return
+}
+
+// -----
+
+func.func @matrix_times_vector_column_mismatch(%arg0: !spirv.matrix<4 x vector<3xf32>>, %arg1: vector<3xf32>) {
+  // expected-error @+1 {{spirv.MatrixTimesVector' op matrix columns (4) must match vector operand size (3)}}
+  %result = spirv.MatrixTimesVector %arg0, %arg1 : !spirv.matrix<4 x vector<3xf32>>, vector<3xf32> -> vector<3xf32>
+  return
+}
diff --git a/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir
index 8eb1311170c66..2d9d7e432d875 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir
@@ -259,14 +259,14 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//       CHECK: #[[$MAP_ADD:.+]] = affine_map<(d0, d1) -> (d0 + d1)>
+//       CHECK: #[[$MAP_ADD:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
 // CHECK-LABEL: @indexed_semantics
 //       CHECK:   scf.for %[[I0:.+]] = %{{.*}} to %{{.*}} step %{{.*}}
 //       CHECK:     scf.for %[[I1:.+]] = %{{.*}} to %{{.*}} step %{{.*}}
 //       CHECK:       %[[INDEX0:.+]] = linalg.index 0
-//       CHECK:       %[[INDEX0_AMENDED:.+]] = affine.apply #[[$MAP_ADD]](%[[INDEX0]], %[[I0]])
+//       CHECK:       %[[INDEX0_AMENDED:.+]] = affine.apply #[[$MAP_ADD]](%[[I0]])[%[[INDEX0]]]
 //       CHECK:       %[[INDEX1:.+]] = linalg.index 1
-//       CHECK:       %[[INDEX1_AMENDED:.+]] = affine.apply #[[$MAP_ADD]](%[[INDEX1]], %[[I1]])
+//       CHECK:       %[[INDEX1_AMENDED:.+]] = affine.apply #[[$MAP_ADD]](%[[I1]])[%[[INDEX1]]]
 //       CHECK:       arith.addi %[[INDEX0_AMENDED]], %[[INDEX1_AMENDED]]
 
 // -----
diff --git a/mlir/test/Interfaces/TilingInterface/tile-using-scfforall.mlir b/mlir/test/Interfaces/TilingInterface/tile-using-scfforall.mlir
index 53dd0c6a2425c..745a82fc0da75 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-using-scfforall.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-using-scfforall.mlir
@@ -205,7 +205,7 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK: #[[$MAP_ADD:.+]] = affine_map<(d0, d1) -> (d0 + d1)>
+// CHECK: #[[$MAP_ADD:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
 
 func.func @indexed_semantics(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
   // Check that we correctly amend "linalg.index" results.
@@ -241,9 +241,9 @@ module attributes {transform.with_named_sequence} {
 // CHECK-LABEL: @indexed_semantics
 //       CHECK: scf.forall (%[[I0:.+]], %[[I1:.+]]) =
 //       CHECK:   %[[INDEX0:.+]] = linalg.index 0
-//       CHECK:   %[[INDEX0_AMENDED:.+]] = affine.apply #[[$MAP_ADD]](%[[INDEX0]], %[[I0]])
+//       CHECK:   %[[INDEX0_AMENDED:.+]] = affine.apply #[[$MAP_ADD]](%[[I0]])[%[[INDEX0]]]
 //       CHECK:   %[[INDEX1:.+]] = linalg.index 1
-//       CHECK:   %[[INDEX1_AMENDED:.+]] = affine.apply #[[$MAP_ADD]](%[[INDEX1]], %[[I1]])
+//       CHECK:   %[[INDEX1_AMENDED:.+]] = affine.apply #[[$MAP_ADD]](%[[I1]])[%[[INDEX1]]]
 //       CHECK:   arith.addi %[[INDEX0_AMENDED]], %[[INDEX1_AMENDED]]
 
 // -----
diff --git a/mlir/test/Target/LLVMIR/nvvm/cvt_tf32.mlir b/mlir/test/Target/LLVMIR/nvvm/cvt_tf32.mlir
new file mode 100644
index 0000000000000..90a232e4baac6
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/cvt_tf32.mlir
@@ -0,0 +1,43 @@
+// RUN: mlir-translate -mlir-to-llvmir %s  -split-input-file --verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: @convert_float_to_tf32_rna
+llvm.func @convert_float_to_tf32_rna(%src : f32) -> i32 {
+  // CHECK: %{{.*}} = call i32 @llvm.nvvm.f2tf32.rna(float %{{.*}})
+  %res = nvvm.cvt.float.to.tf32 %src {rnd = #nvvm.fp_rnd_mode<rna>}
+  llvm.return %res : i32
+}
+
+// CHECK-LABEL: @convert_float_to_tf32_rna_sf
+llvm.func @convert_float_to_tf32_rna_sf(%src : f32) -> i32 {
+  // CHECK: %{{.*}} = call i32 @llvm.nvvm.f2tf32.rna.satfinite(float %{{.*}})
+  %res = nvvm.cvt.float.to.tf32 %src {rnd = #nvvm.fp_rnd_mode<rna>, sat = #nvvm.sat_mode<satfinite>}
+  llvm.return %res : i32
+}
+
+// CHECK-LABEL: @convert_float_to_tf32_rn
+llvm.func @convert_float_to_tf32_rn(%src : f32) -> i32 {
+  // CHECK: %{{.*}} = call i32 @llvm.nvvm.f2tf32.rn(float %{{.*}})
+  %res = nvvm.cvt.float.to.tf32 %src {rnd = #nvvm.fp_rnd_mode<rn>}
+  llvm.return %res : i32
+}
+
+// CHECK-LABEL: @convert_float_to_tf32_rn_relu
+llvm.func @convert_float_to_tf32_rn_relu(%src : f32) -> i32 {
+  // CHECK: %{{.*}} = call i32 @llvm.nvvm.f2tf32.rn.relu(float %{{.*}})
+  %res = nvvm.cvt.float.to.tf32 %src {rnd = #nvvm.fp_rnd_mode<rn>, relu=true}
+  llvm.return %res : i32
+}
+
+// CHECK-LABEL: @convert_float_to_tf32_rz
+llvm.func @convert_float_to_tf32_rz(%src : f32) -> i32 {
+  // CHECK: %{{.*}} = call i32 @llvm.nvvm.f2tf32.rz(float %{{.*}})
+  %res = nvvm.cvt.float.to.tf32 %src {rnd = #nvvm.fp_rnd_mode<rz>}
+  llvm.return %res : i32
+}
+
+// CHECK-LABEL: @convert_float_to_tf32_rz_relu
+llvm.func @convert_float_to_tf32_rz_relu(%src : f32) -> i32 {
+  // CHECK: %{{.*}} = call i32 @llvm.nvvm.f2tf32.rz.relu(float %{{.*}})
+  %res = nvvm.cvt.float.to.tf32 %src {rnd = #nvvm.fp_rnd_mode<rz>, relu=true}
+  llvm.return %res : i32
+}
diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
index 44c7126255dc4..cb08064590bc3 100644
--- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
@@ -106,3 +106,35 @@ llvm.func @tma_reduce_2d_im2col(%src : !llvm.ptr<3>, %tma_desc : !llvm.ptr, %d0
   nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1] {redKind = #nvvm.tma_redux_kind<and>, mode = #nvvm.tma_store_mode<im2col>}: !llvm.ptr, !llvm.ptr<3>
   llvm.return
 }
+
+// -----
+
+llvm.func @convert_float_to_tf32_rna_relu(%src : f32) -> i32 {
+  // expected-error @below {{Relu not supported with rna rounding mode.}}
+  %res = nvvm.cvt.float.to.tf32 %src {rnd = #nvvm.fp_rnd_mode<rna>, relu=true}
+  llvm.return %res : i32
+}
+
+// -----
+
+llvm.func @convert_float_to_tf32_rn_sf(%src : f32) -> i32 {
+  // expected-error @below {{Saturation mode not supported with rn/rz rounding modes.}}
+  %res = nvvm.cvt.float.to.tf32 %src {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>}
+  llvm.return %res : i32
+}
+
+// -----
+
+llvm.func @convert_float_to_tf32_rz_sf(%src : f32) -> i32 {
+  // expected-error @below {{Saturation mode not supported with rn/rz rounding modes.}}
+  %res = nvvm.cvt.float.to.tf32 %src {rnd = #nvvm.fp_rnd_mode<rz>, sat = #nvvm.sat_mode<satfinite>}
+  llvm.return %res : i32
+}
+
+// -----
+
+llvm.func @convert_float_to_tf32_no_rnd_mode(%src : f32) -> i32 {
+  // expected-error @below {{Only {rn,rz,rna} rounding modes supported for CvtFloatToTF32Op.}}
+  %res = nvvm.cvt.float.to.tf32 %src
+  llvm.return %res : i32
+}
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index 44e32c3f35f9b..4e4b9e5698fe9 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -1368,6 +1368,77 @@ llvm.func @omp_atomic_read(%arg0 : !llvm.ptr, %arg1 : !llvm.ptr) -> () {
 
 // -----
 
+// CHECK-LABEL: @omp_atomic_read_implicit_cast
+llvm.func @omp_atomic_read_implicit_cast () {
+//CHECK: %[[Z:.*]] = alloca float, i64 1, align 4
+//CHECK: %[[Y:.*]] = alloca double, i64 1, align 8
+//CHECK: %[[X:.*]] = alloca [2 x { float, float }], i64 1, align 8
+//CHECK: %[[W:.*]] = alloca i32, i64 1, align 4
+//CHECK: %[[X_ELEMENT:.*]] = getelementptr { float, float }, ptr %3, i64 0
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x f32 {bindc_name = "z"} : (i64) -> !llvm.ptr
+  %2 = llvm.mlir.constant(1 : i64) : i64
+  %3 = llvm.alloca %2 x f64 {bindc_name = "y"} : (i64) -> !llvm.ptr
+  %4 = llvm.mlir.constant(1 : i64) : i64
+  %5 = llvm.alloca %4 x !llvm.array<2 x struct<(f32, f32)>> {bindc_name = "x"} : (i64) -> !llvm.ptr
+  %6 = llvm.mlir.constant(1 : i64) : i64
+  %7 = llvm.alloca %6 x i32 {bindc_name = "w"} : (i64) -> !llvm.ptr
+  %8 = llvm.mlir.constant(1 : index) : i64
+  %9 = llvm.mlir.constant(2 : index) : i64
+  %10 = llvm.mlir.constant(1 : i64) : i64
+  %11 = llvm.mlir.constant(0 : i64) : i64
+  %12 = llvm.sub %8, %10 overflow<nsw> : i64
+  %13 = llvm.mul %12, %10 overflow<nsw> : i64
+  %14 = llvm.mul %13, %10 overflow<nsw> : i64
+  %15 = llvm.add %14, %11 overflow<nsw> : i64
+  %16 = llvm.mul %10, %9 overflow<nsw> : i64
+  %17 = llvm.getelementptr %5[%15] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(f32, f32)>
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = alloca { float, float }, align 8
+//CHECK: call void @__atomic_load(i64 8, ptr %[[X_ELEMENT]], ptr %[[ATOMIC_LOAD_TEMP]], i32 0)
+//CHECK: %[[LOAD:.*]] = load { float, float }, ptr %[[ATOMIC_LOAD_TEMP]], align 8
+//CHECK: %[[EXT:.*]] = extractvalue { float, float } %[[LOAD]], 0
+//CHECK: store float %[[EXT]], ptr %[[Y]], align 4
+  omp.atomic.read %3 = %17 : !llvm.ptr, !llvm.ptr, !llvm.struct<(f32, f32)>
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i32, ptr %[[Z]] monotonic, align 4
+//CHECK: %[[CAST:.*]] = bitcast i32 %[[ATOMIC_LOAD_TEMP]] to float
+//CHECK: %[[LOAD:.*]] = fpext float %[[CAST]] to double
+//CHECK: store double %[[LOAD]], ptr %[[Y]], align 8
+  omp.atomic.read %3 = %1 : !llvm.ptr, !llvm.ptr, f32
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i32, ptr %[[W]] monotonic, align 4
+//CHECK: %[[LOAD:.*]] = sitofp i32 %[[ATOMIC_LOAD_TEMP]] to double
+//CHECK: store double %[[LOAD]], ptr %[[Y]], align 8
+  omp.atomic.read %3 = %7 : !llvm.ptr, !llvm.ptr, i32
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i64, ptr %[[Y]] monotonic, align 4
+//CHECK: %[[CAST:.*]] = bitcast i64 %[[ATOMIC_LOAD_TEMP]] to double
+//CHECK: %[[LOAD:.*]] = fptrunc double %[[CAST]] to float
+//CHECK: store float %[[LOAD]], ptr %[[Z]], align 4
+  omp.atomic.read %1 = %3 : !llvm.ptr, !llvm.ptr, f64
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i32, ptr %[[W]] monotonic, align 4
+//CHECK: %[[LOAD:.*]] = sitofp i32 %[[ATOMIC_LOAD_TEMP]] to float
+//CHECK: store float %[[LOAD]], ptr %[[Z]], align 4
+  omp.atomic.read %1 = %7 : !llvm.ptr, !llvm.ptr, i32
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i64, ptr %[[Y]] monotonic, align 4
+//CHECK: %[[CAST:.*]] = bitcast i64 %[[ATOMIC_LOAD_TEMP]] to double
+//CHECK: %[[LOAD:.*]] = fptosi double %[[CAST]] to i32
+//CHECK: store i32 %[[LOAD]], ptr %[[W]], align 4
+  omp.atomic.read %7 = %3 : !llvm.ptr, !llvm.ptr, f64
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i32, ptr %[[Z]] monotonic, align 4
+//CHECK: %[[CAST:.*]] = bitcast i32 %[[ATOMIC_LOAD_TEMP]] to float
+//CHECK: %[[LOAD:.*]] = fptosi float %[[CAST]] to i32
+//CHECK: store i32 %[[LOAD]], ptr %[[W]], align 4
+  omp.atomic.read %7 = %1 : !llvm.ptr, !llvm.ptr, f32
+  llvm.return
+}
+
+// -----
+
 // CHECK-LABEL: @omp_atomic_write
 // CHECK-SAME: (ptr %[[x:.*]], i32 %[[expr:.*]])
 llvm.func @omp_atomic_write(%x: !llvm.ptr, %expr: i32) -> () {
@@ -2766,7 +2837,9 @@ llvm.func @task(%arg0 : !llvm.ptr) {
 // CHECK:         %[[VAL_19:.*]] = load i32, ptr %[[VAL_14]], align 4
 // CHECK:         store i32 %[[VAL_19]], ptr %[[VAL_15]], align 4
 // CHECK:         br label %[[VAL_20:.*]]
-// CHECK:       task.body:                                        ; preds = %omp.private.copy
+// CHECK:       [[VAL_20]]:
+// CHECK:         br label %task.body
+// CHECK:       task.body:                                        ; preds = %[[VAL_20]]
 // CHECK:         br label %omp.task.region
 // CHECK:       omp.task.region:                                  ; preds = %task.body
 // CHECK:         call void @foo(ptr %[[VAL_15]])
@@ -2874,6 +2947,44 @@ llvm.func @omp_taskgroup_task(%x: i32, %y: i32, %zaddr: !llvm.ptr) {
 
 // -----
 
+llvm.func @test_01() attributes {sym_visibility = "private"}
+llvm.func @test_02() attributes {sym_visibility = "private"}
+// CHECK-LABEL: define void @_QPomp_task_priority() {
+llvm.func @_QPomp_task_priority() {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "x"} : (i64) -> !llvm.ptr
+  %2 = llvm.mlir.constant(4 : i32) : i32
+  %3 = llvm.mlir.constant(true) : i1
+  %4 = llvm.load %1 : !llvm.ptr -> i32
+// CHECK:   %[[GID_01:.*]] = call i32 @__kmpc_global_thread_num(ptr {{.*}})
+// CHECK:   %[[I_01:.*]] = call ptr @__kmpc_omp_task_alloc(ptr {{.*}}, i32 %[[GID_01]], i32 33, i64 40, i64 0, ptr @{{.*}})
+// CHECK:   %[[I_02:.*]] = getelementptr inbounds { ptr }, ptr %[[I_01]], i32 0, i32 0
+// CHECK:   %[[I_03:.*]] = getelementptr inbounds { ptr, ptr, i32, ptr, ptr }, ptr %[[I_02]], i32 0, i32 4
+// CHECK:   %[[I_04:.*]] = getelementptr inbounds { ptr, ptr }, ptr %[[I_03]], i32 0, i32 0
+// CHECK:   store i32 {{.*}}, ptr %[[I_04]], align 4
+// CHECK:   %{{.*}} = call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %[[GID_01]], ptr %[[I_01]])
+  omp.task priority(%4 : i32) {
+    llvm.call @test_01() : () -> ()
+    omp.terminator
+  }
+// CHECK:   %[[GID_02:.*]] = call i32 @__kmpc_global_thread_num(ptr {{.*}})
+// CHECK:   %[[I_05:.*]] = call ptr @__kmpc_omp_task_alloc(ptr {{.*}}, i32 %[[GID_02]], i32 35, i64 40, i64 0, ptr @{{.*}})
+// CHECK:   %[[I_06:.*]] = getelementptr inbounds { ptr }, ptr %[[I_05]], i32 0, i32 0
+// CHECK:   %[[I_07:.*]] = getelementptr inbounds { ptr, ptr, i32, ptr, ptr }, ptr %[[I_06]], i32 0, i32 4
+// CHECK:   %[[I_08:.*]] = getelementptr inbounds { ptr, ptr }, ptr %[[I_07]], i32 0, i32 0
+// CHECK:   store i32 4, ptr %[[I_08]], align 4
+// CHECK:   %{{.*}} = call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %[[GID_02]], ptr %[[I_05]])
+  omp.task final(%3) priority(%2 : i32) {
+    llvm.call @test_02() : () -> ()
+    omp.terminator
+  }
+  llvm.return
+// CHECK:   ret void
+// CHECK: }
+}
+
+// -----
+
 // CHECK-LABEL: @omp_opaque_pointers
 // CHECK-SAME: (ptr %[[ARG0:.*]], ptr %[[ARG1:.*]], i32 %[[EXPR:.*]])
 llvm.func @omp_opaque_pointers(%arg0 : !llvm.ptr, %arg1: !llvm.ptr, %expr: i32) -> () {
diff --git a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir
index 75161bac2faf4..d2e394b2cf6a8 100644
--- a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir
@@ -56,8 +56,10 @@ llvm.func @missordered_blocks_(%arg0: !llvm.ptr {fir.bindc_name = "x"}, %arg1: !
 // CHECK:         %[[VAL_20:.*]] = alloca ptr, align 8
 // CHECK:         %[[VAL_21:.*]] = alloca ptr, align 8
 // CHECK:         %[[VAL_22:.*]] = alloca [2 x ptr], align 8
+// CHECK:         br label %[[AFTER_ALLOC:omp.region.after_alloca]]
+// CHECK:       [[AFTER_ALLOC]]:                                   ; preds = %[[PAR_ENTRY]]
 // CHECK:         br label %[[VAL_23:omp.par.region]]
-// CHECK:       [[VAL_23]]:                                   ; preds = %[[PAR_ENTRY]]
+// CHECK:       [[VAL_23]]:                                   ; preds = %[[AFTER_ALLOC]]
 // CHECK:         br label %[[VAL_42:.*]]
 // CHECK:       [[RED_INIT:omp.reduction.init]]:
 // CHECK:         br label %[[VAL_25:omp.reduction.neutral]]
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
index 912d5568c5f26..d6ed3086969fb 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
@@ -91,9 +91,12 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
 // CHECK:         %[[VAL_14:.*]] = alloca [1 x ptr], align 8
 // CHECK:         br label %[[VAL_15:.*]]
 
-// CHECK:       omp.par.region:                                   ; preds = %[[PAR_ENTRY]]
+// CHECK:       [[VAL_15]]:
+// CHECK:         br label %[[PAR_REG:omp.par.region]]
+
+// CHECK:       [[PAR_REG]]:                                   ; preds = %[[VAL_15]]
 // CHECK:         br label %[[VAL_18:.*]]
-// CHECK:       omp.par.region1:                                  ; preds = %[[VAL_15]]
+// CHECK:       omp.par.region1:                                  ; preds = %[[PAR_REG]]
 // CHECK:         %[[VAL_19:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
 // CHECK:         br label %[[VAL_22:.*]]
 
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
index 7f2424381e846..8d329bd8ff817 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
@@ -63,7 +63,11 @@ module {
 // CHECK:         %[[VAL_23:.*]] = alloca ptr, align 8
 // CHECK:         %[[VAL_24:.*]] = alloca [2 x ptr], align 8
 // CHECK:         br label %[[VAL_25:.*]]
-// CHECK:       omp.par.region:                                   ; preds = %[[PAR_ENTRY]]
+
+// CHECK:       [[VAL_25]]:
+// CHECK:         br label %[[PAR_REG:omp.par.region]]
+
+// CHECK:       [[PAR_REG]]:                                   ; preds = %[[VAL_25]]
 // CHECK:         br label %[[INIT_LABEL:.*]]
 // CHECK: [[INIT_LABEL]]:
 // CHECK:         %[[VAL_20:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[VAL_13]], align 8
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir
index 05af32622246a..de3b997feb674 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir
@@ -50,9 +50,13 @@ llvm.func @sections_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attributes {fir.in
 // CHECK:         %[[VAL_20:.*]] = alloca float, align 4
 // CHECK:         %[[VAL_21:.*]] = alloca [1 x ptr], align 8
 // CHECK:         br label %[[VAL_22:.*]]
-// CHECK:       omp.par.region:                                   ; preds = %[[PAR_ENTRY]]
+
+// CHECK:       [[VAL_22]]:
+// CHECK:         br label %[[PAR_REG:omp.par.region]]
+
+// CHECK:       [[PAR_REG]]:                                   ; preds = %[[VAL_22]]
 // CHECK:         br label %[[VAL_25:.*]]
-// CHECK:       omp.par.region1:                                  ; preds = %[[VAL_22]]
+// CHECK:       omp.par.region1:                                  ; preds = %[[PAR_REG]]
 // CHECK:         br label %[[VAL_26:.*]]
 
 // CHECK:       [[RED_INIT:omp.reduction.init]]:
diff --git a/mlir/test/Target/LLVMIR/openmp-simd-private.mlir b/mlir/test/Target/LLVMIR/openmp-simd-private.mlir
index 09d76f8edd007..61542aa1aa4d7 100644
--- a/mlir/test/Target/LLVMIR/openmp-simd-private.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-simd-private.mlir
@@ -12,6 +12,9 @@ omp.private {type = private} @i_privatizer : !llvm.ptr alloc {
 // CHECK:         %{{.*}} = alloca i32, i64 1, align 4
 // CHECK:         %[[DUMMY:.*]] = alloca float, i64 1, align 4
 // CHECK:         %[[PRIV_I:.*]] = alloca i32, i64 1, align 4
+// CHECK:         br label %[[LATE_ALLOC:.*]]
+
+// CHECK:     [[LATE_ALLOC]]:
 // CHECK:         br label %[[AFTER_ALLOC:.*]]
 
 // CHECK:       [[AFTER_ALLOC]]:
diff --git a/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir b/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir
index 3872d908e7a20..ff580e5fea634 100644
--- a/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir
@@ -12,6 +12,9 @@
 // CHECK-NEXT:         br i1 %[[VAL_7]], label %[[VAL_8:.*]], label %[[VAL_9:.*]]
 // CHECK:            user_code.entry:                                  ; preds = %[[VAL_10:.*]]
 // CHECK-NEXT:         %[[VAL_11:.*]] = load ptr, ptr %[[VAL_3]], align 8
+// CHECK-NEXT:         br label %[[AFTER_ALLOC:.*]]
+
+// CHECK:            [[AFTER_ALLOC]]:
 // CHECK-NEXT:         br label %[[VAL_12:.*]]
 
 // CHECK:            [[VAL_12]]:
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index c1e30964b2507..ea3d3b4bd9df8 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -400,17 +400,6 @@ llvm.func @task_in_reduction(%x : !llvm.ptr) {
 
 // -----
 
-llvm.func @task_priority(%x : i32) {
-  // expected-error@below {{not yet implemented: Unhandled clause priority in omp.task operation}}
-  // expected-error@below {{LLVM Translation failed for operation: omp.task}}
-  omp.task priority(%x : i32) {
-    omp.terminator
-  }
-  llvm.return
-}
-
-// -----
-
 llvm.func @task_untied() {
   // expected-error@below {{not yet implemented: Unhandled clause untied in omp.task operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.task}}
diff --git a/mlir/test/Target/LLVMIR/openmp-wsloop-private-late-alloca-workaround.mlir b/mlir/test/Target/LLVMIR/openmp-wsloop-private-late-alloca-workaround.mlir
new file mode 100644
index 0000000000000..4d732bbb4e3b6
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-wsloop-private-late-alloca-workaround.mlir
@@ -0,0 +1,47 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// Tests the "impure" alloc region workaround until `omp.private` is updated.
+// See
+// https://discourse.llvm.org/t/rfc-openmp-supporting-delayed-task-execution-with-firstprivate-variables/83084/7
+// and https://discourse.llvm.org/t/delayed-privatization-for-omp-wsloop/83989
+// for more info.
+
+omp.private {type = private} @impure_alloca_privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i", pinned} : (i64) -> !llvm.ptr
+  %3 = llvm.getelementptr %arg0[0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr)>
+  omp.yield(%1 : !llvm.ptr)
+}
+
+llvm.func @test_alloca_ip_workaround() {
+  omp.target {
+    %65 = llvm.mlir.constant(1 : i32) : i32
+    %66 = llvm.alloca %65 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    %67 = llvm.mlir.constant(0 : index) : i64
+    %68 = llvm.mlir.constant(10 : i32) : i32
+    %69 = llvm.mlir.constant(1 : i32) : i32
+    omp.wsloop private(@impure_alloca_privatizer %66 -> %arg6 : !llvm.ptr) {
+      omp.loop_nest (%arg8) : i32 = (%69) to (%68) inclusive step (%69) {
+        omp.yield
+      }
+    }
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: define {{.*}} @__omp_offloading_{{.*}}_test_alloca_ip_workaround
+
+// CHECK:       omp.target:
+// CHECK:         %[[ALLOC_REG_ARG:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
+// CHECK:         br label %omp.private.latealloc
+
+// CHECK:       omp.private.latealloc:
+// CHECK:         %[[PRIV_ALLOC:.*]] = alloca i32, i64 1, align 4
+// The usage of `ALLOC_REG_ARG` in the inlined alloc region is the reason for
+// introducing the workaround.
+// CHECK:         %{{.*}} = getelementptr { ptr }, ptr %[[ALLOC_REG_ARG]], i32 0
+// CHECK:         br label %omp.region.after_defining_block
+
+
diff --git a/mlir/test/Target/LLVMIR/openmp-wsloop-test-block-structure.mlir b/mlir/test/Target/LLVMIR/openmp-wsloop-test-block-structure.mlir
new file mode 100644
index 0000000000000..19ae425e20403
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-wsloop-test-block-structure.mlir
@@ -0,0 +1,54 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+// Tests regression uncovered by "1009/1009_0029.f90" (from the Fujitsu test
+// suite). This test replicates a simplified version of the block structure
+// produced by the Fujitsu test.
+
+llvm.func @test_block_structure() {
+  %i1 = llvm.mlir.constant(1 : index) : i1
+  %i64 = llvm.mlir.constant(1 : index) : i64
+  llvm.br ^bb1(%i64, %i64 : i64, i64)
+
+^bb1(%20: i64, %21: i64):  // 2 preds: ^bb0, ^bb5
+  llvm.cond_br %i1, ^bb2, ^bb6
+
+^bb2:  // pred: ^bb1
+  llvm.br ^bb3(%i64, %i64 : i64, i64)
+
+^bb3(%25: i64, %26: i64):  // 2 preds: ^bb2, ^bb4
+  llvm.cond_br %i1, ^bb4, ^bb5
+
+^bb4:  // pred: ^bb3
+  omp.wsloop {
+    omp.loop_nest (%arg0) : i64 = (%i64) to (%i64) inclusive step (%i64) {
+      omp.yield
+    }
+  }
+  llvm.br ^bb1(%i64, %i64 : i64, i64)
+
+^bb5:  // pred: ^bb3
+  llvm.br ^bb1(%i64, %i64 : i64, i64)
+
+^bb6:  // pred: ^bb1
+  llvm.return
+}
+
+// CHECK: define void @test_block_structure
+// CHECK:   br label %[[AFTER_ALLOCA:.*]]
+
+// CHECK: [[AFTER_ALLOCA:]]:
+// CHECK:   br label %[[BB1:.*]]
+
+// CHECK: [[BB1:]]:
+// CHECK:   %{{.*}} = phi i64 
+// CHECK:   br i1 true, label %[[BB2:.*]], label %{{.*}}
+
+// CHECK: [[BB2]]:
+// CHECK:   br label %[[BB3:.*]]
+
+// CHECK: [[BB3]]:
+// CHECK:   %{{.*}} = phi i64 
+// CHECK:   br i1 true, label %[[BB4:.*]], label %{{.*}}
+
+// CHECK: [[BB4]]:
+// CHECK:   br label %omp_loop.preheader
diff --git a/mlir/test/Target/SPIRV/gl-ops.mlir b/mlir/test/Target/SPIRV/gl-ops.mlir
index fff1adf0ae12c..119304cea7d4a 100644
--- a/mlir/test/Target/SPIRV/gl-ops.mlir
+++ b/mlir/test/Target/SPIRV/gl-ops.mlir
@@ -81,4 +81,24 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %2 = spirv.GL.FindUMsb %arg0 : i32
     spirv.Return
   }
+
+spirv.func @vector(%arg0 : f32, %arg1 : vector<3xf32>, %arg2 : vector<3xf32>) "None" {
+    // CHECK: {{%.*}} = spirv.GL.Cross {{%.*}}, {{%.*}} : vector<3xf32>
+    %0 = spirv.GL.Cross %arg1, %arg2 : vector<3xf32>
+    // CHECK: {{%.*}} = spirv.GL.Normalize {{%.*}} : f32
+    %1 = spirv.GL.Normalize %arg0 : f32
+    // CHECK: {{%.*}} = spirv.GL.Normalize {{%.*}} : vector<3xf32>
+    %2 = spirv.GL.Normalize %arg1 : vector<3xf32>
+    // CHECK: {{%.*}} = spirv.GL.Reflect {{%.*}}, {{%.*}} : f32
+    %3 = spirv.GL.Reflect %arg0, %arg0 : f32
+    // CHECK: {{%.*}} = spirv.GL.Reflect {{%.*}}, {{%.*}} : vector<3xf32>
+    %4 = spirv.GL.Reflect %arg1, %arg2 : vector<3xf32>
+    // CHECK: {{%.*}} = spirv.GL.Distance {{%.*}}, {{%.*}} : f32, f32 -> f32
+    %5 = spirv.GL.Distance %arg0, %arg0 : f32, f32 -> f32
+    // CHECK: {{%.*}} = spirv.GL.Distance {{%.*}}, {{%.*}} : vector<3xf32>, vector<3xf32> -> f32
+    %6 = spirv.GL.Distance %arg1, %arg2 : vector<3xf32>, vector<3xf32> -> f32
+    spirv.Return
+  }
+
+
 }
diff --git a/mlir/test/Target/SPIRV/matrix.mlir b/mlir/test/Target/SPIRV/matrix.mlir
index 2a391df4bff39..0ec1dc27e4e93 100644
--- a/mlir/test/Target/SPIRV/matrix.mlir
+++ b/mlir/test/Target/SPIRV/matrix.mlir
@@ -36,6 +36,13 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     spirv.ReturnValue %result : !spirv.matrix<2 x vector<3xf32>>
   }
 
+  // CHECK-LABEL: @matrix_times_vector_1
+  spirv.func @matrix_times_vector_1(%arg0: !spirv.matrix<3 x vector<4xf32>>, %arg1: vector<3xf32>) -> vector<4xf32> "None" {
+    // CHECK: {{%.*}} = spirv.MatrixTimesVector {{%.*}}, {{%.*}} : !spirv.matrix<3 x vector<4xf32>>, vector<3xf32> -> vector<4xf32>
+    %result = spirv.MatrixTimesVector %arg0, %arg1 : !spirv.matrix<3 x vector<4xf32>>, vector<3xf32> -> vector<4xf32>
+    spirv.ReturnValue %result : vector<4xf32>
+  }
+  
   // CHECK-LABEL: @matrix_times_matrix_1
   spirv.func @matrix_times_matrix_1(%arg0: !spirv.matrix<3 x vector<3xf32>>, %arg1: !spirv.matrix<3 x vector<3xf32>>) -> !spirv.matrix<3 x vector<3xf32>> "None"{
     // CHECK: {{%.*}} = spirv.MatrixTimesMatrix {{%.*}}, {{%.*}} : !spirv.matrix<3 x vector<3xf32>>, !spirv.matrix<3 x vector<3xf32>> -> !spirv.matrix<3 x vector<3xf32>>
diff --git a/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
index 598678f64cb46..c32bd24014215 100644
--- a/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
@@ -101,8 +101,7 @@ void VectorizerTestPass::testVectorShapeRatio(llvm::raw_ostream &outs) {
   using affine::matcher::Op;
   SmallVector<int64_t, 8> shape(clTestVectorShapeRatio.begin(),
                                 clTestVectorShapeRatio.end());
-  auto subVectorType =
-      VectorType::get(shape, FloatType::getF32(f.getContext()));
+  auto subVectorType = VectorType::get(shape, Float32Type::get(f.getContext()));
   // Only filter operations that operate on a strict super-vector and have one
   // return. This makes testing easier.
   auto filter = [&](Operation &op) {
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index 5b7c36c9b97bf..b20e0816bd17c 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -1286,7 +1286,7 @@ struct TestTypeConverter : public TypeConverter {
 
     // Convert I64 to F64.
     if (t.isSignlessInteger(64)) {
-      results.push_back(FloatType::getF64(t.getContext()));
+      results.push_back(Float64Type::get(t.getContext()));
       return success();
     }
 
@@ -1298,7 +1298,7 @@ struct TestTypeConverter : public TypeConverter {
 
     // Split F32 into F16,F16.
     if (t.isF32()) {
-      results.assign(2, FloatType::getF16(t.getContext()));
+      results.assign(2, Float16Type::get(t.getContext()));
       return success();
     }
 
@@ -1826,7 +1826,7 @@ struct TestTypeConversionDriver
         return type;
       // Allow converting BF16/F16/F32 to F64.
       if (type.isBF16() || type.isF16() || type.isF32())
-        return FloatType::getF64(type.getContext());
+        return Float64Type::get(type.getContext());
       // Otherwise, the type is illegal.
       return nullptr;
     });
diff --git a/mlir/test/lib/Pass/TestVulkanRunnerPipeline.cpp b/mlir/test/lib/Pass/TestVulkanRunnerPipeline.cpp
index 789c4d76cee0d..a3624eb31e26e 100644
--- a/mlir/test/lib/Pass/TestVulkanRunnerPipeline.cpp
+++ b/mlir/test/lib/Pass/TestVulkanRunnerPipeline.cpp
@@ -11,9 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.h"
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
 #include "mlir/Conversion/GPUToSPIRV/GPUToSPIRVPass.h"
+#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/Transforms/RequestCWrappers.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
 #include "mlir/Dialect/SPIRV/Transforms/Passes.h"
@@ -29,6 +33,9 @@ struct VulkanRunnerPipelineOptions
   Option<bool> spirvWebGPUPrepare{
       *this, "spirv-webgpu-prepare",
       llvm::cl::desc("Run MLIR transforms used when targetting WebGPU")};
+  Option<bool> toLlvm{*this, "to-llvm",
+                      llvm::cl::desc("Run MLIR transforms to lower host code "
+                                     "to LLVM, intended for mlir-cpu-runner")};
 };
 
 void buildTestVulkanRunnerPipeline(OpPassManager &passManager,
@@ -56,6 +63,19 @@ void buildTestVulkanRunnerPipeline(OpPassManager &passManager,
     spirvModulePM.addPass(spirv::createSPIRVWebGPUPreparePass());
 
   passManager.addPass(createGpuModuleToBinaryPass());
+
+  if (options.toLlvm) {
+    passManager.addPass(createFinalizeMemRefToLLVMConversionPass());
+    passManager.nest<func::FuncOp>().addPass(
+        LLVM::createRequestCWrappersPass());
+    // vulkan-runtime-wrappers.cpp uses the non-bare-pointer calling convention,
+    // and the type check is needed to prevent accidental ABI mismatches.
+    GpuToLLVMConversionPassOptions opt;
+    opt.hostBarePtrCallConv = false;
+    opt.kernelBarePtrCallConv = false;
+    opt.typeCheckKernelArgs = true;
+    passManager.addPass(createGpuToLLVMConversionPass(opt));
+  }
 }
 
 } // namespace
@@ -65,7 +85,7 @@ void registerTestVulkanRunnerPipeline() {
   PassPipelineRegistration<VulkanRunnerPipelineOptions>(
       "test-vulkan-runner-pipeline",
       "Runs a series of passes for lowering GPU-dialect MLIR to "
-      "SPIR-V-dialect MLIR intended for mlir-vulkan-runner.",
+      "SPIR-V-dialect MLIR intended for mlir-vulkan-runner or mlir-cpu-runner.",
       buildTestVulkanRunnerPipeline);
 }
 } // namespace mlir::test
diff --git a/mlir/test/lib/Transforms/TestDialectConversion.cpp b/mlir/test/lib/Transforms/TestDialectConversion.cpp
index a03bf0a1023d5..8278937a1014c 100644
--- a/mlir/test/lib/Transforms/TestDialectConversion.cpp
+++ b/mlir/test/lib/Transforms/TestDialectConversion.cpp
@@ -34,7 +34,7 @@ struct PDLLTypeConverter : public TypeConverter {
   static LogicalResult convertType(Type t, SmallVectorImpl<Type> &results) {
     // Convert I64 to F64.
     if (t.isSignlessInteger(64)) {
-      results.push_back(FloatType::getF64(t.getContext()));
+      results.push_back(Float64Type::get(t.getContext()));
       return success();
     }
 
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index b2fdf5875f922..16862bd22cf97 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -123,7 +123,7 @@ def find_real_python_interpreter():
     if sys.prefix != sys.base_prefix:
         copied_python = os.path.join(sys.prefix, "bin", "copied-python")
     else:
-        copied_python = os.path.join(config.lldb_build_directory, "copied-python")
+        copied_python = os.path.join(config.mlir_obj_root, "copied-python")
 
     # Avoid doing any work if we already copied the binary.
     if os.path.isfile(copied_python):
diff --git a/mlir/test/mlir-vulkan-runner/addf.mlir b/mlir/test/mlir-vulkan-runner/addf.mlir
index d435f75a28805..71f87a8b0d5c8 100644
--- a/mlir/test/mlir-vulkan-runner/addf.mlir
+++ b/mlir/test/mlir-vulkan-runner/addf.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -test-vulkan-runner-pipeline \
-// RUN:   | mlir-vulkan-runner - --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils --entry-point-result=void | FileCheck %s
+// RUN: mlir-opt %s -test-vulkan-runner-pipeline=to-llvm \
+// RUN:   | mlir-cpu-runner - --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils --entry-point-result=void | FileCheck %s
 
 // CHECK: [3.3,  3.3,  3.3,  3.3,  3.3,  3.3,  3.3,  3.3]
 module attributes {
diff --git a/mlir/test/mlir-vulkan-runner/addf_if.mlir b/mlir/test/mlir-vulkan-runner/addf_if.mlir
index 8ae995c65e7e8..6fe51a83482dc 100644
--- a/mlir/test/mlir-vulkan-runner/addf_if.mlir
+++ b/mlir/test/mlir-vulkan-runner/addf_if.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -test-vulkan-runner-pipeline \
-// RUN:   | mlir-vulkan-runner - --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils --entry-point-result=void | FileCheck %s
+// RUN: mlir-opt %s -test-vulkan-runner-pipeline=to-llvm \
+// RUN:   | mlir-cpu-runner - --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils --entry-point-result=void | FileCheck %s
 
 // CHECK: [3.3,  3.3,  3.3,  3.3,  0,  0,  0,  0]
 module attributes {
diff --git a/mlir/test/mlir-vulkan-runner/addui_extended.mlir b/mlir/test/mlir-vulkan-runner/addui_extended.mlir
index b8db451421459..0894bc301f2e3 100644
--- a/mlir/test/mlir-vulkan-runner/addui_extended.mlir
+++ b/mlir/test/mlir-vulkan-runner/addui_extended.mlir
@@ -1,13 +1,13 @@
 // Make sure that addition with carry produces expected results
 // with and without expansion to primitive add/cmp ops for WebGPU.
 
-// RUN: mlir-opt %s -test-vulkan-runner-pipeline \
-// RUN:   | mlir-vulkan-runner - \
+// RUN: mlir-opt %s -test-vulkan-runner-pipeline=to-llvm \
+// RUN:   | mlir-cpu-runner - \
 // RUN:     --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \
 // RUN:     --entry-point-result=void | FileCheck %s
 
-// RUN: mlir-opt %s -test-vulkan-runner-pipeline=spirv-webgpu-prepare \
-// RUN:   | mlir-vulkan-runner - \
+// RUN: mlir-opt %s -test-vulkan-runner-pipeline="spirv-webgpu-prepare to-llvm" \
+// RUN:   | mlir-cpu-runner - \
 // RUN:     --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \
 // RUN:     --entry-point-result=void | FileCheck %s
 
diff --git a/mlir/test/mlir-vulkan-runner/smul_extended.mlir b/mlir/test/mlir-vulkan-runner/smul_extended.mlir
index 334aec843e197..0ef86f46562e8 100644
--- a/mlir/test/mlir-vulkan-runner/smul_extended.mlir
+++ b/mlir/test/mlir-vulkan-runner/smul_extended.mlir
@@ -1,13 +1,13 @@
 // Make sure that signed extended multiplication produces expected results
 // with and without expansion to primitive mul/add ops for WebGPU.
 
-// RUN: mlir-opt %s -test-vulkan-runner-pipeline \
-// RUN:   | mlir-vulkan-runner - \
+// RUN: mlir-opt %s -test-vulkan-runner-pipeline=to-llvm \
+// RUN:   | mlir-cpu-runner - \
 // RUN:     --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \
 // RUN:     --entry-point-result=void | FileCheck %s
 
-// RUN: mlir-opt %s -test-vulkan-runner-pipeline=spirv-webgpu-prepare \
-// RUN:   | mlir-vulkan-runner - \
+// RUN: mlir-opt %s -test-vulkan-runner-pipeline="spirv-webgpu-prepare to-llvm" \
+// RUN:   | mlir-cpu-runner - \
 // RUN:     --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \
 // RUN:     --entry-point-result=void | FileCheck %s
 
diff --git a/mlir/test/mlir-vulkan-runner/time.mlir b/mlir/test/mlir-vulkan-runner/time.mlir
index 6a0bfef36793b..f628447874238 100644
--- a/mlir/test/mlir-vulkan-runner/time.mlir
+++ b/mlir/test/mlir-vulkan-runner/time.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -test-vulkan-runner-pipeline \
-// RUN:   | mlir-vulkan-runner - --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils --entry-point-result=void | FileCheck %s
+// RUN: mlir-opt %s -test-vulkan-runner-pipeline=to-llvm \
+// RUN:   | mlir-cpu-runner - --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils --entry-point-result=void | FileCheck %s
 
 // CHECK: Compute shader execution time
 // CHECK: Command buffer submit time
diff --git a/mlir/test/mlir-vulkan-runner/umul_extended.mlir b/mlir/test/mlir-vulkan-runner/umul_extended.mlir
index 803b8c3d336d3..5936c808435c1 100644
--- a/mlir/test/mlir-vulkan-runner/umul_extended.mlir
+++ b/mlir/test/mlir-vulkan-runner/umul_extended.mlir
@@ -1,13 +1,13 @@
 // Make sure that unsigned extended multiplication produces expected results
 // with and without expansion to primitive mul/add ops for WebGPU.
 
-// RUN: mlir-opt %s -test-vulkan-runner-pipeline \
-// RUN:   | mlir-vulkan-runner - \
+// RUN: mlir-opt %s -test-vulkan-runner-pipeline=to-llvm \
+// RUN:   | mlir-cpu-runner - \
 // RUN:     --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \
 // RUN:     --entry-point-result=void | FileCheck %s
 
-// RUN: mlir-opt %s -test-vulkan-runner-pipeline=spirv-webgpu-prepare \
-// RUN:   | mlir-vulkan-runner - \
+// RUN: mlir-opt %s -test-vulkan-runner-pipeline="spirv-webgpu-prepare to-llvm" \
+// RUN:   | mlir-cpu-runner - \
 // RUN:     --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \
 // RUN:     --entry-point-result=void | FileCheck %s
 
diff --git a/mlir/test/mlir-vulkan-runner/vector-deinterleave.mlir b/mlir/test/mlir-vulkan-runner/vector-deinterleave.mlir
index 097f3905949d8..ebeb19cd6bcc5 100644
--- a/mlir/test/mlir-vulkan-runner/vector-deinterleave.mlir
+++ b/mlir/test/mlir-vulkan-runner/vector-deinterleave.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -test-vulkan-runner-pipeline \
-// RUN:   | mlir-vulkan-runner - \
+// RUN: mlir-opt %s -test-vulkan-runner-pipeline=to-llvm \
+// RUN:   | mlir-cpu-runner - \
 // RUN:     --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \
 // RUN:     --entry-point-result=void | FileCheck %s
 
diff --git a/mlir/test/mlir-vulkan-runner/vector-interleave.mlir b/mlir/test/mlir-vulkan-runner/vector-interleave.mlir
index 5dd4abbd1fb19..9314baf9b39c7 100644
--- a/mlir/test/mlir-vulkan-runner/vector-interleave.mlir
+++ b/mlir/test/mlir-vulkan-runner/vector-interleave.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -test-vulkan-runner-pipeline \
-// RUN:   | mlir-vulkan-runner - \
+// RUN: mlir-opt %s -test-vulkan-runner-pipeline=to-llvm \
+// RUN:   | mlir-cpu-runner - \
 // RUN:     --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \
 // RUN:     --entry-point-result=void | FileCheck %s
 
diff --git a/mlir/test/mlir-vulkan-runner/vector-shuffle.mlir b/mlir/test/mlir-vulkan-runner/vector-shuffle.mlir
index be97b48b1812e..cf3e2c569426b 100644
--- a/mlir/test/mlir-vulkan-runner/vector-shuffle.mlir
+++ b/mlir/test/mlir-vulkan-runner/vector-shuffle.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -test-vulkan-runner-pipeline \
-// RUN:   | mlir-vulkan-runner - \
+// RUN: mlir-opt %s -test-vulkan-runner-pipeline=to-llvm \
+// RUN:   | mlir-cpu-runner - \
 // RUN:     --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \
 // RUN:     --entry-point-result=void | FileCheck %s
 
diff --git a/mlir/test/python/execution_engine.py b/mlir/test/python/execution_engine.py
index 0d12c35d96bee..d569fcef32bfd 100644
--- a/mlir/test/python/execution_engine.py
+++ b/mlir/test/python/execution_engine.py
@@ -5,7 +5,14 @@
 from mlir.passmanager import *
 from mlir.execution_engine import *
 from mlir.runtime import *
-from ml_dtypes import bfloat16, float8_e5m2
+
+try:
+    from ml_dtypes import bfloat16, float8_e5m2
+
+    HAS_ML_DTYPES = True
+except ModuleNotFoundError:
+    HAS_ML_DTYPES = False
+
 
 MLIR_RUNNER_UTILS = os.getenv(
     "MLIR_RUNNER_UTILS", "../../../../lib/libmlir_runner_utils.so"
@@ -559,12 +566,15 @@ def testBF16Memref():
         execution_engine.invoke("main", arg1_memref_ptr, arg2_memref_ptr)
 
         # test to-numpy utility
-        # CHECK: [0.5]
-        npout = ranked_memref_to_numpy(arg2_memref_ptr[0])
-        log(npout)
+        x = ranked_memref_to_numpy(arg2_memref_ptr[0])
+        assert len(x) == 1
+        assert x[0] == 0.5
 
 
-run(testBF16Memref)
+if HAS_ML_DTYPES:
+    run(testBF16Memref)
+else:
+    log("TEST: testBF16Memref")
 
 
 # Test f8E5M2 memrefs
@@ -598,12 +608,15 @@ def testF8E5M2Memref():
         execution_engine.invoke("main", arg1_memref_ptr, arg2_memref_ptr)
 
         # test to-numpy utility
-        # CHECK: [0.5]
-        npout = ranked_memref_to_numpy(arg2_memref_ptr[0])
-        log(npout)
+        x = ranked_memref_to_numpy(arg2_memref_ptr[0])
+        assert len(x) == 1
+        assert x[0] == 0.5
 
 
-run(testF8E5M2Memref)
+if HAS_ML_DTYPES:
+    run(testF8E5M2Memref)
+else:
+    log("TEST: testF8E5M2Memref")
 
 
 #  Test addition of two 2d_memref
diff --git a/mlir/tools/mlir-vulkan-runner/vulkan-runtime-wrappers.cpp b/mlir/tools/mlir-vulkan-runner/vulkan-runtime-wrappers.cpp
index f1ed571734459..ffd1114cec6aa 100644
--- a/mlir/tools/mlir-vulkan-runner/vulkan-runtime-wrappers.cpp
+++ b/mlir/tools/mlir-vulkan-runner/vulkan-runtime-wrappers.cpp
@@ -13,6 +13,8 @@
 #include <iostream>
 #include <mutex>
 #include <numeric>
+#include <string>
+#include <vector>
 
 #include "VulkanRuntime.h"
 
@@ -26,6 +28,38 @@
 
 namespace {
 
+class VulkanModule;
+
+// Class to be a thing that can be returned from `mgpuModuleGetFunction`.
+struct VulkanFunction {
+  VulkanModule *module;
+  std::string name;
+
+  VulkanFunction(VulkanModule *module, const char *name)
+      : module(module), name(name) {}
+};
+
+// Class to own a copy of the SPIR-V provided to `mgpuModuleLoad` and to manage
+// allocation of pointers returned from `mgpuModuleGetFunction`.
+class VulkanModule {
+public:
+  VulkanModule(const uint8_t *ptr, size_t sizeInBytes)
+      : blob(ptr, ptr + sizeInBytes) {}
+  ~VulkanModule() = default;
+
+  VulkanFunction *getFunction(const char *name) {
+    return functions.emplace_back(std::make_unique<VulkanFunction>(this, name))
+        .get();
+  }
+
+  uint8_t *blobData() { return blob.data(); }
+  size_t blobSizeInBytes() const { return blob.size(); }
+
+private:
+  std::vector<uint8_t> blob;
+  std::vector<std::unique_ptr<VulkanFunction>> functions;
+};
+
 class VulkanRuntimeManager {
 public:
   VulkanRuntimeManager() = default;
@@ -91,6 +125,94 @@ void bindMemRef(void *vkRuntimeManager, DescriptorSetIndex setIndex,
 }
 
 extern "C" {
+
+//===----------------------------------------------------------------------===//
+//
+// New wrappers, intended for mlir-cpu-runner. Calls to these are generated by
+// GPUToLLVMConversionPass.
+//
+//===----------------------------------------------------------------------===//
+
+VULKAN_WRAPPER_SYMBOL_EXPORT void *mgpuStreamCreate() {
+  return new VulkanRuntimeManager();
+}
+
+VULKAN_WRAPPER_SYMBOL_EXPORT void mgpuStreamDestroy(void *vkRuntimeManager) {
+  delete static_cast<VulkanRuntimeManager *>(vkRuntimeManager);
+}
+
+VULKAN_WRAPPER_SYMBOL_EXPORT void mgpuStreamSynchronize(void *) {
+  // Currently a no-op as the other operations are synchronous.
+}
+
+VULKAN_WRAPPER_SYMBOL_EXPORT void *mgpuModuleLoad(const void *data,
+                                                  size_t gpuBlobSize) {
+  // gpuBlobSize is the size of the data in bytes.
+  return new VulkanModule(static_cast<const uint8_t *>(data), gpuBlobSize);
+}
+
+VULKAN_WRAPPER_SYMBOL_EXPORT void mgpuModuleUnload(void *vkModule) {
+  delete static_cast<VulkanModule *>(vkModule);
+}
+
+VULKAN_WRAPPER_SYMBOL_EXPORT void *mgpuModuleGetFunction(void *vkModule,
+                                                         const char *name) {
+  if (!vkModule)
+    abort();
+  return static_cast<VulkanModule *>(vkModule)->getFunction(name);
+}
+
+VULKAN_WRAPPER_SYMBOL_EXPORT void
+mgpuLaunchKernel(void *vkKernel, size_t gridX, size_t gridY, size_t gridZ,
+                 size_t /*blockX*/, size_t /*blockY*/, size_t /*blockZ*/,
+                 size_t /*smem*/, void *vkRuntimeManager, void **params,
+                 void ** /*extra*/, size_t paramsCount) {
+  auto manager = static_cast<VulkanRuntimeManager *>(vkRuntimeManager);
+
+  // The non-bare-pointer memref ABI interacts badly with mgpuLaunchKernel's
+  // signature:
+  // - The memref descriptor struct gets split into several elements, each
+  //   passed as their own "param".
+  // - No metadata is provided as to the rank or element type/size of a memref.
+  //   Here we assume that all MemRefs have rank 1 and an element size of
+  //   4 bytes. This means each descriptor struct will have five members.
+  // TODO(https://github.com/llvm/llvm-project/issues/73457): Refactor the
+  //       ABI/API of mgpuLaunchKernel to use a different ABI for memrefs, so
+  //       that other memref types can also be used. This will allow migrating
+  //       the remaining tests and removal of mlir-vulkan-runner.
+  const size_t paramsPerMemRef = 5;
+  if (paramsCount % paramsPerMemRef != 0) {
+    abort();
+  }
+  const DescriptorSetIndex setIndex = 0;
+  BindingIndex bindIndex = 0;
+  for (size_t i = 0; i < paramsCount; i += paramsPerMemRef) {
+    auto memref = static_cast<MemRefDescriptor<uint32_t, 1> *>(params[i]);
+    bindMemRef<uint32_t, 1>(manager, setIndex, bindIndex, memref);
+    ++bindIndex;
+  }
+
+  manager->setNumWorkGroups(NumWorkGroups{static_cast<uint32_t>(gridX),
+                                          static_cast<uint32_t>(gridY),
+                                          static_cast<uint32_t>(gridZ)});
+
+  auto function = static_cast<VulkanFunction *>(vkKernel);
+  // Expected size should be in bytes.
+  manager->setShaderModule(
+      function->module->blobData(),
+      static_cast<uint32_t>(function->module->blobSizeInBytes()));
+  manager->setEntryPoint(function->name.c_str());
+
+  manager->runOnVulkan();
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Old wrappers, intended for mlir-vulkan-runner. Calls to these are generated
+// by LaunchFuncToVulkanCallsPass.
+//
+//===----------------------------------------------------------------------===//
+
 /// Initializes `VulkanRuntimeManager` and returns a pointer to it.
 VULKAN_WRAPPER_SYMBOL_EXPORT void *initVulkan() {
   return new VulkanRuntimeManager();
diff --git a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp
index a763105fa0fd6..4e354e535dd3a 100644
--- a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp
+++ b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp
@@ -107,15 +107,15 @@ std::optional<Type> recordToType(MLIRContext *ctx, const Record &predRec) {
     auto width = predRec.getValueAsInt("bitwidth");
     switch (width) {
     case 16:
-      return FloatType::getF16(ctx);
+      return Float16Type::get(ctx);
     case 32:
-      return FloatType::getF32(ctx);
+      return Float32Type::get(ctx);
     case 64:
-      return FloatType::getF64(ctx);
+      return Float64Type::get(ctx);
     case 80:
-      return FloatType::getF80(ctx);
+      return Float80Type::get(ctx);
     case 128:
-      return FloatType::getF128(ctx);
+      return Float128Type::get(ctx);
     }
   }
 
@@ -124,39 +124,39 @@ std::optional<Type> recordToType(MLIRContext *ctx, const Record &predRec) {
   }
 
   if (predRec.getName() == "BF16") {
-    return FloatType::getBF16(ctx);
+    return BFloat16Type::get(ctx);
   }
 
   if (predRec.getName() == "TF32") {
-    return FloatType::getTF32(ctx);
+    return FloatTF32Type::get(ctx);
   }
 
   if (predRec.getName() == "F8E4M3FN") {
-    return FloatType::getFloat8E4M3FN(ctx);
+    return Float8E4M3FNType::get(ctx);
   }
 
   if (predRec.getName() == "F8E5M2") {
-    return FloatType::getFloat8E5M2(ctx);
+    return Float8E5M2Type::get(ctx);
   }
 
   if (predRec.getName() == "F8E4M3") {
-    return FloatType::getFloat8E4M3(ctx);
+    return Float8E4M3Type::get(ctx);
   }
 
   if (predRec.getName() == "F8E4M3FNUZ") {
-    return FloatType::getFloat8E4M3FNUZ(ctx);
+    return Float8E4M3FNUZType::get(ctx);
   }
 
   if (predRec.getName() == "F8E4M3B11FNUZ") {
-    return FloatType::getFloat8E4M3B11FNUZ(ctx);
+    return Float8E4M3B11FNUZType::get(ctx);
   }
 
   if (predRec.getName() == "F8E5M2FNUZ") {
-    return FloatType::getFloat8E5M2FNUZ(ctx);
+    return Float8E5M2FNUZType::get(ctx);
   }
 
   if (predRec.getName() == "F8E3M4") {
-    return FloatType::getFloat8E3M4(ctx);
+    return Float8E3M4Type::get(ctx);
   }
 
   if (predRec.isSubClassOf("Complex")) {
diff --git a/mlir/unittests/IR/AttributeTest.cpp b/mlir/unittests/IR/AttributeTest.cpp
index b6033a38d0d4f..9203248a83baf 100644
--- a/mlir/unittests/IR/AttributeTest.cpp
+++ b/mlir/unittests/IR/AttributeTest.cpp
@@ -154,7 +154,7 @@ TEST(DenseSplatTest, IntAttrSplat) {
 
 TEST(DenseSplatTest, F32Splat) {
   MLIRContext context;
-  FloatType floatTy = FloatType::getF32(&context);
+  FloatType floatTy = Float32Type::get(&context);
   float value = 10.0;
 
   testSplat(floatTy, value);
@@ -162,7 +162,7 @@ TEST(DenseSplatTest, F32Splat) {
 
 TEST(DenseSplatTest, F64Splat) {
   MLIRContext context;
-  FloatType floatTy = FloatType::getF64(&context);
+  FloatType floatTy = Float64Type::get(&context);
   double value = 10.0;
 
   testSplat(floatTy, APFloat(value));
@@ -170,7 +170,7 @@ TEST(DenseSplatTest, F64Splat) {
 
 TEST(DenseSplatTest, FloatAttrSplat) {
   MLIRContext context;
-  FloatType floatTy = FloatType::getF32(&context);
+  FloatType floatTy = Float32Type::get(&context);
   Attribute value = FloatAttr::get(floatTy, 10.0);
 
   testSplat(floatTy, value);
@@ -178,7 +178,7 @@ TEST(DenseSplatTest, FloatAttrSplat) {
 
 TEST(DenseSplatTest, BF16Splat) {
   MLIRContext context;
-  FloatType floatTy = FloatType::getBF16(&context);
+  FloatType floatTy = BFloat16Type::get(&context);
   Attribute value = FloatAttr::get(floatTy, 10.0);
 
   testSplat(floatTy, value);
@@ -204,7 +204,7 @@ TEST(DenseSplatTest, StringAttrSplat) {
 
 TEST(DenseComplexTest, ComplexFloatSplat) {
   MLIRContext context;
-  ComplexType complexType = ComplexType::get(FloatType::getF32(&context));
+  ComplexType complexType = ComplexType::get(Float32Type::get(&context));
   std::complex<float> value(10.0, 15.0);
   testSplat(complexType, value);
 }
@@ -218,7 +218,7 @@ TEST(DenseComplexTest, ComplexIntSplat) {
 
 TEST(DenseComplexTest, ComplexAPFloatSplat) {
   MLIRContext context;
-  ComplexType complexType = ComplexType::get(FloatType::getF32(&context));
+  ComplexType complexType = ComplexType::get(Float32Type::get(&context));
   std::complex<APFloat> value(APFloat(10.0f), APFloat(15.0f));
   testSplat(complexType, value);
 }
@@ -409,7 +409,7 @@ TEST(SparseElementsAttrTest, GetZero) {
   context.allowUnregisteredDialects();
 
   IntegerType intTy = IntegerType::get(&context, 32);
-  FloatType floatTy = FloatType::getF32(&context);
+  FloatType floatTy = Float32Type::get(&context);
   Type stringTy = OpaqueType::get(StringAttr::get(&context, "test"), "string");
 
   ShapedType tensorI32 = RankedTensorType::get({2, 2}, intTy);
diff --git a/mlir/unittests/IR/ShapedTypeTest.cpp b/mlir/unittests/IR/ShapedTypeTest.cpp
index 7a5b0722a03ba..c2900b5aaeeeb 100644
--- a/mlir/unittests/IR/ShapedTypeTest.cpp
+++ b/mlir/unittests/IR/ShapedTypeTest.cpp
@@ -24,7 +24,7 @@ TEST(ShapedTypeTest, CloneMemref) {
   MLIRContext context;
 
   Type i32 = IntegerType::get(&context, 32);
-  Type f32 = FloatType::getF32(&context);
+  Type f32 = Float32Type::get(&context);
   Attribute memSpace = IntegerAttr::get(IntegerType::get(&context, 64), 7);
   Type memrefOriginalType = i32;
   llvm::SmallVector<int64_t> memrefOriginalShape({10, 20});
@@ -71,7 +71,7 @@ TEST(ShapedTypeTest, CloneTensor) {
   MLIRContext context;
 
   Type i32 = IntegerType::get(&context, 32);
-  Type f32 = FloatType::getF32(&context);
+  Type f32 = Float32Type::get(&context);
 
   Type tensorOriginalType = i32;
   llvm::SmallVector<int64_t> tensorOriginalShape({10, 20});
@@ -111,7 +111,7 @@ TEST(ShapedTypeTest, CloneVector) {
   MLIRContext context;
 
   Type i32 = IntegerType::get(&context, 32);
-  Type f32 = FloatType::getF32(&context);
+  Type f32 = Float32Type::get(&context);
 
   Type vectorOriginalType = i32;
   llvm::SmallVector<int64_t> vectorOriginalShape({10, 20});
@@ -134,7 +134,7 @@ TEST(ShapedTypeTest, CloneVector) {
 
 TEST(ShapedTypeTest, VectorTypeBuilder) {
   MLIRContext context;
-  Type f32 = FloatType::getF32(&context);
+  Type f32 = Float32Type::get(&context);
 
   SmallVector<int64_t> shape{2, 4, 8, 9, 1};
   SmallVector<bool> scalableDims{true, false, true, false, false};
@@ -192,7 +192,7 @@ TEST(ShapedTypeTest, VectorTypeBuilder) {
 
 TEST(ShapedTypeTest, RankedTensorTypeBuilder) {
   MLIRContext context;
-  Type f32 = FloatType::getF32(&context);
+  Type f32 = Float32Type::get(&context);
 
   SmallVector<int64_t> shape{2, 4, 8, 16, 32};
   RankedTensorType tensorType = RankedTensorType::get(shape, f32);
@@ -254,7 +254,7 @@ class TensorWithString : public RankedTensorType {
 
 TEST(ShapedTypeTest, RankedTensorTypeView) {
   MLIRContext context;
-  Type f32 = FloatType::getF32(&context);
+  Type f32 = Float32Type::get(&context);
 
   Type noEncodingRankedTensorType = RankedTensorType::get({10, 20}, f32);
 
diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h
index e1968675550d4..a4d4fc08837b2 100644
--- a/offload/DeviceRTL/include/Synchronization.h
+++ b/offload/DeviceRTL/include/Synchronization.h
@@ -66,7 +66,7 @@ V add(Ty *Address, V Val, atomic::OrderingTy Ordering) {
 
 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
 V load(Ty *Address, atomic::OrderingTy Ordering) {
-  return add(Address, Ty(0), Ordering);
+  return __scoped_atomic_load_n(Address, Ordering, __MEMORY_SCOPE_DEVICE);
 }
 
 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
diff --git a/offload/cmake/caches/AMDGPUBot.cmake b/offload/cmake/caches/AMDGPUBot.cmake
index d72b620ae3080..69bef91b2ce49 100644
--- a/offload/cmake/caches/AMDGPUBot.cmake
+++ b/offload/cmake/caches/AMDGPUBot.cmake
@@ -1,17 +1,19 @@
-# This file is meant for test builds on one basic AMDGPU buildbot only.
+# This file is used across all AMDGPU-cmake builders
 
 # Install directory set to /tmp as this is a bot config
 set(CMAKE_INSTALL_PREFIX /tmp/llvm.install.test CACHE STRING "")
 
+# General settings
+set(CMAKE_BUILD_TYPE Release CACHE STRING "")
+set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "")
+set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "")
+
 set(LLVM_ENABLE_PROJECTS "clang;lld" CACHE STRING "")
 set(LLVM_ENABLE_RUNTIMES "compiler-rt;openmp;offload" CACHE STRING "")
+
 set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
 set(LLVM_ENABLE_ASSERTIONS ON CACHE BOOL "")
-set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 32" CACHE STRING "")
 set(LLVM_TARGETS_TO_BUILD "host;AMDGPU" CACHE STRING "")
+set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 32" CACHE STRING "")
 
 set(CLANG_DEFAULT_LINKER "lld" CACHE STRING "")
-
-set(CMAKE_BUILD_TYPE Release CACHE STRING "")
-set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "")
-set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "")
diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S
index cc5344cdd124a..0bf9f07a13f14 100644
--- a/openmp/runtime/src/z_Linux_asm.S
+++ b/openmp/runtime/src/z_Linux_asm.S
@@ -19,6 +19,16 @@
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 
+# if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+# if __has_include(<cet.h>)
+# include <cet.h>
+# endif
+# endif
+
+# if !defined(_CET_ENDBR)
+# define _CET_ENDBR
+# endif
+
 # if KMP_MIC
 // the 'delay r16/r32/r64' should be used instead of the 'pause'.
 // The delay operation has the effect of removing the current thread from
@@ -66,6 +76,7 @@
 	ALIGN  4
 	.globl KMP_PREFIX_UNDERSCORE($0)
 KMP_PREFIX_UNDERSCORE($0):
+	_CET_ENDBR
 .endmacro
 # else // KMP_OS_DARWIN
 #  define KMP_PREFIX_UNDERSCORE(x) x //no extra underscore for Linux* OS symbols
@@ -92,6 +103,7 @@ KMP_PREFIX_UNDERSCORE($0):
         .globl KMP_PREFIX_UNDERSCORE(\proc)
 KMP_PREFIX_UNDERSCORE(\proc):
 	.cfi_startproc
+	_CET_ENDBR
 .endm
 .macro KMP_CFI_DEF_OFFSET sz
 	.cfi_def_cfa_offset	\sz
diff --git a/utils/bazel/WORKSPACE b/utils/bazel/WORKSPACE
index 69373ed574e2a..eeb1c692ac871 100644
--- a/utils/bazel/WORKSPACE
+++ b/utils/bazel/WORKSPACE
@@ -5,11 +5,11 @@
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe")
 
-SKYLIB_VERSION = "1.3.0"
+SKYLIB_VERSION = "1.7.1"
 
 http_archive(
     name = "bazel_skylib",
-    sha256 = "74d544d96f4a5bb630d465ca8bbcfe231e3594e5aae57e1edbf17a6eb3ca2506",
+    sha256 = "bc283cdfcd526a52c3201279cda4bc298652efa898b10b4db0837dc51652756f",
     urls = [
         "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version = SKYLIB_VERSION),
         "https://github.com/bazelbuild/bazel-skylib/releases/download/{version}/bazel-skylib-{version}.tar.gz".format(version = SKYLIB_VERSION),
@@ -18,9 +18,9 @@ http_archive(
 
 http_archive(
     name = "rules_cc",
-    urls = ["https://github.com/bazelbuild/rules_cc/releases/download/0.0.17/rules_cc-0.0.17.tar.gz"],
     sha256 = "abc605dd850f813bb37004b77db20106a19311a96b2da1c92b789da529d28fe1",
     strip_prefix = "rules_cc-0.0.17",
+    urls = ["https://github.com/bazelbuild/rules_cc/releases/download/0.0.17/rules_cc-0.0.17.tar.gz"],
 )
 
 new_local_repository(
@@ -158,9 +158,9 @@ maybe(
 maybe(
     http_archive,
     name = "robin_map",
-    strip_prefix = "robin-map-1.3.0",
-    sha256 = "a8424ad3b0affd4c57ed26f0f3d8a29604f0e1f2ef2089f497f614b1c94c7236",
     build_file = "@llvm-raw//utils/bazel/third_party_build:robin_map.BUILD",
+    sha256 = "a8424ad3b0affd4c57ed26f0f3d8a29604f0e1f2ef2089f497f614b1c94c7236",
+    strip_prefix = "robin-map-1.3.0",
     url = "https://github.com/Tessil/robin-map/archive/refs/tags/v1.3.0.tar.gz",
 )
 
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 47e632098a41b..e3f4fab2c3fdb 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -43,10 +43,7 @@ cc_binary(
 
 gentbl(
     name = "diagnostic_defs_gen",
-    tbl_outs = [(
-        "-gen-clang-diags-defs -clang-component=%s" % c,
-        "include/clang/Basic/Diagnostic%sKinds.inc" % c,
-    ) for c in [
+    tbl_outs = [out for c in [
         "AST",
         "Analysis",
         "Comment",
@@ -60,6 +57,15 @@ gentbl(
         "Refactoring",
         "Sema",
         "Serialization",
+    ] for out in [
+        (
+            "-gen-clang-diags-defs -clang-component=%s" % c,
+            "include/clang/Basic/Diagnostic%sKinds.inc" % c,
+        ),
+        (
+            "-gen-clang-diags-enums -clang-component=%s" % c,
+            "include/clang/Basic/Diagnostic%sEnums.inc" % c,
+        ),
     ]] + [
         (
             "-gen-clang-diag-groups",
@@ -660,19 +666,6 @@ py_binary(
     main = "utils/bundle_resources.py",
 )
 
-# A hacky library to expose some internal headers of the `basic` library to its
-# own implementation source files using a stripped include prefix rather than
-# file-relative-inclusion. This is inherently non-modular as these headers will
-# be repeated in the sources below for file-relative-inclusion.
-cc_library(
-    name = "basic_internal_headers",
-    hdrs = glob([
-        "lib/Basic/*.h",
-    ]),
-    features = ["-header_modules"],
-    strip_include_prefix = "lib/Basic",
-)
-
 cc_library(
     name = "basic",
     srcs = [
@@ -690,10 +683,12 @@ cc_library(
     copts = [
         "-DHAVE_VCS_VERSION_INC",
         "$(STACK_FRAME_UNLIMITED)",
+        "-I$(WORKSPACE_ROOT)/clang/lib/Basic",
     ],
     includes = ["include"],
     textual_hdrs = [
         # keep sorted
+        "include/clang/Basic/AllDiagnosticKinds.inc",
         "include/clang/Basic/AttrHasAttributeImpl.inc",
         "include/clang/Basic/AttrList.inc",
         "include/clang/Basic/AttrSubMatchRulesList.inc",
@@ -706,6 +701,7 @@ cc_library(
         "include/clang/Basic/DiagnosticFrontendKinds.inc",
         "include/clang/Basic/DiagnosticGroups.inc",
         "include/clang/Basic/DiagnosticIndexName.inc",
+        "include/clang/Basic/DiagnosticInstallAPIKinds.inc",
         "include/clang/Basic/DiagnosticLexKinds.inc",
         "include/clang/Basic/DiagnosticParseKinds.inc",
         "include/clang/Basic/DiagnosticRefactoringKinds.inc",
@@ -723,6 +719,7 @@ cc_library(
     ] + glob([
         "include/clang/Basic/*.def",
     ]),
+    toolchains = [":workspace_root"],
     deps = [
         ":basic_arm_cde_gen",
         ":basic_arm_fp16_inc_gen",
@@ -742,7 +739,6 @@ cc_library(
         ":basic_builtins_spirv_gen",
         ":basic_builtins_x86_64_gen",
         ":basic_builtins_x86_gen",
-        ":basic_internal_headers",
         ":basic_riscv_sifive_vector_builtins_gen",
         ":basic_riscv_vector_builtin_cg_gen",
         ":basic_riscv_vector_builtins_gen",
@@ -1453,7 +1449,6 @@ cc_library(
     hdrs = glob(["include/clang/Tooling/DependencyScanning/**/*.h"]),
     deps = [
         ":basic",
-        ":codegen",
         ":driver",
         ":frontend",
         ":lex",
@@ -1515,13 +1510,7 @@ cc_library(
             "lib/Format/*.h",
         ],
     ),
-    hdrs = [
-        "lib/Format/FormatTokenLexer.h",
-        "lib/Format/FormatTokenSource.h",
-        "lib/Format/Macros.h",
-        "lib/Format/QualifierAlignmentFixer.h",
-        "lib/Format/UnwrappedLineParser.h",
-    ] + glob([
+    hdrs = glob([
         "include/clang/Format/*.h",
     ]),
     includes = ["include"],
@@ -1951,14 +1940,12 @@ cc_library(
             "lib/Interpreter/*.cpp",
             "lib/Interpreter/*.h",
         ],
-        exclude = ["lib/Interpreter/Wasm.cpp"],
-    ),
-    hdrs = glob(
-        [
-            "include/clang/Interpreter/*.h",
+        exclude = [
+            "lib/Interpreter/Wasm.cpp",
+            "lib/Interpreter/Wasm.h",
         ],
-        exclude = ["lib/Interpreter/Wasm.cpp"],
     ),
+    hdrs = glob(["include/clang/Interpreter/*.h"]),
     includes = ["include"],
     deps = [
         ":analysis",
@@ -2025,7 +2012,6 @@ cc_library(
         "//llvm:BitWriter",
         "//llvm:BitstreamReader",
         "//llvm:CodeGen",
-        "//llvm:CodeGenTypes",
         "//llvm:Core",
         "//llvm:Coroutines",
         "//llvm:Coverage",
@@ -2148,7 +2134,6 @@ cc_library(
         "include/clang/Serialization/AttrPCHRead.inc",
         "include/clang/Serialization/AttrPCHWrite.inc",
     ] + glob([
-        "include/clang/Frontend/*.h",
         "lib/Serialization/*.cpp",
         "lib/Serialization/*.h",
     ]),
@@ -2160,15 +2145,11 @@ cc_library(
         "include/clang/Serialization/*.def",
     ]),
     deps = [
-        ":apinotes",
         ":ast",
         ":basic",
-        ":driver",
         ":lex",
         ":sema",
         ":serialization_attr_gen",
-        ":static_analyzer_core_options",
-        ":support",
         ":type_nodes_gen",
         "//llvm:BitReader",
         "//llvm:BitWriter",
@@ -2763,6 +2744,30 @@ cc_library(
     ],
 )
 
+proto_library(
+    name = "cxx-proto",
+    srcs = ["tools/clang-fuzzer/cxx_proto.proto"],
+)
+
+cc_proto_library(
+    name = "cxx_cc_proto",
+    deps = [":cxx-proto"],
+)
+
+cc_library(
+    name = "proto-to-cxx-lib",
+    srcs = ["tools/clang-fuzzer/proto-to-cxx/proto_to_cxx.cpp"],
+    hdrs = ["tools/clang-fuzzer/proto-to-cxx/proto_to_cxx.h"],
+    includes = ["tools/clang-fuzzer"],
+    deps = [":cxx_cc_proto"],
+)
+
+cc_binary(
+    name = "clang-proto-to-cxx",
+    srcs = ["tools/clang-fuzzer/proto-to-cxx/proto_to_cxx_main.cpp"],
+    deps = [":proto-to-cxx-lib"],
+)
+
 cc_library(
     name = "clang-fuzzer-initialize",
     srcs = ["tools/clang-fuzzer/fuzzer-initialize/fuzzer_initialize.cpp"],
diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
index 74b4eca0889a7..7a8e14e06ddc5 100644
--- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
+++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
@@ -84,15 +84,6 @@
 /* Define if __unw_add_dynamic_fde() is available on this platform. */
 /* HAVE_UNW_ADD_DYNAMIC_FDE defined in Bazel */
 
-/* Define to 1 if you have the <errno.h> header file. */
-#define HAVE_ERRNO_H 1
-
-/* Define to 1 if you have the <fcntl.h> header file. */
-#define HAVE_FCNTL_H 1
-
-/* Define to 1 if you have the <fenv.h> header file. */
-#define HAVE_FENV_H 1
-
 /* Define if libffi is available on this platform. */
 /* #undef HAVE_FFI_CALL */
 
@@ -111,9 +102,6 @@
 /* Define to 1 if you have the `getpagesize' function. */
 #define HAVE_GETPAGESIZE 1
 
-/* Define to 1 if you have the `getrlimit' function. */
-#define HAVE_GETRLIMIT 1
-
 /* Define to 1 if you have the `getrusage' function. */
 #define HAVE_GETRUSAGE 1
 
@@ -177,45 +165,24 @@
 /* Define to 1 if you have the `setenv' function. */
 /* HAVE_SETENV defined in Bazel */
 
-/* Define to 1 if you have the `setrlimit' function. */
-#define HAVE_SETRLIMIT 1
-
 /* Define to 1 if you have the `sigaltstack' function. */
 #define HAVE_SIGALTSTACK 1
 
-/* Define to 1 if you have the <signal.h> header file. */
-#define HAVE_SIGNAL_H 1
-
 /* Define to 1 if you have the `strerror_r' function. */
 /* HAVE_STRERROR_R defined in Bazel */
 
 /* Define to 1 if you have the `sysconf' function. */
 #define HAVE_SYSCONF 1
 
-/* Define to 1 if you have the <sys/ioctl.h> header file. */
-#define HAVE_SYS_IOCTL_H 1
-
 /* Define to 1 if you have the <sys/mman.h> header file. */
 #define HAVE_SYS_MMAN_H 1
 
-/* Define to 1 if you have the <sys/resource.h> header file. */
-#define HAVE_SYS_RESOURCE_H 1
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#define HAVE_SYS_STAT_H 1
-
-/* Define to 1 if you have the <sys/time.h> header file. */
-#define HAVE_SYS_TIME_H 1
-
 /* Define to 1 if stat struct has st_mtimespec member .*/
 /* #undef HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC */
 
 /* Define to 1 if stat struct has st_mtim member. */
 /* HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC defined in Bazel */
 
-/* Define to 1 if you have the <termios.h> header file. */
-#define HAVE_TERMIOS_H 1
-
 /* Define to 1 if you have the <unistd.h> header file. */
 /* HAVE_UNISTD_H defined in Bazel */
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 873fb2d18bfb2..f2e32b1b8c6db 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -549,9 +549,12 @@ cc_library(
         "//mlir:ConvertToSPIRV",
         "//mlir:FuncDialect",
         "//mlir:GPUDialect",
+        "//mlir:GPUToGPURuntimeTransforms",
         "//mlir:GPUToSPIRV",
         "//mlir:GPUTransforms",
         "//mlir:IR",
+        "//mlir:LLVMIRTransforms",
+        "//mlir:MemRefToLLVM",
         "//mlir:MemRefTransforms",
         "//mlir:Pass",
         "//mlir:SPIRVDialect",
diff --git a/utils/bazel/llvm_configs/config.h.cmake b/utils/bazel/llvm_configs/config.h.cmake
index 3e6b94dfbe545..1d2d00a3b758b 100644
--- a/utils/bazel/llvm_configs/config.h.cmake
+++ b/utils/bazel/llvm_configs/config.h.cmake
@@ -72,15 +72,6 @@
 /* Define if __unw_add_dynamic_fde() is available on this platform. */
 #cmakedefine HAVE_UNW_ADD_DYNAMIC_FDE ${HAVE_UNW_ADD_DYNAMIC_FDE}
 
-/* Define to 1 if you have the <errno.h> header file. */
-#cmakedefine HAVE_ERRNO_H ${HAVE_ERRNO_H}
-
-/* Define to 1 if you have the <fcntl.h> header file. */
-#cmakedefine HAVE_FCNTL_H ${HAVE_FCNTL_H}
-
-/* Define to 1 if you have the <fenv.h> header file. */
-#cmakedefine HAVE_FENV_H ${HAVE_FENV_H}
-
 /* Define if libffi is available on this platform. */
 #cmakedefine HAVE_FFI_CALL ${HAVE_FFI_CALL}
 
@@ -99,9 +90,6 @@
 /* Define to 1 if you have the `getpagesize' function. */
 #cmakedefine HAVE_GETPAGESIZE ${HAVE_GETPAGESIZE}
 
-/* Define to 1 if you have the `getrlimit' function. */
-#cmakedefine HAVE_GETRLIMIT ${HAVE_GETRLIMIT}
-
 /* Define to 1 if you have the `getrusage' function. */
 #cmakedefine HAVE_GETRUSAGE ${HAVE_GETRUSAGE}
 
@@ -174,45 +162,24 @@
 /* Define to 1 if you have the `setenv' function. */
 #cmakedefine HAVE_SETENV ${HAVE_SETENV}
 
-/* Define to 1 if you have the `setrlimit' function. */
-#cmakedefine HAVE_SETRLIMIT ${HAVE_SETRLIMIT}
-
 /* Define to 1 if you have the `sigaltstack' function. */
 #cmakedefine HAVE_SIGALTSTACK ${HAVE_SIGALTSTACK}
 
-/* Define to 1 if you have the <signal.h> header file. */
-#cmakedefine HAVE_SIGNAL_H ${HAVE_SIGNAL_H}
-
 /* Define to 1 if you have the `strerror_r' function. */
 #cmakedefine HAVE_STRERROR_R ${HAVE_STRERROR_R}
 
 /* Define to 1 if you have the `sysconf' function. */
 #cmakedefine HAVE_SYSCONF ${HAVE_SYSCONF}
 
-/* Define to 1 if you have the <sys/ioctl.h> header file. */
-#cmakedefine HAVE_SYS_IOCTL_H ${HAVE_SYS_IOCTL_H}
-
 /* Define to 1 if you have the <sys/mman.h> header file. */
 #cmakedefine HAVE_SYS_MMAN_H ${HAVE_SYS_MMAN_H}
 
-/* Define to 1 if you have the <sys/resource.h> header file. */
-#cmakedefine HAVE_SYS_RESOURCE_H ${HAVE_SYS_RESOURCE_H}
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#cmakedefine HAVE_SYS_STAT_H ${HAVE_SYS_STAT_H}
-
-/* Define to 1 if you have the <sys/time.h> header file. */
-#cmakedefine HAVE_SYS_TIME_H ${HAVE_SYS_TIME_H}
-
 /* Define to 1 if stat struct has st_mtimespec member .*/
 #cmakedefine HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC ${HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC}
 
 /* Define to 1 if stat struct has st_mtim member. */
 #cmakedefine HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC ${HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC}
 
-/* Define to 1 if you have the <termios.h> header file. */
-#cmakedefine HAVE_TERMIOS_H ${HAVE_TERMIOS_H}
-
 /* Define to 1 if you have the <unistd.h> header file. */
 #cmakedefine HAVE_UNISTD_H ${HAVE_UNISTD_H}