diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt
index 9ac196ad0e821..04db160b64b05 100644
--- a/bolt/CMakeLists.txt
+++ b/bolt/CMakeLists.txt
@@ -163,8 +163,8 @@ if (BOLT_ENABLE_RUNTIME)
   add_llvm_install_targets(install-bolt_rt
     DEPENDS bolt_rt bolt
     COMPONENT bolt)
-  set(LIBBOLT_RT_INSTR "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib/libbolt_rt_instr.a")
-  set(LIBBOLT_RT_HUGIFY "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib/libbolt_rt_hugify.a")
+  set(LIBBOLT_RT_INSTR "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_instr.a")
+  set(LIBBOLT_RT_HUGIFY "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_hugify.a")
 endif()
 
 find_program(GNU_LD_EXECUTABLE NAMES ${LLVM_DEFAULT_TARGET_TRIPLE}-ld.bfd ld.bfd DOC "GNU ld")
diff --git a/bolt/lib/Passes/Inliner.cpp b/bolt/lib/Passes/Inliner.cpp
index f004a8eeea185..1793f4ff1f148 100644
--- a/bolt/lib/Passes/Inliner.cpp
+++ b/bolt/lib/Passes/Inliner.cpp
@@ -310,13 +310,13 @@ Inliner::inlineCall(BinaryBasicBlock &CallerBB,
       if (MIB.isPseudo(Inst))
         continue;
 
-      MIB.stripAnnotations(Inst, /*KeepTC=*/BC.isX86());
+      MIB.stripAnnotations(Inst, /*KeepTC=*/BC.isX86() || BC.isAArch64());
 
       // Fix branch target. Strictly speaking, we don't have to do this as
       // targets of direct branches will be fixed later and don't matter
       // in the CFG state. However, disassembly may look misleading, and
       // hence we do the fixing.
-      if (MIB.isBranch(Inst)) {
+      if (MIB.isBranch(Inst) && !MIB.isTailCall(Inst)) {
         assert(!MIB.isIndirectBranch(Inst) &&
                "unexpected indirect branch in callee");
         const BinaryBasicBlock *TargetBB =
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index d752751c17932..d84da10b5bbe6 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -133,6 +133,36 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 public:
   using MCPlusBuilder::MCPlusBuilder;
 
+  MCPhysReg getStackPointer() const override { return AArch64::SP; }
+
+  bool isPush(const MCInst &Inst) const override { return false; }
+
+  bool isPop(const MCInst &Inst) const override { return false; }
+
+  void createCall(MCInst &Inst, const MCSymbol *Target,
+                  MCContext *Ctx) override {
+    createDirectCall(Inst, Target, Ctx, false);
+  }
+
+  bool convertTailCallToCall(MCInst &Inst) override {
+    int NewOpcode;
+    switch (Inst.getOpcode()) {
+    default:
+      return false;
+    case AArch64::B:
+      NewOpcode = AArch64::BL;
+      break;
+    case AArch64::BR:
+      NewOpcode = AArch64::BLR;
+      break;
+    }
+
+    Inst.setOpcode(NewOpcode);
+    removeAnnotation(Inst, MCPlus::MCAnnotation::kTailCall);
+    clearOffset(Inst);
+    return true;
+  }
+
   bool equals(const MCTargetExpr &A, const MCTargetExpr &B,
               CompFuncTy Comp) const override {
     const auto &AArch64ExprA = cast<AArch64MCExpr>(A);
diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt
index 40f4fbc9f30d5..0deb69a27d435 100644
--- a/bolt/runtime/CMakeLists.txt
+++ b/bolt/runtime/CMakeLists.txt
@@ -16,18 +16,18 @@ add_library(bolt_rt_instr STATIC
   instr.cpp
   ${CMAKE_CURRENT_BINARY_DIR}/config.h
   )
-set_target_properties(bolt_rt_instr PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+set_target_properties(bolt_rt_instr PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "lib${LLVM_LIBDIR_SUFFIX}")
 add_library(bolt_rt_hugify STATIC
   hugify.cpp
   ${CMAKE_CURRENT_BINARY_DIR}/config.h
   )
-set_target_properties(bolt_rt_hugify PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+set_target_properties(bolt_rt_hugify PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "lib${LLVM_LIBDIR_SUFFIX}")
 
 if(NOT BOLT_BUILT_STANDALONE)
   add_custom_command(TARGET bolt_rt_instr POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_instr.a" "${LLVM_LIBRARY_DIR}")
+    COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_instr.a" "${LLVM_LIBRARY_DIR}")
   add_custom_command(TARGET bolt_rt_hugify POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_hugify.a" "${LLVM_LIBRARY_DIR}")
+    COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_hugify.a" "${LLVM_LIBRARY_DIR}")
 endif()
 
 set(BOLT_RT_FLAGS
@@ -53,23 +53,23 @@ target_include_directories(bolt_rt_instr PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 target_compile_options(bolt_rt_hugify PRIVATE ${BOLT_RT_FLAGS})
 target_include_directories(bolt_rt_hugify PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 
-install(TARGETS bolt_rt_instr DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
-install(TARGETS bolt_rt_hugify DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+install(TARGETS bolt_rt_instr DESTINATION "lib${LLVM_LIBDIR_SUFFIX}")
+install(TARGETS bolt_rt_hugify DESTINATION "lib${LLVM_LIBDIR_SUFFIX}")
 
 if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
   add_library(bolt_rt_instr_osx STATIC
     instr.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/config.h
   )
-  set_target_properties(bolt_rt_instr_osx PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+  set_target_properties(bolt_rt_instr_osx PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "lib${LLVM_LIBDIR_SUFFIX}")
   target_include_directories(bolt_rt_instr_osx PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
   target_compile_options(bolt_rt_instr_osx PRIVATE
     -target x86_64-apple-darwin19.6.0
     ${BOLT_RT_FLAGS})
-  install(TARGETS bolt_rt_instr_osx DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+  install(TARGETS bolt_rt_instr_osx DESTINATION "lib${LLVM_LIBDIR_SUFFIX}")
 
   if(NOT BOLT_BUILT_STANDALONE)
     add_custom_command(TARGET bolt_rt_instr_osx POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_instr_osx.a" "${LLVM_LIBRARY_DIR}")
+      COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_instr_osx.a" "${LLVM_LIBRARY_DIR}")
   endif()
 endif()
diff --git a/bolt/test/AArch64/inline-small-function-1.s b/bolt/test/AArch64/inline-small-function-1.s
new file mode 100644
index 0000000000000..3ea22a9915fb4
--- /dev/null
+++ b/bolt/test/AArch64/inline-small-function-1.s
@@ -0,0 +1,42 @@
+## This test checks that inline is properly handled by BOLT on aarch64.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags -O0 %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt --inline-small-functions --print-inline  --print-only=_Z3barP1A  \
+# RUN: %t.exe -o %t.bolt  | FileCheck %s
+
+# CHECK: BOLT-INFO: inlined 0 calls at 1 call sites in 2 iteration(s). Change in binary size: 4 bytes.
+# CHECK: Binary Function "_Z3barP1A" after inlining {
+# CHECK-NOT: bl	_Z3fooP1A
+# CHECK: ldr	x8, [x0]
+# CHECK-NEXT: ldr	w0, [x8]
+ 
+	.text
+	.globl	_Z3fooP1A                      
+	.type	_Z3fooP1A,@function
+_Z3fooP1A:                              
+	ldr	x8, [x0]
+	ldr	w0, [x8]
+	ret
+	.size	_Z3fooP1A, .-_Z3fooP1A
+
+	.globl	_Z3barP1A                       
+	.type	_Z3barP1A,@function
+_Z3barP1A:                              
+	stp	x29, x30, [sp, #-16]!           
+	mov	x29, sp
+	bl	_Z3fooP1A
+	mul	w0, w0, w0
+	ldp	x29, x30, [sp], #16             
+	ret
+	.size	_Z3barP1A, .-_Z3barP1A
+
+	.globl	main                            
+	.p2align	2
+	.type	main,@function
+main:                                   
+	mov	w0, wzr
+	ret
+	.size	main, .-main
diff --git a/bolt/test/AArch64/inline-small-function-2.s b/bolt/test/AArch64/inline-small-function-2.s
new file mode 100644
index 0000000000000..5eb7d391fd157
--- /dev/null
+++ b/bolt/test/AArch64/inline-small-function-2.s
@@ -0,0 +1,48 @@
+## This test checks that inline is properly handled by BOLT on aarch64.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags -O0 %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt --inline-small-functions --print-inline  --print-only=test  \
+# RUN: %t.exe -o %t.bolt | FileCheck %s
+
+#CHECK: BOLT-INFO: inlined 0 calls at 1 call sites in 2 iteration(s). Change in binary size: 4 bytes.
+#CHECK: Binary Function "test" after inlining {
+#CHECK-NOT: bl	indirect
+#CHECK: add	w0, w1, w0
+#CHECK-NEXT: blr	x2
+ 
+	.text
+	.globl	indirect                       
+	.type	indirect,@function
+indirect:                               
+	add	w0, w1, w0
+	br	x2
+	.size	indirect, .-indirect
+
+	.globl	test                           
+	.type	test,@function
+test:                                   
+	stp	x29, x30, [sp, #-32]!          
+	stp	x20, x19, [sp, #16]            
+	mov	x29, sp
+	mov	w19, w1
+	mov	w20, w0
+	bl	indirect
+	add	w8, w19, w20
+	cmp	w0, #0
+	csinc	w0, w8, wzr, eq
+	ldp	x20, x19, [sp, #16]             
+	ldp	x29, x30, [sp], #32            
+	ret
+	.size	test, .-test
+
+	.globl	main                            
+	.type	main,@function
+main:                                   
+	mov	w0, wzr
+	ret
+	.size	main, .-main
+
+ 
\ No newline at end of file
diff --git a/bolt/test/lit.local.cfg b/bolt/test/lit.local.cfg
index e2fa0a4a2210f..d5a6849b27a77 100644
--- a/bolt/test/lit.local.cfg
+++ b/bolt/test/lit.local.cfg
@@ -1,5 +1,5 @@
 host_linux_triple = config.target_triple.split("-")[0] + "-unknown-linux-gnu"
-common_linker_flags = "-fuse-ld=lld -Wl,--unresolved-symbols=ignore-all -pie"
+common_linker_flags = "-fuse-ld=lld -Wl,--unresolved-symbols=ignore-all -Wl,--build-id=none -pie"
 flags = f"--target={host_linux_triple} -fPIE {common_linker_flags}"
 
 config.substitutions.insert(0, ("%cflags", f"%cflags {flags}"))
diff --git a/clang/include/clang/AST/Attr.h b/clang/include/clang/AST/Attr.h
index 3365ebe4d9012..bed532a84a1bd 100644
--- a/clang/include/clang/AST/Attr.h
+++ b/clang/include/clang/AST/Attr.h
@@ -60,6 +60,8 @@ class Attr : public AttributeCommonInfo {
   unsigned IsLateParsed : 1;
   LLVM_PREFERRED_TYPE(bool)
   unsigned InheritEvenIfAlreadyPresent : 1;
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned DeferDeserialization : 1;
 
   void *operator new(size_t bytes) noexcept {
     llvm_unreachable("Attrs cannot be allocated with regular 'new'.");
@@ -80,10 +82,11 @@ class Attr : public AttributeCommonInfo {
 
 protected:
   Attr(ASTContext &Context, const AttributeCommonInfo &CommonInfo,
-       attr::Kind AK, bool IsLateParsed)
+       attr::Kind AK, bool IsLateParsed, bool DeferDeserialization = false)
       : AttributeCommonInfo(CommonInfo), AttrKind(AK), Inherited(false),
         IsPackExpansion(false), Implicit(false), IsLateParsed(IsLateParsed),
-        InheritEvenIfAlreadyPresent(false) {}
+        InheritEvenIfAlreadyPresent(false),
+        DeferDeserialization(DeferDeserialization) {}
 
 public:
   attr::Kind getKind() const { return static_cast<attr::Kind>(AttrKind); }
@@ -105,6 +108,8 @@ class Attr : public AttributeCommonInfo {
   void setPackExpansion(bool PE) { IsPackExpansion = PE; }
   bool isPackExpansion() const { return IsPackExpansion; }
 
+  bool shouldDeferDeserialization() const { return DeferDeserialization; }
+
   // Clone this attribute.
   Attr *clone(ASTContext &C) const;
 
@@ -146,8 +151,9 @@ class InheritableAttr : public Attr {
 protected:
   InheritableAttr(ASTContext &Context, const AttributeCommonInfo &CommonInfo,
                   attr::Kind AK, bool IsLateParsed,
-                  bool InheritEvenIfAlreadyPresent)
-      : Attr(Context, CommonInfo, AK, IsLateParsed) {
+                  bool InheritEvenIfAlreadyPresent,
+                  bool DeferDeserialization = false)
+      : Attr(Context, CommonInfo, AK, IsLateParsed, DeferDeserialization) {
     this->InheritEvenIfAlreadyPresent = InheritEvenIfAlreadyPresent;
   }
 
diff --git a/clang/include/clang/Basic/AllDiagnosticKinds.inc b/clang/include/clang/Basic/AllDiagnosticKinds.inc
new file mode 100644
index 0000000000000..a946b4a640ac6
--- /dev/null
+++ b/clang/include/clang/Basic/AllDiagnosticKinds.inc
@@ -0,0 +1,33 @@
+//===--- AllDiagnosticKinds.inc----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Defines the Diagnostic IDs in ID sorted order. The order is dictated by
+/// the enum in DiagnosticIDs.h#L49-L65.
+///
+//===----------------------------------------------------------------------===//
+
+// Turn off clang-format, as the order of the includes are important to make
+// sure tables based on Diagnostic IDs are partitioned/sorted based on
+// DiagID.
+
+// clang-format off
+#include "clang/Basic/DiagnosticCommonKinds.inc"
+#include "clang/Basic/DiagnosticDriverKinds.inc"
+#include "clang/Basic/DiagnosticFrontendKinds.inc"
+#include "clang/Basic/DiagnosticSerializationKinds.inc"
+#include "clang/Basic/DiagnosticLexKinds.inc"
+#include "clang/Basic/DiagnosticParseKinds.inc"
+#include "clang/Basic/DiagnosticASTKinds.inc"
+#include "clang/Basic/DiagnosticCommentKinds.inc"
+#include "clang/Basic/DiagnosticCrossTUKinds.inc"
+#include "clang/Basic/DiagnosticSemaKinds.inc"
+#include "clang/Basic/DiagnosticAnalysisKinds.inc"
+#include "clang/Basic/DiagnosticRefactoringKinds.inc"
+#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
+// clang-format on
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 408d3adf370c8..3969dd8af5dfa 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -713,6 +713,12 @@ class Attr {
   // attribute may be documented under multiple categories, more than one
   // Documentation entry may be listed.
   list<Documentation> Documentation;
+  // Set to true if deserialization of this attribute must be deferred until 
+  // the parent Decl is fully deserialized (during header module file
+  // deserialization). E.g., this is the case for the preferred_name attribute,
+  // since its type deserialization depends on its target Decl type.
+  // (See https://github.com/llvm/llvm-project/issues/56490 for details).
+  bit DeferDeserialization = 0;
 }
 
 /// Used to define a set of mutually exclusive attributes.
@@ -3254,6 +3260,11 @@ def PreferredName : InheritableAttr {
   let InheritEvenIfAlreadyPresent = 1;
   let MeaningfulToClassTemplateDefinition = 1;
   let TemplateDependent = 1;
+  // Type of this attribute depends on the target Decl type.
+  // Therefore, its deserialization must be deferred until
+  // deserialization of the target Decl is complete
+  // (for header modules).
+  let DeferDeserialization = 1;
 }
 
 def PreserveMost : DeclOrTypeAttr {
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 40dae25f7b54b..d568d2fd7aa30 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -740,6 +740,8 @@ enum ASTRecordTypes {
   CXX_ADDED_TEMPLATE_PARTIAL_SPECIALIZATION = 75,
 
   UPDATE_MODULE_LOCAL_VISIBLE = 76,
+
+  UPDATE_TU_LOCAL_VISIBLE = 77,
 };
 
 /// Record types used within a source manager block.
@@ -1340,6 +1342,10 @@ enum DeclCode {
   /// only visible from DeclContext in the same module.
   DECL_CONTEXT_MODULE_LOCAL_VISIBLE,
 
+  /// A record that stores the set of declarations that are only visible
+  /// to the TU.
+  DECL_CONTEXT_TU_LOCAL_VISIBLE,
+
   /// A LabelDecl record.
   DECL_LABEL,
 
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index d77bb01c5aa59..82564fe664acb 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -528,6 +528,7 @@ class ASTReader
     uint64_t LexicalOffset;
     uint64_t VisibleOffset;
     uint64_t ModuleLocalOffset;
+    uint64_t TULocalOffset;
   };
 
   using DelayedNamespaceOffsetMapTy =
@@ -640,6 +641,9 @@ class ASTReader
   llvm::DenseMap<const DeclContext *,
                  serialization::reader::ModuleLocalLookupTable>
       ModuleLocalLookups;
+  llvm::DenseMap<const DeclContext *,
+                 serialization::reader::DeclContextLookupTable>
+      TULocalLookups;
 
   using SpecLookupTableTy =
       llvm::DenseMap<const Decl *,
@@ -670,6 +674,7 @@ class ASTReader
   llvm::DenseMap<GlobalDeclID, DeclContextVisibleUpdates> PendingVisibleUpdates;
   llvm::DenseMap<GlobalDeclID, DeclContextVisibleUpdates>
       PendingModuleLocalVisibleUpdates;
+  llvm::DenseMap<GlobalDeclID, DeclContextVisibleUpdates> TULocalUpdates;
 
   using SpecializationsUpdate = SmallVector<UpdateData, 1>;
   using SpecializationsUpdateMap =
@@ -704,11 +709,17 @@ class ASTReader
                                      llvm::BitstreamCursor &Cursor,
                                      uint64_t Offset, DeclContext *DC);
 
+  enum class VisibleDeclContextStorageKind {
+    GenerallyVisible,
+    ModuleLocalVisible,
+    TULocalVisible,
+  };
+
   /// Read the record that describes the visible contents of a DC.
   bool ReadVisibleDeclContextStorage(ModuleFile &M,
                                      llvm::BitstreamCursor &Cursor,
                                      uint64_t Offset, GlobalDeclID ID,
-                                     bool IsModuleLocal);
+                                     VisibleDeclContextStorageKind VisibleKind);
 
   bool ReadSpecializations(ModuleFile &M, llvm::BitstreamCursor &Cursor,
                            uint64_t Offset, Decl *D, bool IsPartial);
@@ -1148,6 +1159,10 @@ class ASTReader
   unsigned NumModuleLocalVisibleDeclContexts = 0,
            TotalModuleLocalVisibleDeclContexts = 0;
 
+  /// Number of TU Local decl contexts read/total
+  unsigned NumTULocalVisibleDeclContexts = 0,
+           TotalTULocalVisibleDeclContexts = 0;
+
   /// Total size of modules, in bits, currently loaded
   uint64_t TotalModulesSizeInBits = 0;
 
@@ -1221,6 +1236,24 @@ class ASTReader
   /// been completed.
   std::deque<PendingDeclContextInfo> PendingDeclContextInfos;
 
+  /// Deserialization of some attributes must be deferred since they refer
+  /// to themselves in their type (e.g., preferred_name attribute refers to the
+  /// typedef that refers back to the template specialization of the template
+  /// that the attribute is attached to).
+  /// More attributes that store TypeSourceInfo might be potentially affected,
+  /// see https://github.com/llvm/llvm-project/issues/56490 for details.
+  struct DeferredAttribute {
+    // Index of the deferred attribute in the Record of the TargetedDecl.
+    uint64_t RecordIdx;
+    // Decl to attach a deferred attribute to.
+    Decl *TargetedDecl;
+  };
+
+  /// The collection of Decls that have been loaded but some of their attributes
+  /// have been deferred, paired with the index inside the record pointing
+  /// at the skipped attribute.
+  SmallVector<DeferredAttribute> PendingDeferredAttributes;
+
   template <typename DeclTy>
   using DuplicateObjCDecls = std::pair<DeclTy *, DeclTy *>;
 
@@ -1463,6 +1496,9 @@ class ASTReader
   const serialization::reader::ModuleLocalLookupTable *
   getModuleLocalLookupTables(DeclContext *Primary) const;
 
+  const serialization::reader::DeclContextLookupTable *
+  getTULocalLookupTables(DeclContext *Primary) const;
+
   /// Get the loaded specializations lookup tables for \p D,
   /// if any.
   serialization::reader::LazySpecializationInfoLookupTable *
@@ -1570,6 +1606,7 @@ class ASTReader
   void loadPendingDeclChain(Decl *D, uint64_t LocalOffset);
   void loadObjCCategories(GlobalDeclID ID, ObjCInterfaceDecl *D,
                           unsigned PreviousGeneration = 0);
+  void loadDeferredAttribute(const DeferredAttribute &DA);
 
   RecordLocation getLocalBitOffset(uint64_t GlobalOffset);
   uint64_t getGlobalBitOffset(ModuleFile &M, uint64_t LocalOffset);
diff --git a/clang/include/clang/Serialization/ASTRecordReader.h b/clang/include/clang/Serialization/ASTRecordReader.h
index 2561418b78ca7..a29972fcf73a8 100644
--- a/clang/include/clang/Serialization/ASTRecordReader.h
+++ b/clang/include/clang/Serialization/ASTRecordReader.h
@@ -83,6 +83,12 @@ class ASTRecordReader
   /// Returns the current value in this record, without advancing.
   uint64_t peekInt() { return Record[Idx]; }
 
+  /// Returns the next N values in this record, without advancing.
+  uint64_t peekInts(unsigned N) { return Record[Idx + N]; }
+
+  /// Skips the current value.
+  void skipInt() { Idx += 1; }
+
   /// Skips the specified number of values.
   void skipInts(unsigned N) { Idx += N; }
 
@@ -335,7 +341,12 @@ class ASTRecordReader
   Attr *readAttr();
 
   /// Reads attributes from the current stream position, advancing Idx.
-  void readAttributes(AttrVec &Attrs);
+  /// For some attributes (where type depends on itself recursively), defer
+  /// reading the attribute until the type has been read.
+  void readAttributes(AttrVec &Attrs, Decl *D = nullptr);
+
+  /// Reads one attribute from the current stream position, advancing Idx.
+  Attr *readOrDeferAttrFor(Decl *D);
 
   /// Read an BTFTypeTagAttr object.
   BTFTypeTagAttr *readBTFTypeTagAttr() {
diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index 53b09cc914392..079e39a9fb678 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -496,6 +496,9 @@ class ASTWriter : public ASTDeserializationListener,
   /// file.
   unsigned NumModuleLocalDeclContexts = 0;
 
+  /// The number of TULocal declcontexts written to the AST file.
+  unsigned NumTULocalDeclContexts = 0;
+
   /// A mapping from each known submodule to its ID number, which will
   /// be a positive integer.
   llvm::DenseMap<const Module *, unsigned> SubmoduleIDs;
@@ -594,12 +597,14 @@ class ASTWriter : public ASTDeserializationListener,
   void
   GenerateNameLookupTable(ASTContext &Context, const DeclContext *DC,
                           llvm::SmallVectorImpl<char> &LookupTable,
-                          llvm::SmallVectorImpl<char> &ModuleLocalLookupTable);
+                          llvm::SmallVectorImpl<char> &ModuleLocalLookupTable,
+                          llvm::SmallVectorImpl<char> &TULocalLookupTable);
   uint64_t WriteDeclContextLexicalBlock(ASTContext &Context,
                                         const DeclContext *DC);
   void WriteDeclContextVisibleBlock(ASTContext &Context, DeclContext *DC,
                                     uint64_t &VisibleBlockOffset,
-                                    uint64_t &ModuleLocalBlockOffset);
+                                    uint64_t &ModuleLocalBlockOffset,
+                                    uint64_t &TULocalBlockOffset);
   void WriteTypeDeclOffsets();
   void WriteFileDeclIDsMap();
   void WriteComments(ASTContext &Context);
@@ -633,8 +638,10 @@ class ASTWriter : public ASTDeserializationListener,
   unsigned DeclContextLexicalAbbrev = 0;
   unsigned DeclContextVisibleLookupAbbrev = 0;
   unsigned DeclModuleLocalVisibleLookupAbbrev = 0;
+  unsigned DeclTULocalLookupAbbrev = 0;
   unsigned UpdateVisibleAbbrev = 0;
   unsigned ModuleLocalUpdateVisibleAbbrev = 0;
+  unsigned TULocalUpdateVisibleAbbrev = 0;
   unsigned DeclRecordAbbrev = 0;
   unsigned DeclTypedefAbbrev = 0;
   unsigned DeclVarAbbrev = 0;
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 4bfb80589620c..6677119d09211 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -4247,6 +4247,7 @@ bool Compiler<Emitter>::visitExpr(const Expr *E, bool DestroyToplevelScope) {
   // For us, that means everything we don't
   // have a PrimType for.
   if (std::optional<unsigned> LocalOffset = this->allocateLocal(E)) {
+    InitLinkScope<Emitter> ILS(this, InitLink::Temp(*LocalOffset));
     if (!this->emitGetPtrLocal(*LocalOffset, E))
       return false;
 
diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp
index d77f28c80b2eb..81194bbf2538e 100644
--- a/clang/lib/Basic/DiagnosticIDs.cpp
+++ b/clang/lib/Basic/DiagnosticIDs.cpp
@@ -37,21 +37,7 @@ struct StaticDiagInfoDescriptionStringTable {
 #define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR,     \
              SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY)            \
   char ENUM##_desc[sizeof(DESC)];
-  // clang-format off
-#include "clang/Basic/DiagnosticCommonKinds.inc"
-#include "clang/Basic/DiagnosticDriverKinds.inc"
-#include "clang/Basic/DiagnosticFrontendKinds.inc"
-#include "clang/Basic/DiagnosticSerializationKinds.inc"
-#include "clang/Basic/DiagnosticLexKinds.inc"
-#include "clang/Basic/DiagnosticParseKinds.inc"
-#include "clang/Basic/DiagnosticASTKinds.inc"
-#include "clang/Basic/DiagnosticCommentKinds.inc"
-#include "clang/Basic/DiagnosticCrossTUKinds.inc"
-#include "clang/Basic/DiagnosticSemaKinds.inc"
-#include "clang/Basic/DiagnosticAnalysisKinds.inc"
-#include "clang/Basic/DiagnosticRefactoringKinds.inc"
-#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
-  // clang-format on
+#include "clang/Basic/AllDiagnosticKinds.inc"
 #undef DIAG
 };
 
@@ -59,21 +45,7 @@ const StaticDiagInfoDescriptionStringTable StaticDiagInfoDescriptions = {
 #define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR,     \
              SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY)            \
   DESC,
-// clang-format off
-#include "clang/Basic/DiagnosticCommonKinds.inc"
-#include "clang/Basic/DiagnosticDriverKinds.inc"
-#include "clang/Basic/DiagnosticFrontendKinds.inc"
-#include "clang/Basic/DiagnosticSerializationKinds.inc"
-#include "clang/Basic/DiagnosticLexKinds.inc"
-#include "clang/Basic/DiagnosticParseKinds.inc"
-#include "clang/Basic/DiagnosticASTKinds.inc"
-#include "clang/Basic/DiagnosticCommentKinds.inc"
-#include "clang/Basic/DiagnosticCrossTUKinds.inc"
-#include "clang/Basic/DiagnosticSemaKinds.inc"
-#include "clang/Basic/DiagnosticAnalysisKinds.inc"
-#include "clang/Basic/DiagnosticRefactoringKinds.inc"
-#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
-// clang-format on
+#include "clang/Basic/AllDiagnosticKinds.inc"
 #undef DIAG
 };
 
@@ -85,21 +57,7 @@ const uint32_t StaticDiagInfoDescriptionOffsets[] = {
 #define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR,     \
              SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY)            \
   offsetof(StaticDiagInfoDescriptionStringTable, ENUM##_desc),
-// clang-format off
-#include "clang/Basic/DiagnosticCommonKinds.inc"
-#include "clang/Basic/DiagnosticDriverKinds.inc"
-#include "clang/Basic/DiagnosticFrontendKinds.inc"
-#include "clang/Basic/DiagnosticSerializationKinds.inc"
-#include "clang/Basic/DiagnosticLexKinds.inc"
-#include "clang/Basic/DiagnosticParseKinds.inc"
-#include "clang/Basic/DiagnosticASTKinds.inc"
-#include "clang/Basic/DiagnosticCommentKinds.inc"
-#include "clang/Basic/DiagnosticCrossTUKinds.inc"
-#include "clang/Basic/DiagnosticSemaKinds.inc"
-#include "clang/Basic/DiagnosticAnalysisKinds.inc"
-#include "clang/Basic/DiagnosticRefactoringKinds.inc"
-#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
-// clang-format on
+#include "clang/Basic/AllDiagnosticKinds.inc"
 #undef DIAG
 };
 
diff --git a/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp b/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp
index 5447b98d7105e..02635ce235a12 100644
--- a/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp
+++ b/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp
@@ -81,6 +81,9 @@ class PCHContainerGenerator : public ASTConsumer {
         if (!TD->isCompleteDefinition())
           return true;
 
+      if (D->hasAttr<NoDebugAttr>())
+        return true;
+
       QualType QualTy = Ctx.getTypeDeclType(D);
       if (!QualTy.isNull() && CanRepresent(QualTy.getTypePtr()))
         DI.getOrCreateStandaloneType(QualTy, D->getLocation());
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 202227b195585..a72ff766685bb 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -1425,10 +1425,9 @@ bool ASTReader::ReadLexicalDeclContextStorage(ModuleFile &M,
   return false;
 }
 
-bool ASTReader::ReadVisibleDeclContextStorage(ModuleFile &M,
-                                              BitstreamCursor &Cursor,
-                                              uint64_t Offset, GlobalDeclID ID,
-                                              bool IsModuleLocal) {
+bool ASTReader::ReadVisibleDeclContextStorage(
+    ModuleFile &M, BitstreamCursor &Cursor, uint64_t Offset, GlobalDeclID ID,
+    ASTReader::VisibleDeclContextStorageKind VisibleKind) {
   assert(Offset != 0);
 
   SavedStreamPosition SavedPosition(Cursor);
@@ -1452,22 +1451,42 @@ bool ASTReader::ReadVisibleDeclContextStorage(ModuleFile &M,
     return true;
   }
   unsigned RecCode = MaybeRecCode.get();
-  if (!IsModuleLocal && RecCode != DECL_CONTEXT_VISIBLE) {
-    Error("Expected visible lookup table block");
-    return true;
-  }
-  if (IsModuleLocal && RecCode != DECL_CONTEXT_MODULE_LOCAL_VISIBLE) {
-    Error("Expected module local visible lookup table block");
-    return true;
+  switch (VisibleKind) {
+  case VisibleDeclContextStorageKind::GenerallyVisible:
+    if (RecCode != DECL_CONTEXT_VISIBLE) {
+      Error("Expected visible lookup table block");
+      return true;
+    }
+    break;
+  case VisibleDeclContextStorageKind::ModuleLocalVisible:
+    if (RecCode != DECL_CONTEXT_MODULE_LOCAL_VISIBLE) {
+      Error("Expected module local visible lookup table block");
+      return true;
+    }
+    break;
+  case VisibleDeclContextStorageKind::TULocalVisible:
+    if (RecCode != DECL_CONTEXT_TU_LOCAL_VISIBLE) {
+      Error("Expected TU local lookup table block");
+      return true;
+    }
+    break;
   }
 
   // We can't safely determine the primary context yet, so delay attaching the
   // lookup table until we're done with recursive deserialization.
   auto *Data = (const unsigned char*)Blob.data();
-  if (!IsModuleLocal)
+  switch (VisibleKind) {
+  case VisibleDeclContextStorageKind::GenerallyVisible:
     PendingVisibleUpdates[ID].push_back(UpdateData{&M, Data});
-  else
+    break;
+  case VisibleDeclContextStorageKind::ModuleLocalVisible:
     PendingModuleLocalVisibleUpdates[ID].push_back(UpdateData{&M, Data});
+    break;
+  case VisibleDeclContextStorageKind::TULocalVisible:
+    if (M.Kind == MK_MainFile)
+      TULocalUpdates[ID].push_back(UpdateData{&M, Data});
+    break;
+  }
   return false;
 }
 
@@ -3613,6 +3632,21 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
       break;
     }
 
+    case UPDATE_TU_LOCAL_VISIBLE: {
+      if (F.Kind != MK_MainFile)
+        break;
+      unsigned Idx = 0;
+      GlobalDeclID ID = ReadDeclID(F, Record, Idx);
+      auto *Data = (const unsigned char *)Blob.data();
+      TULocalUpdates[ID].push_back(UpdateData{&F, Data});
+      // If we've already loaded the decl, perform the updates when we finish
+      // loading this block.
+      if (Decl *D = GetExistingDecl(ID))
+        PendingUpdateRecords.push_back(
+            PendingUpdateRecord(ID, D, /*JustLoaded=*/false));
+      break;
+    }
+
     case CXX_ADDED_TEMPLATE_SPECIALIZATION: {
       unsigned Idx = 0;
       GlobalDeclID ID = ReadDeclID(F, Record, Idx);
@@ -3717,6 +3751,7 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
       TotalLexicalDeclContexts += Record[2];
       TotalVisibleDeclContexts += Record[3];
       TotalModuleLocalVisibleDeclContexts += Record[4];
+      TotalTULocalVisibleDeclContexts += Record[5];
       break;
 
     case UNUSED_FILESCOPED_DECLS:
@@ -4002,7 +4037,7 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
       break;
 
     case DELAYED_NAMESPACE_LEXICAL_VISIBLE_RECORD: {
-      if (Record.size() % 4 != 0)
+      if (Record.size() % 5 != 0)
         return llvm::createStringError(
             std::errc::illegal_byte_sequence,
             "invalid DELAYED_NAMESPACE_LEXICAL_VISIBLE_RECORD block in AST "
@@ -4021,9 +4056,12 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
         uint64_t LocalModuleLocalOffset = Record[I++];
         uint64_t ModuleLocalOffset =
             LocalModuleLocalOffset ? BaseOffset + LocalModuleLocalOffset : 0;
+        uint64_t TULocalLocalOffset = Record[I++];
+        uint64_t TULocalOffset =
+            TULocalLocalOffset ? BaseOffset + TULocalLocalOffset : 0;
 
         DelayedNamespaceOffsetMap[ID] = {LexicalOffset, VisibleOffset,
-                                         ModuleLocalOffset};
+                                         ModuleLocalOffset, TULocalOffset};
 
         assert(!GetExistingDecl(ID) &&
                "We shouldn't load the namespace in the front of delayed "
@@ -8473,6 +8511,15 @@ bool ASTReader::FindExternalVisibleDeclsByName(const DeclContext *DC,
     }
   }
 
+  if (auto It = TULocalLookups.find(DC); It != TULocalLookups.end()) {
+    ++NumTULocalVisibleDeclContexts;
+    for (GlobalDeclID ID : It->second.Table.find(Name)) {
+      NamedDecl *ND = cast<NamedDecl>(GetDecl(ID));
+      if (ND->getDeclName() == Name && Found.insert(ND).second)
+        Decls.push_back(ND);
+    }
+  }
+
   SetExternalVisibleDeclsForName(DC, Name, Decls);
   return !Decls.empty();
 }
@@ -8500,6 +8547,7 @@ void ASTReader::completeVisibleDeclsMap(const DeclContext *DC) {
 
   findAll(Lookups, NumVisibleDeclContextsRead);
   findAll(ModuleLocalLookups, NumModuleLocalVisibleDeclContexts);
+  findAll(TULocalLookups, NumTULocalVisibleDeclContexts);
 
   for (DeclsMap::iterator I = Decls.begin(), E = Decls.end(); I != E; ++I) {
     SetExternalVisibleDeclsForName(DC, I->first, I->second);
@@ -8519,6 +8567,12 @@ ASTReader::getModuleLocalLookupTables(DeclContext *Primary) const {
   return I == ModuleLocalLookups.end() ? nullptr : &I->second;
 }
 
+const serialization::reader::DeclContextLookupTable *
+ASTReader::getTULocalLookupTables(DeclContext *Primary) const {
+  auto I = TULocalLookups.find(Primary);
+  return I == TULocalLookups.end() ? nullptr : &I->second;
+}
+
 serialization::reader::LazySpecializationInfoLookupTable *
 ASTReader::getLoadedSpecializationsLookupTables(const Decl *D, bool IsPartial) {
   assert(D->isCanonicalDecl());
@@ -8634,6 +8688,11 @@ void ASTReader::PrintStats() {
         NumModuleLocalVisibleDeclContexts, TotalModuleLocalVisibleDeclContexts,
         ((float)NumModuleLocalVisibleDeclContexts /
          TotalModuleLocalVisibleDeclContexts * 100));
+  if (TotalTULocalVisibleDeclContexts)
+    std::fprintf(stderr, "  %u/%u visible declcontexts in GMF read (%f%%)\n",
+                 NumTULocalVisibleDeclContexts, TotalTULocalVisibleDeclContexts,
+                 ((float)NumTULocalVisibleDeclContexts /
+                  TotalTULocalVisibleDeclContexts * 100));
   if (TotalNumMethodPoolEntries)
     std::fprintf(stderr, "  %u/%u method pool entries read (%f%%)\n",
                  NumMethodPoolEntriesRead, TotalNumMethodPoolEntries,
@@ -10180,6 +10239,11 @@ void ASTReader::finishPendingActions() {
     }
     PendingDeducedVarTypes.clear();
 
+    // Load the delayed preferred name attributes.
+    for (unsigned I = 0; I != PendingDeferredAttributes.size(); ++I)
+      loadDeferredAttribute(PendingDeferredAttributes[I]);
+    PendingDeferredAttributes.clear();
+
     // For each decl chain that we wanted to complete while deserializing, mark
     // it as "still needs to be completed".
     for (unsigned I = 0; I != PendingIncompleteDeclChains.size(); ++I) {
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 1c51a7b5e460f..de834285fa76b 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -414,7 +414,8 @@ class ASTDeclReader : public DeclVisitor<ASTDeclReader, void> {
   void VisitLifetimeExtendedTemporaryDecl(LifetimeExtendedTemporaryDecl *D);
 
   void VisitDeclContext(DeclContext *DC, uint64_t &LexicalOffset,
-                        uint64_t &VisibleOffset, uint64_t &ModuleLocalOffset);
+                        uint64_t &VisibleOffset, uint64_t &ModuleLocalOffset,
+                        uint64_t &TULocalOffset);
 
   template <typename T>
   RedeclarableResult VisitRedeclarable(Redeclarable<T> *D);
@@ -612,7 +613,7 @@ void ASTDeclReader::VisitDecl(Decl *D) {
 
   if (HasAttrs) {
     AttrVec Attrs;
-    Record.readAttributes(Attrs);
+    Record.readAttributes(Attrs, D);
     // Avoid calling setAttrs() directly because it uses Decl::getASTContext()
     // internally which is unsafe during derialization.
     D->setAttrsImpl(Attrs, Reader.getContext());
@@ -1859,7 +1860,9 @@ void ASTDeclReader::VisitHLSLBufferDecl(HLSLBufferDecl *D) {
   uint64_t LexicalOffset = 0;
   uint64_t VisibleOffset = 0;
   uint64_t ModuleLocalOffset = 0;
-  VisitDeclContext(D, LexicalOffset, VisibleOffset, ModuleLocalOffset);
+  uint64_t TULocalOffset = 0;
+  VisitDeclContext(D, LexicalOffset, VisibleOffset, ModuleLocalOffset,
+                   TULocalOffset);
   D->IsCBuffer = Record.readBool();
   D->KwLoc = readSourceLocation();
   D->LBraceLoc = readSourceLocation();
@@ -2770,10 +2773,12 @@ void ASTDeclReader::VisitLifetimeExtendedTemporaryDecl(
 
 void ASTDeclReader::VisitDeclContext(DeclContext *DC, uint64_t &LexicalOffset,
                                      uint64_t &VisibleOffset,
-                                     uint64_t &ModuleLocalOffset) {
+                                     uint64_t &ModuleLocalOffset,
+                                     uint64_t &TULocalOffset) {
   LexicalOffset = ReadLocalOffset();
   VisibleOffset = ReadLocalOffset();
   ModuleLocalOffset = ReadLocalOffset();
+  TULocalOffset = ReadLocalOffset();
 }
 
 template <typename T>
@@ -3093,6 +3098,8 @@ class AttrReader {
     return Reader.readInt();
   }
 
+  uint64_t peekInts(unsigned N) { return Reader.peekInts(N); }
+
   bool readBool() { return Reader.readBool(); }
 
   SourceRange readSourceRange() {
@@ -3123,18 +3130,29 @@ class AttrReader {
     return Reader.readVersionTuple();
   }
 
+  void skipInt() { Reader.skipInts(1); }
+
+  void skipInts(unsigned N) { Reader.skipInts(N); }
+
+  unsigned getCurrentIdx() { return Reader.getIdx(); }
+
   OMPTraitInfo *readOMPTraitInfo() { return Reader.readOMPTraitInfo(); }
 
   template <typename T> T *readDeclAs() { return Reader.readDeclAs<T>(); }
 };
 }
 
+/// Reads one attribute from the current stream position, advancing Idx.
 Attr *ASTRecordReader::readAttr() {
   AttrReader Record(*this);
   auto V = Record.readInt();
   if (!V)
     return nullptr;
 
+  // Read and ignore the skip count, since attribute deserialization is not
+  // deferred on this pass.
+  Record.skipInt();
+
   Attr *New = nullptr;
   // Kind is stored as a 1-based integer because 0 is used to indicate a null
   // Attr pointer.
@@ -3164,13 +3182,28 @@ Attr *ASTRecordReader::readAttr() {
   return New;
 }
 
-/// Reads attributes from the current stream position.
-void ASTRecordReader::readAttributes(AttrVec &Attrs) {
+/// Reads attributes from the current stream position, advancing Idx.
+/// For some attributes (where type depends on itself recursively), defer
+/// reading the attribute until the type has been read.
+void ASTRecordReader::readAttributes(AttrVec &Attrs, Decl *D) {
   for (unsigned I = 0, E = readInt(); I != E; ++I)
-    if (auto *A = readAttr())
+    if (auto *A = readOrDeferAttrFor(D))
       Attrs.push_back(A);
 }
 
+/// Reads one attribute from the current stream position, advancing Idx.
+/// For some attributes (where type depends on itself recursively), defer
+/// reading the attribute until the type has been read.
+Attr *ASTRecordReader::readOrDeferAttrFor(Decl *D) {
+  AttrReader Record(*this);
+  unsigned SkipCount = Record.peekInts(1);
+  if (!SkipCount)
+    return readAttr();
+  Reader->PendingDeferredAttributes.push_back({Record.getCurrentIdx(), D});
+  Record.skipInts(SkipCount);
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // ASTReader Implementation
 //===----------------------------------------------------------------------===//
@@ -3875,6 +3908,7 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) {
   case DECL_CONTEXT_LEXICAL:
   case DECL_CONTEXT_VISIBLE:
   case DECL_CONTEXT_MODULE_LOCAL_VISIBLE:
+  case DECL_CONTEXT_TU_LOCAL_VISIBLE:
   case DECL_SPECIALIZATIONS:
   case DECL_PARTIAL_SPECIALIZATIONS:
     llvm_unreachable("Record cannot be de-serialized with readDeclRecord");
@@ -4185,9 +4219,10 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) {
     uint64_t LexicalOffset = 0;
     uint64_t VisibleOffset = 0;
     uint64_t ModuleLocalOffset = 0;
+    uint64_t TULocalOffset = 0;
 
-    Reader.VisitDeclContext(DC, LexicalOffset, VisibleOffset,
-                            ModuleLocalOffset);
+    Reader.VisitDeclContext(DC, LexicalOffset, VisibleOffset, ModuleLocalOffset,
+                            TULocalOffset);
 
     // Get the lexical and visible block for the delayed namespace.
     // It is sufficient to judge if ID is in DelayedNamespaceOffsetMap.
@@ -4199,18 +4234,24 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) {
         LexicalOffset = Iter->second.LexicalOffset;
         VisibleOffset = Iter->second.VisibleOffset;
         ModuleLocalOffset = Iter->second.ModuleLocalOffset;
+        TULocalOffset = Iter->second.TULocalOffset;
       }
 
     if (LexicalOffset &&
         ReadLexicalDeclContextStorage(*Loc.F, DeclsCursor, LexicalOffset, DC))
       return nullptr;
-    if (VisibleOffset &&
-        ReadVisibleDeclContextStorage(*Loc.F, DeclsCursor, VisibleOffset, ID,
-                                      /*IsModuleLocal=*/false))
+    if (VisibleOffset && ReadVisibleDeclContextStorage(
+                             *Loc.F, DeclsCursor, VisibleOffset, ID,
+                             VisibleDeclContextStorageKind::GenerallyVisible))
       return nullptr;
     if (ModuleLocalOffset &&
-        ReadVisibleDeclContextStorage(*Loc.F, DeclsCursor, ModuleLocalOffset,
-                                      ID, /*IsModuleLocal=*/true))
+        ReadVisibleDeclContextStorage(
+            *Loc.F, DeclsCursor, ModuleLocalOffset, ID,
+            VisibleDeclContextStorageKind::ModuleLocalVisible))
+      return nullptr;
+    if (TULocalOffset && ReadVisibleDeclContextStorage(
+                             *Loc.F, DeclsCursor, TULocalOffset, ID,
+                             VisibleDeclContextStorageKind::TULocalVisible))
       return nullptr;
   }
   assert(Record.getIdx() == Record.size());
@@ -4376,6 +4417,18 @@ void ASTReader::loadDeclUpdateRecords(PendingUpdateRecord &Record) {
     DC->setHasExternalVisibleStorage(true);
   }
 
+  if (auto I = TULocalUpdates.find(ID); I != TULocalUpdates.end()) {
+    auto Updates = std::move(I->second);
+    TULocalUpdates.erase(I);
+
+    auto *DC = cast<DeclContext>(D)->getPrimaryContext();
+    for (const auto &Update : Updates)
+      TULocalLookups[DC].Table.add(
+          Update.Mod, Update.Data,
+          reader::ASTDeclContextNameLookupTrait(*this, *Update.Mod));
+    DC->setHasExternalVisibleStorage(true);
+  }
+
   // Load any pending related decls.
   if (D->isCanonicalDecl()) {
     if (auto IT = RelatedDeclsMap.find(ID); IT != RelatedDeclsMap.end()) {
@@ -4459,6 +4512,49 @@ void ASTReader::loadPendingDeclChain(Decl *FirstLocal, uint64_t LocalOffset) {
   ASTDeclReader::attachLatestDecl(CanonDecl, MostRecent);
 }
 
+void ASTReader::loadDeferredAttribute(const DeferredAttribute &DA) {
+  Decl *D = DA.TargetedDecl;
+  ModuleFile *M = getOwningModuleFile(D);
+
+  unsigned LocalDeclIndex = D->getGlobalID().getLocalDeclIndex();
+  const DeclOffset &DOffs = M->DeclOffsets[LocalDeclIndex];
+  RecordLocation Loc(M, DOffs.getBitOffset(M->DeclsBlockStartOffset));
+
+  llvm::BitstreamCursor &Cursor = Loc.F->DeclsCursor;
+  SavedStreamPosition SavedPosition(Cursor);
+  if (llvm::Error Err = Cursor.JumpToBit(Loc.Offset)) {
+    Error(std::move(Err));
+  }
+
+  Expected<unsigned> MaybeCode = Cursor.ReadCode();
+  if (!MaybeCode) {
+    llvm::report_fatal_error(
+        Twine("ASTReader::loadPreferredNameAttribute failed reading code: ") +
+        toString(MaybeCode.takeError()));
+  }
+  unsigned Code = MaybeCode.get();
+
+  ASTRecordReader Record(*this, *Loc.F);
+  Expected<unsigned> MaybeRecCode = Record.readRecord(Cursor, Code);
+  if (!MaybeRecCode) {
+    llvm::report_fatal_error(
+        Twine(
+            "ASTReader::loadPreferredNameAttribute failed reading rec code: ") +
+        toString(MaybeCode.takeError()));
+  }
+  unsigned RecCode = MaybeRecCode.get();
+  if (RecCode < DECL_TYPEDEF || RecCode > DECL_LAST) {
+    llvm::report_fatal_error(
+        Twine("ASTReader::loadPreferredNameAttribute failed reading rec code: "
+              "expected valid DeclCode") +
+        toString(MaybeCode.takeError()));
+  }
+
+  Record.skipInts(DA.RecordIdx);
+  Attr *A = Record.readAttr();
+  getContext().getDeclAttrs(D).push_back(A);
+}
+
 namespace {
 
   /// Given an ObjC interface, goes through the modules and links to the
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 55d3c2bb56f2c..c7c17e09a30e0 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -37,6 +37,7 @@
 #include "clang/AST/Type.h"
 #include "clang/AST/TypeLoc.h"
 #include "clang/AST/TypeLocVisitor.h"
+#include "clang/Basic/AttrKinds.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Basic/FileEntry.h"
@@ -4046,6 +4047,13 @@ class ASTDeclContextNameLookupTraitBase {
       : Writer(Writer) {}
 
 public:
+  data_type getData(const DeclIDsTy &LocalIDs) {
+    unsigned Start = DeclIDs.size();
+    for (auto ID : LocalIDs)
+      DeclIDs.push_back(ID);
+    return std::make_pair(Start, DeclIDs.size());
+  }
+
   data_type ImportData(const reader::ASTDeclContextNameLookupTrait::data_type &FromReader) {
     unsigned Start = DeclIDs.size();
     DeclIDs.insert(
@@ -4138,23 +4146,16 @@ class ASTDeclContextNameLookupTraitBase {
   }
 };
 
-class ModuleLocalNameLookupTrait : public ASTDeclContextNameLookupTraitBase {
+class ModuleLevelNameLookupTrait : public ASTDeclContextNameLookupTraitBase {
 public:
   using primary_module_hash_type = unsigned;
 
   using key_type = std::pair<DeclarationNameKey, primary_module_hash_type>;
   using key_type_ref = key_type;
 
-  explicit ModuleLocalNameLookupTrait(ASTWriter &Writer)
+  explicit ModuleLevelNameLookupTrait(ASTWriter &Writer)
       : ASTDeclContextNameLookupTraitBase(Writer) {}
 
-  data_type getData(const DeclIDsTy &LocalIDs) {
-    unsigned Start = DeclIDs.size();
-    for (auto ID : LocalIDs)
-      DeclIDs.push_back(ID);
-    return std::make_pair(Start, DeclIDs.size());
-  }
-
   static bool EqualKey(key_type_ref a, key_type_ref b) { return a == b; }
 
   hash_value_type ComputeHash(key_type Key) {
@@ -4202,19 +4203,46 @@ static bool isModuleLocalDecl(NamedDecl *D) {
   return false;
 }
 
+static bool isTULocalInNamedModules(NamedDecl *D) {
+  Module *NamedModule = D->getTopLevelOwningNamedModule();
+  if (!NamedModule)
+    return false;
+
+  // For none-top level decls, we choose to move it to the general visible
+  // lookup table. Since the consumer may get its parent somehow and performs
+  // a lookup in it (considering looking up the operator function in lambda).
+  // The difference between module local lookup table and TU local lookup table
+  // is, the consumers still have a chance to lookup in the module local lookup
+  // table but **now** the consumers won't read the TU local lookup table if
+  // the consumer is not the original TU.
+  //
+  // FIXME: It seems to be an optimization chance (and also a more correct
+  // semantics) to remain the TULocal lookup table and performing similar lookup
+  // with the module local lookup table except that we only allow the lookups
+  // with the same module unit.
+  if (!D->getNonTransparentDeclContext()->isFileContext())
+    return false;
+
+  return D->getLinkageInternal() == Linkage::Internal;
+}
+
 // Trait used for the on-disk hash table used in the method pool.
+template <bool CollectingTULocalDecls>
 class ASTDeclContextNameLookupTrait : public ASTDeclContextNameLookupTraitBase {
 public:
-  using ModuleLocalDeclsMapTy =
-      llvm::DenseMap<ModuleLocalNameLookupTrait::key_type, DeclIDsTy>;
-
-private:
-  ModuleLocalDeclsMapTy ModuleLocalDeclsMap;
+  using ModuleLevelDeclsMapTy =
+      llvm::DenseMap<ModuleLevelNameLookupTrait::key_type, DeclIDsTy>;
 
-public:
   using key_type = DeclarationNameKey;
   using key_type_ref = key_type;
 
+  using TULocalDeclsMapTy = llvm::DenseMap<key_type, DeclIDsTy>;
+
+private:
+  ModuleLevelDeclsMapTy ModuleLocalDeclsMap;
+  TULocalDeclsMapTy TULocalDeclsMap;
+
+public:
   explicit ASTDeclContextNameLookupTrait(ASTWriter &Writer)
       : ASTDeclContextNameLookupTraitBase(Writer) {}
 
@@ -4250,15 +4278,30 @@ class ASTDeclContextNameLookupTrait : public ASTDeclContextNameLookupTraitBase {
         }
       }
 
+      if constexpr (CollectingTULocalDecls) {
+        if (isTULocalInNamedModules(D)) {
+          auto Iter = TULocalDeclsMap.find(D->getDeclName());
+          if (Iter == TULocalDeclsMap.end())
+            TULocalDeclsMap.insert({D->getDeclName(), DeclIDsTy{ID}});
+          else
+            Iter->second.push_back(ID);
+          continue;
+        }
+      }
+
       DeclIDs.push_back(ID);
     }
     return std::make_pair(Start, DeclIDs.size());
   }
 
-  const ModuleLocalDeclsMapTy &getModuleLocalDecls() {
+  using ASTDeclContextNameLookupTraitBase::getData;
+
+  const ModuleLevelDeclsMapTy &getModuleLocalDecls() {
     return ModuleLocalDeclsMap;
   }
 
+  const TULocalDeclsMapTy &getTULocalDecls() { return TULocalDeclsMap; }
+
   static bool EqualKey(key_type_ref a, key_type_ref b) { return a == b; }
 
   hash_value_type ComputeHash(key_type Name) { return Name.getHash(); }
@@ -4486,7 +4529,8 @@ static bool isLookupResultNotInteresting(ASTWriter &Writer,
 void ASTWriter::GenerateNameLookupTable(
     ASTContext &Context, const DeclContext *ConstDC,
     llvm::SmallVectorImpl<char> &LookupTable,
-    llvm::SmallVectorImpl<char> &ModuleLocalLookupTable) {
+    llvm::SmallVectorImpl<char> &ModuleLocalLookupTable,
+    llvm::SmallVectorImpl<char> &TULookupTable) {
   assert(!ConstDC->hasLazyLocalLexicalLookups() &&
          !ConstDC->hasLazyExternalLexicalLookups() &&
          "must call buildLookups first");
@@ -4496,9 +4540,11 @@ void ASTWriter::GenerateNameLookupTable(
   assert(DC == DC->getPrimaryContext() && "only primary DC has lookup table");
 
   // Create the on-disk hash table representation.
-  MultiOnDiskHashTableGenerator<reader::ASTDeclContextNameLookupTrait,
-                                ASTDeclContextNameLookupTrait> Generator;
-  ASTDeclContextNameLookupTrait Trait(*this);
+  MultiOnDiskHashTableGenerator<
+      reader::ASTDeclContextNameLookupTrait,
+      ASTDeclContextNameLookupTrait</*CollectingTULocal=*/true>>
+      Generator;
+  ASTDeclContextNameLookupTrait</*CollectingTULocal=*/true> Trait(*this);
 
   // The first step is to collect the declaration names which we need to
   // serialize into the name lookup table, and to collect them in a stable
@@ -4670,26 +4716,45 @@ void ASTWriter::GenerateNameLookupTable(
   Generator.emit(LookupTable, Trait, Lookups ? &Lookups->Table : nullptr);
 
   const auto &ModuleLocalDecls = Trait.getModuleLocalDecls();
-  if (ModuleLocalDecls.empty())
-    return;
+  if (!ModuleLocalDecls.empty()) {
+    MultiOnDiskHashTableGenerator<reader::ModuleLocalNameLookupTrait,
+                                  ModuleLevelNameLookupTrait>
+        ModuleLocalLookupGenerator;
+    ModuleLevelNameLookupTrait ModuleLocalTrait(*this);
+
+    for (const auto &ModuleLocalIter : ModuleLocalDecls) {
+      const auto &Key = ModuleLocalIter.first;
+      const auto &IDs = ModuleLocalIter.second;
+      ModuleLocalLookupGenerator.insert(Key, ModuleLocalTrait.getData(IDs),
+                                        ModuleLocalTrait);
+    }
 
-  MultiOnDiskHashTableGenerator<reader::ModuleLocalNameLookupTrait,
-                                ModuleLocalNameLookupTrait>
-      ModuleLocalLookupGenerator;
-  ModuleLocalNameLookupTrait ModuleLocalTrait(*this);
+    auto *ModuleLocalLookups =
+        Chain ? Chain->getModuleLocalLookupTables(DC) : nullptr;
+    ModuleLocalLookupGenerator.emit(
+        ModuleLocalLookupTable, ModuleLocalTrait,
+        ModuleLocalLookups ? &ModuleLocalLookups->Table : nullptr);
+  }
+
+  const auto &TULocalDecls = Trait.getTULocalDecls();
+  if (!TULocalDecls.empty() && !isGeneratingReducedBMI()) {
+    MultiOnDiskHashTableGenerator<
+        reader::ASTDeclContextNameLookupTrait,
+        ASTDeclContextNameLookupTrait</*CollectingTULocal=*/false>>
+        TULookupGenerator;
+    ASTDeclContextNameLookupTrait</*CollectingTULocal=*/false> TULocalTrait(
+        *this);
+
+    for (const auto &TULocalIter : TULocalDecls) {
+      const auto &Key = TULocalIter.first;
+      const auto &IDs = TULocalIter.second;
+      TULookupGenerator.insert(Key, TULocalTrait.getData(IDs), TULocalTrait);
+    }
 
-  for (const auto &ModuleLocalIter : ModuleLocalDecls) {
-    const auto &Key = ModuleLocalIter.first;
-    const auto &IDs = ModuleLocalIter.second;
-    ModuleLocalLookupGenerator.insert(Key, ModuleLocalTrait.getData(IDs),
-                                      ModuleLocalTrait);
+    auto *TULocalLookups = Chain ? Chain->getTULocalLookupTables(DC) : nullptr;
+    TULookupGenerator.emit(TULookupTable, TULocalTrait,
+                           TULocalLookups ? &TULocalLookups->Table : nullptr);
   }
-
-  auto *ModuleLocalLookups =
-      Chain ? Chain->getModuleLocalLookupTables(DC) : nullptr;
-  ModuleLocalLookupGenerator.emit(
-      ModuleLocalLookupTable, ModuleLocalTrait,
-      ModuleLocalLookups ? &ModuleLocalLookups->Table : nullptr);
 }
 
 /// Write the block containing all of the declaration IDs
@@ -4700,7 +4765,12 @@ void ASTWriter::GenerateNameLookupTable(
 void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context,
                                              DeclContext *DC,
                                              uint64_t &VisibleBlockOffset,
-                                             uint64_t &ModuleLocalBlockOffset) {
+                                             uint64_t &ModuleLocalBlockOffset,
+                                             uint64_t &TULocalBlockOffset) {
+  assert(VisibleBlockOffset == 0);
+  assert(ModuleLocalBlockOffset == 0);
+  assert(TULocalBlockOffset == 0);
+
   // If we imported a key declaration of this namespace, write the visible
   // lookup results as an update record for it rather than including them
   // on this declaration. We will only look at key declarations on reload.
@@ -4787,7 +4857,9 @@ void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context,
   // Create the on-disk hash table in a buffer.
   SmallString<4096> LookupTable;
   SmallString<4096> ModuleLocalLookupTable;
-  GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable);
+  SmallString<4096> TULookupTable;
+  GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable,
+                          TULookupTable);
 
   // Write the lookup table
   RecordData::value_type Record[] = {DECL_CONTEXT_VISIBLE};
@@ -4795,17 +4867,26 @@ void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context,
                             LookupTable);
   ++NumVisibleDeclContexts;
 
-  if (ModuleLocalLookupTable.empty())
-    return;
+  if (!ModuleLocalLookupTable.empty()) {
+    ModuleLocalBlockOffset = Stream.GetCurrentBitNo();
+    assert(ModuleLocalBlockOffset > VisibleBlockOffset);
+    // Write the lookup table
+    RecordData::value_type ModuleLocalRecord[] = {
+        DECL_CONTEXT_MODULE_LOCAL_VISIBLE};
+    Stream.EmitRecordWithBlob(DeclModuleLocalVisibleLookupAbbrev,
+                              ModuleLocalRecord, ModuleLocalLookupTable);
+    ++NumModuleLocalDeclContexts;
+  }
 
-  ModuleLocalBlockOffset = Stream.GetCurrentBitNo();
-  assert(ModuleLocalBlockOffset > VisibleBlockOffset);
-  // Write the lookup table
-  RecordData::value_type ModuleLocalRecord[] = {
-      DECL_CONTEXT_MODULE_LOCAL_VISIBLE};
-  Stream.EmitRecordWithBlob(DeclModuleLocalVisibleLookupAbbrev,
-                            ModuleLocalRecord, ModuleLocalLookupTable);
-  ++NumModuleLocalDeclContexts;
+  if (!TULookupTable.empty()) {
+    TULocalBlockOffset = Stream.GetCurrentBitNo();
+    // Write the lookup table
+    RecordData::value_type TULocalDeclsRecord[] = {
+        DECL_CONTEXT_TU_LOCAL_VISIBLE};
+    Stream.EmitRecordWithBlob(DeclTULocalLookupAbbrev, TULocalDeclsRecord,
+                              TULookupTable);
+    ++NumTULocalDeclContexts;
+  }
 }
 
 /// Write an UPDATE_VISIBLE block for the given context.
@@ -4823,7 +4904,9 @@ void ASTWriter::WriteDeclContextVisibleUpdate(ASTContext &Context,
   // Create the on-disk hash table in a buffer.
   SmallString<4096> LookupTable;
   SmallString<4096> ModuleLocalLookupTable;
-  GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable);
+  SmallString<4096> TULookupTable;
+  GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable,
+                          TULookupTable);
 
   // If we're updating a namespace, select a key declaration as the key for the
   // update record; those are the only ones that will be checked on reload.
@@ -4835,14 +4918,20 @@ void ASTWriter::WriteDeclContextVisibleUpdate(ASTContext &Context,
                                      getDeclID(cast<Decl>(DC)).getRawValue()};
   Stream.EmitRecordWithBlob(UpdateVisibleAbbrev, Record, LookupTable);
 
-  if (ModuleLocalLookupTable.empty())
-    return;
+  if (!ModuleLocalLookupTable.empty()) {
+    // Write the module local lookup table
+    RecordData::value_type ModuleLocalRecord[] = {
+        UPDATE_MODULE_LOCAL_VISIBLE, getDeclID(cast<Decl>(DC)).getRawValue()};
+    Stream.EmitRecordWithBlob(ModuleLocalUpdateVisibleAbbrev, ModuleLocalRecord,
+                              ModuleLocalLookupTable);
+  }
 
-  // Write the module local lookup table
-  RecordData::value_type ModuleLocalRecord[] = {
-      UPDATE_MODULE_LOCAL_VISIBLE, getDeclID(cast<Decl>(DC)).getRawValue()};
-  Stream.EmitRecordWithBlob(ModuleLocalUpdateVisibleAbbrev, ModuleLocalRecord,
-                            ModuleLocalLookupTable);
+  if (!TULookupTable.empty()) {
+    RecordData::value_type GMFRecord[] = {
+        UPDATE_TU_LOCAL_VISIBLE, getDeclID(cast<Decl>(DC)).getRawValue()};
+    Stream.EmitRecordWithBlob(TULocalUpdateVisibleAbbrev, GMFRecord,
+                              TULookupTable);
+  }
 }
 
 /// Write an FP_PRAGMA_OPTIONS block for the given FPOptions.
@@ -5067,15 +5156,14 @@ void ASTWriter::WriteModuleFileExtension(Sema &SemaRef,
 
 void ASTRecordWriter::AddAttr(const Attr *A) {
   auto &Record = *this;
-  // FIXME: Clang can't handle the serialization/deserialization of
-  // preferred_name properly now. See
-  // https://github.com/llvm/llvm-project/issues/56490 for example.
-  if (!A || (isa<PreferredNameAttr>(A) &&
-             Writer->isWritingStdCXXNamedModules()))
+  if (!A)
     return Record.push_back(0);
 
   Record.push_back(A->getKind() + 1); // FIXME: stable encoding, target attrs
 
+  auto SkipIdx = Record.size();
+  // Add placeholder for the size of deferred attribute.
+  Record.push_back(0);
   Record.AddIdentifierRef(A->getAttrName());
   Record.AddIdentifierRef(A->getScopeName());
   Record.AddSourceRange(A->getRange());
@@ -5086,6 +5174,12 @@ void ASTRecordWriter::AddAttr(const Attr *A) {
   Record.push_back(A->isRegularKeywordAttribute());
 
 #include "clang/Serialization/AttrPCHWrite.inc"
+
+  if (A->shouldDeferDeserialization()) {
+    // Record the actual size of deferred attribute (+ 1 to count the attribute
+    // kind).
+    Record[SkipIdx] = Record.size() - SkipIdx + 1;
+  }
 }
 
 /// Emit the list of attributes to the specified record.
@@ -6025,9 +6119,12 @@ ASTFileSignature ASTWriter::WriteASTCore(Sema *SemaPtr, StringRef isysroot,
   }
 
   // Some simple statistics
-  RecordData::value_type Record[] = {
-      NumStatements, NumMacros, NumLexicalDeclContexts, NumVisibleDeclContexts,
-      NumModuleLocalDeclContexts};
+  RecordData::value_type Record[] = {NumStatements,
+                                     NumMacros,
+                                     NumLexicalDeclContexts,
+                                     NumVisibleDeclContexts,
+                                     NumModuleLocalDeclContexts,
+                                     NumTULocalDeclContexts};
   Stream.EmitRecord(STATISTICS, Record);
   Stream.ExitBlock();
   Stream.FlushToWord();
@@ -6106,7 +6203,9 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) {
     uint64_t LexicalOffset = WriteDeclContextLexicalBlock(Context, NS);
     uint64_t VisibleOffset = 0;
     uint64_t ModuleLocalOffset = 0;
-    WriteDeclContextVisibleBlock(Context, NS, VisibleOffset, ModuleLocalOffset);
+    uint64_t TULocalOffset = 0;
+    WriteDeclContextVisibleBlock(Context, NS, VisibleOffset, ModuleLocalOffset,
+                                 TULocalOffset);
 
     // Write the offset relative to current block.
     if (LexicalOffset)
@@ -6118,10 +6217,14 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) {
     if (ModuleLocalOffset)
       ModuleLocalOffset -= DeclTypesBlockStartOffset;
 
+    if (TULocalOffset)
+      TULocalOffset -= DeclTypesBlockStartOffset;
+
     AddDeclRef(NS, DelayedNamespaceRecord);
     DelayedNamespaceRecord.push_back(LexicalOffset);
     DelayedNamespaceRecord.push_back(VisibleOffset);
     DelayedNamespaceRecord.push_back(ModuleLocalOffset);
+    DelayedNamespaceRecord.push_back(TULocalOffset);
   }
 
   // The process of writing lexical and visible block for delayed namespace
@@ -6207,6 +6310,12 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) {
   Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Blob));
   ModuleLocalUpdateVisibleAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
+  Abv = std::make_shared<llvm::BitCodeAbbrev>();
+  Abv->Add(llvm::BitCodeAbbrevOp(UPDATE_TU_LOCAL_VISIBLE));
+  Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::VBR, 6));
+  Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Blob));
+  TULocalUpdateVisibleAbbrev = Stream.EmitAbbrev(std::move(Abv));
+
   // And a visible updates block for the translation unit.
   WriteDeclContextVisibleUpdate(Context, TU);
 
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 7a494cfe1ac64..30b28057f4c10 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -2069,6 +2069,7 @@ void ASTDeclWriter::VisitDeclContext(DeclContext *DC) {
   uint64_t LexicalOffset = 0;
   uint64_t VisibleOffset = 0;
   uint64_t ModuleLocalOffset = 0;
+  uint64_t TULocalOffset = 0;
 
   if (Writer.isGeneratingReducedBMI() && isa<NamespaceDecl>(DC) &&
       cast<NamespaceDecl>(DC)->isFromExplicitGlobalModule()) {
@@ -2080,12 +2081,14 @@ void ASTDeclWriter::VisitDeclContext(DeclContext *DC) {
     LexicalOffset =
         Writer.WriteDeclContextLexicalBlock(Record.getASTContext(), DC);
     Writer.WriteDeclContextVisibleBlock(Record.getASTContext(), DC,
-                                        VisibleOffset, ModuleLocalOffset);
+                                        VisibleOffset, ModuleLocalOffset,
+                                        TULocalOffset);
   }
 
   Record.AddOffset(LexicalOffset);
   Record.AddOffset(VisibleOffset);
   Record.AddOffset(ModuleLocalOffset);
+  Record.AddOffset(TULocalOffset);
 }
 
 const Decl *ASTWriter::getFirstLocalDecl(const Decl *D) {
@@ -2441,6 +2444,7 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // LexicalOffset
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // VisibleOffset
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // ModuleLocalOffset
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // TULocalOffset
   DeclEnumAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for DECL_RECORD
@@ -2494,6 +2498,7 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // LexicalOffset
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // VisibleOffset
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // ModuleLocalOffset
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // TULocalOffset
   DeclRecordAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for DECL_PARM_VAR
@@ -2836,6 +2841,11 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
   DeclModuleLocalVisibleLookupAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
+  Abv = std::make_shared<BitCodeAbbrev>();
+  Abv->Add(BitCodeAbbrevOp(serialization::DECL_CONTEXT_TU_LOCAL_VISIBLE));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
+  DeclTULocalLookupAbbrev = Stream.EmitAbbrev(std::move(Abv));
+
   Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::DECL_SPECIALIZATIONS));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
diff --git a/clang/test/AST/ByteCode/cxx20.cpp b/clang/test/AST/ByteCode/cxx20.cpp
index 268362ceff635..268226a7c143e 100644
--- a/clang/test/AST/ByteCode/cxx20.cpp
+++ b/clang/test/AST/ByteCode/cxx20.cpp
@@ -893,3 +893,18 @@ namespace VirtDtor {
 
   static_assert(test('C', 'B'));
 }
+
+namespace TemporaryInNTTP {
+  template<auto n> struct B { /* ... */ };
+  struct J1 {
+    J1 *self=this;
+  };
+  /// FIXME: The bytecode interpreter emits a different diagnostic here.
+  /// The current interpreter creates a fake MaterializeTemporaryExpr (see EvaluateAsConstantExpr)
+  /// which is later used as the LValueBase of the created APValue.
+  B<J1{}> j1;  // ref-error {{pointer to temporary object is not allowed in a template argument}} \
+               // expected-error {{non-type template argument is not a constant expression}} \
+               // expected-note {{pointer to temporary is not a constant expression}} \
+               // expected-note {{created here}}
+  B<2> j2; /// Ok.
+}
diff --git a/clang/test/CXX/basic/basic.lookup/basic.lookup.argdep/p5-ex2.cpp b/clang/test/CXX/basic/basic.lookup/basic.lookup.argdep/p5-ex2.cpp
index a27946bd90a46..c200abafc0af8 100644
--- a/clang/test/CXX/basic/basic.lookup/basic.lookup.argdep/p5-ex2.cpp
+++ b/clang/test/CXX/basic/basic.lookup/basic.lookup.argdep/p5-ex2.cpp
@@ -61,6 +61,6 @@ void test() {
 
   // error: S::f is visible in instantiation context, but  R::g has internal
   // linkage and cannot be used outside N.cpp
-  apply(x, S::Z()); // expected-error@N.cpp:10 {{no matching function for call to 'g'}}
-                    // expected-note@-1 {{in instantiation of function template specialization 'apply<R::X, S::Z>' requested here}}
+  apply(x, S::Z()); // expected-error@N.cpp:10 {{use of undeclared identifier 'g'}}
+                    // expected-note@-1 {{requested here}}
 }
diff --git a/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp b/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp
index 54ec6aa61ec37..d70eb7de22c6a 100644
--- a/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp
+++ b/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp
@@ -66,11 +66,7 @@ void test_late() {
   // expected-note@p2.cpp:18 {{'exported' declared here}}
 #endif
 
-  internal = 1;
-#ifndef IMPLEMENTATION
-  // expected-error@-2 {{declaration of 'internal' must be imported from module 'A' before it is required}}
-  // expected-note@p2.cpp:20 {{declaration here is not visible}}
-#endif
+  internal = 1; // expected-error {{use of undeclared identifier 'internal'}}
 
   not_exported_private = 1;
 #ifndef IMPLEMENTATION
@@ -78,11 +74,7 @@ void test_late() {
   // expected-error@-3 {{undeclared identifier}}
 #endif
 
-  internal_private = 1;
-#ifndef IMPLEMENTATION
-  // FIXME: should not be visible here
-  // expected-error@-3 {{undeclared identifier}}
-#endif
+  internal_private = 1; // expected-error {{use of undeclared identifier 'internal_private'}}
 }
 
 #endif
diff --git a/clang/test/CXX/module/basic/basic.def.odr/p4.cppm b/clang/test/CXX/module/basic/basic.def.odr/p4.cppm
index 487dbdef283ee..7e88cbe78b4e3 100644
--- a/clang/test/CXX/module/basic/basic.def.odr/p4.cppm
+++ b/clang/test/CXX/module/basic/basic.def.odr/p4.cppm
@@ -128,7 +128,6 @@ void f(a::b, a::c) {}
 //
 // CHECK-DAG: @_ZW6Module25extern_var_module_linkage = external {{(dso_local )?}}global
 // CHECK-DAG: @_ZW6Module25inline_var_module_linkage = linkonce_odr {{(dso_local )?}}global
-// CHECK-DAG: @_ZL25static_var_module_linkage = internal {{(dso_local )?}}global i32 0,
 // CHECK-DAG: @_ZW6Module24const_var_module_linkage = available_externally {{(dso_local )?}}constant i32 3,
 
 module Module;
@@ -152,10 +151,6 @@ void use() {
   (void)&extern_var_module_linkage;
   (void)&inline_var_module_linkage;
 
-  // FIXME: Issue #61427 Internal-linkage declarations in the interface TU
-  // should not be not visible here.
-  (void)&static_var_module_linkage; // FIXME: Should not be visible here.
-
   (void)&const_var_module_linkage; // FIXME: will be visible after P2788R0
 }
 
diff --git a/clang/test/CXX/module/basic/basic.link/p2.cppm b/clang/test/CXX/module/basic/basic.link/p2.cppm
index 5a497304201dc..d7d2b5992a235 100644
--- a/clang/test/CXX/module/basic/basic.link/p2.cppm
+++ b/clang/test/CXX/module/basic/basic.link/p2.cppm
@@ -45,16 +45,14 @@ module M;
 void use_from_module_impl() {
   external_linkage_fn();
   module_linkage_fn();
-  internal_linkage_fn(); // expected-error {{no matching function for call to 'internal_linkage_fn'}}
+  internal_linkage_fn(); // expected-error {{use of undeclared identifier 'internal_linkage_fn'}} // expected-note@* {{}}
   (void)external_linkage_class{};
   (void)module_linkage_class{};
   (void)external_linkage_var;
   (void)module_linkage_var;
 
-  // FIXME: Issue #61427 Internal-linkage declarations in the interface TU
-  // should not be not visible here.
-  (void)internal_linkage_class{};
-  (void)internal_linkage_var;
+  (void)internal_linkage_class{}; // expected-error {{use of undeclared identifier 'internal_linkage_class'}} //expected-error{{}}
+  (void)internal_linkage_var; // expected-error {{use of undeclared identifier 'internal_linkage_var'}}
 }
 
 //--- user.cpp
@@ -63,11 +61,10 @@ import M;
 void use_from_module_impl() {
   external_linkage_fn();
   module_linkage_fn();   // expected-error {{use of undeclared identifier 'module_linkage_fn'}}
-  internal_linkage_fn(); // expected-error {{declaration of 'internal_linkage_fn' must be imported}}
+  internal_linkage_fn(); // expected-error {{use of undeclared identifier 'internal_linkage_fn'}}
   (void)external_linkage_class{};
-  (void)module_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}}
+  (void)module_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}} // expected-note@* {{}}
   (void)internal_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}}
-  // expected-note@M.cppm:10 {{declaration here is not visible}}
   (void)external_linkage_var;
   (void)module_linkage_var; // expected-error {{undeclared identifier}}
   (void)internal_linkage_var; // expected-error {{undeclared identifier}}
diff --git a/clang/test/Modules/gmodules-nodebug.cpp b/clang/test/Modules/gmodules-nodebug.cpp
new file mode 100644
index 0000000000000..d83103768e838
--- /dev/null
+++ b/clang/test/Modules/gmodules-nodebug.cpp
@@ -0,0 +1,14 @@
+// REQUIRES: asserts
+
+// RUN: %clang_cc1 -std=c++23 -x c++-header -emit-pch -fmodule-format=obj \
+// RUN:   -o %t.pch %s \
+// RUN:   -mllvm -debug-only=pchcontainer &>%t-pch.ll
+// RUN: cat %t-pch.ll | FileCheck %s
+
+template<class...>                     
+using __void_t [[gnu::nodebug]] = void;
+                                       
+__void_t<> func() {}                   
+
+// CHECK: !DICompileUnit
+// CHECK-NOT: __void_t
diff --git a/clang/test/Modules/preferred_name.cppm b/clang/test/Modules/preferred_name.cppm
index 806781a81c5ca..86ba6ae96db99 100644
--- a/clang/test/Modules/preferred_name.cppm
+++ b/clang/test/Modules/preferred_name.cppm
@@ -53,10 +53,16 @@ import A;
 export using ::foo_templ;
 
 //--- Use1.cpp
-import A;         // expected-warning@foo.h:8 {{attribute declaration must precede definition}}
-#include "foo.h"  // expected-note@foo.h:9 {{previous definition is here}}
-
+// expected-no-diagnostics
+import A;
+#include "foo.h"
 //--- Use2.cpp
 // expected-no-diagnostics
 #include "foo.h"
 import A;
+
+//--- Use3.cpp
+#include "foo.h"
+import A;
+foo test;
+int size = test.size(); // expected-error {{no member named 'size' in 'foo'}}
diff --git a/clang/tools/diagtool/DiagnosticNames.cpp b/clang/tools/diagtool/DiagnosticNames.cpp
index eb90f082437b3..c3a3002889c73 100644
--- a/clang/tools/diagtool/DiagnosticNames.cpp
+++ b/clang/tools/diagtool/DiagnosticNames.cpp
@@ -23,26 +23,13 @@ llvm::ArrayRef<DiagnosticRecord> diagtool::getBuiltinDiagnosticsByName() {
   return llvm::ArrayRef(BuiltinDiagnosticsByName);
 }
 
-
 // FIXME: Is it worth having two tables, especially when this one can get
 // out of sync easily?
 static const DiagnosticRecord BuiltinDiagnosticsByID[] = {
 #define DIAG(ENUM, CLASS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR,      \
              SHOWINSYSHEADER, SHOWINSYSMACRO, DEFER, CATEGORY)                 \
   {#ENUM, diag::ENUM, STR_SIZE(#ENUM, uint8_t)},
-#include "clang/Basic/DiagnosticCommonKinds.inc"
-#include "clang/Basic/DiagnosticCrossTUKinds.inc"
-#include "clang/Basic/DiagnosticDriverKinds.inc"
-#include "clang/Basic/DiagnosticFrontendKinds.inc"
-#include "clang/Basic/DiagnosticSerializationKinds.inc"
-#include "clang/Basic/DiagnosticLexKinds.inc"
-#include "clang/Basic/DiagnosticParseKinds.inc"
-#include "clang/Basic/DiagnosticASTKinds.inc"
-#include "clang/Basic/DiagnosticCommentKinds.inc"
-#include "clang/Basic/DiagnosticSemaKinds.inc"
-#include "clang/Basic/DiagnosticAnalysisKinds.inc"
-#include "clang/Basic/DiagnosticRefactoringKinds.inc"
-#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
+#include "clang/Basic/AllDiagnosticKinds.inc"
 #undef DIAG
 };
 
@@ -54,6 +41,13 @@ static bool orderByID(const DiagnosticRecord &Left,
 const DiagnosticRecord &diagtool::getDiagnosticForID(short DiagID) {
   DiagnosticRecord Key = {nullptr, DiagID, 0};
 
+  // The requirement for lower_bound to produce a valid result it is
+  // enough if the BuiltinDiagnosticsByID is partitioned (by DiagID),
+  // but as we want this function to work for all possible values of
+  // DiagID sent in as argument it is better to right away check if
+  // BuiltinDiagnosticsByID is sorted.
+  assert(llvm::is_sorted(BuiltinDiagnosticsByID, orderByID) &&
+         "IDs in BuiltinDiagnosticsByID must be sorted.");
   const DiagnosticRecord *Result =
       llvm::lower_bound(BuiltinDiagnosticsByID, Key, orderByID);
   assert(Result && "diagnostic not found; table may be out of date");
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index cc6a8eaebd44e..41730eba32ce2 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -3043,6 +3043,10 @@ static void emitAttributes(const RecordKeeper &Records, raw_ostream &OS,
            << (R.getValueAsBit("InheritEvenIfAlreadyPresent") ? "true"
                                                               : "false");
       }
+      if (R.getValueAsBit("DeferDeserialization")) {
+        OS << ", "
+           << "/*DeferDeserialization=*/true";
+      }
       OS << ")\n";
 
       for (auto const &ai : Args) {
diff --git a/flang/test/Lower/module_use.f90 b/flang/test/Lower/module_use.f90
index ad43865470b68..b976663239ef5 100644
--- a/flang/test/Lower/module_use.f90
+++ b/flang/test/Lower/module_use.f90
@@ -1,5 +1,6 @@
-! RUN: bbc -emit-fir %S/module_definition.f90
-! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: rm -fr %t && mkdir -p %t
+! RUN: bbc -emit-fir -module %t %S/module_definition.f90
+! RUN: bbc -emit-fir -J %t %s -o - | FileCheck %s
 
 ! Test use of module data not defined in this file.
 ! The modules are defined in module_definition.f90
diff --git a/libclc/clc/include/clc/common/clc_degrees.h b/libclc/clc/include/clc/common/clc_degrees.h
new file mode 100644
index 0000000000000..e8bb684fcd4d7
--- /dev/null
+++ b/libclc/clc/include/clc/common/clc_degrees.h
@@ -0,0 +1,12 @@
+#ifndef __CLC_MATH_CLC_DEGREES_H__
+#define __CLC_MATH_CLC_DEGREES_H__
+
+#define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION __clc_degrees
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_DEGREES_H__
diff --git a/libclc/clc/include/clc/common/clc_radians.h b/libclc/clc/include/clc/common/clc_radians.h
new file mode 100644
index 0000000000000..80d481e8de723
--- /dev/null
+++ b/libclc/clc/include/clc/common/clc_radians.h
@@ -0,0 +1,12 @@
+#ifndef __CLC_MATH_CLC_RADIANS_H__
+#define __CLC_MATH_CLC_RADIANS_H__
+
+#define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION __clc_radians
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_RADIANS_H__
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index f3097de694422..d74bff20ba87b 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -1,3 +1,5 @@
+common/clc_degrees.cl
+common/clc_radians.cl
 common/clc_smoothstep.cl
 geometric/clc_dot.cl
 integer/clc_abs.cl
diff --git a/libclc/clc/lib/generic/common/clc_degrees.cl b/libclc/clc/lib/generic/common/clc_degrees.cl
new file mode 100644
index 0000000000000..ce705982072e8
--- /dev/null
+++ b/libclc/clc/lib/generic/common/clc_degrees.cl
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clcmacro.h>
+#include <clc/internal/clc.h>
+
+#define DEGREES_SINGLE_DEF(TYPE, LITERAL)                                      \
+  _CLC_OVERLOAD _CLC_DEF TYPE __clc_degrees(TYPE radians) {                    \
+    return (TYPE)LITERAL * radians;                                            \
+  }
+
+#define DEGREES_DEF(TYPE, LITERAL)                                             \
+  DEGREES_SINGLE_DEF(TYPE, LITERAL)                                            \
+  DEGREES_SINGLE_DEF(TYPE##2, LITERAL)                                         \
+  DEGREES_SINGLE_DEF(TYPE##3, LITERAL)                                         \
+  DEGREES_SINGLE_DEF(TYPE##4, LITERAL)                                         \
+  DEGREES_SINGLE_DEF(TYPE##8, LITERAL)                                         \
+  DEGREES_SINGLE_DEF(TYPE##16, LITERAL)
+
+// 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
+DEGREES_DEF(float, 0x1.ca5dc2p+5F)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
+DEGREES_DEF(double, 0x1.ca5dc1a63c1f8p+5)
+
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
+DEGREES_DEF(half, (half)0x1.ca5dc1a63c1f8p+5)
+
+#endif
diff --git a/libclc/clc/lib/generic/common/clc_radians.cl b/libclc/clc/lib/generic/common/clc_radians.cl
new file mode 100644
index 0000000000000..850b8eb84f9da
--- /dev/null
+++ b/libclc/clc/lib/generic/common/clc_radians.cl
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clcmacro.h>
+#include <clc/internal/clc.h>
+
+#define RADIANS_SINGLE_DEF(TYPE, LITERAL)                                      \
+  _CLC_OVERLOAD _CLC_DEF TYPE __clc_radians(TYPE radians) {                    \
+    return (TYPE)LITERAL * radians;                                            \
+  }
+
+#define RADIANS_DEF(TYPE, LITERAL)                                             \
+  RADIANS_SINGLE_DEF(TYPE, LITERAL)                                            \
+  RADIANS_SINGLE_DEF(TYPE##2, LITERAL)                                         \
+  RADIANS_SINGLE_DEF(TYPE##3, LITERAL)                                         \
+  RADIANS_SINGLE_DEF(TYPE##4, LITERAL)                                         \
+  RADIANS_SINGLE_DEF(TYPE##8, LITERAL)                                         \
+  RADIANS_SINGLE_DEF(TYPE##16, LITERAL)
+
+// pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
+RADIANS_DEF(float, 0x1.1df46ap-6F)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
+RADIANS_DEF(double, 0x1.1df46a2529d39p-6)
+
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
+RADIANS_DEF(half, (half)0x1.1df46a2529d39p-6)
+
+#endif
diff --git a/libclc/clc/lib/spirv/SOURCES b/libclc/clc/lib/spirv/SOURCES
index 02784b8def682..ac855ea5184ed 100644
--- a/libclc/clc/lib/spirv/SOURCES
+++ b/libclc/clc/lib/spirv/SOURCES
@@ -1,3 +1,5 @@
+../generic/common/clc_degrees.cl
+../generic/common/clc_radians.cl
 ../generic/common/clc_smoothstep.cl
 ../generic/geometric/clc_dot.cl
 ../generic/math/clc_ceil.cl
diff --git a/libclc/clc/lib/spirv64/SOURCES b/libclc/clc/lib/spirv64/SOURCES
index 02784b8def682..ac855ea5184ed 100644
--- a/libclc/clc/lib/spirv64/SOURCES
+++ b/libclc/clc/lib/spirv64/SOURCES
@@ -1,3 +1,5 @@
+../generic/common/clc_degrees.cl
+../generic/common/clc_radians.cl
 ../generic/common/clc_smoothstep.cl
 ../generic/geometric/clc_dot.cl
 ../generic/math/clc_ceil.cl
diff --git a/libclc/generic/lib/common/degrees.cl b/libclc/generic/lib/common/degrees.cl
index cf49b190c76b3..a9715d64f507a 100644
--- a/libclc/generic/lib/common/degrees.cl
+++ b/libclc/generic/lib/common/degrees.cl
@@ -22,23 +22,20 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/common/clc_degrees.h>
 
-_CLC_OVERLOAD _CLC_DEF float degrees(float radians) {
-  // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
-  return 0x1.ca5dc2p+5F * radians;
-}
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, degrees, float);
-
+_CLC_DEFINE_UNARY_BUILTIN(float, degrees, __clc_degrees, float)
 
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
-_CLC_OVERLOAD _CLC_DEF double degrees(double radians) {
-  // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
-  return 0x1.ca5dc1a63c1f8p+5 * radians;
-}
+_CLC_DEFINE_UNARY_BUILTIN(double, degrees, __clc_degrees, double)
+
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, degrees, double);
+_CLC_DEFINE_UNARY_BUILTIN(half, degrees, __clc_degrees, half)
 
 #endif
diff --git a/libclc/generic/lib/common/radians.cl b/libclc/generic/lib/common/radians.cl
index 645a30549afed..b5dcbfe6e3fd2 100644
--- a/libclc/generic/lib/common/radians.cl
+++ b/libclc/generic/lib/common/radians.cl
@@ -22,23 +22,20 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/common/clc_radians.h>
 
-_CLC_OVERLOAD _CLC_DEF float radians(float degrees) {
-  // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
-  return 0x1.1df46ap-6F * degrees;
-}
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, radians, float);
-
+_CLC_DEFINE_UNARY_BUILTIN(float, radians, __clc_radians, float)
 
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
-_CLC_OVERLOAD _CLC_DEF double radians(double degrees) {
-  // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
-  return 0x1.1df46a2529d39p-6 * degrees;
-}
+_CLC_DEFINE_UNARY_BUILTIN(double, radians, __clc_radians, double)
+
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, radians, double);
+_CLC_DEFINE_UNARY_BUILTIN(half, radians, __clc_radians, half)
 
 #endif
diff --git a/lld/COFF/COFFLinkerContext.h b/lld/COFF/COFFLinkerContext.h
index bdd625b8c3916..8322f829d4055 100644
--- a/lld/COFF/COFFLinkerContext.h
+++ b/lld/COFF/COFFLinkerContext.h
@@ -56,7 +56,6 @@ class COFFLinkerContext : public CommonLinkerContext {
   std::vector<ObjFile *> objFileInstances;
   std::map<std::string, PDBInputFile *> pdbInputFileInstances;
   std::vector<ImportFile *> importFileInstances;
-  std::vector<BitcodeFile *> bitcodeFileInstances;
 
   MergeChunk *mergeChunkInstances[Log2MaxSectionAlignment + 1] = {};
 
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 8b1a8dc3e5af7..898c6c17d2062 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -218,7 +218,7 @@ void LinkerDriver::addFile(InputFile *file) {
                  << " linked in after "
                     "doing LTO compilation.";
       }
-      ctx.bitcodeFileInstances.push_back(f);
+      f->symtab.bitcodeFileInstances.push_back(f);
     } else if (auto *f = dyn_cast<ImportFile>(file)) {
       ctx.importFileInstances.push_back(f);
     }
@@ -285,7 +285,7 @@ void LinkerDriver::addBuffer(std::unique_ptr<MemoryBuffer> mb,
     addFile(make<ArchiveFile>(ctx, mbref));
     break;
   case file_magic::bitcode:
-    addFile(make<BitcodeFile>(ctx, mbref, "", 0, lazy));
+    addFile(BitcodeFile::create(ctx, mbref, "", 0, lazy));
     break;
   case file_magic::coff_object:
   case file_magic::coff_import_library:
@@ -374,8 +374,8 @@ void LinkerDriver::addArchiveBuffer(MemoryBufferRef mb, StringRef symName,
   if (magic == file_magic::coff_object) {
     obj = ObjFile::create(ctx, mb);
   } else if (magic == file_magic::bitcode) {
-    obj =
-        make<BitcodeFile>(ctx, mb, parentName, offsetInArchive, /*lazy=*/false);
+    obj = BitcodeFile::create(ctx, mb, parentName, offsetInArchive,
+                              /*lazy=*/false);
   } else if (magic == file_magic::coff_cl_gl_object) {
     Err(ctx) << mb.getBufferIdentifier()
              << ": is not a native COFF file. Recompile without /GL?";
@@ -2571,19 +2571,19 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
         }
       }
 
-      // If any inputs are bitcode files, the LTO code generator may create
-      // references to library functions that are not explicit in the bitcode
-      // file's symbol table. If any of those library functions are defined in a
-      // bitcode file in an archive member, we need to arrange to use LTO to
-      // compile those archive members by adding them to the link beforehand.
-      if (!ctx.bitcodeFileInstances.empty()) {
-        llvm::Triple TT(
-            ctx.bitcodeFileInstances.front()->obj->getTargetTriple());
-        for (auto *s : lto::LTO::getRuntimeLibcallSymbols(TT))
-          ctx.symtab.addLibcall(s);
-      }
-
       ctx.forEachSymtab([&](SymbolTable &symtab) {
+        // If any inputs are bitcode files, the LTO code generator may create
+        // references to library functions that are not explicit in the bitcode
+        // file's symbol table. If any of those library functions are defined in
+        // a bitcode file in an archive member, we need to arrange to use LTO to
+        // compile those archive members by adding them to the link beforehand.
+        if (!symtab.bitcodeFileInstances.empty()) {
+          llvm::Triple TT(
+              symtab.bitcodeFileInstances.front()->obj->getTargetTriple());
+          for (auto *s : lto::LTO::getRuntimeLibcallSymbols(TT))
+            symtab.addLibcall(s);
+        }
+
         // Windows specific -- if __load_config_used can be resolved, resolve
         // it.
         if (symtab.findUnderscore("_load_config_used"))
@@ -2639,8 +2639,11 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   // If we are going to do codegen for link-time optimization, check for
   // unresolvable symbols first, so we don't spend time generating code that
   // will fail to link anyway.
-  if (!ctx.bitcodeFileInstances.empty() && !config->forceUnresolved)
-    ctx.symtab.reportUnresolvable();
+  if (!config->forceUnresolved)
+    ctx.forEachSymtab([](SymbolTable &symtab) {
+      if (!symtab.bitcodeFileInstances.empty())
+        symtab.reportUnresolvable();
+    });
   if (errorCount())
     return;
 
@@ -2655,7 +2658,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   // link those files (unless -thinlto-index-only was given, in which case we
   // resolve symbols and write indices, but don't generate native code or link).
   ltoCompilationDone = true;
-  ctx.symtab.compileBitcodeFiles();
+  ctx.forEachSymtab([](SymbolTable &symtab) { symtab.compileBitcodeFiles(); });
 
   if (Defined *d =
           dyn_cast_or_null<Defined>(ctx.symtab.findUnderscore("_tls_used")))
diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index 66641ff9dcc1f..5ee73d4dc4f8b 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -1229,10 +1229,15 @@ void ImportFile::parse() {
   }
 }
 
-BitcodeFile::BitcodeFile(COFFLinkerContext &ctx, MemoryBufferRef mb,
-                         StringRef archiveName, uint64_t offsetInArchive,
-                         bool lazy)
-    : InputFile(ctx.symtab, BitcodeKind, mb, lazy) {
+BitcodeFile::BitcodeFile(SymbolTable &symtab, MemoryBufferRef mb,
+                         std::unique_ptr<lto::InputFile> &o, bool lazy)
+    : InputFile(symtab, BitcodeKind, mb, lazy) {
+  obj.swap(o);
+}
+
+BitcodeFile *BitcodeFile::create(COFFLinkerContext &ctx, MemoryBufferRef mb,
+                                 StringRef archiveName,
+                                 uint64_t offsetInArchive, bool lazy) {
   std::string path = mb.getBufferIdentifier().str();
   if (ctx.config.thinLTOIndexOnly)
     path = replaceThinLTOSuffix(mb.getBufferIdentifier(),
@@ -1252,7 +1257,9 @@ BitcodeFile::BitcodeFile(COFFLinkerContext &ctx, MemoryBufferRef mb,
                                                sys::path::filename(path) +
                                                utostr(offsetInArchive)));
 
-  obj = check(lto::InputFile::create(mbref));
+  std::unique_ptr<lto::InputFile> obj = check(lto::InputFile::create(mbref));
+  return make<BitcodeFile>(ctx.getSymtab(getMachineType(obj.get())), mb, obj,
+                           lazy);
 }
 
 BitcodeFile::~BitcodeFile() = default;
@@ -1329,7 +1336,7 @@ void BitcodeFile::parseLazy() {
     }
 }
 
-MachineTypes BitcodeFile::getMachineType() const {
+MachineTypes BitcodeFile::getMachineType(const llvm::lto::InputFile *obj) {
   Triple t(obj->getTargetTriple());
   switch (t.getArch()) {
   case Triple::x86_64:
diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h
index d3075c5e0a338..823561cda247a 100644
--- a/lld/COFF/InputFiles.h
+++ b/lld/COFF/InputFiles.h
@@ -386,13 +386,19 @@ class ImportFile : public InputFile {
 // Used for LTO.
 class BitcodeFile : public InputFile {
 public:
-  explicit BitcodeFile(COFFLinkerContext &ctx, MemoryBufferRef mb,
-                       StringRef archiveName, uint64_t offsetInArchive,
-                       bool lazy);
+  explicit BitcodeFile(SymbolTable &symtab, MemoryBufferRef mb,
+                       std::unique_ptr<llvm::lto::InputFile> &obj, bool lazy);
   ~BitcodeFile();
+
+  static BitcodeFile *create(COFFLinkerContext &ctx, MemoryBufferRef mb,
+                             StringRef archiveName, uint64_t offsetInArchive,
+                             bool lazy);
   static bool classof(const InputFile *f) { return f->kind() == BitcodeKind; }
   ArrayRef<Symbol *> getSymbols() { return symbols; }
-  MachineTypes getMachineType() const override;
+  MachineTypes getMachineType() const override {
+    return getMachineType(obj.get());
+  }
+  static MachineTypes getMachineType(const llvm::lto::InputFile *obj);
   void parseLazy();
   std::unique_ptr<llvm::lto::InputFile> obj;
 
diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index 36dcd0dfe1389..bf965e8a2332d 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -347,8 +347,8 @@ bool SymbolTable::handleMinGWAutomaticImport(Symbol *sym, StringRef name) {
 /// defined symbol imported" diagnostic for symbols in localImports.
 /// objFiles and bitcodeFiles (if not nullptr) are used to report where
 /// undefined symbols are referenced.
-static void reportProblemSymbols(
-    COFFLinkerContext &ctx, const SmallPtrSetImpl<Symbol *> &undefs,
+void SymbolTable::reportProblemSymbols(
+    const SmallPtrSetImpl<Symbol *> &undefs,
     const DenseMap<Symbol *, Symbol *> *localImports, bool needBitcodeFiles) {
   // Return early if there is nothing to report (which should be
   // the common case).
@@ -392,7 +392,7 @@ static void reportProblemSymbols(
     processFile(file, file->getSymbols());
 
   if (needBitcodeFiles)
-    for (BitcodeFile *file : ctx.bitcodeFileInstances)
+    for (BitcodeFile *file : bitcodeFileInstances)
       processFile(file, file->getSymbols());
 
   for (const UndefinedDiag &undefDiag : undefDiags)
@@ -423,8 +423,7 @@ void SymbolTable::reportUnresolvable() {
     undefs.insert(sym);
   }
 
-  reportProblemSymbols(ctx, undefs,
-                       /* localImports */ nullptr, true);
+  reportProblemSymbols(undefs, /*localImports=*/nullptr, true);
 }
 
 bool SymbolTable::resolveRemainingUndefines() {
@@ -506,8 +505,8 @@ bool SymbolTable::resolveRemainingUndefines() {
   }
 
   reportProblemSymbols(
-      ctx, undefs,
-      ctx.config.warnLocallyDefinedImported ? &localImports : nullptr, false);
+      undefs, ctx.config.warnLocallyDefinedImported ? &localImports : nullptr,
+      false);
   return foundLazy;
 }
 
@@ -1124,13 +1123,13 @@ Symbol *SymbolTable::addUndefined(StringRef name) {
 }
 
 void SymbolTable::compileBitcodeFiles() {
-  if (ctx.bitcodeFileInstances.empty())
+  if (bitcodeFileInstances.empty())
     return;
 
   llvm::TimeTraceScope timeScope("Compile bitcode");
   ScopedTimer t(ctx.ltoTimer);
   lto.reset(new BitcodeCompiler(ctx));
-  for (BitcodeFile *f : ctx.bitcodeFileInstances)
+  for (BitcodeFile *f : bitcodeFileInstances)
     lto->add(*f);
   for (InputFile *newObj : lto->compile()) {
     ObjFile *obj = cast<ObjFile>(newObj);
diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h
index 9e316fcdbe630..66bca0d63e5ff 100644
--- a/lld/COFF/SymbolTable.h
+++ b/lld/COFF/SymbolTable.h
@@ -14,6 +14,7 @@
 #include "llvm/ADT/CachedHashString.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
@@ -155,6 +156,8 @@ class SymbolTable {
       callback(pair.second);
   }
 
+  std::vector<BitcodeFile *> bitcodeFileInstances;
+
   DefinedRegular *loadConfigSym = nullptr;
   uint32_t loadConfigSize = 0;
   void initializeLoadConfig();
@@ -175,6 +178,11 @@ class SymbolTable {
   std::unique_ptr<BitcodeCompiler> lto;
   std::vector<std::pair<Symbol *, Symbol *>> entryThunks;
   llvm::DenseMap<Symbol *, Symbol *> exitThunks;
+
+  void
+  reportProblemSymbols(const llvm::SmallPtrSetImpl<Symbol *> &undefs,
+                       const llvm::DenseMap<Symbol *, Symbol *> *localImports,
+                       bool needBitcodeFiles);
 };
 
 std::vector<std::string> getSymbolLocations(ObjFile *file, uint32_t symIndex);
diff --git a/lld/test/COFF/lto-arm64x.ll b/lld/test/COFF/lto-arm64x.ll
new file mode 100644
index 0000000000000..bbfc6b64c6fce
--- /dev/null
+++ b/lld/test/COFF/lto-arm64x.ll
@@ -0,0 +1,47 @@
+; REQUIRES: aarch64, x86
+; RUN: split-file %s %t.dir && cd %t.dir
+
+; RUN: llvm-as arm64ec.ll -o arm64ec.obj
+; RUN: llvm-as aarch64.ll -o aarch64.obj
+; RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj
+; RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
+
+; RUN: lld-link -machine:arm64x aarch64.obj arm64ec.obj loadconfig-arm64.obj loadconfig-arm64ec.obj -out:out.exe -subsystem:console
+; RUN: llvm-objdump -d out.exe | FileCheck %s
+
+; CHECK:      0000000140001000 <.text>:
+; CHECK-NEXT: 140001000: 52800020     mov     w0, #0x1                // =1
+; CHECK-NEXT: 140001004: d65f03c0     ret
+; CHECK-NEXT:                 ...
+; CHECK-NEXT: 140002000: 00000009     udf     #0x9
+; CHECK-NEXT: 140002004: 52800040     mov     w0, #0x2                // =2
+; CHECK-NEXT: 140002008: d65f03c0     ret
+
+; CHECK:      0000000140003000 <.hexpthk>:
+; CHECK-NEXT: 140003000: 48 8b c4                     movq    %rsp, %rax
+; CHECK-NEXT: 140003003: 48 89 58 20                  movq    %rbx, 0x20(%rax)
+; CHECK-NEXT: 140003007: 55                           pushq   %rbp
+; CHECK-NEXT: 140003008: 5d                           popq    %rbp
+; CHECK-NEXT: 140003009: e9 f6 ef ff ff               jmp     0x140002004 <.text+0x1004>
+; CHECK-NEXT: 14000300e: cc                           int3
+; CHECK-NEXT: 14000300f: cc                           int3
+
+#--- arm64ec.ll
+
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64ec-unknown-windows-msvc"
+
+define dso_local i32 @mainCRTStartup() {
+entry:
+  ret i32 2
+}
+
+#--- aarch64.ll
+
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-windows-msvc"
+
+define dso_local i32 @mainCRTStartup() {
+entry:
+  ret i32 1
+}
diff --git a/lldb/include/lldb/Symbol/Function.h b/lldb/include/lldb/Symbol/Function.h
index 157c007bdf0e8..d0b27269568b0 100644
--- a/lldb/include/lldb/Symbol/Function.h
+++ b/lldb/include/lldb/Symbol/Function.h
@@ -454,6 +454,11 @@ class Function : public UserID, public SymbolContextScope {
   /// and variables).
   const Address &GetAddress() const { return m_address; }
 
+  bool GetRangeContainingLoadAddress(lldb::addr_t load_addr, Target &target,
+                                     AddressRange &range) {
+    return m_block.GetRangeContainingLoadAddress(load_addr, target, range);
+  }
+
   lldb::LanguageType GetLanguage() const;
   /// Find the file and line number of the source location of the start of the
   /// function.  This will use the declaration if present and fall back on the
diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp
index 4e61c83889b0b..cc848076dab5f 100644
--- a/lldb/source/API/SBThread.cpp
+++ b/lldb/source/API/SBThread.cpp
@@ -842,7 +842,6 @@ SBError SBThread::StepOverUntil(lldb::SBFrame &sb_frame,
     // appropriate error message.
 
     bool all_in_function = true;
-    AddressRange fun_range = frame_sc.function->GetAddressRange();
 
     std::vector<addr_t> step_over_until_addrs;
     const bool abort_other_plans = false;
@@ -859,7 +858,9 @@ SBError SBThread::StepOverUntil(lldb::SBFrame &sb_frame,
       addr_t step_addr =
           sc.line_entry.range.GetBaseAddress().GetLoadAddress(target);
       if (step_addr != LLDB_INVALID_ADDRESS) {
-        if (fun_range.ContainsLoadAddress(step_addr, target))
+        AddressRange unused_range;
+        if (frame_sc.function->GetRangeContainingLoadAddress(step_addr, *target,
+                                                             unused_range))
           step_over_until_addrs.push_back(step_addr);
         else
           all_in_function = false;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
index 6d073411de876..c2edc52aa964f 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
@@ -281,22 +281,34 @@ bool DWARFDebugInfoEntry::GetDIENamesAndRanges(
   return !ranges.empty();
 }
 
-// Get all attribute values for a given DIE, including following any
-// specification or abstract origin attributes and including those in the
-// results. Any duplicate attributes will have the first instance take
-// precedence (this can happen for declaration attributes).
-void DWARFDebugInfoEntry::GetAttributes(DWARFUnit *cu,
-                                        DWARFAttributes &attributes,
-                                        Recurse recurse,
-                                        uint32_t curr_depth) const {
-  const auto *abbrevDecl = GetAbbreviationDeclarationPtr(cu);
-  if (!abbrevDecl) {
-    attributes.Clear();
-    return;
-  }
+/// Helper for the public \ref DWARFDebugInfoEntry::GetAttributes API.
+/// Adds all attributes of the DIE at the top of the \c worklist to the
+/// \c attributes list. Specifcations and abstract origins are added
+/// to the \c worklist if the referenced DIE has not been seen before.
+static bool GetAttributes(llvm::SmallVector<DWARFDIE> &worklist,
+                          llvm::SmallSet<DWARFDebugInfoEntry const *, 3> &seen,
+                          DWARFAttributes &attributes) {
+  assert(!worklist.empty() && "Need at least one DIE to visit.");
+  assert(seen.size() >= 1 &&
+         "Need to have seen at least the currently visited entry.");
+
+  DWARFDIE current = worklist.pop_back_val();
+
+  const auto *cu = current.GetCU();
+  assert(cu);
+
+  const auto *entry = current.GetDIE();
+  assert(entry);
+
+  const auto *abbrevDecl =
+      entry->GetAbbreviationDeclarationPtr(current.GetCU());
+  if (!abbrevDecl)
+    return false;
 
   const DWARFDataExtractor &data = cu->GetData();
-  lldb::offset_t offset = GetFirstAttributeOffset();
+  lldb::offset_t offset = current.GetDIE()->GetFirstAttributeOffset();
+
+  const bool is_first_die = seen.size() == 1;
 
   for (const auto &attribute : abbrevDecl->attributes()) {
     DWARFFormValue form_value(cu);
@@ -309,10 +321,10 @@ void DWARFDebugInfoEntry::GetAttributes(DWARFUnit *cu,
     switch (attr) {
     case DW_AT_sibling:
     case DW_AT_declaration:
-      if (curr_depth > 0) {
+      if (!is_first_die) {
         // This attribute doesn't make sense when combined with the DIE that
         // references this DIE. We know a DIE is referencing this DIE because
-        // curr_depth is not zero
+        // we've visited more than one DIE already.
         break;
       }
       [[fallthrough]];
@@ -321,13 +333,12 @@ void DWARFDebugInfoEntry::GetAttributes(DWARFUnit *cu,
       break;
     }
 
-    if (recurse == Recurse::yes &&
-        ((attr == DW_AT_specification) || (attr == DW_AT_abstract_origin))) {
+    if (attr == DW_AT_specification || attr == DW_AT_abstract_origin) {
       if (form_value.ExtractValue(data, &offset)) {
-        DWARFDIE spec_die = form_value.Reference();
-        if (spec_die)
-          spec_die.GetDIE()->GetAttributes(spec_die.GetCU(), attributes,
-                                           recurse, curr_depth + 1);
+        if (DWARFDIE spec_die = form_value.Reference()) {
+          if (seen.insert(spec_die.GetDIE()).second)
+            worklist.push_back(spec_die);
+        }
       }
     } else {
       const dw_form_t form = form_value.Form();
@@ -339,6 +350,34 @@ void DWARFDebugInfoEntry::GetAttributes(DWARFUnit *cu,
         DWARFFormValue::SkipValue(form, data, &offset, cu);
     }
   }
+
+  return true;
+}
+
+DWARFAttributes DWARFDebugInfoEntry::GetAttributes(const DWARFUnit *cu,
+                                                   Recurse recurse) const {
+  // FIXME: use ElaboratingDIEIterator to follow specifications/abstract origins
+  // instead of maintaining our own worklist/seen list.
+
+  DWARFAttributes attributes;
+
+  llvm::SmallVector<DWARFDIE, 3> worklist;
+  worklist.emplace_back(cu, this);
+
+  // Keep track if DIEs already seen to prevent infinite recursion.
+  // Value of '3' was picked for the same reason that
+  // DWARFDie::findRecursively does.
+  llvm::SmallSet<DWARFDebugInfoEntry const *, 3> seen;
+  seen.insert(this);
+
+  do {
+    if (!::GetAttributes(worklist, seen, attributes)) {
+      attributes.Clear();
+      break;
+    }
+  } while (!worklist.empty() && recurse == Recurse::yes);
+
+  return attributes;
 }
 
 // GetAttributeValue
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
index de6bbf1d52789..72aeb2743b1e2 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
@@ -52,12 +52,28 @@ class DWARFDebugInfoEntry {
                lldb::offset_t *offset_ptr);
 
   using Recurse = DWARFBaseDIE::Recurse;
-  DWARFAttributes GetAttributes(DWARFUnit *cu,
-                                Recurse recurse = Recurse::yes) const {
-    DWARFAttributes attrs;
-    GetAttributes(cu, attrs, recurse, 0 /* curr_depth */);
-    return attrs;
-  }
+
+  /// Get all attribute values for a given DIE, optionally following any
+  /// specifications and abstract origins and including their attributes
+  /// in the result too.
+  ///
+  /// When following specifications/abstract origins, the attributes
+  /// on the referring DIE are guaranteed to be visited before the attributes of
+  /// the referenced DIE.
+  ///
+  /// \param[in] cu DWARFUnit that this entry belongs to.
+  ///
+  /// \param[in] recurse If set to \c Recurse::yes, will include attributes
+  /// on DIEs referenced via \c DW_AT_specification and \c DW_AT_abstract_origin
+  /// (including across multiple levels of indirection).
+  ///
+  /// \returns DWARFAttributes that include all attributes found on this DIE
+  /// (and possibly referenced DIEs). Attributes may appear multiple times
+  /// (e.g., if a declaration and definition both specify the same attribute).
+  /// On failure, the returned DWARFAttributes will be empty.
+  ///
+  DWARFAttributes GetAttributes(const DWARFUnit *cu,
+                                Recurse recurse = Recurse::yes) const;
 
   dw_offset_t GetAttributeValue(const DWARFUnit *cu, const dw_attr_t attr,
                                 DWARFFormValue &formValue,
@@ -178,10 +194,6 @@ class DWARFDebugInfoEntry {
   /// A copy of the DW_TAG value so we don't have to go through the compile
   /// unit abbrev table
   dw_tag_t m_tag = llvm::dwarf::DW_TAG_null;
-
-private:
-  void GetAttributes(DWARFUnit *cu, DWARFAttributes &attrs, Recurse recurse,
-                     uint32_t curr_depth) const;
 };
 } // namespace dwarf
 } // namespace lldb_private::plugin
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 2f451d173c4dd..ad5005b660c64 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -3414,7 +3414,10 @@ VariableSP SymbolFileDWARF::ParseVariableDIE(const SymbolContext &sc,
       mangled = form_value.AsCString();
       break;
     case DW_AT_type:
-      type_die_form = form_value;
+      // DW_AT_type on declaration may be less accurate than
+      // that of definition, so don't overwrite it.
+      if (!type_die_form.IsValid())
+        type_die_form = form_value;
       break;
     case DW_AT_external:
       is_external = form_value.Boolean();
diff --git a/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py b/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py
new file mode 100644
index 0000000000000..59e028acf014c
--- /dev/null
+++ b/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py
@@ -0,0 +1,136 @@
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class TestStepUntilAPI(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def setUp(self):
+        super().setUp()
+
+        self.main_source = "main.c"
+        self.main_spec = lldb.SBFileSpec(self.main_source)
+        self.less_than_two = line_number("main.c", "Less than 2")
+        self.greater_than_two = line_number("main.c", "Greater than or equal to 2.")
+        self.back_out_in_main = line_number("main.c", "Back out in main")
+        self.in_foo = line_number("main.c", "In foo")
+
+    def _build_dict_for_discontinuity(self):
+        return dict(
+            CFLAGS_EXTRAS="-funique-basic-block-section-names "
+            + "-ffunction-sections -fbasic-block-sections=list="
+            + self.getSourcePath("function.list"),
+            LD_EXTRAS="-Wl,--script=" + self.getSourcePath("symbol.order"),
+        )
+
+    def _do_until(self, build_dict, args, until_line, expected_line):
+        self.build(dictionary=build_dict)
+        launch_info = lldb.SBLaunchInfo(args)
+        _, _, thread, _ = lldbutil.run_to_source_breakpoint(
+            self, "At the start", self.main_spec, launch_info
+        )
+
+        self.assertSuccess(
+            thread.StepOverUntil(self.frame(), self.main_spec, until_line)
+        )
+
+        self.runCmd("process status")
+
+        line = self.frame().GetLineEntry().GetLine()
+        self.assertEqual(
+            line, expected_line, "Did not get the expected stop line number"
+        )
+
+    def _assertDiscontinuity(self):
+        target = self.target()
+        foo = target.FindFunctions("foo")
+        self.assertEqual(len(foo), 1)
+        foo = foo[0]
+
+        call_me = self.target().FindFunctions("call_me")
+        self.assertEqual(len(call_me), 1)
+        call_me = call_me[0]
+
+        foo_addr = foo.function.GetStartAddress().GetLoadAddress(target)
+        found_before = False
+        found_after = False
+        for range in call_me.function.GetRanges():
+            addr = range.GetBaseAddress().GetLoadAddress(target)
+            if addr < foo_addr:
+                found_before = True
+            if addr > foo_addr:
+                found_after = True
+
+        self.assertTrue(
+            found_before and found_after,
+            "'foo' is not between 'call_me'" + str(foo) + str(call_me),
+        )
+
+    def test_hitting(self):
+        """Test SBThread.StepOverUntil - targeting a line and hitting it."""
+        self._do_until(None, None, self.less_than_two, self.less_than_two)
+
+    @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"])
+    @skipIf(archs=no_match(["x86_64", "aarch64"]))
+    def test_hitting_discontinuous(self):
+        """Test SBThread.StepOverUntil - targeting a line and hitting it -- with
+        discontinuous functions"""
+        self._do_until(
+            self._build_dict_for_discontinuity(),
+            None,
+            self.less_than_two,
+            self.less_than_two,
+        )
+        self._assertDiscontinuity()
+
+    def test_missing(self):
+        """Test SBThread.StepOverUntil - targeting a line and missing it by stepping out to call site"""
+        self._do_until(
+            None, ["foo", "bar", "baz"], self.less_than_two, self.back_out_in_main
+        )
+
+    @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"])
+    @skipIf(archs=no_match(["x86_64", "aarch64"]))
+    def test_missing_discontinuous(self):
+        """Test SBThread.StepOverUntil - targeting a line and missing it by
+        stepping out to call site -- with discontinuous functions"""
+        self._do_until(
+            self._build_dict_for_discontinuity(),
+            ["foo", "bar", "baz"],
+            self.less_than_two,
+            self.back_out_in_main,
+        )
+        self._assertDiscontinuity()
+
+    def test_bad_line(self):
+        """Test that we get an error if attempting to step outside the current
+        function"""
+        self.build()
+        _, _, thread, _ = lldbutil.run_to_source_breakpoint(
+            self, "At the start", self.main_spec
+        )
+        self.assertIn(
+            "step until target not in current function",
+            thread.StepOverUntil(
+                self.frame(), self.main_spec, self.in_foo
+            ).GetCString(),
+        )
+
+    @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"])
+    @skipIf(archs=no_match(["x86_64", "aarch64"]))
+    def test_bad_line_discontinuous(self):
+        """Test that we get an error if attempting to step outside the current
+        function -- and the function is discontinuous"""
+        self.build(dictionary=self._build_dict_for_discontinuity())
+        _, _, thread, _ = lldbutil.run_to_source_breakpoint(
+            self, "At the start", self.main_spec
+        )
+        self.assertIn(
+            "step until target not in current function",
+            thread.StepOverUntil(
+                self.frame(), self.main_spec, self.in_foo
+            ).GetCString(),
+        )
+        self._assertDiscontinuity()
diff --git a/lldb/test/API/functionalities/thread/step_until/function.list b/lldb/test/API/functionalities/thread/step_until/function.list
new file mode 100644
index 0000000000000..5900fe8c35069
--- /dev/null
+++ b/lldb/test/API/functionalities/thread/step_until/function.list
@@ -0,0 +1 @@
+!call_me
diff --git a/lldb/test/API/functionalities/thread/step_until/main.c b/lldb/test/API/functionalities/thread/step_until/main.c
index bb866079cf5f5..4c52308f030e9 100644
--- a/lldb/test/API/functionalities/thread/step_until/main.c
+++ b/lldb/test/API/functionalities/thread/step_until/main.c
@@ -4,6 +4,9 @@
  * unrelated to the program, just to achieve consistent
  * debug line tables, across platforms, that are not
  * dependent on compiler optimzations. */
+
+int foo(int x) { return x; /* In foo */ }
+
 int call_me(int argc) {
   printf ("At the start, argc: %d.\n", argc);
 
diff --git a/lldb/test/API/functionalities/thread/step_until/symbol.order b/lldb/test/API/functionalities/thread/step_until/symbol.order
new file mode 100644
index 0000000000000..dcc9607a4188f
--- /dev/null
+++ b/lldb/test/API/functionalities/thread/step_until/symbol.order
@@ -0,0 +1,9 @@
+SECTIONS {
+  .text.ordered : {
+    *(.text.call_me)
+    *(.text.foo)
+    *(.text.call_me.call_me.__part.1)
+    *(.text.call_me.call_me.__part.2)
+    *(.text.call_me.call_me.__part.3)
+  }
+} INSERT BEFORE .text;
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
index 1e4c8f3ba0778..3f61d1607073c 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
@@ -394,3 +394,643 @@ TEST(DWARFDIETest, GetContextInFunction) {
   EXPECT_THAT(foo_struct_die.GetTypeLookupContext(),
               testing::ElementsAre(make_struct("struct_t")));
 }
+
+struct GetAttributesTestFixture : public testing::TestWithParam<dw_attr_t> {};
+
+TEST_P(GetAttributesTestFixture, TestGetAttributes_IterationOrder) {
+  // Tests that we accumulate all current DIE's attributes first
+  // before checking the attributes of the specification.
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - func
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_data4
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_declaration
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_external
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_data4
+        - Code:            0x3
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_data4
+            - Attribute:       {0}
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_data4
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+# DW_TAG_compile_unit
+#   DW_AT_language [DW_FORM_data2]    (DW_LANG_C_plus_plus)
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+#     DW_TAG_subprogram
+#       DW_AT_high_pc [DW_FORM_data4]
+#       DW_AT_name [DW_FORM_strp] ("func")
+#       DW_AT_low_pc [DW_FORM_data4]
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0xdeadbeef
+            - Value:           0x0
+            - Value:           0x1
+            - Value:           0x1
+            - Value:           0xdeadbeef
+
+#     DW_TAG_subprogram
+#       DW_AT_high_pc [DW_FORM_data4]
+#       DW_AT_specification [DW_FORM_ref4] ("func")
+#       DW_AT_low_pc [DW_FORM_data4]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0xf00dcafe
+            - Value:           0xf
+            - Value:           0xf00dcafe
+
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str());
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto declaration = cu_die.GetFirstChild();
+  ASSERT_TRUE(declaration.IsValid());
+  ASSERT_EQ(declaration.Tag(), DW_TAG_subprogram);
+
+  auto definition = declaration.GetSibling();
+  ASSERT_TRUE(definition.IsValid());
+  ASSERT_EQ(definition.Tag(), DW_TAG_subprogram);
+  ASSERT_FALSE(definition.GetAttributeValueAsOptionalUnsigned(DW_AT_external));
+
+  auto attrs = definition.GetAttributes(DWARFDebugInfoEntry::Recurse::yes);
+  EXPECT_EQ(attrs.Size(), 7U);
+
+  // Check that the attributes on the definition (that are also present
+  // on the declaration) take precedence.
+  for (auto attr : {DW_AT_low_pc, DW_AT_high_pc}) {
+    auto idx = attrs.FindAttributeIndex(attr);
+    EXPECT_NE(idx, UINT32_MAX);
+
+    DWARFFormValue form_value;
+    auto success = attrs.ExtractFormValueAtIndex(idx, form_value);
+    EXPECT_TRUE(success);
+
+    EXPECT_EQ(form_value.Unsigned(), 0xf00dcafe);
+  }
+}
+
+TEST_P(GetAttributesTestFixture, TestGetAttributes_Cycle) {
+  // Tests that GetAttributes can deal with cycles in
+  // specifications/abstract origins.
+  //
+  // Contrived example:
+  //
+  // func1 -> func3
+  //   ^       |
+  //   |       v
+  //   +------func2
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       {0}
+              Form:            DW_FORM_ref4
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x19
+
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0xf
+
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x14
+
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str());
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto func1 = cu_die.GetFirstChild();
+  ASSERT_TRUE(func1.IsValid());
+  ASSERT_EQ(func1.Tag(), DW_TAG_subprogram);
+
+  auto func2 = func1.GetSibling();
+  ASSERT_TRUE(func2.IsValid());
+  ASSERT_EQ(func2.Tag(), DW_TAG_subprogram);
+
+  auto func3 = func2.GetSibling();
+  ASSERT_TRUE(func3.IsValid());
+  ASSERT_EQ(func3.Tag(), DW_TAG_subprogram);
+
+  auto attrs = func1.GetAttributes(DWARFDebugInfoEntry::Recurse::yes);
+  EXPECT_EQ(attrs.Size(), 3U);
+
+  // Confirm that the specifications do form a cycle.
+  {
+    DWARFFormValue form_value;
+    auto success = attrs.ExtractFormValueAtIndex(0, form_value);
+    ASSERT_TRUE(success);
+
+    EXPECT_EQ(form_value.Reference(), func3);
+  }
+
+  {
+    DWARFFormValue form_value;
+    auto success = attrs.ExtractFormValueAtIndex(1, form_value);
+    ASSERT_TRUE(success);
+
+    EXPECT_EQ(form_value.Reference(), func2);
+  }
+
+  {
+    DWARFFormValue form_value;
+    auto success = attrs.ExtractFormValueAtIndex(2, form_value);
+    ASSERT_TRUE(success);
+
+    EXPECT_EQ(form_value.Reference(), func1);
+  }
+}
+
+TEST_P(GetAttributesTestFixture,
+       TestGetAttributes_SkipNonApplicableAttributes) {
+  // Tests that GetAttributes will omit attributes found through
+  // specifications/abstract origins which are not applicable.
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - func
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_declaration
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_sibling
+              Form:            DW_FORM_ref4
+        - Code:            0x3
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_declaration
+              Form:            DW_FORM_flag_present
+            - Attribute:       {0}
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_sibling
+              Form:            DW_FORM_ref4
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+# DW_TAG_compile_unit
+#   DW_AT_language [DW_FORM_data2]    (DW_LANG_C_plus_plus)
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+#     DW_TAG_subprogram
+#       DW_AT_declaration
+#       DW_AT_name [DW_FORM_strp] ("func")
+#       DW_AT_sibling
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x1
+            - Value:           0x0
+            - Value:           0x18
+
+#     DW_TAG_subprogram
+#       DW_AT_declaration
+#       DW_AT_specification [DW_FORM_ref4] ("func")
+#       DW_AT_sibling
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x1
+            - Value:           0xf
+            - Value:           0xdeadbeef
+
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str());
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto declaration = cu_die.GetFirstChild();
+  ASSERT_TRUE(declaration.IsValid());
+  ASSERT_EQ(declaration.Tag(), DW_TAG_subprogram);
+
+  auto definition = declaration.GetSibling();
+  ASSERT_TRUE(definition.IsValid());
+  ASSERT_EQ(definition.Tag(), DW_TAG_subprogram);
+
+  auto attrs = definition.GetAttributes(DWARFDebugInfoEntry::Recurse::yes);
+  EXPECT_EQ(attrs.Size(), 4U);
+  EXPECT_NE(attrs.FindAttributeIndex(DW_AT_name), UINT32_MAX);
+  EXPECT_NE(attrs.FindAttributeIndex(GetParam()), UINT32_MAX);
+
+  auto sibling_idx = attrs.FindAttributeIndex(DW_AT_sibling);
+  EXPECT_NE(sibling_idx, UINT32_MAX);
+
+  DWARFFormValue form_value;
+  auto success = attrs.ExtractFormValueAtIndex(sibling_idx, form_value);
+  ASSERT_TRUE(success);
+
+  EXPECT_EQ(form_value.Unsigned(), 0xdeadbeef);
+}
+
+TEST_P(GetAttributesTestFixture, TestGetAttributes_NoRecurse) {
+  // Tests that GetAttributes will not recurse if Recurse::No is passed to it.
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - func
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x3
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_data4
+            - Attribute:       {0}
+              Form:            DW_FORM_ref4
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+# DW_TAG_compile_unit
+#   DW_AT_language [DW_FORM_data2]    (DW_LANG_C_plus_plus)
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+#     DW_TAG_subprogram
+#       DW_AT_name [DW_FORM_strp] ("func")
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x0
+
+#     DW_TAG_subprogram
+#       DW_AT_low_pc [DW_FORM_data4]
+#       DW_AT_specification [DW_FORM_ref4]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0xdeadbeef
+            - Value:           0xf
+
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str());
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto declaration = cu_die.GetFirstChild();
+  ASSERT_TRUE(declaration.IsValid());
+  ASSERT_EQ(declaration.Tag(), DW_TAG_subprogram);
+
+  auto definition = declaration.GetSibling();
+  ASSERT_TRUE(definition.IsValid());
+  ASSERT_EQ(definition.Tag(), DW_TAG_subprogram);
+
+  auto attrs = definition.GetAttributes(DWARFDebugInfoEntry::Recurse::no);
+  EXPECT_EQ(attrs.Size(), 2U);
+  EXPECT_EQ(attrs.FindAttributeIndex(DW_AT_name), UINT32_MAX);
+  EXPECT_NE(attrs.FindAttributeIndex(GetParam()), UINT32_MAX);
+  EXPECT_NE(attrs.FindAttributeIndex(DW_AT_low_pc), UINT32_MAX);
+}
+
+TEST_P(GetAttributesTestFixture, TestGetAttributes_InvalidSpec) {
+  // Test that GetAttributes doesn't try following invalid
+  // specifications (but still add it to the list of attributes).
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - func
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x3
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       {0}
+              Form:            DW_FORM_ref4
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+# DW_TAG_compile_unit
+#   DW_AT_language [DW_FORM_data2]    (DW_LANG_C_plus_plus)
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+#     DW_TAG_subprogram
+#       DW_AT_name [DW_FORM_strp] ("func")
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x0
+
+#     DW_TAG_subprogram
+#       DW_AT_specification [DW_FORM_ref4]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0xdeadbeef
+
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str());
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto declaration = cu_die.GetFirstChild();
+  ASSERT_TRUE(declaration.IsValid());
+  ASSERT_EQ(declaration.Tag(), DW_TAG_subprogram);
+
+  auto definition = declaration.GetSibling();
+  ASSERT_TRUE(definition.IsValid());
+  ASSERT_EQ(definition.Tag(), DW_TAG_subprogram);
+
+  auto attrs = definition.GetAttributes(DWARFDebugInfoEntry::Recurse::yes);
+  EXPECT_EQ(attrs.Size(), 1U);
+  EXPECT_EQ(attrs.FindAttributeIndex(DW_AT_name), UINT32_MAX);
+  EXPECT_NE(attrs.FindAttributeIndex(GetParam()), UINT32_MAX);
+}
+
+TEST(DWARFDIETest, TestGetAttributes_Worklist) {
+  // Test that GetAttributes will follow both the abstract origin
+  // and specification on a single DIE correctly (omitting non-applicable
+  // attributes in the process).
+
+  // Contrived example where
+  // f1---> f2 --> f4
+  //    `-> f3 `-> f5
+  //
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - foo
+    - bar
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_specification
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_abstract_origin
+              Form:            DW_FORM_ref4
+        - Code:            0x3
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_declaration
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_artificial
+              Form:            DW_FORM_flag_present
+
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+#     DW_TAG_subprogram ("f1")
+#       DW_AT_specification [DW_FORM_ref4] ("f2")
+#       DW_AT_abstract_origin [DW_FORM_ref4] ("f3")
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x18
+            - Value:           0x21
+
+#     DW_TAG_subprogram ("f2")
+#       DW_AT_specification [DW_FORM_ref4] ("f4")
+#       DW_AT_abstract_origin [DW_FORM_ref4] ("f5")
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x22
+            - Value:           0x23
+
+#     DW_TAG_subprogram ("f3")
+#       DW_AT_declaration [DW_FORM_flag_present]
+#       DW_AT_artificial [DW_FORM_flag_present]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x1
+            - Value:           0x1
+
+#     DW_TAG_subprogram ("f4")
+#       DW_AT_declaration [DW_FORM_flag_present]
+#       DW_AT_artificial [DW_FORM_flag_present]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x1
+            - Value:           0x1
+
+#     DW_TAG_subprogram ("f5")
+#       DW_AT_declaration [DW_FORM_flag_present]
+#       DW_AT_artificial [DW_FORM_flag_present]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x1
+            - Value:           0x1
+
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(yamldata);
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto f1 = cu_die.GetFirstChild();
+  ASSERT_TRUE(f1.IsValid());
+  ASSERT_EQ(f1.Tag(), DW_TAG_subprogram);
+
+  auto attrs = f1.GetAttributes(DWARFDebugInfoEntry::Recurse::yes);
+  EXPECT_EQ(attrs.Size(), 7U);
+  EXPECT_EQ(attrs.FindAttributeIndex(DW_AT_declaration), UINT32_MAX);
+}
+
+INSTANTIATE_TEST_SUITE_P(GetAttributeTests, GetAttributesTestFixture,
+                         testing::Values(DW_AT_specification,
+                                         DW_AT_abstract_origin));
diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index e2af991ed37b1..10714b508ca68 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -169,10 +169,26 @@ rnk@google.com (email), [rnk](https://github.com/rnk) (GitHub)
 
 ### Backends / Targets
 
-#### AArch64 backend
+#### ARM and AArch64 backends
 
-Tim Northover \
-t.p.northover@gmail.com (email), [TNorthover](https://github.com/TNorthover) (GitHub)
+David Green \
+david.green@arm.com (email), [davemgreen](https://github.com/davemgreen) (GitHub) \
+Amara Emerson (esp. AArch64 GlobalISel) \
+amara@apple.com (email), [aemerson](https://github.com/aemerson) (GitHub) \
+Eli Friedman (esp. ARM64EC) \
+efriedma@quicinc.com (email), [efriedma-quic](https://github.com/efriedma-quic) (GitHub) \
+Sjoerd Meijer \
+smeijer@nvidia.com (email), [sjoerdmeijer](https://github.com/sjoerdmeijer) (GitHub) \
+Nashe Mncube \
+nashe.mncube@arm.com (email), [nasherm](https://github.com/nasherm) (GitHub) \
+Sander de Smalen (esp. scalable vectorization/SVE/SME) \
+sander.desmalen@arm.com (email), [sdesmalen-arm](https://github.com/sdesmalen-arm) (GitHub) \
+Peter Smith (Anything ABI) \
+peter.smith@arm.com (email), [smithp35](https://github.com/smithp35) (GitHub) \
+Oliver Stannard (esp. assembly/dissassembly) \
+oliver.stannard@arm.com (email), [ostannard](https://github.com/ostannard) (GitHub) \
+Ties Stuij (Arm GlobalISel and early arch support) \
+ties.stuij@arm.com (email), [stuij](https://github.com/stuij) (GitHub)
 
 #### AMDGPU backend
 
@@ -184,19 +200,6 @@ Matthew.Arsenault@amd.com, arsenm2@gmail.com (email), [arsenm](https://github.co
 Mark Schimmel \
 marksl@synopsys.com (email), [markschimmel](https://github.com/markschimmel) (GitHub)
 
-#### ARM backend
-
-David Green \
-david.green@arm.com (email), [davemgreen](https://github.com/davemgreen) (GitHub) \
-Oliver Stannard (Especially assembly/dissassembly) \
-oliver.stannard@arm.com (email), [ostannard](https://github.com/ostannard) (GitHub) \
-Nashe Mncube \
-nashe.mncube@arm.com (email), [nasherm](https://github.com/nasherm) (GitHub) \
-Peter Smith (Anything ABI) \
-peter.smith@arm.com (email), [smithp35](https://github.com/smithp35) (GitHub) \
-Ties Stuij (GlobalISel and early arch support) \
-ties.stuij@arm.com (email), [stuij](https://github.com/stuij) (GitHub)
-
 #### AVR backend
 
 Ben Shi \
@@ -480,6 +483,7 @@ James Grosbach (grosbach@apple.com) -- MC layer \
 Anton Korobeynikov (anton@korobeynikov.info, [asl](https://github.com/asl)) -- ARM EABI, Windows codegen \
 Benjamin Kramer (benny.kra@gmail.com, [d0k](https://github.com/d0k)) -- DWARF Parser \
 David Majnemer (david.majnemer@gmail.com, [majnemer](https://github.com/majnemer)) -- InstCombine, ConstantFold \
+Tim Northover (t.p.northover@gmail.com, [TNorthover](https://github.com/TNorthover)) -- AArch64 backend \
 Chad Rosier (mcrosier@codeaurora.org) -- FastISel \
 Hans Wennborg (hans@chromium.org, [zmodem](https://github.com/zmodem)) -- Release management \
 Kostya Serebryany ([kcc](https://github.com/kcc)) -- Sanitizers \
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index fe13fc676e303..71b204f9c3fec 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1870,6 +1870,13 @@ class TargetTransformInfo {
   /// false, but it shouldn't matter what it returns anyway.
   bool hasArmWideBranch(bool Thumb) const;
 
+  /// Returns a bitmask constructed from the target-features or fmv-features
+  /// metadata of a function.
+  uint64_t getFeatureMask(const Function &F) const;
+
+  /// Returns true if this is an instance of a function with multiple versions.
+  bool isMultiversionedFunction(const Function &F) const;
+
   /// \return The maximum number of function arguments the target supports.
   unsigned getMaxNumArgs() const;
 
@@ -2312,6 +2319,8 @@ class TargetTransformInfo::Concept {
   virtual VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
+  virtual uint64_t getFeatureMask(const Function &F) const = 0;
+  virtual bool isMultiversionedFunction(const Function &F) const = 0;
   virtual unsigned getMaxNumArgs() const = 0;
   virtual unsigned getNumBytesToPadGlobalArray(unsigned Size,
                                                Type *ArrayType) const = 0;
@@ -3144,6 +3153,14 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.hasArmWideBranch(Thumb);
   }
 
+  uint64_t getFeatureMask(const Function &F) const override {
+    return Impl.getFeatureMask(F);
+  }
+
+  bool isMultiversionedFunction(const Function &F) const override {
+    return Impl.isMultiversionedFunction(F);
+  }
+
   unsigned getMaxNumArgs() const override {
     return Impl.getMaxNumArgs();
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 7ac3063ca9a37..dcef4a1abcfa3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1039,6 +1039,10 @@ class TargetTransformInfoImplBase {
 
   bool hasArmWideBranch(bool) const { return false; }
 
+  uint64_t getFeatureMask(const Function &F) const { return 0; }
+
+  bool isMultiversionedFunction(const Function &F) const { return false; }
+
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
   unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 4faa090901a6a..4488a6152117c 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -896,6 +896,10 @@ inline UnaryOpc_match<Opnd, true> m_ChainedUnaryOp(unsigned Opc,
   return UnaryOpc_match<Opnd, true>(Opc, Op);
 }
 
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_BitCast(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::BITCAST, Op);
+}
+
 template <typename Opnd>
 inline UnaryOpc_match<Opnd> m_BSwap(const Opnd &Op) {
   return UnaryOpc_match<Opnd>(ISD::BSWAP, Op);
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 63f06a3a69298..0338770593bc4 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -270,13 +270,16 @@ void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
 
 bool isX18ReservedByDefault(const Triple &TT);
 
-// Return the priority for a given set of FMV features.
+// For a given set of feature names, which can be either target-features, or
+// fmv-features metadata, expand their dependencies and then return a bitmask
+// corresponding to the entries of AArch64::FeatPriorities.
 uint64_t getFMVPriority(ArrayRef<StringRef> Features);
 
-// For given feature names, return a bitmask corresponding to the entries of
-// AArch64::CPUFeatures. The values in CPUFeatures are not bitmasks themselves,
-// they are sequential (0, 1, 2, 3, ...). The resulting bitmask is used at
-// runtime to test whether a certain FMV feature is available on the host.
+// For a given set of FMV feature names, expand their dependencies and then
+// return a bitmask corresponding to the entries of AArch64::CPUFeatures.
+// The values in CPUFeatures are not bitmasks themselves, they are sequential
+// (0, 1, 2, 3, ...). The resulting bitmask is used at runtime to test whether
+// a certain FMV feature is available on the host.
 uint64_t getCpuSupportsMask(ArrayRef<StringRef> Features);
 
 void PrintSupportedExtensions();
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 7bbd469bd035d..11ccfa33821ca 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -25,6 +25,8 @@
 
 using namespace llvm;
 
+extern cl::opt<bool> UseDerefAtPointSemantics;
+
 static bool isAligned(const Value *Base, Align Alignment,
                       const DataLayout &DL) {
   return Base->getPointerAlignment(DL) >= Alignment;
@@ -168,7 +170,7 @@ static bool isDereferenceableAndAlignedPointer(
                                               Size, DL, CtxI, AC, DT, TLI,
                                               Visited, MaxDepth);
 
-  if (CtxI) {
+  if (CtxI && (!UseDerefAtPointSemantics || !V->canBeFreed())) {
     /// Look through assumes to see if both dereferencability and alignment can
     /// be proven by an assume if needed.
     RetainedKnowledge AlignRK;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index df42dc2746daf..8b9722d047edc 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1383,6 +1383,14 @@ bool TargetTransformInfo::hasArmWideBranch(bool Thumb) const {
   return TTIImpl->hasArmWideBranch(Thumb);
 }
 
+uint64_t TargetTransformInfo::getFeatureMask(const Function &F) const {
+  return TTIImpl->getFeatureMask(F);
+}
+
+bool TargetTransformInfo::isMultiversionedFunction(const Function &F) const {
+  return TTIImpl->isMultiversionedFunction(F);
+}
+
 unsigned TargetTransformInfo::getMaxNumArgs() const {
   return TTIImpl->getMaxNumArgs();
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index de7fb21f5903e..49e5b7d9ef014 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15770,7 +15770,7 @@ SDValue DAGCombiner::foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG,
   // FIXME: I don't think looking for bitcast intrinsically makes sense, but
   // removing this would require more changes.
   auto IsBitCastOrFree = [&TLI, FPOpcode](SDValue Op, EVT VT) {
-    if (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).getValueType() == VT)
+    if (sd_match(Op, m_BitCast(m_SpecificVT(VT))))
       return true;
 
     return FPOpcode == ISD::FABS ? TLI.isFAbsFree(VT) : TLI.isFNegFree(VT);
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 188a450d12fde..7dbf65fbf055b 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -264,6 +264,33 @@ computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
   return Result;
 }
 
+/// Emit an implicit cast to convert \p XRead to type of variable \p V
+static llvm::Value *emitImplicitCast(IRBuilder<> &Builder, llvm::Value *XRead,
+                                     llvm::Value *V) {
+  // TODO: Add this functionality to the `AtomicInfo` interface
+  llvm::Type *XReadType = XRead->getType();
+  llvm::Type *VType = V->getType();
+  if (llvm::AllocaInst *vAlloca = dyn_cast<llvm::AllocaInst>(V))
+    VType = vAlloca->getAllocatedType();
+
+  if (XReadType->isStructTy() && VType->isStructTy())
+    // No need to extract or convert. A direct
+    // `store` will suffice.
+    return XRead;
+
+  if (XReadType->isStructTy())
+    XRead = Builder.CreateExtractValue(XRead, /*Idxs=*/0);
+  if (VType->isIntegerTy() && XReadType->isFloatingPointTy())
+    XRead = Builder.CreateFPToSI(XRead, VType);
+  else if (VType->isFloatingPointTy() && XReadType->isIntegerTy())
+    XRead = Builder.CreateSIToFP(XRead, VType);
+  else if (VType->isIntegerTy() && XReadType->isIntegerTy())
+    XRead = Builder.CreateIntCast(XRead, VType, true);
+  else if (VType->isFloatingPointTy() && XReadType->isFloatingPointTy())
+    XRead = Builder.CreateFPCast(XRead, VType);
+  return XRead;
+}
+
 /// Make \p Source branch to \p Target.
 ///
 /// Handles two situations:
@@ -8501,6 +8528,8 @@ OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc,
     }
   }
   checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
+  if (XRead->getType() != V.Var->getType())
+    XRead = emitImplicitCast(Builder, XRead, V.Var);
   Builder.CreateStore(XRead, V.Var, V.IsVolatile);
   return Builder.saveIP();
 }
@@ -8785,6 +8814,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture(
     return AtomicResult.takeError();
   Value *CapturedVal =
       (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
+  if (CapturedVal->getType() != V.Var->getType())
+    CapturedVal = emitImplicitCast(Builder, CapturedVal, V.Var);
   Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
 
   checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index 65b63955b6f6d..eddb67282fca4 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -36,7 +36,7 @@
 
 using namespace llvm;
 
-static cl::opt<bool> UseDerefAtPointSemantics(
+cl::opt<bool> UseDerefAtPointSemantics(
     "use-dereferenceable-at-point-semantics", cl::Hidden, cl::init(false),
     cl::desc("Deref attributes and metadata infer facts at definition only"));
 
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 206e410047db5..1582d1999ca1d 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2926,26 +2926,12 @@ struct RegPairInfo {
   int FrameIdx;
   int Offset;
   enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type;
+  const TargetRegisterClass *RC;
 
   RegPairInfo() = default;
 
   bool isPaired() const { return Reg2 != AArch64::NoRegister; }
 
-  unsigned getScale() const {
-    switch (Type) {
-    case PPR:
-      return 2;
-    case GPR:
-    case FPR64:
-    case VG:
-      return 8;
-    case ZPR:
-    case FPR128:
-      return 16;
-    }
-    llvm_unreachable("Unsupported type");
-  }
-
   bool isScalable() const { return Type == PPR || Type == ZPR; }
 };
 
@@ -3023,20 +3009,27 @@ static void computeCalleeSaveRegisterPairs(
     RegPairInfo RPI;
     RPI.Reg1 = CSI[i].getReg();
 
-    if (AArch64::GPR64RegClass.contains(RPI.Reg1))
+    if (AArch64::GPR64RegClass.contains(RPI.Reg1)) {
       RPI.Type = RegPairInfo::GPR;
-    else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
+      RPI.RC = &AArch64::GPR64RegClass;
+    } else if (AArch64::FPR64RegClass.contains(RPI.Reg1)) {
       RPI.Type = RegPairInfo::FPR64;
-    else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
+      RPI.RC = &AArch64::FPR64RegClass;
+    } else if (AArch64::FPR128RegClass.contains(RPI.Reg1)) {
       RPI.Type = RegPairInfo::FPR128;
-    else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
+      RPI.RC = &AArch64::FPR128RegClass;
+    } else if (AArch64::ZPRRegClass.contains(RPI.Reg1)) {
       RPI.Type = RegPairInfo::ZPR;
-    else if (AArch64::PPRRegClass.contains(RPI.Reg1))
+      RPI.RC = &AArch64::ZPRRegClass;
+    } else if (AArch64::PPRRegClass.contains(RPI.Reg1)) {
       RPI.Type = RegPairInfo::PPR;
-    else if (RPI.Reg1 == AArch64::VG)
+      RPI.RC = &AArch64::PPRRegClass;
+    } else if (RPI.Reg1 == AArch64::VG) {
       RPI.Type = RegPairInfo::VG;
-    else
+      RPI.RC = &AArch64::FIXED_REGSRegClass;
+    } else {
       llvm_unreachable("Unsupported register class.");
+    }
 
     // Add the stack hazard size as we transition from GPR->FPR CSRs.
     if (AFI->hasStackHazardSlotIndex() &&
@@ -3045,7 +3038,7 @@ static void computeCalleeSaveRegisterPairs(
       ByteOffset += StackFillDir * StackHazardSize;
     LastReg = RPI.Reg1;
 
-    int Scale = RPI.getScale();
+    int Scale = TRI->getSpillSize(*RPI.RC);
     // Add the next reg to the pair if it is in the same register class.
     if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) {
       Register NextReg = CSI[i + RegInc].getReg();
@@ -3254,38 +3247,26 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
     // Rationale: This sequence saves uop updates compared to a sequence of
     // pre-increment spills like stp xi,xj,[sp,#-16]!
     // Note: Similar rationale and sequence for restores in epilog.
-    unsigned Size;
-    Align Alignment;
+    unsigned Size = TRI->getSpillSize(*RPI.RC);
+    Align Alignment = TRI->getSpillAlign(*RPI.RC);
     switch (RPI.Type) {
     case RegPairInfo::GPR:
       StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
-      Size = 8;
-      Alignment = Align(8);
       break;
     case RegPairInfo::FPR64:
       StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
-      Size = 8;
-      Alignment = Align(8);
       break;
     case RegPairInfo::FPR128:
       StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
-      Size = 16;
-      Alignment = Align(16);
       break;
     case RegPairInfo::ZPR:
       StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
-      Size = 16;
-      Alignment = Align(16);
       break;
     case RegPairInfo::PPR:
       StrOpc = AArch64::STR_PXI;
-      Size = 2;
-      Alignment = Align(2);
       break;
     case RegPairInfo::VG:
       StrOpc = AArch64::STRXui;
-      Size = 8;
-      Alignment = Align(8);
       break;
     }
 
@@ -3495,33 +3476,23 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     //    ldp     x22, x21, [sp, #0]      // addImm(+0)
     // Note: see comment in spillCalleeSavedRegisters()
     unsigned LdrOpc;
-    unsigned Size;
-    Align Alignment;
+    unsigned Size = TRI->getSpillSize(*RPI.RC);
+    Align Alignment = TRI->getSpillAlign(*RPI.RC);
     switch (RPI.Type) {
     case RegPairInfo::GPR:
       LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
-      Size = 8;
-      Alignment = Align(8);
       break;
     case RegPairInfo::FPR64:
       LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
-      Size = 8;
-      Alignment = Align(8);
       break;
     case RegPairInfo::FPR128:
       LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
-      Size = 16;
-      Alignment = Align(16);
       break;
     case RegPairInfo::ZPR:
       LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
-      Size = 16;
-      Alignment = Align(16);
       break;
     case RegPairInfo::PPR:
       LdrOpc = AArch64::LDR_PXI;
-      Size = 2;
-      Alignment = Align(2);
       break;
     case RegPairInfo::VG:
       continue;
@@ -3795,14 +3766,15 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   unsigned CSStackSize = 0;
   unsigned SVECSStackSize = 0;
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
   for (unsigned Reg : SavedRegs.set_bits()) {
-    auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
+    auto *RC = TRI->getMinimalPhysRegClass(Reg);
+    assert(RC && "expected register class!");
+    auto SpillSize = TRI->getSpillSize(*RC);
     if (AArch64::PPRRegClass.contains(Reg) ||
         AArch64::ZPRRegClass.contains(Reg))
-      SVECSStackSize += RegSize;
+      SVECSStackSize += SpillSize;
     else
-      CSStackSize += RegSize;
+      CSStackSize += SpillSize;
   }
 
   // Increase the callee-saved stack size if the function has streaming mode
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d4a114c275fb7..7d3ca46204b67 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2373,8 +2373,9 @@ bool AArch64TargetLowering::targetShrinkDemandedConstant(
     return false;
 
   unsigned Size = VT.getSizeInBits();
-  assert((Size == 32 || Size == 64) &&
-         "i32 or i64 is expected after legalization.");
+
+  if (Size != 32 && Size != 64)
+    return false;
 
   // Exit early if we demand all bits.
   if (DemandedBits.popcount() == Size)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 932a6f9ce23fd..7f10bfed739b4 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/TargetParser/AArch64TargetParser.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include <algorithm>
@@ -248,6 +249,19 @@ static bool hasPossibleIncompatibleOps(const Function *F) {
   return false;
 }
 
+uint64_t AArch64TTIImpl::getFeatureMask(const Function &F) const {
+  StringRef AttributeStr =
+      isMultiversionedFunction(F) ? "fmv-features" : "target-features";
+  StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
+  SmallVector<StringRef, 8> Features;
+  FeatureStr.split(Features, ",");
+  return AArch64::getFMVPriority(Features);
+}
+
+bool AArch64TTIImpl::isMultiversionedFunction(const Function &F) const {
+  return F.hasFnAttribute("fmv-features");
+}
+
 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
                                          const Function *Callee) const {
   SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 8e7e590c173ff..1eb805ae00b1b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -89,6 +89,10 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
                                 unsigned DefaultCallPenalty) const;
 
+  uint64_t getFeatureMask(const Function &F) const;
+
+  bool isMultiversionedFunction(const Function &F) const;
+
   /// \name Scalar TTI Implementations
   /// @{
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 169f1369fb543..7de64bddf7884 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -715,7 +715,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
       ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
       PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
       MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
-      Mode(MFI.getMode()) {
+      Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()) {
   for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
     SpillPhysVGPRS.push_back(regToString(Reg, TRI));
 
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index efc8b77f8d8fa..420b98b8a9c1f 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -1009,7 +1009,8 @@ void LoongArchAsmParser::emitLoadAddressPcrel(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(
       LoongArchAsmParser::Inst(ADDI, LoongArchMCExpr::VK_LoongArch_PCALA_LO12));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, true);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out,
+                /*RelaxHint=*/true);
 }
 
 void LoongArchAsmParser::emitLoadAddressPcrelLarge(MCInst &Inst, SMLoc IDLoc,
@@ -1083,7 +1084,8 @@ void LoongArchAsmParser::emitLoadAddressGot(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(
       LoongArchAsmParser::Inst(LD, LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, true);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out,
+                /*RelaxHint=*/true);
 }
 
 void LoongArchAsmParser::emitLoadAddressGotLarge(MCInst &Inst, SMLoc IDLoc,
@@ -1176,7 +1178,8 @@ void LoongArchAsmParser::emitLoadAddressTLSIE(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(LoongArchAsmParser::Inst(
       LD, LoongArchMCExpr::VK_LoongArch_TLS_IE_PC_LO12));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out,
+                /*RelaxHint=*/true);
 }
 
 void LoongArchAsmParser::emitLoadAddressTLSIELarge(MCInst &Inst, SMLoc IDLoc,
@@ -1248,7 +1251,8 @@ void LoongArchAsmParser::emitLoadAddressTLSLD(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(LoongArchAsmParser::Inst(
       ADDI, LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out,
+                /*RelaxHint=*/true);
 }
 
 void LoongArchAsmParser::emitLoadAddressTLSLDLarge(MCInst &Inst, SMLoc IDLoc,
@@ -1320,7 +1324,8 @@ void LoongArchAsmParser::emitLoadAddressTLSGD(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(LoongArchAsmParser::Inst(
       ADDI, LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out,
+                /*RelaxHint=*/true);
 }
 
 void LoongArchAsmParser::emitLoadAddressTLSGDLarge(MCInst &Inst, SMLoc IDLoc,
@@ -1409,7 +1414,8 @@ void LoongArchAsmParser::emitLoadAddressTLSDesc(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(LoongArchAsmParser::Inst(
       LoongArch::JIRL, LoongArchMCExpr::VK_LoongArch_TLS_DESC_CALL));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out,
+                /*RelaxHint=*/true);
 }
 
 void LoongArchAsmParser::emitLoadAddressTLSDescLarge(MCInst &Inst, SMLoc IDLoc,
@@ -1500,8 +1506,9 @@ void LoongArchAsmParser::emitFuncCall36(MCInst &Inst, SMLoc IDLoc,
       IsTailCall ? Inst.getOperand(0).getReg() : MCRegister(LoongArch::R1);
   const MCExpr *Sym =
       IsTailCall ? Inst.getOperand(1).getExpr() : Inst.getOperand(0).getExpr();
-  const LoongArchMCExpr *LE = LoongArchMCExpr::create(
-      Sym, llvm::LoongArchMCExpr::VK_LoongArch_CALL36, getContext());
+  const LoongArchMCExpr *LE =
+      LoongArchMCExpr::create(Sym, llvm::LoongArchMCExpr::VK_LoongArch_CALL36,
+                              getContext(), /*RelaxHint=*/true);
 
   Out.emitInstruction(
       MCInstBuilder(LoongArch::PCADDU18I).addReg(ScratchReg).addExpr(LE),
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index 359bde1244429..04d57f0fe7457 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -282,9 +282,11 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
       break;
     case LoongArchMCExpr::VK_LoongArch_TLS_LE_HI20_R:
       FixupKind = LoongArch::fixup_loongarch_tls_le_hi20_r;
+      RelaxCandidate = true;
       break;
     case LoongArchMCExpr::VK_LoongArch_TLS_LE_LO12_R:
       FixupKind = LoongArch::fixup_loongarch_tls_le_lo12_r;
+      RelaxCandidate = true;
       break;
     case LoongArchMCExpr::VK_LoongArch_PCREL20_S2:
       FixupKind = LoongArch::fixup_loongarch_pcrel20_s2;
@@ -387,11 +389,17 @@ void LoongArchMCCodeEmitter::expandAddTPRel(const MCInst &MI,
          "Expected %le_add_r relocation on TP-relative symbol");
 
   // Emit the correct %le_add_r relocation for the symbol.
-  // TODO: Emit R_LARCH_RELAX for %le_add_r where the relax feature is enabled.
   Fixups.push_back(MCFixup::create(
       0, Expr, MCFixupKind(LoongArch::fixup_loongarch_tls_le_add_r),
       MI.getLoc()));
 
+  // Emit R_LARCH_RELAX for %le_add_r when the relax feature is enabled.
+  if (STI.hasFeature(LoongArch::FeatureRelax)) {
+    const MCConstantExpr *Dummy = MCConstantExpr::create(0, Ctx);
+    Fixups.push_back(MCFixup::create(
+        0, Dummy, MCFixupKind(LoongArch::fixup_loongarch_relax), MI.getLoc()));
+  }
+
   // Emit a normal ADD instruction with the given operands.
   unsigned ADD = MI.getOpcode() == LoongArch::PseudoAddTPRel_D
                      ? LoongArch::ADD_D
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index fc8a0eaed140d..7fbba7f05e0a5 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -578,10 +578,10 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     unsigned Opc;
     switch (Opcode) {
     case X86::PTILELOADDRSV:
-      Opc = X86::TILELOADDRS;
+      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
       break;
     case X86::PTILELOADDRST1V:
-      Opc = X86::TILELOADDRST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
       break;
     case X86::PTILELOADDV:
       Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
@@ -737,28 +737,28 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     unsigned Opc;
     switch (Opcode) {
     case X86::PT2RPNTLVWZ0V:
-      Opc = X86::T2RPNTLVWZ0;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
       break;
     case X86::PT2RPNTLVWZ0T1V:
-      Opc = X86::T2RPNTLVWZ0T1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
       break;
     case X86::PT2RPNTLVWZ1V:
-      Opc = X86::T2RPNTLVWZ1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
       break;
     case X86::PT2RPNTLVWZ1T1V:
-      Opc = X86::T2RPNTLVWZ1T1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
       break;
     case X86::PT2RPNTLVWZ0RSV:
-      Opc = X86::T2RPNTLVWZ0RS;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
       break;
     case X86::PT2RPNTLVWZ0RST1V:
-      Opc = X86::T2RPNTLVWZ0RST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
       break;
     case X86::PT2RPNTLVWZ1RSV:
-      Opc = X86::T2RPNTLVWZ1RS;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
       break;
     case X86::PT2RPNTLVWZ1RST1V:
-      Opc = X86::T2RPNTLVWZ1RST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
       break;
     default:
       llvm_unreachable("Impossible Opcode!");
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 9b340a778b36a..84bcdae520885 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1800,10 +1800,10 @@ void X86DAGToDAGISel::emitFunctionEntryCode() {
     emitSpecialCodeForMain();
 }
 
-static bool isDispSafeForFrameIndex(int64_t Val) {
-  // On 64-bit platforms, we can run into an issue where a frame index
+static bool isDispSafeForFrameIndexOrRegBase(int64_t Val) {
+  // We can run into an issue where a frame index or a register base
   // includes a displacement that, when added to the explicit displacement,
-  // will overflow the displacement field. Assuming that the frame index
+  // will overflow the displacement field. Assuming that the
   // displacement fits into a 31-bit integer  (which is only slightly more
   // aggressive than the current fundamental assumption that it fits into
   // a 32-bit integer), a 31-bit disp should always be safe.
@@ -1831,7 +1831,7 @@ bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
     // In addition to the checks required for a register base, check that
     // we do not try to use an unsafe Disp with a frame index.
     if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
-        !isDispSafeForFrameIndex(Val))
+        !isDispSafeForFrameIndexOrRegBase(Val))
       return true;
     // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
     // 64 bits. Instructions with 32-bit register addresses perform this zero
@@ -1849,10 +1849,14 @@ bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
     // to get an address size override to be emitted. However, this
     // pseudo-register is not part of any register class and therefore causes
     // MIR verification to fail.
-    if (Subtarget->isTarget64BitILP32() && !isUInt<31>(Val) &&
+    if (Subtarget->isTarget64BitILP32() &&
+        !isDispSafeForFrameIndexOrRegBase((uint32_t)Val) &&
         !AM.hasBaseOrIndexReg())
       return true;
-  }
+  } else if (AM.hasBaseOrIndexReg() && !isDispSafeForFrameIndexOrRegBase(Val))
+    // For 32-bit X86, make sure the displacement still isn't close to the
+    // expressible limit.
+    return true;
   AM.Disp = Val;
   return false;
 }
@@ -2553,7 +2557,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
   case ISD::FrameIndex:
     if (AM.BaseType == X86ISelAddressMode::RegBase &&
         AM.Base_Reg.getNode() == nullptr &&
-        (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
+        (!Subtarget->is64Bit() || isDispSafeForFrameIndexOrRegBase(AM.Disp))) {
       AM.BaseType = X86ISelAddressMode::FrameIndexBase;
       AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
       return false;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 90e3e15b1fb46..dba38f3e1a0bc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -26438,7 +26438,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       switch (CC) {
       case ISD::SETEQ: {
         SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
-        if (HasAVX10_2_COMX & HasAVX10_2_COMX_Ty) // ZF == 1
+        if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1
           break;
         // (ZF = 1 and PF = 0)
         SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
@@ -26447,7 +26447,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       }
       case ISD::SETNE: {
         SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
-        if (HasAVX10_2_COMX & HasAVX10_2_COMX_Ty) // ZF == 0
+        if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0
           break;
         // (ZF = 0 or PF = 1)
         SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
@@ -37800,14 +37800,14 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     case X86::PTILESTORED:
       Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
       break;
-#undef GET_EGPR_IF_ENABLED
     case X86::PTILELOADDRS:
-      Opc = X86::TILELOADDRS;
+      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
       break;
     case X86::PTILELOADDRST1:
-      Opc = X86::TILELOADDRST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
       break;
     }
+#undef GET_EGPR_IF_ENABLED
 
     MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
     unsigned CurOp = 0;
@@ -37838,34 +37838,36 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::PT2RPNTLVWZ1RST1: {
     const DebugLoc &DL = MI.getDebugLoc();
     unsigned Opc;
+#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
     switch (MI.getOpcode()) {
     default:
       llvm_unreachable("Unexpected instruction!");
     case X86::PT2RPNTLVWZ0:
-      Opc = X86::T2RPNTLVWZ0;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
       break;
     case X86::PT2RPNTLVWZ0T1:
-      Opc = X86::T2RPNTLVWZ0T1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
       break;
     case X86::PT2RPNTLVWZ1:
-      Opc = X86::T2RPNTLVWZ1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
       break;
     case X86::PT2RPNTLVWZ1T1:
-      Opc = X86::T2RPNTLVWZ1T1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
       break;
     case X86::PT2RPNTLVWZ0RS:
-      Opc = X86::T2RPNTLVWZ0RS;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
       break;
     case X86::PT2RPNTLVWZ0RST1:
-      Opc = X86::T2RPNTLVWZ0RST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
       break;
     case X86::PT2RPNTLVWZ1RS:
-      Opc = X86::T2RPNTLVWZ1RS;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
       break;
     case X86::PT2RPNTLVWZ1RST1:
-      Opc = X86::T2RPNTLVWZ1RST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
       break;
     }
+#undef GET_EGPR_IF_ENABLED
     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
     MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
 
@@ -58570,8 +58572,8 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG,
-                                     const X86Subtarget &Subtarget) {
+static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG,
+                                       const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   SDValue Src = N->getOperand(0);
   SDLoc DL(N);
@@ -59264,7 +59266,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   // clang-format off
   default: break;
   case ISD::SCALAR_TO_VECTOR:
-    return combineScalarToVector(N, DAG, Subtarget);
+    return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
   case ISD::EXTRACT_VECTOR_ELT:
   case X86ISD::PEXTRW:
   case X86ISD::PEXTRB:
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index a055ba91d3e17..85046228bc8c5 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -345,26 +345,33 @@ let Predicates = [HasAMXTILE, In64BitMode], isPseudo = true, SchedRW = [WriteSys
   def PTILEPAIRLOAD : PseudoI<(outs TILEPair:$dst), (ins opaquemem:$src), []>;
 }
 
-let Predicates = [HasAMXTRANSPOSE, In64BitMode] in {
-  let SchedRW = [WriteSystem] in {
-    def T2RPNTLVWZ0 : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                        (ins sibmem:$src), "t2rpntlvwz0\t{$src, $dst|$dst, $src}",
-                        []>, VEX, WIG, T8,PS;
+multiclass T2RPNTLVW_Base<bits<8> op1, bits<8> op2, string rs, string suffix> {
+  def Z0#rs#suffix    : I<op1, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
+                          "t2rpntlvwz0" #!tolower(rs)# "\t{$src, $dst|$dst, $src}", []>, PS;
+  def Z0#rs#T1#suffix : I<op2, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
+                          "t2rpntlvwz0" #!tolower(rs)# "t1\t{$src, $dst|$dst, $src}", []>, PS;
+  def Z1#rs#suffix    : I<op1, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
+                          "t2rpntlvwz1" #!tolower(rs)# "\t{$src, $dst|$dst, $src}", []>, PD;
+  def Z1#rs#T1#suffix : I<op2, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
+                          "t2rpntlvwz1" #!tolower(rs)# "t1\t{$src, $dst|$dst, $src}", []>, PD;
+}
 
-    def T2RPNTLVWZ0T1 : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                          (ins sibmem:$src), "t2rpntlvwz0t1\t{$src, $dst|$dst, $src}",
-                          []>, VEX, T8,PS;
+let Predicates = [HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in
+  defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "">, T8, VEX;
 
-    def T2RPNTLVWZ1 : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                        (ins sibmem:$src), "t2rpntlvwz1\t{$src, $dst|$dst, $src}",
-                        []>, VEX, T8,PD;
+let Predicates = [HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in
+  defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "_EVEX">, T8, EVEX, NoCD8;
 
-    def T2RPNTLVWZ1T1 : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                          (ins sibmem:$src), "t2rpntlvwz1t1\t{$src, $dst|$dst, $src}",
-                          []>, VEX, T8,PD;
+let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in
+  defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "">, T_MAP5, VEX;
 
+let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in
+  defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "_EVEX">, T_MAP5, EVEX, NoCD8;
+
+let Predicates = [HasAMXTRANSPOSE, In64BitMode] in {
+  let SchedRW = [WriteSystem] in {
     def TTRANSPOSED : I<0x5f, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src),
-                        "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8,XS;
+                        "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8, XS;
     let isPseudo = true in {
       def PT2RPNTLVWZ0V : PseudoI<(outs TILEPair:$dst),
                                   (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
@@ -491,22 +498,6 @@ let Predicates = [HasAMXCOMPLEX, HasAMXTRANSPOSE, In64BitMode], SchedRW = [Write
 }
 
 let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
-  def T2RPNTLVWZ0RS   : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                        (ins sibmem:$src1),
-                        "t2rpntlvwz0rs\t{$src1, $dst|$dst, $src1}",
-                        []>, VEX, T_MAP5;
-  def T2RPNTLVWZ0RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                        (ins sibmem:$src1),
-                        "t2rpntlvwz0rst1\t{$src1, $dst|$dst, $src1}",
-                        []>, VEX, T_MAP5;
-  def T2RPNTLVWZ1RS   : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                        (ins sibmem:$src1),
-                        "t2rpntlvwz1rs\t{$src1, $dst|$dst, $src1}",
-                        []>, VEX, T_MAP5, PD;
-  def T2RPNTLVWZ1RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                        (ins sibmem:$src1),
-                        "t2rpntlvwz1rst1\t{$src1, $dst|$dst, $src1}",
-                        []>, VEX, T_MAP5, PD;
   let isPseudo = true in {
     def PT2RPNTLVWZ0RSV   : PseudoI<(outs TILEPair:$dst),
                               (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
@@ -529,16 +520,20 @@ let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSy
   }
 } // HasAMXMOVRS, HasAMXTRANSPOSE
 
-let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in {
-  def TILELOADDRS : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst),
-                    (ins sibmem:$src1),
-                    "tileloaddrs\t{$src1, $dst|$dst, $src1}",
-                    []>, VEX, T8, XD;
-  def TILELOADDRST1 : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst),
-                    (ins sibmem:$src1),
-                    "tileloaddrst1\t{$src1, $dst|$dst, $src1}",
-                    []>, VEX, T8, PD;
+multiclass TILELOADDRS_Base<string suffix> {
+  def suffix    : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src1),
+                    "tileloaddrs\t{$src1, $dst|$dst, $src1}", []>, T8, XD;
+  def T1#suffix : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src1),
+                    "tileloaddrst1\t{$src1, $dst|$dst, $src1}", []>, T8, PD;
+}
+
+let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in
+  defm TILELOADDRS : TILELOADDRS_Base<"">, VEX;
 
+let Predicates = [HasAMXMOVRS, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in
+  defm TILELOADDRS : TILELOADDRS_Base<"_EVEX">, EVEX, NoCD8;
+
+let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in {
   let isPseudo = true, mayLoad = 1 in {
     def PTILELOADDRSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
                                                   GR16:$src2,
diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index 127016184bc17..edbcb17297603 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -1767,9 +1767,9 @@ multiclass vmovrs_p<bits<8> opc, string OpStr, X86VectorVTInfo _> {
 }
 
 multiclass vmovrs_p_vl<bits<8> opc, string OpStr, AVX512VLVectorVTInfo _Vec> {
-  let Predicates = [HasMOVRS, HasAVX10_2_512] in
+  let Predicates = [HasMOVRS, HasAVX10_2_512, In64BitMode] in
     defm Z : vmovrs_p<opc, OpStr, _Vec.info512>, EVEX_V512;
-  let Predicates = [HasMOVRS, HasAVX10_2] in {
+  let Predicates = [HasMOVRS, HasAVX10_2, In64BitMode] in {
     defm Z128 : vmovrs_p<opc, OpStr, _Vec.info128>, EVEX_V128;
     defm Z256 : vmovrs_p<opc, OpStr, _Vec.info256>, EVEX_V256;
   }
diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td
index 9fabe2acf0019..43c02c4f85844 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -1733,7 +1733,7 @@ def CLDEMOTE : I<0x1C, MRM0m, (outs), (ins i8mem:$src), "cldemote\t$src",
 //
 
 let SchedRW = [WriteLoad] in {
-let Predicates = [HasMOVRS, NoEGPR] in {
+let Predicates = [HasMOVRS, NoEGPR, In64BitMode] in {
 def MOVRS8rm     : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
                    "movrs{b}\t{$src, $dst|$dst, $src}",
                    [(set GR8:$dst, (int_x86_movrsqi addr:$src))]>, T8;
@@ -1746,8 +1746,25 @@ def MOVRS32rm    : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
 def MOVRS64rm    : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                    "movrs{q}\t{$src, $dst|$dst, $src}",
                    [(set GR64:$dst, (int_x86_movrsdi addr:$src))]>, T8;
+}
+
+let Predicates = [HasMOVRS] in
 def PREFETCHRST2 : I<0x18, MRM4m, (outs), (ins i8mem:$src),
                    "prefetchrst2\t$src",
                    [(int_x86_prefetchrs addr:$src)]>, TB;
+
+let Predicates = [HasMOVRS, HasEGPR, In64BitMode] in {
+def MOVRS8rm_EVEX  : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
+                   "movrs{b}\t{$src, $dst|$dst, $src}",
+                   [(set GR8:$dst, (int_x86_movrsqi addr:$src))]>, EVEX, NoCD8, T_MAP4;
+def MOVRS16rm_EVEX : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                   "movrs{w}\t{$src, $dst|$dst, $src}",
+                   [(set GR16:$dst, (int_x86_movrshi addr:$src))]>, EVEX, NoCD8, PD, T_MAP4;
+def MOVRS32rm_EVEX : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                   "movrs{l}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (int_x86_movrssi addr:$src))]>, EVEX, NoCD8, T_MAP4;
+def MOVRS64rm_EVEX : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                   "movrs{q}\t{$src, $dst|$dst, $src}",
+                   [(set GR64:$dst, (int_x86_movrsdi addr:$src))]>, EVEX, NoCD8, T_MAP4, REX_W;
+}
 }
-}
\ No newline at end of file
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index 34ca03a47e0a4..e13c6e6d28c2b 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -48,12 +48,33 @@ std::optional<AArch64::ArchInfo> AArch64::ArchInfo::findBySubArch(StringRef SubA
   return {};
 }
 
+std::optional<AArch64::FMVInfo> lookupFMVByID(AArch64::ArchExtKind ExtID) {
+  for (const AArch64::FMVInfo &Info : AArch64::getFMVInfo())
+    if (Info.ID && *Info.ID == ExtID)
+      return Info;
+  return {};
+}
+
 uint64_t AArch64::getFMVPriority(ArrayRef<StringRef> Features) {
-  uint64_t Priority = 0;
-  for (StringRef Feature : Features)
-    if (std::optional<FMVInfo> Info = parseFMVExtension(Feature))
-      Priority |= (1ULL << Info->PriorityBit);
-  return Priority;
+  // Transitively enable the Arch Extensions which correspond to each feature.
+  ExtensionSet FeatureBits;
+  for (const StringRef Feature : Features) {
+    std::optional<FMVInfo> FMV = parseFMVExtension(Feature);
+    if (!FMV) {
+      if (std::optional<ExtensionInfo> Info = targetFeatureToExtension(Feature))
+        FMV = lookupFMVByID(Info->ID);
+    }
+    if (FMV && FMV->ID)
+      FeatureBits.enable(*FMV->ID);
+  }
+
+  // Construct a bitmask for all the transitively enabled Arch Extensions.
+  uint64_t PriorityMask = 0;
+  for (const FMVInfo &Info : getFMVInfo())
+    if (Info.ID && FeatureBits.Enabled.test(*Info.ID))
+      PriorityMask |= (1ULL << Info.PriorityBit);
+
+  return PriorityMask;
 }
 
 uint64_t AArch64::getCpuSupportsMask(ArrayRef<StringRef> Features) {
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 78cd249c9c16a..bf0cacc6224be 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2641,6 +2641,165 @@ DeleteDeadIFuncs(Module &M,
   return Changed;
 }
 
+// Follows the use-def chain of \p V backwards until it finds a Function,
+// in which case it collects in \p Versions. Return true on successful
+// use-def chain traversal, false otherwise.
+static bool collectVersions(TargetTransformInfo &TTI, Value *V,
+                            SmallVectorImpl<Function *> &Versions) {
+  if (auto *F = dyn_cast<Function>(V)) {
+    if (!TTI.isMultiversionedFunction(*F))
+      return false;
+    Versions.push_back(F);
+  } else if (auto *Sel = dyn_cast<SelectInst>(V)) {
+    if (!collectVersions(TTI, Sel->getTrueValue(), Versions))
+      return false;
+    if (!collectVersions(TTI, Sel->getFalseValue(), Versions))
+      return false;
+  } else if (auto *Phi = dyn_cast<PHINode>(V)) {
+    for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I)
+      if (!collectVersions(TTI, Phi->getIncomingValue(I), Versions))
+        return false;
+  } else {
+    // Unknown instruction type. Bail.
+    return false;
+  }
+  return true;
+}
+
+// Bypass the IFunc Resolver of MultiVersioned functions when possible. To
+// deduce whether the optimization is legal we need to compare the target
+// features between caller and callee versions. The criteria for bypassing
+// the resolver are the following:
+//
+// * If the callee's feature set is a subset of the caller's feature set,
+//   then the callee is a candidate for direct call.
+//
+// * Among such candidates the one of highest priority is the best match
+//   and it shall be picked, unless there is a version of the callee with
+//   higher priority than the best match which cannot be picked from a
+//   higher priority caller (directly or through the resolver).
+//
+// * For every higher priority callee version than the best match, there
+//   is a higher priority caller version whose feature set availability
+//   is implied by the callee's feature set.
+//
+static bool OptimizeNonTrivialIFuncs(
+    Module &M, function_ref<TargetTransformInfo &(Function &)> GetTTI) {
+  bool Changed = false;
+
+  // Cache containing the mask constructed from a function's target features.
+  DenseMap<Function *, uint64_t> FeatureMask;
+
+  for (GlobalIFunc &IF : M.ifuncs()) {
+    if (IF.isInterposable())
+      continue;
+
+    Function *Resolver = IF.getResolverFunction();
+    if (!Resolver)
+      continue;
+
+    if (Resolver->isInterposable())
+      continue;
+
+    TargetTransformInfo &TTI = GetTTI(*Resolver);
+
+    // Discover the callee versions.
+    SmallVector<Function *> Callees;
+    if (any_of(*Resolver, [&TTI, &Callees](BasicBlock &BB) {
+          if (auto *Ret = dyn_cast_or_null<ReturnInst>(BB.getTerminator()))
+            if (!collectVersions(TTI, Ret->getReturnValue(), Callees))
+              return true;
+          return false;
+        }))
+      continue;
+
+    assert(!Callees.empty() && "Expecting successful collection of versions");
+
+    // Cache the feature mask for each callee.
+    for (Function *Callee : Callees) {
+      auto [It, Inserted] = FeatureMask.try_emplace(Callee);
+      if (Inserted)
+        It->second = TTI.getFeatureMask(*Callee);
+    }
+
+    // Sort the callee versions in decreasing priority order.
+    sort(Callees, [&](auto *LHS, auto *RHS) {
+      return FeatureMask[LHS] > FeatureMask[RHS];
+    });
+
+    // Find the callsites and cache the feature mask for each caller.
+    SmallVector<Function *> Callers;
+    DenseMap<Function *, SmallVector<CallBase *>> CallSites;
+    for (User *U : IF.users()) {
+      if (auto *CB = dyn_cast<CallBase>(U)) {
+        if (CB->getCalledOperand() == &IF) {
+          Function *Caller = CB->getFunction();
+          auto [FeatIt, FeatInserted] = FeatureMask.try_emplace(Caller);
+          if (FeatInserted)
+            FeatIt->second = TTI.getFeatureMask(*Caller);
+          auto [CallIt, CallInserted] = CallSites.try_emplace(Caller);
+          if (CallInserted)
+            Callers.push_back(Caller);
+          CallIt->second.push_back(CB);
+        }
+      }
+    }
+
+    // Sort the caller versions in decreasing priority order.
+    sort(Callers, [&](auto *LHS, auto *RHS) {
+      return FeatureMask[LHS] > FeatureMask[RHS];
+    });
+
+    auto implies = [](uint64_t A, uint64_t B) { return (A & B) == B; };
+
+    // Index to the highest priority candidate.
+    unsigned I = 0;
+    // Now try to redirect calls starting from higher priority callers.
+    for (Function *Caller : Callers) {
+      assert(I < Callees.size() && "Found callers of equal priority");
+
+      Function *Callee = Callees[I];
+      uint64_t CallerBits = FeatureMask[Caller];
+      uint64_t CalleeBits = FeatureMask[Callee];
+
+      // In the case of FMV callers, we know that all higher priority callers
+      // than the current one did not get selected at runtime, which helps
+      // reason about the callees (if they have versions that mandate presence
+      // of the features which we already know are unavailable on this target).
+      if (TTI.isMultiversionedFunction(*Caller)) {
+        // If the feature set of the caller implies the feature set of the
+        // highest priority candidate then it shall be picked. In case of
+        // identical sets advance the candidate index one position.
+        if (CallerBits == CalleeBits)
+          ++I;
+        else if (!implies(CallerBits, CalleeBits)) {
+          // Keep advancing the candidate index as long as the caller's
+          // features are a subset of the current candidate's.
+          while (implies(CalleeBits, CallerBits)) {
+            if (++I == Callees.size())
+              break;
+            CalleeBits = FeatureMask[Callees[I]];
+          }
+          continue;
+        }
+      } else {
+        // We can't reason much about non-FMV callers. Just pick the highest
+        // priority callee if it matches, otherwise bail.
+        if (I > 0 || !implies(CallerBits, CalleeBits))
+          continue;
+      }
+      auto &Calls = CallSites[Caller];
+      for (CallBase *CS : Calls)
+        CS->setCalledOperand(Callee);
+      Changed = true;
+    }
+    if (IF.use_empty() ||
+        all_of(IF.users(), [](User *U) { return isa<GlobalAlias>(U); }))
+      NumIFuncsResolved++;
+  }
+  return Changed;
+}
+
 static bool
 optimizeGlobalsInModule(Module &M, const DataLayout &DL,
                         function_ref<TargetLibraryInfo &(Function &)> GetTLI,
@@ -2707,6 +2866,9 @@ optimizeGlobalsInModule(Module &M, const DataLayout &DL,
     // Optimize IFuncs whose callee's are statically known.
     LocalChange |= OptimizeStaticIFuncs(M);
 
+    // Optimize IFuncs based on the target features of the caller.
+    LocalChange |= OptimizeNonTrivialIFuncs(M, GetTTI);
+
     // Remove any IFuncs that are now dead.
     LocalChange |= DeleteDeadIFuncs(M, NotDiscardableComdats);
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index d0b2ded127ff7..df5f9833a2ff9 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -13,6 +13,7 @@
 
 #include "InstCombineInternal.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -657,6 +658,94 @@ Instruction *InstCombinerImpl::foldPowiReassoc(BinaryOperator &I) {
   return nullptr;
 }
 
+// If we have the following pattern,
+// X = 1.0/sqrt(a)
+// R1 = X * X
+// R2 = a/sqrt(a)
+// then this method collects all the instructions that match R1 and R2.
+static bool getFSqrtDivOptPattern(Instruction *Div,
+                                  SmallPtrSetImpl<Instruction *> &R1,
+                                  SmallPtrSetImpl<Instruction *> &R2) {
+  Value *A;
+  if (match(Div, m_FDiv(m_FPOne(), m_Sqrt(m_Value(A)))) ||
+      match(Div, m_FDiv(m_SpecificFP(-1.0), m_Sqrt(m_Value(A))))) {
+    for (User *U : Div->users()) {
+      Instruction *I = cast<Instruction>(U);
+      if (match(I, m_FMul(m_Specific(Div), m_Specific(Div))))
+        R1.insert(I);
+    }
+
+    CallInst *CI = cast<CallInst>(Div->getOperand(1));
+    for (User *U : CI->users()) {
+      Instruction *I = cast<Instruction>(U);
+      if (match(I, m_FDiv(m_Specific(A), m_Sqrt(m_Specific(A)))))
+        R2.insert(I);
+    }
+  }
+  return !R1.empty() && !R2.empty();
+}
+
+// Check legality for transforming
+// x = 1.0/sqrt(a)
+// r1 = x * x;
+// r2 = a/sqrt(a);
+//
+// TO
+//
+// r1 = 1/a
+// r2 = sqrt(a)
+// x = r1 * r2
+// This transform works only when 'a' is known positive.
+static bool isFSqrtDivToFMulLegal(Instruction *X,
+                                  SmallPtrSetImpl<Instruction *> &R1,
+                                  SmallPtrSetImpl<Instruction *> &R2) {
+  // Check if the required pattern for the transformation exists.
+  if (!getFSqrtDivOptPattern(X, R1, R2))
+    return false;
+
+  BasicBlock *BBx = X->getParent();
+  BasicBlock *BBr1 = (*R1.begin())->getParent();
+  BasicBlock *BBr2 = (*R2.begin())->getParent();
+
+  CallInst *FSqrt = cast<CallInst>(X->getOperand(1));
+  if (!FSqrt->hasAllowReassoc() || !FSqrt->hasNoNaNs() ||
+      !FSqrt->hasNoSignedZeros() || !FSqrt->hasNoInfs())
+    return false;
+
+  // We change x = 1/sqrt(a) to x = sqrt(a) * 1/a . This change isn't allowed
+  // by recip fp as it is strictly meant to transform ops of type a/b to
+  // a * 1/b. So, this can be considered as algebraic rewrite and reassoc flag
+  // has been used(rather abused)in the past for algebraic rewrites.
+  if (!X->hasAllowReassoc() || !X->hasAllowReciprocal() || !X->hasNoInfs())
+    return false;
+
+  // Check the constraints on X, R1 and R2 combined.
+  // fdiv instruction and one of the multiplications must reside in the same
+  // block. If not, the optimized code may execute more ops than before and
+  // this may hamper the performance.
+  if (BBx != BBr1 && BBx != BBr2)
+    return false;
+
+  // Check the constraints on instructions in R1.
+  if (any_of(R1, [BBr1](Instruction *I) {
+        // When you have multiple instructions residing in R1 and R2
+        // respectively, it's difficult to generate combinations of (R1,R2) and
+        // then check if we have the required pattern. So, for now, just be
+        // conservative.
+        return (I->getParent() != BBr1 || !I->hasAllowReassoc());
+      }))
+    return false;
+
+  // Check the constraints on instructions in R2.
+  return all_of(R2, [BBr2](Instruction *I) {
+    // When you have multiple instructions residing in R1 and R2
+    // respectively, it's difficult to generate combination of (R1,R2) and
+    // then check if we have the required pattern. So, for now, just be
+    // conservative.
+    return (I->getParent() == BBr2 && I->hasAllowReassoc());
+  });
+}
+
 Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0);
   Value *Op1 = I.getOperand(1);
@@ -1913,6 +2002,74 @@ static Instruction *foldFDivSqrtDivisor(BinaryOperator &I,
   return BinaryOperator::CreateFMulFMF(Op0, NewSqrt, &I);
 }
 
+// Change
+// X = 1/sqrt(a)
+// R1 = X * X
+// R2 = a * X
+//
+// TO
+//
+// FDiv = 1/a
+// FSqrt = sqrt(a)
+// FMul = FDiv * FSqrt
+// Replace Uses Of R1 With FDiv
+// Replace Uses Of R2 With FSqrt
+// Replace Uses Of X With FMul
+static Instruction *
+convertFSqrtDivIntoFMul(CallInst *CI, Instruction *X,
+                        const SmallPtrSetImpl<Instruction *> &R1,
+                        const SmallPtrSetImpl<Instruction *> &R2,
+                        InstCombiner::BuilderTy &B, InstCombinerImpl *IC) {
+
+  B.SetInsertPoint(X);
+
+  // Have an instruction that is representative of all of instructions in R1 and
+  // get the most common fpmath metadata and fast-math flags on it.
+  Value *SqrtOp = CI->getArgOperand(0);
+  auto *FDiv = cast<Instruction>(
+      B.CreateFDiv(ConstantFP::get(X->getType(), 1.0), SqrtOp));
+  auto *R1FPMathMDNode = (*R1.begin())->getMetadata(LLVMContext::MD_fpmath);
+  FastMathFlags R1FMF = (*R1.begin())->getFastMathFlags(); // Common FMF
+  for (Instruction *I : R1) {
+    R1FPMathMDNode = MDNode::getMostGenericFPMath(
+        R1FPMathMDNode, I->getMetadata(LLVMContext::MD_fpmath));
+    R1FMF &= I->getFastMathFlags();
+    IC->replaceInstUsesWith(*I, FDiv);
+    IC->eraseInstFromFunction(*I);
+  }
+  FDiv->setMetadata(LLVMContext::MD_fpmath, R1FPMathMDNode);
+  FDiv->copyFastMathFlags(R1FMF);
+
+  // Have a single sqrt call instruction that is representative of all of
+  // instructions in R2 and get the most common fpmath metadata and fast-math
+  // flags on it.
+  auto *FSqrt = cast<CallInst>(CI->clone());
+  FSqrt->insertBefore(CI);
+  auto *R2FPMathMDNode = (*R2.begin())->getMetadata(LLVMContext::MD_fpmath);
+  FastMathFlags R2FMF = (*R2.begin())->getFastMathFlags(); // Common FMF
+  for (Instruction *I : R2) {
+    R2FPMathMDNode = MDNode::getMostGenericFPMath(
+        R2FPMathMDNode, I->getMetadata(LLVMContext::MD_fpmath));
+    R2FMF &= I->getFastMathFlags();
+    IC->replaceInstUsesWith(*I, FSqrt);
+    IC->eraseInstFromFunction(*I);
+  }
+  FSqrt->setMetadata(LLVMContext::MD_fpmath, R2FPMathMDNode);
+  FSqrt->copyFastMathFlags(R2FMF);
+
+  Instruction *FMul;
+  // If X = -1/sqrt(a) initially,then FMul = -(FDiv * FSqrt)
+  if (match(X, m_FDiv(m_SpecificFP(-1.0), m_Specific(CI)))) {
+    Value *Mul = B.CreateFMul(FDiv, FSqrt);
+    FMul = cast<Instruction>(B.CreateFNeg(Mul));
+  } else
+    FMul = cast<Instruction>(B.CreateFMul(FDiv, FSqrt));
+  FMul->copyMetadata(*X);
+  FMul->copyFastMathFlags(FastMathFlags::intersectRewrite(R1FMF, R2FMF) |
+                          FastMathFlags::unionValue(R1FMF, R2FMF));
+  return IC->replaceInstUsesWith(*X, FMul);
+}
+
 Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
   Module *M = I.getModule();
 
@@ -1937,6 +2094,24 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
     return R;
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Convert
+  // x = 1.0/sqrt(a)
+  // r1 = x * x;
+  // r2 = a/sqrt(a);
+  //
+  // TO
+  //
+  // r1 = 1/a
+  // r2 = sqrt(a)
+  // x = r1 * r2
+  SmallPtrSet<Instruction *, 2> R1, R2;
+  if (isFSqrtDivToFMulLegal(&I, R1, R2)) {
+    CallInst *CI = cast<CallInst>(I.getOperand(1));
+    if (Instruction *D = convertFSqrtDivIntoFMul(CI, &I, R1, R2, Builder, this))
+      return D;
+  }
+
   if (isa<Constant>(Op0))
     if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
       if (Instruction *R = FoldOpIntoSelect(I, SI))
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 2fb60ef11499c..fb21576722461 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -281,28 +281,33 @@ bool InstCombinerImpl::shouldChangeType(Type *From, Type *To) const {
 // Return true, if No Signed Wrap should be maintained for I.
 // The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C",
 // where both B and C should be ConstantInts, results in a constant that does
-// not overflow. This function only handles the Add and Sub opcodes. For
+// not overflow. This function only handles the Add/Sub/Mul opcodes. For
 // all other opcodes, the function conservatively returns false.
 static bool maintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
   auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
   if (!OBO || !OBO->hasNoSignedWrap())
     return false;
 
-  // We reason about Add and Sub Only.
-  Instruction::BinaryOps Opcode = I.getOpcode();
-  if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
-    return false;
-
   const APInt *BVal, *CVal;
   if (!match(B, m_APInt(BVal)) || !match(C, m_APInt(CVal)))
     return false;
 
+  // We reason about Add/Sub/Mul Only.
   bool Overflow = false;
-  if (Opcode == Instruction::Add)
+  switch (I.getOpcode()) {
+  case Instruction::Add:
     (void)BVal->sadd_ov(*CVal, Overflow);
-  else
+    break;
+  case Instruction::Sub:
     (void)BVal->ssub_ov(*CVal, Overflow);
-
+    break;
+  case Instruction::Mul:
+    (void)BVal->smul_ov(*CVal, Overflow);
+    break;
+  default:
+    // Conservatively return false for other opcodes.
+    return false;
+  }
   return !Overflow;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 99f6a8860f0f4..6df11abda9e98 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1447,9 +1447,11 @@ class LoopVectorizationCostModel {
     // Override forced styles if needed.
     // FIXME: use actual opcode/data type for analysis here.
     // FIXME: Investigate opportunity for fixed vector factor.
-    bool EVLIsLegal = UserIC <= 1 &&
-                      TTI.hasActiveVectorLength(0, nullptr, Align()) &&
-                      !EnableVPlanNativePath;
+    bool EVLIsLegal =
+        UserIC <= 1 && TTI.hasActiveVectorLength(0, nullptr, Align()) &&
+        !EnableVPlanNativePath &&
+        // FIXME: remove this once fixed-ordered recurrence is supported.
+        Legal->getFixedOrderRecurrences().empty();
     if (!EVLIsLegal) {
       // If for some reason EVL mode is unsupported, fallback to
       // DataWithoutLaneMask to try to vectorize the loop with folded tail
@@ -3503,10 +3505,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
   if (hasIrregularType(ScalarTy, DL))
     return false;
 
-  // We currently only know how to emit interleave/deinterleave with
-  // Factor=2 for scalable vectors. This is purely an implementation
-  // limit.
-  if (VF.isScalable() && InterleaveFactor != 2)
+  // For scalable vectors, the only interleave factor currently supported
+  // must be power of 2 since we require the (de)interleave2 intrinsics
+  // instead of shufflevectors.
+  if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor))
     return false;
 
   // If the group involves a non-integral pointer, we may not be able to
@@ -9433,9 +9435,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
                          LoopVectorizationCostModel::CM_Interleave);
       // For scalable vectors, the only interleave factor currently supported
-      // is 2 since we require the (de)interleave2 intrinsics instead of
-      // shufflevectors.
-      assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
+      // must be power of 2 since we require the (de)interleave2 intrinsics
+      // instead of shufflevectors.
+      assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) &&
              "Unsupported interleave factor for scalable vectors");
       return Result;
     };
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 979a8e0768a99..5ae2f43e4950c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2863,10 +2863,21 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
   // must use intrinsics to interleave.
   if (VecTy->isScalableTy()) {
-    VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
-    return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
-                                   Vals,
-                                   /*FMFSource=*/nullptr, Name);
+    assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for "
+                                    "scalable vectors, must be power of 2");
+    SmallVector<Value *> InterleavingValues(Vals);
+    // When interleaving, the number of values will be shrunk until we have the
+    // single final interleaved value.
+    auto *InterleaveTy = cast<VectorType>(InterleavingValues[0]->getType());
+    for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) {
+      InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy);
+      for (unsigned I = 0; I < Midpoint; ++I)
+        InterleavingValues[I] = Builder.CreateIntrinsic(
+            InterleaveTy, Intrinsic::vector_interleave2,
+            {InterleavingValues[I], InterleavingValues[Midpoint + I]},
+            /*FMFSource=*/nullptr, Name);
+    }
+    return InterleavingValues[0];
   }
 
   // Fixed length. Start by concatenating all vectors into a wide vector.
@@ -2952,15 +2963,11 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
                           &InterleaveFactor](Value *MaskForGaps) -> Value * {
     if (State.VF.isScalable()) {
       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
-      assert(InterleaveFactor == 2 &&
+      assert(isPowerOf2_32(InterleaveFactor) &&
              "Unsupported deinterleave factor for scalable vectors");
       auto *ResBlockInMask = State.get(BlockInMask);
-      SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask};
-      auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
-                                     State.VF.getKnownMinValue() * 2, true);
-      return State.Builder.CreateIntrinsic(
-          MaskTy, Intrinsic::vector_interleave2, Ops,
-          /*FMFSource=*/nullptr, "interleaved.mask");
+      SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
+      return interleaveVectors(State.Builder, Ops, "interleaved.mask");
     }
 
     if (!BlockInMask)
@@ -3000,22 +3007,48 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
     ArrayRef<VPValue *> VPDefs = definedValues();
     const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
     if (VecTy->isScalableTy()) {
-      assert(InterleaveFactor == 2 &&
+      assert(isPowerOf2_32(InterleaveFactor) &&
              "Unsupported deinterleave factor for scalable vectors");
 
-        // Scalable vectors cannot use arbitrary shufflevectors (only splats),
-        // so must use intrinsics to deinterleave.
-      Value *DI = State.Builder.CreateIntrinsic(
-          Intrinsic::vector_deinterleave2, VecTy, NewLoad,
-          /*FMFSource=*/nullptr, "strided.vec");
-      unsigned J = 0;
-      for (unsigned I = 0; I < InterleaveFactor; ++I) {
-        Instruction *Member = Group->getMember(I);
+      // Scalable vectors cannot use arbitrary shufflevectors (only splats),
+      // so must use intrinsics to deinterleave.
+      SmallVector<Value *> DeinterleavedValues(InterleaveFactor);
+      DeinterleavedValues[0] = NewLoad;
+      // For the case of InterleaveFactor > 2, we will have to do recursive
+      // deinterleaving, because the current available deinterleave intrinsic
+      // supports only Factor of 2, otherwise it will bailout after first
+      // iteration.
+      // When deinterleaving, the number of values will double until we
+      // have "InterleaveFactor".
+      for (unsigned NumVectors = 1; NumVectors < InterleaveFactor;
+           NumVectors *= 2) {
+        // Deinterleave the elements within the vector
+        SmallVector<Value *> TempDeinterleavedValues(NumVectors);
+        for (unsigned I = 0; I < NumVectors; ++I) {
+          auto *DiTy = DeinterleavedValues[I]->getType();
+          TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic(
+              Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I],
+              /*FMFSource=*/nullptr, "strided.vec");
+        }
+        // Extract the deinterleaved values:
+        for (unsigned I = 0; I < 2; ++I)
+          for (unsigned J = 0; J < NumVectors; ++J)
+            DeinterleavedValues[NumVectors * I + J] =
+                State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I);
+      }
 
-        if (!Member)
+#ifndef NDEBUG
+      for (Value *Val : DeinterleavedValues)
+        assert(Val && "NULL Deinterleaved Value");
+#endif
+      for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
+        Instruction *Member = Group->getMember(I);
+        Value *StridedVec = DeinterleavedValues[I];
+        if (!Member) {
+          // This value is not needed as it's not used
+          cast<Instruction>(StridedVec)->eraseFromParent();
           continue;
-
-        Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
+        }
         // If this member has different type, cast the result type.
         if (Member->getType() != ScalarTy) {
           VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
diff --git a/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll b/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll
new file mode 100644
index 0000000000000..7ab6221d0da53
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -disable-output -passes="print<scalar-evolution>" \
+; RUN:   -scalar-evolution-classify-expressions=0 2>&1 | FileCheck %s
+
+define void @implied1(i32 %n) {
+; Prove that (n s> 1) ===> (n - 1 s> 0).
+; CHECK-LABEL: 'implied1'
+; CHECK-NEXT:  Determining loop execution counts for: @implied1
+; CHECK-NEXT:  Loop %header: backedge-taken count is (-2 + %n)
+; CHECK-NEXT:  Loop %header: constant max backedge-taken count is i32 2147483645
+; CHECK-NEXT:  Loop %header: symbolic max backedge-taken count is (-2 + %n)
+; CHECK-NEXT:  Loop %header: Trip multiple is 1
+;
+entry:
+  %cmp1 = icmp sgt i32 %n, 1
+  %n.minus.1 = sub nsw i32 %n, 1
+  call void @llvm.assume(i1 %cmp1)
+  br label %header
+
+header:
+  %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp sgt i32 %n.minus.1, %indvar.next
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @implied1_neg(i32 %n) {
+; Prove that (n s> 0) =\=> (n - 1 s> 0).
+; CHECK-LABEL: 'implied1_neg'
+; CHECK-NEXT:  Determining loop execution counts for: @implied1_neg
+; CHECK-NEXT:  Loop %header: backedge-taken count is (-1 + (1 smax (-1 + %n)<nsw>))<nsw>
+; CHECK-NEXT:  Loop %header: constant max backedge-taken count is i32 2147483645
+; CHECK-NEXT:  Loop %header: symbolic max backedge-taken count is (-1 + (1 smax (-1 + %n)<nsw>))<nsw>
+; CHECK-NEXT:  Loop %header: Trip multiple is 1
+;
+entry:
+  %cmp1 = icmp sgt i32 %n, 0
+  %n.minus.1 = sub nsw i32 %n, 1
+  call void @llvm.assume(i1 %cmp1)
+  br label %header
+
+header:
+  %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp sgt i32 %n.minus.1, %indvar.next
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @implied2(i32 %n) {
+; Prove that (n u>= -1) ===> (n + 1 u>= 0).
+; CHECK-LABEL: 'implied2'
+; CHECK-NEXT:  Determining loop execution counts for: @implied2
+; CHECK-NEXT:  Loop %header: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %header: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %header: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %header: Predicated backedge-taken count is (1 + (zext i32 %n to i64))<nuw><nsw>
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %header: Predicated constant max backedge-taken count is i64 4294967296
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %header: Predicated symbolic max backedge-taken count is (1 + (zext i32 %n to i64))<nuw><nsw>
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
+;
+entry:
+  %cmp1 = icmp uge i32 %n, -1
+  %n.1 = add nuw i32 %n, 1
+  call void @llvm.assume(i1 %cmp1)
+  br label %header
+
+header:
+  %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp uge i32 %n.1, %indvar.next
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @implied2_neg(i32 %n) {
+; Prove that (n u>= -1) =\=> (n - 1 s>= 0).
+; CHECK-LABEL: 'implied2_neg'
+; CHECK-NEXT:  Determining loop execution counts for: @implied2_neg
+; CHECK-NEXT:  Loop %header: backedge-taken count is (-1 + (1 smax %n))<nsw>
+; CHECK-NEXT:  Loop %header: constant max backedge-taken count is i32 2147483646
+; CHECK-NEXT:  Loop %header: symbolic max backedge-taken count is (-1 + (1 smax %n))<nsw>
+; CHECK-NEXT:  Loop %header: Trip multiple is 1
+;
+entry:
+  %cmp1 = icmp uge i32 %n, -1
+  %n.minus.1 = sub nuw nsw i32 %n, 1
+  call void @llvm.assume(i1 %cmp1)
+  br label %header
+
+header:
+  %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp sge i32 %n.minus.1, %indvar.next
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir
index 253e6ebe793ce..76fdfd0c301f6 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir
@@ -6,15 +6,15 @@ tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $x0
-
     ; CHECK-LABEL: name: add_v16s8
     ; CHECK: liveins: $x0
-    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[LOAD:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<16 x s8>))
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s8) = G_VECREDUCE_ADD [[LOAD]](<16 x s8>)
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[VECREDUCE_ADD]](s8)
-    ; CHECK: $w0 = COPY [[ANYEXT]](s32)
-    ; CHECK: RET_ReallyLR implicit $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<16 x s8>))
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s8) = G_VECREDUCE_ADD [[LOAD]](<16 x s8>)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[VECREDUCE_ADD]](s8)
+    ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(p0) = COPY $x0
     %1:_(<16 x s8>) = G_LOAD %0(p0) :: (load (<16 x s8>))
     %2:_(s8) = G_VECREDUCE_ADD %1(<16 x s8>)
@@ -29,15 +29,15 @@ tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $x0
-
     ; CHECK-LABEL: name: add_v8s16
     ; CHECK: liveins: $x0
-    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p0) :: (load (<8 x s16>))
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s16) = G_VECREDUCE_ADD [[LOAD]](<8 x s16>)
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[VECREDUCE_ADD]](s16)
-    ; CHECK: $w0 = COPY [[ANYEXT]](s32)
-    ; CHECK: RET_ReallyLR implicit $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p0) :: (load (<8 x s16>))
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s16) = G_VECREDUCE_ADD [[LOAD]](<8 x s16>)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[VECREDUCE_ADD]](s16)
+    ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(p0) = COPY $x0
     %1:_(<8 x s16>) = G_LOAD %0(p0) :: (load (<8 x s16>))
     %2:_(s16) = G_VECREDUCE_ADD %1(<8 x s16>)
@@ -52,14 +52,14 @@ tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $x0
-
     ; CHECK-LABEL: name: add_v4s32
     ; CHECK: liveins: $x0
-    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[LOAD]](<4 x s32>)
-    ; CHECK: $w0 = COPY [[VECREDUCE_ADD]](s32)
-    ; CHECK: RET_ReallyLR implicit $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[LOAD]](<4 x s32>)
+    ; CHECK-NEXT: $w0 = COPY [[VECREDUCE_ADD]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(p0) = COPY $x0
     %1:_(<4 x s32>) = G_LOAD %0(p0) :: (load (<4 x s32>))
     %2:_(s32) = G_VECREDUCE_ADD %1(<4 x s32>)
@@ -73,14 +73,14 @@ tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $x0
-
     ; CHECK-LABEL: name: add_v2s64
     ; CHECK: liveins: $x0
-    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>))
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[LOAD]](<2 x s64>)
-    ; CHECK: $x0 = COPY [[VECREDUCE_ADD]](s64)
-    ; CHECK: RET_ReallyLR implicit $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>))
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[LOAD]](<2 x s64>)
+    ; CHECK-NEXT: $x0 = COPY [[VECREDUCE_ADD]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:_(p0) = COPY $x0
     %1:_(<2 x s64>) = G_LOAD %0(p0) :: (load (<2 x s64>))
     %2:_(s64) = G_VECREDUCE_ADD %1(<2 x s64>)
@@ -94,14 +94,14 @@ tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $x0
-
     ; CHECK-LABEL: name: add_v2s32
     ; CHECK: liveins: $x0
-    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[LOAD]](<2 x s32>)
-    ; CHECK: $w0 = COPY [[VECREDUCE_ADD]](s32)
-    ; CHECK: RET_ReallyLR implicit $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[LOAD]](<2 x s32>)
+    ; CHECK-NEXT: $w0 = COPY [[VECREDUCE_ADD]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(p0) = COPY $x0
     %1:_(<2 x s32>) = G_LOAD %0(p0) :: (load (<2 x s32>))
     %2:_(s32) = G_VECREDUCE_ADD %1(<2 x s32>)
@@ -111,24 +111,25 @@ body:             |
 ...
 ---
 name:            test_v8i64
+# This is a power-of-2 legalization, so use a tree reduction.
 alignment:       4
 tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $q0, $q1, $q2, $q3
-    ; This is a power-of-2 legalization, so use a tree reduction.
     ; CHECK-LABEL: name: test_v8i64
     ; CHECK: liveins: $q0, $q1, $q2, $q3
-    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
-    ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
-    ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
-    ; CHECK: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3
-    ; CHECK: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY]], [[COPY1]]
-    ; CHECK: [[ADD1:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY2]], [[COPY3]]
-    ; CHECK: [[ADD2:%[0-9]+]]:_(<2 x s64>) = G_ADD [[ADD]], [[ADD1]]
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[ADD2]](<2 x s64>)
-    ; CHECK: $x0 = COPY [[VECREDUCE_ADD]](s64)
-    ; CHECK: RET_ReallyLR implicit $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY2]], [[COPY3]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(<2 x s64>) = G_ADD [[ADD]], [[ADD1]]
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[ADD2]](<2 x s64>)
+    ; CHECK-NEXT: $x0 = COPY [[VECREDUCE_ADD]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:_(<2 x s64>) = COPY $q0
     %1:_(<2 x s64>) = COPY $q1
     %2:_(<2 x s64>) = COPY $q2
@@ -143,25 +144,26 @@ body:             |
 ...
 ---
 name:            test_v6i64
+# This is a non-power-of-2 legalization, generate multiple vector reductions
+# and combine them with scalar ops.
 alignment:       4
 tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $q0, $q1, $q2, $q3
-    ; This is a non-power-of-2 legalization, generate multiple vector reductions
-    ; and combine them with scalar ops.
     ; CHECK-LABEL: name: test_v6i64
     ; CHECK: liveins: $q0, $q1, $q2, $q3
-    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
-    ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
-    ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY]](<2 x s64>)
-    ; CHECK: [[VECREDUCE_ADD1:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY1]](<2 x s64>)
-    ; CHECK: [[VECREDUCE_ADD2:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY2]](<2 x s64>)
-    ; CHECK: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[VECREDUCE_ADD]], [[VECREDUCE_ADD1]]
-    ; CHECK: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[ADD]], [[VECREDUCE_ADD2]]
-    ; CHECK: $x0 = COPY [[ADD1]](s64)
-    ; CHECK: RET_ReallyLR implicit $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY]](<2 x s64>)
+    ; CHECK-NEXT: [[VECREDUCE_ADD1:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY1]](<2 x s64>)
+    ; CHECK-NEXT: [[VECREDUCE_ADD2:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY2]](<2 x s64>)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[VECREDUCE_ADD]], [[VECREDUCE_ADD1]]
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[ADD]], [[VECREDUCE_ADD2]]
+    ; CHECK-NEXT: $x0 = COPY [[ADD1]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:_(<2 x s64>) = COPY $q0
     %1:_(<2 x s64>) = COPY $q1
     %2:_(<2 x s64>) = COPY $q2
diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index def4192b0e005..aba284b4e0d29 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=generic | FileCheck %s -check-prefixes=CHECK,SDAG
-; RUN: llc < %s -global-isel=1 -global-isel-abort=2 -mtriple=aarch64 -aarch64-neon-syntax=generic 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=generic | FileCheck %s -check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=aarch64 -global-isel=1 -global-isel-abort=2 -aarch64-neon-syntax=generic 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; Function Attrs: nounwind readnone
 declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
 declare i8 @llvm.vector.reduce.add.v3i8(<3 x i8>)
 declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
@@ -23,14 +22,14 @@ declare i64 @llvm.vector.reduce.add.v3i64(<3 x i64>)
 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
 declare i128 @llvm.vector.reduce.add.v2i128(<2 x i128>)
 
-; GISEL:       warning: Instruction selection used fallback path for addv_v2i8
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v3i8
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v4i8
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v2i16
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v3i16
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v3i32
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v3i64
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v2i128
+; CHECK-GI:       warning: Instruction selection used fallback path for addv_v2i8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v3i8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v4i8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v2i16
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v3i16
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v3i32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v3i64
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v2i128
 
 
 define i8 @add_B(ptr %arr)  {
@@ -83,34 +82,34 @@ define i64 @add_D(ptr %arr)  {
 
 
 define i32 @oversized_ADDV_256(ptr noalias nocapture readonly %arg1, ptr noalias nocapture readonly %arg2) {
-; SDAG-LABEL: oversized_ADDV_256:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    ldr d0, [x0]
-; SDAG-NEXT:    ldr d1, [x1]
-; SDAG-NEXT:    uabdl v0.8h, v0.8b, v1.8b
-; SDAG-NEXT:    uaddlv s0, v0.8h
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: oversized_ADDV_256:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1]
+; CHECK-SD-NEXT:    uabdl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uaddlv s0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: oversized_ADDV_256:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    ldr d1, [x0]
-; GISEL-NEXT:    ldr d2, [x1]
-; GISEL-NEXT:    movi v0.2d, #0000000000000000
-; GISEL-NEXT:    usubl v1.8h, v1.8b, v2.8b
-; GISEL-NEXT:    sshll v2.4s, v1.4h, #0
-; GISEL-NEXT:    sshll2 v3.4s, v1.8h, #0
-; GISEL-NEXT:    ssubw2 v0.4s, v0.4s, v1.8h
-; GISEL-NEXT:    cmlt v4.4s, v2.4s, #0
-; GISEL-NEXT:    cmlt v5.4s, v3.4s, #0
-; GISEL-NEXT:    neg v6.4s, v2.4s
-; GISEL-NEXT:    mov v1.16b, v4.16b
-; GISEL-NEXT:    bif v0.16b, v3.16b, v5.16b
-; GISEL-NEXT:    bsl v1.16b, v6.16b, v2.16b
-; GISEL-NEXT:    add v0.4s, v1.4s, v0.4s
-; GISEL-NEXT:    addv s0, v0.4s
-; GISEL-NEXT:    fmov w0, s0
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: oversized_ADDV_256:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    ldr d2, [x1]
+; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-GI-NEXT:    usubl v1.8h, v1.8b, v2.8b
+; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ssubw2 v0.4s, v0.4s, v1.8h
+; CHECK-GI-NEXT:    cmlt v4.4s, v2.4s, #0
+; CHECK-GI-NEXT:    cmlt v5.4s, v3.4s, #0
+; CHECK-GI-NEXT:    neg v6.4s, v2.4s
+; CHECK-GI-NEXT:    mov v1.16b, v4.16b
+; CHECK-GI-NEXT:    bif v0.16b, v3.16b, v5.16b
+; CHECK-GI-NEXT:    bsl v1.16b, v6.16b, v2.16b
+; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = load <8 x i8>, ptr %arg1, align 1
   %1 = zext <8 x i8> %0 to <8 x i32>
@@ -127,48 +126,48 @@ entry:
 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
 
 define i32 @oversized_ADDV_512(ptr %arr)  {
-; SDAG-LABEL: oversized_ADDV_512:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldp q0, q1, [x0, #32]
-; SDAG-NEXT:    ldp q2, q3, [x0]
-; SDAG-NEXT:    add v1.4s, v3.4s, v1.4s
-; SDAG-NEXT:    add v0.4s, v2.4s, v0.4s
-; SDAG-NEXT:    add v0.4s, v0.4s, v1.4s
-; SDAG-NEXT:    addv s0, v0.4s
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: oversized_ADDV_512:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-SD-NEXT:    ldp q2, q3, [x0]
+; CHECK-SD-NEXT:    add v1.4s, v3.4s, v1.4s
+; CHECK-SD-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: oversized_ADDV_512:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldp q0, q1, [x0]
-; GISEL-NEXT:    ldp q2, q3, [x0, #32]
-; GISEL-NEXT:    add v0.4s, v0.4s, v1.4s
-; GISEL-NEXT:    add v1.4s, v2.4s, v3.4s
-; GISEL-NEXT:    add v0.4s, v0.4s, v1.4s
-; GISEL-NEXT:    addv s0, v0.4s
-; GISEL-NEXT:    fmov w0, s0
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: oversized_ADDV_512:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldp q0, q1, [x0]
+; CHECK-GI-NEXT:    ldp q2, q3, [x0, #32]
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %bin.rdx = load <16 x i32>, ptr %arr
   %r = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %bin.rdx)
   ret i32 %r
 }
 
 define i8 @addv_combine_i8(<8 x i8> %a1, <8 x i8> %a2) {
-; SDAG-LABEL: addv_combine_i8:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    add v0.8b, v0.8b, v1.8b
-; SDAG-NEXT:    addv b0, v0.8b
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: addv_combine_i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    addv b0, v0.8b
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: addv_combine_i8:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    addv b0, v0.8b
-; GISEL-NEXT:    addv b1, v1.8b
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    add w0, w9, w8, uxtb
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: addv_combine_i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv b0, v0.8b
+; CHECK-GI-NEXT:    addv b1, v1.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w9, w8, uxtb
+; CHECK-GI-NEXT:    ret
 entry:
   %rdx.1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a1)
   %rdx.2 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a2)
@@ -177,21 +176,21 @@ entry:
 }
 
 define i16 @addv_combine_i16(<4 x i16> %a1, <4 x i16> %a2) {
-; SDAG-LABEL: addv_combine_i16:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    add v0.4h, v0.4h, v1.4h
-; SDAG-NEXT:    addv h0, v0.4h
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: addv_combine_i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    addv h0, v0.4h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: addv_combine_i16:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    addv h0, v0.4h
-; GISEL-NEXT:    addv h1, v1.4h
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    add w0, w9, w8, uxth
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: addv_combine_i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv h0, v0.4h
+; CHECK-GI-NEXT:    addv h1, v1.4h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w9, w8, uxth
+; CHECK-GI-NEXT:    ret
 entry:
   %rdx.1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a1)
   %rdx.2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2)
@@ -200,21 +199,21 @@ entry:
 }
 
 define i32 @addv_combine_i32(<4 x i32> %a1, <4 x i32> %a2) {
-; SDAG-LABEL: addv_combine_i32:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    add v0.4s, v0.4s, v1.4s
-; SDAG-NEXT:    addv s0, v0.4s
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: addv_combine_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: addv_combine_i32:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    addv s0, v0.4s
-; GISEL-NEXT:    addv s1, v1.4s
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    add w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: addv_combine_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %rdx.1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a1)
   %rdx.2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2)
@@ -223,21 +222,21 @@ entry:
 }
 
 define i64 @addv_combine_i64(<2 x i64> %a1, <2 x i64> %a2) {
-; SDAG-LABEL: addv_combine_i64:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    add v0.2d, v0.2d, v1.2d
-; SDAG-NEXT:    addp d0, v0.2d
-; SDAG-NEXT:    fmov x0, d0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: addv_combine_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: addv_combine_i64:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    addp d0, v0.2d
-; GISEL-NEXT:    addp d1, v1.2d
-; GISEL-NEXT:    fmov x8, d0
-; GISEL-NEXT:    fmov x9, d1
-; GISEL-NEXT:    add x0, x8, x9
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: addv_combine_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %rdx.1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1)
   %rdx.2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a2)
@@ -471,3 +470,6 @@ entry:
   ret i128 %arg1
 }
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL: {{.*}}
+; SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll b/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
index b498611242d46..d69d1b6eb4a2a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefixes=CHECK,SDAG
-; RUN: llc < %s -global-isel -global-isel-abort=1 -pass-remarks-missed=gisel* -mtriple=arm64-linux-gnu 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL,FALLBACK
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=arm64-linux-gnu -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 %0 = type { i64, i64 }
 
@@ -39,22 +39,21 @@ declare i32 @llvm.aarch64.stxp(i64, i64, ptr) nounwind
 
 @var = dso_local global i64 0, align 8
 
-; FALLBACK-NOT: remark:{{.*}}test_load_i8
 define dso_local void @test_load_i8(ptr %addr) {
-; SDAG-LABEL: test_load_i8:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldxrb w8, [x0]
-; SDAG-NEXT:    adrp x9, var
-; SDAG-NEXT:    str x8, [x9, :lo12:var]
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: test_load_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldxrb w8, [x0]
+; CHECK-SD-NEXT:    adrp x9, var
+; CHECK-SD-NEXT:    str x8, [x9, :lo12:var]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_load_i8:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldxrb w9, [x0]
-; GISEL-NEXT:    adrp x8, var
-; GISEL-NEXT:    and x9, x9, #0xff
-; GISEL-NEXT:    str x9, [x8, :lo12:var]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_load_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldxrb w9, [x0]
+; CHECK-GI-NEXT:    adrp x8, var
+; CHECK-GI-NEXT:    and x9, x9, #0xff
+; CHECK-GI-NEXT:    str x9, [x8, :lo12:var]
+; CHECK-GI-NEXT:    ret
 
   %val = call i64 @llvm.aarch64.ldxr.p0(ptr elementtype(i8) %addr)
   %shortval = trunc i64 %val to i8
@@ -63,22 +62,21 @@ define dso_local void @test_load_i8(ptr %addr) {
   ret void
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_load_i16
 define dso_local void @test_load_i16(ptr %addr) {
-; SDAG-LABEL: test_load_i16:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldxrh w8, [x0]
-; SDAG-NEXT:    adrp x9, var
-; SDAG-NEXT:    str x8, [x9, :lo12:var]
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: test_load_i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldxrh w8, [x0]
+; CHECK-SD-NEXT:    adrp x9, var
+; CHECK-SD-NEXT:    str x8, [x9, :lo12:var]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_load_i16:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldxrh w9, [x0]
-; GISEL-NEXT:    adrp x8, var
-; GISEL-NEXT:    and x9, x9, #0xffff
-; GISEL-NEXT:    str x9, [x8, :lo12:var]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_load_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldxrh w9, [x0]
+; CHECK-GI-NEXT:    adrp x8, var
+; CHECK-GI-NEXT:    and x9, x9, #0xffff
+; CHECK-GI-NEXT:    str x9, [x8, :lo12:var]
+; CHECK-GI-NEXT:    ret
 
   %val = call i64 @llvm.aarch64.ldxr.p0(ptr elementtype(i16) %addr)
   %shortval = trunc i64 %val to i16
@@ -87,22 +85,21 @@ define dso_local void @test_load_i16(ptr %addr) {
   ret void
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_load_i32
 define dso_local void @test_load_i32(ptr %addr) {
-; SDAG-LABEL: test_load_i32:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldxr w8, [x0]
-; SDAG-NEXT:    adrp x9, var
-; SDAG-NEXT:    str x8, [x9, :lo12:var]
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: test_load_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldxr w8, [x0]
+; CHECK-SD-NEXT:    adrp x9, var
+; CHECK-SD-NEXT:    str x8, [x9, :lo12:var]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_load_i32:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldxr w9, [x0]
-; GISEL-NEXT:    adrp x8, var
-; GISEL-NEXT:    mov w9, w9
-; GISEL-NEXT:    str x9, [x8, :lo12:var]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_load_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldxr w9, [x0]
+; CHECK-GI-NEXT:    adrp x8, var
+; CHECK-GI-NEXT:    mov w9, w9
+; CHECK-GI-NEXT:    str x9, [x8, :lo12:var]
+; CHECK-GI-NEXT:    ret
 
   %val = call i64 @llvm.aarch64.ldxr.p0(ptr elementtype(i32) %addr)
   %shortval = trunc i64 %val to i32
@@ -111,7 +108,6 @@ define dso_local void @test_load_i32(ptr %addr) {
   ret void
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_load_i64
 define dso_local void @test_load_i64(ptr %addr) {
 ; CHECK-LABEL: test_load_i64:
 ; CHECK:       // %bb.0:
@@ -128,7 +124,6 @@ define dso_local void @test_load_i64(ptr %addr) {
 
 declare i64 @llvm.aarch64.ldxr.p0(ptr) nounwind
 
-; FALLBACK-NOT: remark:{{.*}}test_store_i8
 define dso_local i32 @test_store_i8(i32, i8 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_i8:
 ; CHECK:       // %bb.0:
@@ -140,7 +135,6 @@ define dso_local i32 @test_store_i8(i32, i8 %val, ptr %addr) {
   ret i32 %res
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_store_i16
 define dso_local i32 @test_store_i16(i32, i16 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_i16:
 ; CHECK:       // %bb.0:
@@ -152,7 +146,6 @@ define dso_local i32 @test_store_i16(i32, i16 %val, ptr %addr) {
   ret i32 %res
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_store_i32
 define dso_local i32 @test_store_i32(i32, i32 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_i32:
 ; CHECK:       // %bb.0:
@@ -163,7 +156,6 @@ define dso_local i32 @test_store_i32(i32, i32 %val, ptr %addr) {
   ret i32 %res
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_store_i64
 define dso_local i32 @test_store_i64(i32, i64 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_i64:
 ; CHECK:       // %bb.0:
@@ -219,22 +211,21 @@ entry:
 declare %0 @llvm.aarch64.ldaxp(ptr) nounwind
 declare i32 @llvm.aarch64.stlxp(i64, i64, ptr) nounwind
 
-; FALLBACK-NOT: remark:{{.*}}test_load_acquire_i8
 define dso_local void @test_load_acquire_i8(ptr %addr) {
-; SDAG-LABEL: test_load_acquire_i8:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldaxrb w8, [x0]
-; SDAG-NEXT:    adrp x9, var
-; SDAG-NEXT:    str x8, [x9, :lo12:var]
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: test_load_acquire_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldaxrb w8, [x0]
+; CHECK-SD-NEXT:    adrp x9, var
+; CHECK-SD-NEXT:    str x8, [x9, :lo12:var]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_load_acquire_i8:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldaxrb w9, [x0]
-; GISEL-NEXT:    adrp x8, var
-; GISEL-NEXT:    and x9, x9, #0xff
-; GISEL-NEXT:    str x9, [x8, :lo12:var]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_load_acquire_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldaxrb w9, [x0]
+; CHECK-GI-NEXT:    adrp x8, var
+; CHECK-GI-NEXT:    and x9, x9, #0xff
+; CHECK-GI-NEXT:    str x9, [x8, :lo12:var]
+; CHECK-GI-NEXT:    ret
 
   %val = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i8) %addr)
   %shortval = trunc i64 %val to i8
@@ -243,22 +234,21 @@ define dso_local void @test_load_acquire_i8(ptr %addr) {
   ret void
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_load_acquire_i16
 define dso_local void @test_load_acquire_i16(ptr %addr) {
-; SDAG-LABEL: test_load_acquire_i16:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldaxrh w8, [x0]
-; SDAG-NEXT:    adrp x9, var
-; SDAG-NEXT:    str x8, [x9, :lo12:var]
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: test_load_acquire_i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldaxrh w8, [x0]
+; CHECK-SD-NEXT:    adrp x9, var
+; CHECK-SD-NEXT:    str x8, [x9, :lo12:var]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_load_acquire_i16:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldaxrh w9, [x0]
-; GISEL-NEXT:    adrp x8, var
-; GISEL-NEXT:    and x9, x9, #0xffff
-; GISEL-NEXT:    str x9, [x8, :lo12:var]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_load_acquire_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldaxrh w9, [x0]
+; CHECK-GI-NEXT:    adrp x8, var
+; CHECK-GI-NEXT:    and x9, x9, #0xffff
+; CHECK-GI-NEXT:    str x9, [x8, :lo12:var]
+; CHECK-GI-NEXT:    ret
 
   %val = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i16) %addr)
   %shortval = trunc i64 %val to i16
@@ -267,22 +257,21 @@ define dso_local void @test_load_acquire_i16(ptr %addr) {
   ret void
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_load_acquire_i32
 define dso_local void @test_load_acquire_i32(ptr %addr) {
-; SDAG-LABEL: test_load_acquire_i32:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldaxr w8, [x0]
-; SDAG-NEXT:    adrp x9, var
-; SDAG-NEXT:    str x8, [x9, :lo12:var]
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: test_load_acquire_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldaxr w8, [x0]
+; CHECK-SD-NEXT:    adrp x9, var
+; CHECK-SD-NEXT:    str x8, [x9, :lo12:var]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_load_acquire_i32:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldaxr w9, [x0]
-; GISEL-NEXT:    adrp x8, var
-; GISEL-NEXT:    mov w9, w9
-; GISEL-NEXT:    str x9, [x8, :lo12:var]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_load_acquire_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldaxr w9, [x0]
+; CHECK-GI-NEXT:    adrp x8, var
+; CHECK-GI-NEXT:    mov w9, w9
+; CHECK-GI-NEXT:    str x9, [x8, :lo12:var]
+; CHECK-GI-NEXT:    ret
 
   %val = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i32) %addr)
   %shortval = trunc i64 %val to i32
@@ -291,7 +280,6 @@ define dso_local void @test_load_acquire_i32(ptr %addr) {
   ret void
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_load_acquire_i64
 define dso_local void @test_load_acquire_i64(ptr %addr) {
 ; CHECK-LABEL: test_load_acquire_i64:
 ; CHECK:       // %bb.0:
@@ -308,7 +296,6 @@ define dso_local void @test_load_acquire_i64(ptr %addr) {
 
 declare i64 @llvm.aarch64.ldaxr.p0(ptr) nounwind
 
-; FALLBACK-NOT: remark:{{.*}}test_store_release_i8
 define dso_local i32 @test_store_release_i8(i32, i8 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_release_i8:
 ; CHECK:       // %bb.0:
@@ -320,7 +307,6 @@ define dso_local i32 @test_store_release_i8(i32, i8 %val, ptr %addr) {
   ret i32 %res
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_store_release_i16
 define dso_local i32 @test_store_release_i16(i32, i16 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_release_i16:
 ; CHECK:       // %bb.0:
@@ -332,7 +318,6 @@ define dso_local i32 @test_store_release_i16(i32, i16 %val, ptr %addr) {
   ret i32 %res
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_store_release_i32
 define dso_local i32 @test_store_release_i32(i32, i32 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_release_i32:
 ; CHECK:       // %bb.0:
@@ -343,7 +328,6 @@ define dso_local i32 @test_store_release_i32(i32, i32 %val, ptr %addr) {
   ret i32 %res
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_store_release_i64
 define dso_local i32 @test_store_release_i64(i32, i64 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_release_i64:
 ; CHECK:       // %bb.0:
@@ -378,5 +362,3 @@ define dso_local i32 @test_stxp_undef_inline_asm(ptr %p, i64 %x) nounwind {
 }
 
 declare i32 @llvm.aarch64.stlxr.p0(i64, ptr) nounwind
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; FALLBACK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/half-precision-signof-no-assert.ll b/llvm/test/CodeGen/AArch64/half-precision-signof-no-assert.ll
new file mode 100644
index 0000000000000..92e15e78d8c41
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/half-precision-signof-no-assert.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+; Check that the following does not crash
+; See https://github.com/llvm/llvm-project/issues/123029 for details
+
+define ptr @fn(ptr %in, ptr %out) {
+; CHECK-LABEL: fn:
+; CHECK:       // %bb.0: // %fn
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    movi v0.4h, #60, lsl #8
+; CHECK-NEXT:    adrp x8, .LCPI0_0
+; CHECK-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-NEXT:    fcmgt v2.4s, v1.4s, #0.0
+; CHECK-NEXT:    fcmlt v1.4s, v1.4s, #0.0
+; CHECK-NEXT:    orr v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ldr h2, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT:    xtn v1.4h, v1.4s
+; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    str d0, [x1]
+; CHECK-NEXT:    ldr h0, [x0, #8]
+; CHECK-NEXT:    mov x0, xzr
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    fcmp s0, #0.0
+; CHECK-NEXT:    fcsel s1, s2, s1, mi
+; CHECK-NEXT:    fcsel s1, s2, s1, gt
+; CHECK-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-NEXT:    fcvt s1, h1
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    str h0, [x1, #8]
+; CHECK-NEXT:    ret
+fn:
+  %1 = load <4 x half>, ptr %in
+  %2 = fcmp one <4 x half> %1, zeroinitializer
+  %3 = uitofp <4 x i1> %2 to <4 x half>
+  store <4 x half> %3, ptr %out
+
+  %4 = getelementptr inbounds nuw i8, ptr %in, i64 8
+  %5 = load half, ptr %4
+  %6 = fcmp one half %5, 0xH0000
+  %7 = uitofp i1 %6 to half
+  %8 = call half @llvm.copysign.f16(half %7, half %5)
+  %9 = getelementptr inbounds nuw i8, ptr %out, i64 8
+  store half %8, ptr %9
+  ret ptr null
+}
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 7f3c1fdc93380..c9fe258f11556 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -1,86 +1,87 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,SDAG
-; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -global-isel -global-isel-abort=2 -verify-machineinstrs < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -global-isel -global-isel-abort=2 -verify-machineinstrs < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; Basic tests from input vector to bitmask
 ; IR generated from clang for:
 ; __builtin_convertvector + reinterpret_cast<uint16&>
 
-; GISEL: warning: Instruction selection used fallback path for clang_builtins_undef_concat_convert_to_bitmask4
-; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_2xi32
-; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_8xi2
-; GISEL-NEXT: warning: Instruction selection used fallback path for no_direct_convert_for_bad_concat
+; CHECK-GI:       warning: Instruction selection used fallback path for convert_to_bitmask2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for clang_builtins_undef_concat_convert_to_bitmask4
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for convert_to_bitmask_2xi32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for convert_to_bitmask_8xi2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for no_direct_convert_for_bad_concat
 
 define i16 @convert_to_bitmask16(<16 x i8> %vec) {
 ; Bits used in mask
-; SDAG-LABEL: convert_to_bitmask16:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, lCPI0_0@PAGE
-; SDAG-NEXT:    cmeq.16b v0, v0, #0
-; SDAG-NEXT:    ldr q1, [x8, lCPI0_0@PAGEOFF]
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    ext.16b v1, v0, v0, #8
-; SDAG-NEXT:    zip1.16b v0, v0, v1
-; SDAG-NEXT:    addv.8h h0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask16:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    adrp x8, lCPI0_0@PAGE
+; CHECK-SD-NEXT:    cmeq.16b v0, v0, #0
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI0_0@PAGEOFF]
+; CHECK-SD-NEXT:    bic.16b v0, v1, v0
+; CHECK-SD-NEXT:    ext.16b v1, v0, v0, #8
+; CHECK-SD-NEXT:    zip1.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.8h h0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.16b v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    umov.b w8, v0[1]
-; GISEL-NEXT:    umov.b w9, v0[0]
-; GISEL-NEXT:    umov.b w10, v0[2]
-; GISEL-NEXT:    umov.b w11, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    orr w8, w9, w8, lsl #2
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[5]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[6]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #4
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #5
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[8]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #6
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[9]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #7
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[10]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #8
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[11]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #9
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[12]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #10
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[13]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #11
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[14]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #12
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[15]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #13
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #14
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #15
-; GISEL-NEXT:    strh w8, [sp, #14]
-; GISEL-NEXT:    and w0, w8, #0xffff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask16:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.16b v0, v0, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    umov.b w8, v0[1]
+; CHECK-GI-NEXT:    umov.b w9, v0[0]
+; CHECK-GI-NEXT:    umov.b w10, v0[2]
+; CHECK-GI-NEXT:    umov.b w11, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #2
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[5]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[6]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #4
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #5
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[8]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #6
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[9]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #7
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[10]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #8
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[11]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #9
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[12]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #10
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[13]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #11
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[14]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #12
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[15]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #13
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #14
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #15
+; CHECK-GI-NEXT:    strh w8, [sp, #14]
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 ; Actual conversion
 
@@ -90,50 +91,50 @@ define i16 @convert_to_bitmask16(<16 x i8> %vec) {
 }
 
 define i16 @convert_to_bitmask8(<8 x i16> %vec) {
-; SDAG-LABEL: convert_to_bitmask8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, lCPI1_0@PAGE
-; SDAG-NEXT:    cmeq.8h v0, v0, #0
-; SDAG-NEXT:    ldr q1, [x8, lCPI1_0@PAGEOFF]
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    addv.8h h0, v0
-; SDAG-NEXT:    fmov w8, s0
-; SDAG-NEXT:    and w0, w8, #0xff
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask8:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    adrp x8, lCPI1_0@PAGE
+; CHECK-SD-NEXT:    cmeq.8h v0, v0, #0
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI1_0@PAGEOFF]
+; CHECK-SD-NEXT:    bic.16b v0, v1, v0
+; CHECK-SD-NEXT:    addv.8h h0, v0
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    and w0, w8, #0xff
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.8h v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    xtn.8b v0, v0
-; GISEL-NEXT:    umov.b w8, v0[1]
-; GISEL-NEXT:    umov.b w9, v0[0]
-; GISEL-NEXT:    umov.b w10, v0[2]
-; GISEL-NEXT:    umov.b w11, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    orr w8, w9, w8, lsl #2
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[5]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[6]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #4
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #5
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #6
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #7
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask8:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.8h v0, v0, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    xtn.8b v0, v0
+; CHECK-GI-NEXT:    umov.b w8, v0[1]
+; CHECK-GI-NEXT:    umov.b w9, v0[0]
+; CHECK-GI-NEXT:    umov.b w10, v0[2]
+; CHECK-GI-NEXT:    umov.b w11, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #2
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[5]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[6]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #4
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #5
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #6
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #7
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
@@ -143,36 +144,36 @@ define i16 @convert_to_bitmask8(<8 x i16> %vec) {
 }
 
 define i4 @convert_to_bitmask4(<4 x i32> %vec) {
-; SDAG-LABEL: convert_to_bitmask4:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, lCPI2_0@PAGE
-; SDAG-NEXT:    cmeq.4s v0, v0, #0
-; SDAG-NEXT:    ldr q1, [x8, lCPI2_0@PAGEOFF]
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    addv.4s s0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask4:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    adrp x8, lCPI2_0@PAGE
+; CHECK-SD-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI2_0@PAGEOFF]
+; CHECK-SD-NEXT:    bic.16b v0, v1, v0
+; CHECK-SD-NEXT:    addv.4s s0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask4:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.4s v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask4:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
@@ -220,37 +221,37 @@ define i8 @clang_builtins_undef_concat_convert_to_bitmask4(<4 x i32> %vec) {
 
 
 define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) {
-; SDAG-LABEL: convert_to_bitmask_no_compare:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    adrp x8, lCPI5_0@PAGE
-; SDAG-NEXT:    ldr q1, [x8, lCPI5_0@PAGEOFF]
-; SDAG-NEXT:    shl.4s v0, v0, #31
-; SDAG-NEXT:    cmlt.4s v0, v0, #0
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    addv.4s s0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_no_compare:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    adrp x8, lCPI5_0@PAGE
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI5_0@PAGEOFF]
+; CHECK-SD-NEXT:    shl.4s v0, v0, #31
+; CHECK-SD-NEXT:    cmlt.4s v0, v0, #0
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4s s0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_no_compare:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    and.16b v0, v0, v1
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_no_compare:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    and.16b v0, v0, v1
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp = and <4 x i32> %vec1, %vec2
@@ -260,39 +261,39 @@ define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) {
 }
 
 define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; SDAG-LABEL: convert_to_bitmask_with_compare_chain:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    cmeq.4s v2, v0, #0
-; SDAG-NEXT:    cmeq.4s v0, v0, v1
-; SDAG-NEXT:    adrp x8, lCPI6_0@PAGE
-; SDAG-NEXT:    ldr q1, [x8, lCPI6_0@PAGEOFF]
-; SDAG-NEXT:    bic.16b v0, v0, v2
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    addv.4s s0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_with_compare_chain:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmeq.4s v2, v0, #0
+; CHECK-SD-NEXT:    cmeq.4s v0, v0, v1
+; CHECK-SD-NEXT:    adrp x8, lCPI6_0@PAGE
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI6_0@PAGEOFF]
+; CHECK-SD-NEXT:    bic.16b v0, v0, v2
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4s s0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_with_compare_chain:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.4s v2, v0, #0
-; GISEL-NEXT:    cmeq.4s v0, v0, v1
-; GISEL-NEXT:    bic.16b v0, v0, v2
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_with_compare_chain:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.4s v2, v0, #0
+; CHECK-GI-NEXT:    cmeq.4s v0, v0, v1
+; CHECK-GI-NEXT:    bic.16b v0, v0, v2
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -303,39 +304,39 @@ define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec
 }
 
 define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; SDAG-LABEL: convert_to_bitmask_with_trunc_in_chain:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    cmeq.4s v0, v0, #0
-; SDAG-NEXT:    adrp x8, lCPI7_0@PAGE
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    ldr q1, [x8, lCPI7_0@PAGEOFF]
-; SDAG-NEXT:    shl.4s v0, v0, #31
-; SDAG-NEXT:    cmlt.4s v0, v0, #0
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    addv.4s s0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_with_trunc_in_chain:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-SD-NEXT:    adrp x8, lCPI7_0@PAGE
+; CHECK-SD-NEXT:    bic.16b v0, v1, v0
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI7_0@PAGEOFF]
+; CHECK-SD-NEXT:    shl.4s v0, v0, #31
+; CHECK-SD-NEXT:    cmlt.4s v0, v0, #0
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4s s0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_with_trunc_in_chain:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.4s v0, v0, #0
-; GISEL-NEXT:    bic.16b v0, v1, v0
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_with_trunc_in_chain:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-GI-NEXT:    bic.16b v0, v1, v0
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -346,82 +347,82 @@ define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %ve
 }
 
 define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; SDAG-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    cmeq.4s v0, v0, #0
-; SDAG-NEXT:    cmeq.4s v1, v1, #0
-; SDAG-NEXT:    adrp x8, lCPI8_0@PAGE
-; SDAG-NEXT:    movi d2, #0x000000ffffffff
-; SDAG-NEXT:    movi d3, #0x00ffffffffffff
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    movi d1, #0xffff0000ffff0000
-; SDAG-NEXT:    xtn.4h v0, v0
-; SDAG-NEXT:    orr.8b v0, v0, v2
-; SDAG-NEXT:    movi d2, #0x00ffffffff0000
-; SDAG-NEXT:    eor.8b v1, v0, v1
-; SDAG-NEXT:    eor.8b v0, v0, v2
-; SDAG-NEXT:    mov.h v1[2], wzr
-; SDAG-NEXT:    orr.8b v0, v0, v3
-; SDAG-NEXT:    orr.8b v0, v1, v0
-; SDAG-NEXT:    ldr d1, [x8, lCPI8_0@PAGEOFF]
-; SDAG-NEXT:    shl.4h v0, v0, #15
-; SDAG-NEXT:    cmlt.4h v0, v0, #0
-; SDAG-NEXT:    and.8b v0, v0, v1
-; SDAG-NEXT:    addv.4h h0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-SD-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-SD-NEXT:    adrp x8, lCPI8_0@PAGE
+; CHECK-SD-NEXT:    movi d2, #0x000000ffffffff
+; CHECK-SD-NEXT:    movi d3, #0x00ffffffffffff
+; CHECK-SD-NEXT:    bic.16b v0, v1, v0
+; CHECK-SD-NEXT:    movi d1, #0xffff0000ffff0000
+; CHECK-SD-NEXT:    xtn.4h v0, v0
+; CHECK-SD-NEXT:    orr.8b v0, v0, v2
+; CHECK-SD-NEXT:    movi d2, #0x00ffffffff0000
+; CHECK-SD-NEXT:    eor.8b v1, v0, v1
+; CHECK-SD-NEXT:    eor.8b v0, v0, v2
+; CHECK-SD-NEXT:    mov.h v1[2], wzr
+; CHECK-SD-NEXT:    orr.8b v0, v0, v3
+; CHECK-SD-NEXT:    orr.8b v0, v1, v0
+; CHECK-SD-NEXT:    ldr d1, [x8, lCPI8_0@PAGEOFF]
+; CHECK-SD-NEXT:    shl.4h v0, v0, #15
+; CHECK-SD-NEXT:    cmlt.4h v0, v0, #0
+; CHECK-SD-NEXT:    and.8b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4h h0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    mov w8, #1 ; =0x1
-; GISEL-NEXT:    mov w9, #0 ; =0x0
-; GISEL-NEXT:    cmeq.4s v5, v0, #0
-; GISEL-NEXT:    fmov s2, w8
-; GISEL-NEXT:    fmov s4, w9
-; GISEL-NEXT:    cmeq.4s v1, v1, #0
-; GISEL-NEXT:    mov.16b v3, v2
-; GISEL-NEXT:    mov.16b v0, v4
-; GISEL-NEXT:    mov.h v4[1], w8
-; GISEL-NEXT:    bic.16b v1, v1, v5
-; GISEL-NEXT:    mov.16b v5, v2
-; GISEL-NEXT:    mov.h v2[1], w8
-; GISEL-NEXT:    mov.h v3[1], w8
-; GISEL-NEXT:    mov.h v0[1], w8
-; GISEL-NEXT:    mov.h v5[1], w8
-; GISEL-NEXT:    mov.h v4[2], w8
-; GISEL-NEXT:    xtn.4h v1, v1
-; GISEL-NEXT:    mov.h v2[2], w8
-; GISEL-NEXT:    mov.h v3[2], w9
-; GISEL-NEXT:    mov.h v0[2], w9
-; GISEL-NEXT:    mov.h v5[2], w9
-; GISEL-NEXT:    mov.h v4[3], w9
-; GISEL-NEXT:    mov.h v2[3], w9
-; GISEL-NEXT:    mov.h v3[3], w9
-; GISEL-NEXT:    mov.h v0[3], w8
-; GISEL-NEXT:    mov.h v5[3], w8
-; GISEL-NEXT:    orr.8b v1, v1, v3
-; GISEL-NEXT:    eor.8b v0, v1, v0
-; GISEL-NEXT:    eor.8b v1, v4, v1
-; GISEL-NEXT:    and.8b v0, v0, v5
-; GISEL-NEXT:    orr.8b v1, v2, v1
-; GISEL-NEXT:    orr.8b v0, v0, v1
-; GISEL-NEXT:    ushll.4s v0, v0, #0
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w8, #1 ; =0x1
+; CHECK-GI-NEXT:    mov w9, #0 ; =0x0
+; CHECK-GI-NEXT:    cmeq.4s v5, v0, #0
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-GI-NEXT:    mov.16b v3, v2
+; CHECK-GI-NEXT:    mov.16b v0, v4
+; CHECK-GI-NEXT:    mov.h v4[1], w8
+; CHECK-GI-NEXT:    bic.16b v1, v1, v5
+; CHECK-GI-NEXT:    mov.16b v5, v2
+; CHECK-GI-NEXT:    mov.h v2[1], w8
+; CHECK-GI-NEXT:    mov.h v3[1], w8
+; CHECK-GI-NEXT:    mov.h v0[1], w8
+; CHECK-GI-NEXT:    mov.h v5[1], w8
+; CHECK-GI-NEXT:    mov.h v4[2], w8
+; CHECK-GI-NEXT:    xtn.4h v1, v1
+; CHECK-GI-NEXT:    mov.h v2[2], w8
+; CHECK-GI-NEXT:    mov.h v3[2], w9
+; CHECK-GI-NEXT:    mov.h v0[2], w9
+; CHECK-GI-NEXT:    mov.h v5[2], w9
+; CHECK-GI-NEXT:    mov.h v4[3], w9
+; CHECK-GI-NEXT:    mov.h v2[3], w9
+; CHECK-GI-NEXT:    mov.h v3[3], w9
+; CHECK-GI-NEXT:    mov.h v0[3], w8
+; CHECK-GI-NEXT:    mov.h v5[3], w8
+; CHECK-GI-NEXT:    orr.8b v1, v1, v3
+; CHECK-GI-NEXT:    eor.8b v0, v1, v0
+; CHECK-GI-NEXT:    eor.8b v1, v4, v1
+; CHECK-GI-NEXT:    and.8b v0, v0, v5
+; CHECK-GI-NEXT:    orr.8b v1, v2, v1
+; CHECK-GI-NEXT:    orr.8b v0, v0, v1
+; CHECK-GI-NEXT:    ushll.4s v0, v0, #0
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -440,42 +441,42 @@ define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <
 }
 
 define i4 @convert_to_bitmask_with_different_types_in_chain(<4 x i16> %vec1, <4 x i32> %vec2) {
-; SDAG-LABEL: convert_to_bitmask_with_different_types_in_chain:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    cmeq.4s v1, v1, #0
-; SDAG-NEXT:    cmeq.4h v0, v0, #0
-; SDAG-NEXT:    adrp x8, lCPI9_0@PAGE
-; SDAG-NEXT:    xtn.4h v1, v1
-; SDAG-NEXT:    orn.8b v0, v1, v0
-; SDAG-NEXT:    ldr d1, [x8, lCPI9_0@PAGEOFF]
-; SDAG-NEXT:    and.8b v0, v0, v1
-; SDAG-NEXT:    addv.4h h0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_with_different_types_in_chain:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-SD-NEXT:    cmeq.4h v0, v0, #0
+; CHECK-SD-NEXT:    adrp x8, lCPI9_0@PAGE
+; CHECK-SD-NEXT:    xtn.4h v1, v1
+; CHECK-SD-NEXT:    orn.8b v0, v1, v0
+; CHECK-SD-NEXT:    ldr d1, [x8, lCPI9_0@PAGEOFF]
+; CHECK-SD-NEXT:    and.8b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4h h0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_with_different_types_in_chain:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.4s v1, v1, #0
-; GISEL-NEXT:    cmeq.4h v0, v0, #0
-; GISEL-NEXT:    xtn.4h v1, v1
-; GISEL-NEXT:    orn.8b v0, v1, v0
-; GISEL-NEXT:    ushll.4s v0, v0, #0
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_with_different_types_in_chain:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-GI-NEXT:    cmeq.4h v0, v0, #0
+; CHECK-GI-NEXT:    xtn.4h v1, v1
+; CHECK-GI-NEXT:    orn.8b v0, v1, v0
+; CHECK-GI-NEXT:    ushll.4s v0, v0, #0
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp1 = icmp ne <4 x i16> %vec1, zeroinitializer
@@ -486,73 +487,73 @@ define i4 @convert_to_bitmask_with_different_types_in_chain(<4 x i16> %vec1, <4
 }
 
 define i16 @convert_to_bitmask_without_knowing_type(<16 x i1> %vec) {
-; SDAG-LABEL: convert_to_bitmask_without_knowing_type:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    shl.16b v0, v0, #7
-; SDAG-NEXT:    adrp x8, lCPI10_0@PAGE
-; SDAG-NEXT:    ldr q1, [x8, lCPI10_0@PAGEOFF]
-; SDAG-NEXT:    cmlt.16b v0, v0, #0
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    ext.16b v1, v0, v0, #8
-; SDAG-NEXT:    zip1.16b v0, v0, v1
-; SDAG-NEXT:    addv.8h h0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_without_knowing_type:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    shl.16b v0, v0, #7
+; CHECK-SD-NEXT:    adrp x8, lCPI10_0@PAGE
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI10_0@PAGEOFF]
+; CHECK-SD-NEXT:    cmlt.16b v0, v0, #0
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    ext.16b v1, v0, v0, #8
+; CHECK-SD-NEXT:    zip1.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.8h h0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_without_knowing_type:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    umov.b w8, v0[1]
-; GISEL-NEXT:    umov.b w9, v0[0]
-; GISEL-NEXT:    umov.b w10, v0[2]
-; GISEL-NEXT:    umov.b w11, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    orr w8, w9, w8, lsl #2
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[5]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[6]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #4
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #5
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[8]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #6
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[9]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #7
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[10]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #8
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[11]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #9
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[12]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #10
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[13]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #11
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[14]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #12
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[15]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #13
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #14
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #15
-; GISEL-NEXT:    strh w8, [sp, #14]
-; GISEL-NEXT:    and w0, w8, #0xffff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_without_knowing_type:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    umov.b w8, v0[1]
+; CHECK-GI-NEXT:    umov.b w9, v0[0]
+; CHECK-GI-NEXT:    umov.b w10, v0[2]
+; CHECK-GI-NEXT:    umov.b w11, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #2
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[5]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[6]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #4
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #5
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[8]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #6
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[9]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #7
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[10]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #8
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[11]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #9
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[12]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #10
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[13]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #11
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[14]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #12
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[15]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #13
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #14
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #15
+; CHECK-GI-NEXT:    strh w8, [sp, #14]
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
   %bitmask = bitcast <16 x i1> %vec to i16
   ret i16 %bitmask
@@ -575,51 +576,51 @@ define i2 @convert_to_bitmask_2xi32(<2 x i32> %vec) {
 }
 
 define i4 @convert_to_bitmask_4xi8(<4 x i8> %vec) {
-; SDAG-LABEL: convert_to_bitmask_4xi8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    bic.4h v0, #255, lsl #8
-; SDAG-NEXT:    adrp x8, lCPI12_0@PAGE
-; SDAG-NEXT:    ldr d1, [x8, lCPI12_0@PAGEOFF]
-; SDAG-NEXT:    cmeq.4h v0, v0, #0
-; SDAG-NEXT:    bic.8b v0, v1, v0
-; SDAG-NEXT:    addv.4h h0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_4xi8:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    bic.4h v0, #255, lsl #8
+; CHECK-SD-NEXT:    adrp x8, lCPI12_0@PAGE
+; CHECK-SD-NEXT:    ldr d1, [x8, lCPI12_0@PAGEOFF]
+; CHECK-SD-NEXT:    cmeq.4h v0, v0, #0
+; CHECK-SD-NEXT:    bic.8b v0, v1, v0
+; CHECK-SD-NEXT:    addv.4h h0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_4xi8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    mov w8, #0 ; =0x0
-; GISEL-NEXT:    uzp1.8b v0, v0, v0
-; GISEL-NEXT:    fmov s1, w8
-; GISEL-NEXT:    mov.b v1[1], w8
-; GISEL-NEXT:    mov.b v1[2], w8
-; GISEL-NEXT:    mov.b v1[3], w8
-; GISEL-NEXT:    cmeq.8b v0, v0, v1
-; GISEL-NEXT:    mvn.8b v0, v0
-; GISEL-NEXT:    umov.b w8, v0[0]
-; GISEL-NEXT:    umov.b w9, v0[1]
-; GISEL-NEXT:    mov.s v1[0], w8
-; GISEL-NEXT:    umov.b w8, v0[2]
-; GISEL-NEXT:    mov.s v1[1], w9
-; GISEL-NEXT:    umov.b w9, v0[3]
-; GISEL-NEXT:    mov.s v1[2], w8
-; GISEL-NEXT:    mov.s v1[3], w9
-; GISEL-NEXT:    mov.s w8, v1[1]
-; GISEL-NEXT:    mov.s w9, v1[2]
-; GISEL-NEXT:    fmov w11, s1
-; GISEL-NEXT:    mov.s w10, v1[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_4xi8:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w8, #0 ; =0x0
+; CHECK-GI-NEXT:    uzp1.8b v0, v0, v0
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov.b v1[1], w8
+; CHECK-GI-NEXT:    mov.b v1[2], w8
+; CHECK-GI-NEXT:    mov.b v1[3], w8
+; CHECK-GI-NEXT:    cmeq.8b v0, v0, v1
+; CHECK-GI-NEXT:    mvn.8b v0, v0
+; CHECK-GI-NEXT:    umov.b w8, v0[0]
+; CHECK-GI-NEXT:    umov.b w9, v0[1]
+; CHECK-GI-NEXT:    mov.s v1[0], w8
+; CHECK-GI-NEXT:    umov.b w8, v0[2]
+; CHECK-GI-NEXT:    mov.s v1[1], w9
+; CHECK-GI-NEXT:    umov.b w9, v0[3]
+; CHECK-GI-NEXT:    mov.s v1[2], w8
+; CHECK-GI-NEXT:    mov.s v1[3], w9
+; CHECK-GI-NEXT:    mov.s w8, v1[1]
+; CHECK-GI-NEXT:    mov.s w9, v1[2]
+; CHECK-GI-NEXT:    fmov w11, s1
+; CHECK-GI-NEXT:    mov.s w10, v1[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
   %cmp_result = icmp ne <4 x i8> %vec, zeroinitializer
   %bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -645,39 +646,39 @@ define i8 @convert_to_bitmask_8xi2(<8 x i2> %vec) {
 }
 
 define i4 @convert_to_bitmask_float(<4 x float> %vec) {
-; SDAG-LABEL: convert_to_bitmask_float:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    fcmgt.4s v1, v0, #0.0
-; SDAG-NEXT:    fcmlt.4s v0, v0, #0.0
-; SDAG-NEXT:    adrp x8, lCPI14_0@PAGE
-; SDAG-NEXT:    orr.16b v0, v0, v1
-; SDAG-NEXT:    ldr q1, [x8, lCPI14_0@PAGEOFF]
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    addv.4s s0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_float:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    fcmgt.4s v1, v0, #0.0
+; CHECK-SD-NEXT:    fcmlt.4s v0, v0, #0.0
+; CHECK-SD-NEXT:    adrp x8, lCPI14_0@PAGE
+; CHECK-SD-NEXT:    orr.16b v0, v0, v1
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI14_0@PAGEOFF]
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4s s0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_float:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    fcmgt.4s v1, v0, #0.0
-; GISEL-NEXT:    fcmlt.4s v0, v0, #0.0
-; GISEL-NEXT:    orr.16b v0, v0, v1
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_float:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    fcmgt.4s v1, v0, #0.0
+; CHECK-GI-NEXT:    fcmlt.4s v0, v0, #0.0
+; CHECK-GI-NEXT:    orr.16b v0, v0, v1
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp_result = fcmp one <4 x float> %vec, zeroinitializer
@@ -688,58 +689,58 @@ define i4 @convert_to_bitmask_float(<4 x float> %vec) {
 ; Larger vector types don't map directly, but the can be split/truncated and then converted.
 ; After the comparison against 0, this is truncated to <8 x i16>, which is valid again.
 define i8 @convert_large_vector(<8 x i32> %vec) {
-; SDAG-LABEL: convert_large_vector:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    sub sp, sp, #16
-; SDAG-NEXT:    .cfi_def_cfa_offset 16
-; SDAG-NEXT:    cmeq.4s v1, v1, #0
-; SDAG-NEXT:    cmeq.4s v0, v0, #0
-; SDAG-NEXT:    adrp x8, lCPI15_0@PAGE
-; SDAG-NEXT:    uzp1.8h v0, v0, v1
-; SDAG-NEXT:    ldr q1, [x8, lCPI15_0@PAGEOFF]
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    addv.8h h0, v0
-; SDAG-NEXT:    fmov w8, s0
-; SDAG-NEXT:    and w0, w8, #0xff
-; SDAG-NEXT:    add sp, sp, #16
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_large_vector:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-SD-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-SD-NEXT:    adrp x8, lCPI15_0@PAGE
+; CHECK-SD-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI15_0@PAGEOFF]
+; CHECK-SD-NEXT:    bic.16b v0, v1, v0
+; CHECK-SD-NEXT:    addv.8h h0, v0
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    and w0, w8, #0xff
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_large_vector:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.4s v0, v0, #0
-; GISEL-NEXT:    cmeq.4s v1, v1, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    mvn.16b v1, v1
-; GISEL-NEXT:    uzp1.8h v0, v0, v1
-; GISEL-NEXT:    xtn.8b v0, v0
-; GISEL-NEXT:    umov.b w8, v0[1]
-; GISEL-NEXT:    umov.b w9, v0[0]
-; GISEL-NEXT:    umov.b w10, v0[2]
-; GISEL-NEXT:    umov.b w11, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    orr w8, w9, w8, lsl #2
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[5]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[6]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #4
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #5
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #6
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #7
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_large_vector:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-GI-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    mvn.16b v1, v1
+; CHECK-GI-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-GI-NEXT:    xtn.8b v0, v0
+; CHECK-GI-NEXT:    umov.b w8, v0[1]
+; CHECK-GI-NEXT:    umov.b w9, v0[0]
+; CHECK-GI-NEXT:    umov.b w10, v0[2]
+; CHECK-GI-NEXT:    umov.b w11, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #2
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[5]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[6]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #4
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #5
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #6
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #7
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
    %cmp_result = icmp ne <8 x i32> %vec, zeroinitializer
@@ -748,40 +749,40 @@ define i8 @convert_large_vector(<8 x i32> %vec) {
 }
 
 define i4 @convert_legalized_illegal_element_size(<4 x i22> %vec) {
-; SDAG-LABEL: convert_legalized_illegal_element_size:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    movi.4s v1, #63, msl #16
-; SDAG-NEXT:    adrp x8, lCPI16_0@PAGE
-; SDAG-NEXT:    cmtst.4s v0, v0, v1
-; SDAG-NEXT:    ldr d1, [x8, lCPI16_0@PAGEOFF]
-; SDAG-NEXT:    xtn.4h v0, v0
-; SDAG-NEXT:    and.8b v0, v0, v1
-; SDAG-NEXT:    addv.4h h0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_legalized_illegal_element_size:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    movi.4s v1, #63, msl #16
+; CHECK-SD-NEXT:    adrp x8, lCPI16_0@PAGE
+; CHECK-SD-NEXT:    cmtst.4s v0, v0, v1
+; CHECK-SD-NEXT:    ldr d1, [x8, lCPI16_0@PAGEOFF]
+; CHECK-SD-NEXT:    xtn.4h v0, v0
+; CHECK-SD-NEXT:    and.8b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4h h0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_legalized_illegal_element_size:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    movi.4s v1, #63, msl #16
-; GISEL-NEXT:    and.16b v0, v0, v1
-; GISEL-NEXT:    cmeq.4s v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_legalized_illegal_element_size:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    movi.4s v1, #63, msl #16
+; CHECK-GI-NEXT:    and.16b v0, v0, v1
+; CHECK-GI-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
   %cmp_result = icmp ne <4 x i22> %vec, zeroinitializer
   %bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -818,101 +819,101 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
 }
 
 define <8 x i1> @no_convert_without_direct_bitcast(<8 x i16> %vec) {
-; SDAG-LABEL: no_convert_without_direct_bitcast:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    cmtst.8h v0, v0, v0
-; SDAG-NEXT:    xtn.8b v0, v0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: no_convert_without_direct_bitcast:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmtst.8h v0, v0, v0
+; CHECK-SD-NEXT:    xtn.8b v0, v0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: no_convert_without_direct_bitcast:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmeq.8h v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    xtn.8b v0, v0
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: no_convert_without_direct_bitcast:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    cmeq.8h v0, v0, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    xtn.8b v0, v0
+; CHECK-GI-NEXT:    ret
 
    %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
    ret <8 x i1> %cmp_result
 }
 
 define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
-; SDAG-LABEL: no_combine_illegal_num_elements:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    sub sp, sp, #16
-; SDAG-NEXT:    .cfi_def_cfa_offset 16
-; SDAG-NEXT:    fmov s0, w0
-; SDAG-NEXT:    fmov s1, w4
-; SDAG-NEXT:    mov.s v0[1], w1
-; SDAG-NEXT:    mov.s v1[1], w5
-; SDAG-NEXT:    mov.s v0[2], w2
-; SDAG-NEXT:    cmeq.4s v1, v1, #0
-; SDAG-NEXT:    mov.s v0[3], w3
-; SDAG-NEXT:    cmeq.4s v0, v0, #0
-; SDAG-NEXT:    uzp1.8h v0, v0, v1
-; SDAG-NEXT:    mvn.16b v0, v0
-; SDAG-NEXT:    xtn.8b v0, v0
-; SDAG-NEXT:    umov.b w8, v0[0]
-; SDAG-NEXT:    umov.b w9, v0[1]
-; SDAG-NEXT:    umov.b w10, v0[2]
-; SDAG-NEXT:    and w8, w8, #0x1
-; SDAG-NEXT:    bfi w8, w9, #1, #1
-; SDAG-NEXT:    umov.b w9, v0[3]
-; SDAG-NEXT:    bfi w8, w10, #2, #1
-; SDAG-NEXT:    umov.b w10, v0[4]
-; SDAG-NEXT:    bfi w8, w9, #3, #1
-; SDAG-NEXT:    umov.b w9, v0[5]
-; SDAG-NEXT:    bfi w8, w10, #4, #1
-; SDAG-NEXT:    orr w8, w8, w9, lsl #5
-; SDAG-NEXT:    and w0, w8, #0x3f
-; SDAG-NEXT:    add sp, sp, #16
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: no_combine_illegal_num_elements:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    fmov s1, w4
+; CHECK-SD-NEXT:    mov.s v0[1], w1
+; CHECK-SD-NEXT:    mov.s v1[1], w5
+; CHECK-SD-NEXT:    mov.s v0[2], w2
+; CHECK-SD-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-SD-NEXT:    mov.s v0[3], w3
+; CHECK-SD-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-SD-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-SD-NEXT:    mvn.16b v0, v0
+; CHECK-SD-NEXT:    xtn.8b v0, v0
+; CHECK-SD-NEXT:    umov.b w8, v0[0]
+; CHECK-SD-NEXT:    umov.b w9, v0[1]
+; CHECK-SD-NEXT:    umov.b w10, v0[2]
+; CHECK-SD-NEXT:    and w8, w8, #0x1
+; CHECK-SD-NEXT:    bfi w8, w9, #1, #1
+; CHECK-SD-NEXT:    umov.b w9, v0[3]
+; CHECK-SD-NEXT:    bfi w8, w10, #2, #1
+; CHECK-SD-NEXT:    umov.b w10, v0[4]
+; CHECK-SD-NEXT:    bfi w8, w9, #3, #1
+; CHECK-SD-NEXT:    umov.b w9, v0[5]
+; CHECK-SD-NEXT:    bfi w8, w10, #4, #1
+; CHECK-SD-NEXT:    orr w8, w8, w9, lsl #5
+; CHECK-SD-NEXT:    and w0, w8, #0x3f
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: no_combine_illegal_num_elements:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    mov.s v0[0], w0
-; GISEL-NEXT:    mov.s v1[0], w4
-; GISEL-NEXT:    mov.s v2[0], wzr
-; GISEL-NEXT:    mov.s v0[1], w1
-; GISEL-NEXT:    mov.s v1[1], w5
-; GISEL-NEXT:    mov.s v2[1], wzr
-; GISEL-NEXT:    mov.s v0[2], w2
-; GISEL-NEXT:    cmeq.4s v1, v1, v2
-; GISEL-NEXT:    mvn.16b v1, v1
-; GISEL-NEXT:    mov.s v0[3], w3
-; GISEL-NEXT:    cmeq.4s v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    mov.h v0[1], w8
-; GISEL-NEXT:    mov.s w8, v1[1]
-; GISEL-NEXT:    mov.h v0[2], w9
-; GISEL-NEXT:    mov.h v0[3], w10
-; GISEL-NEXT:    mov.h v0[4], v1[0]
-; GISEL-NEXT:    mov.h v0[5], w8
-; GISEL-NEXT:    umov.h w8, v0[1]
-; GISEL-NEXT:    umov.h w9, v0[0]
-; GISEL-NEXT:    umov.h w10, v0[2]
-; GISEL-NEXT:    umov.h w11, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    umov.h w10, v0[4]
-; GISEL-NEXT:    orr w8, w9, w8, lsl #2
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.h w11, v0[5]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #4
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #5
-; GISEL-NEXT:    and w8, w8, #0x3f
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: no_combine_illegal_num_elements:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov.s v0[0], w0
+; CHECK-GI-NEXT:    mov.s v1[0], w4
+; CHECK-GI-NEXT:    mov.s v2[0], wzr
+; CHECK-GI-NEXT:    mov.s v0[1], w1
+; CHECK-GI-NEXT:    mov.s v1[1], w5
+; CHECK-GI-NEXT:    mov.s v2[1], wzr
+; CHECK-GI-NEXT:    mov.s v0[2], w2
+; CHECK-GI-NEXT:    cmeq.4s v1, v1, v2
+; CHECK-GI-NEXT:    mvn.16b v1, v1
+; CHECK-GI-NEXT:    mov.s v0[3], w3
+; CHECK-GI-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    mov.h v0[1], w8
+; CHECK-GI-NEXT:    mov.s w8, v1[1]
+; CHECK-GI-NEXT:    mov.h v0[2], w9
+; CHECK-GI-NEXT:    mov.h v0[3], w10
+; CHECK-GI-NEXT:    mov.h v0[4], v1[0]
+; CHECK-GI-NEXT:    mov.h v0[5], w8
+; CHECK-GI-NEXT:    umov.h w8, v0[1]
+; CHECK-GI-NEXT:    umov.h w9, v0[0]
+; CHECK-GI-NEXT:    umov.h w10, v0[2]
+; CHECK-GI-NEXT:    umov.h w11, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    umov.h w10, v0[4]
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #2
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.h w11, v0[5]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #4
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #5
+; CHECK-GI-NEXT:    and w8, w8, #0x3f
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
   %cmp_result = icmp ne <6 x i32> %vec, zeroinitializer
   %bitmask = bitcast <6 x i1> %cmp_result to i6
@@ -921,220 +922,220 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
 
 ; Only apply the combine when casting a vector to a scalar.
 define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind {
-; SDAG-LABEL: vector_to_vector_cast:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    sub sp, sp, #16
-; SDAG-NEXT:    shl.16b v0, v0, #7
-; SDAG-NEXT:    adrp x8, lCPI20_0@PAGE
-; SDAG-NEXT:    ldr q1, [x8, lCPI20_0@PAGEOFF]
-; SDAG-NEXT:    add x8, sp, #14
-; SDAG-NEXT:    cmlt.16b v0, v0, #0
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    ext.16b v1, v0, v0, #8
-; SDAG-NEXT:    zip1.16b v0, v0, v1
-; SDAG-NEXT:    addv.8h h0, v0
-; SDAG-NEXT:    str h0, [sp, #14]
-; SDAG-NEXT:    ld1.b { v0 }[0], [x8]
-; SDAG-NEXT:    orr x8, x8, #0x1
-; SDAG-NEXT:    ld1.b { v0 }[4], [x8]
-; SDAG-NEXT:    ; kill: def $d0 killed $d0 killed $q0
-; SDAG-NEXT:    add sp, sp, #16
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: vector_to_vector_cast:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    shl.16b v0, v0, #7
+; CHECK-SD-NEXT:    adrp x8, lCPI20_0@PAGE
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI20_0@PAGEOFF]
+; CHECK-SD-NEXT:    add x8, sp, #14
+; CHECK-SD-NEXT:    cmlt.16b v0, v0, #0
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    ext.16b v1, v0, v0, #8
+; CHECK-SD-NEXT:    zip1.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.8h h0, v0
+; CHECK-SD-NEXT:    str h0, [sp, #14]
+; CHECK-SD-NEXT:    ld1.b { v0 }[0], [x8]
+; CHECK-SD-NEXT:    orr x8, x8, #0x1
+; CHECK-SD-NEXT:    ld1.b { v0 }[4], [x8]
+; CHECK-SD-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: vector_to_vector_cast:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    umov.b w8, v0[1]
-; GISEL-NEXT:    mov d1, v0[1]
-; GISEL-NEXT:    umov.b w10, v0[1]
-; GISEL-NEXT:    umov.b w9, v0[0]
-; GISEL-NEXT:    umov.b w13, v0[0]
-; GISEL-NEXT:    umov.b w14, v0[2]
-; GISEL-NEXT:    umov.b w15, v0[3]
-; GISEL-NEXT:    umov.b w11, v0[2]
-; GISEL-NEXT:    umov.b w16, v0[4]
-; GISEL-NEXT:    umov.b w17, v0[5]
-; GISEL-NEXT:    umov.b w12, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    umov.b w0, v1[1]
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    bfi w13, w10, #1, #31
-; GISEL-NEXT:    and w14, w14, #0x1
-; GISEL-NEXT:    umov.b w8, v1[0]
-; GISEL-NEXT:    umov.b w10, v1[2]
-; GISEL-NEXT:    and w15, w15, #0x1
-; GISEL-NEXT:    orr w13, w13, w14, lsl #2
-; GISEL-NEXT:    umov.b w14, v1[3]
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    and w0, w0, #0x1
-; GISEL-NEXT:    and w16, w16, #0x1
-; GISEL-NEXT:    orr w9, w9, w11, lsl #2
-; GISEL-NEXT:    orr w13, w13, w15, lsl #3
-; GISEL-NEXT:    umov.b w15, v1[4]
-; GISEL-NEXT:    umov.b w11, v0[6]
-; GISEL-NEXT:    bfi w8, w0, #1, #31
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    and w17, w17, #0x1
-; GISEL-NEXT:    orr w13, w13, w16, lsl #4
-; GISEL-NEXT:    and w14, w14, #0x1
-; GISEL-NEXT:    umov.b w0, v0[7]
-; GISEL-NEXT:    orr w8, w8, w10, lsl #2
-; GISEL-NEXT:    umov.b w10, v1[5]
-; GISEL-NEXT:    umov.b w16, v1[6]
-; GISEL-NEXT:    orr w13, w13, w17, lsl #5
-; GISEL-NEXT:    umov.b w17, v0[4]
-; GISEL-NEXT:    and w15, w15, #0x1
-; GISEL-NEXT:    orr w8, w8, w14, lsl #3
-; GISEL-NEXT:    and w12, w12, #0x1
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    umov.b w14, v1[7]
-; GISEL-NEXT:    orr w9, w9, w12, lsl #3
-; GISEL-NEXT:    orr w11, w13, w11, lsl #6
-; GISEL-NEXT:    orr w8, w8, w15, lsl #4
-; GISEL-NEXT:    umov.b w15, v0[5]
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    and w0, w0, #0x1
-; GISEL-NEXT:    and w12, w17, #0x1
-; GISEL-NEXT:    umov.b w13, v0[1]
-; GISEL-NEXT:    orr w8, w8, w10, lsl #5
-; GISEL-NEXT:    and w16, w16, #0x1
-; GISEL-NEXT:    orr w9, w9, w12, lsl #4
-; GISEL-NEXT:    umov.b w10, v0[0]
-; GISEL-NEXT:    orr w11, w11, w0, lsl #7
-; GISEL-NEXT:    and w14, w14, #0x1
-; GISEL-NEXT:    and w12, w15, #0x1
-; GISEL-NEXT:    umov.b w15, v0[2]
-; GISEL-NEXT:    orr w8, w8, w16, lsl #6
-; GISEL-NEXT:    orr w9, w9, w12, lsl #5
-; GISEL-NEXT:    umov.b w12, v0[6]
-; GISEL-NEXT:    strb w11, [sp, #8]
-; GISEL-NEXT:    and w11, w13, #0x1
-; GISEL-NEXT:    umov.b w13, v0[3]
-; GISEL-NEXT:    orr w8, w8, w14, lsl #7
-; GISEL-NEXT:    umov.b w14, v0[7]
-; GISEL-NEXT:    ldr b0, [sp, #8]
-; GISEL-NEXT:    bfi w10, w11, #1, #31
-; GISEL-NEXT:    and w11, w15, #0x1
-; GISEL-NEXT:    strb w8, [sp, #9]
-; GISEL-NEXT:    umov.b w15, v0[4]
-; GISEL-NEXT:    and w8, w12, #0x1
-; GISEL-NEXT:    orr w10, w10, w11, lsl #2
-; GISEL-NEXT:    orr w8, w9, w8, lsl #6
-; GISEL-NEXT:    and w9, w13, #0x1
-; GISEL-NEXT:    umov.b w11, v0[1]
-; GISEL-NEXT:    orr w9, w10, w9, lsl #3
-; GISEL-NEXT:    umov.b w10, v0[5]
-; GISEL-NEXT:    umov.b w12, v0[0]
-; GISEL-NEXT:    and w13, w14, #0x1
-; GISEL-NEXT:    umov.b w16, v0[2]
-; GISEL-NEXT:    umov.b w17, v0[3]
-; GISEL-NEXT:    and w14, w15, #0x1
-; GISEL-NEXT:    umov.b w15, v0[2]
-; GISEL-NEXT:    orr w8, w8, w13, lsl #7
-; GISEL-NEXT:    orr w9, w9, w14, lsl #4
-; GISEL-NEXT:    umov.b w13, v0[6]
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    umov.b w14, v0[3]
-; GISEL-NEXT:    strb w8, [sp, #10]
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    bfi w12, w11, #1, #31
-; GISEL-NEXT:    orr w8, w9, w8, lsl #5
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    and w9, w15, #0x1
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    umov.b w15, v0[1]
-; GISEL-NEXT:    orr w9, w12, w9, lsl #2
-; GISEL-NEXT:    umov.b w12, v0[5]
-; GISEL-NEXT:    and w13, w13, #0x1
-; GISEL-NEXT:    and w14, w14, #0x1
-; GISEL-NEXT:    orr w8, w8, w13, lsl #6
-; GISEL-NEXT:    umov.b w13, v0[0]
-; GISEL-NEXT:    orr w9, w9, w14, lsl #3
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    umov.b w14, v0[6]
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    and w15, w15, #0x1
-; GISEL-NEXT:    umov.b w0, v0[3]
-; GISEL-NEXT:    orr w9, w9, w10, lsl #4
-; GISEL-NEXT:    and w10, w12, #0x1
-; GISEL-NEXT:    umov.b w12, v0[7]
-; GISEL-NEXT:    orr w8, w8, w11, lsl #7
-; GISEL-NEXT:    bfi w13, w15, #1, #31
-; GISEL-NEXT:    and w11, w16, #0x1
-; GISEL-NEXT:    orr w9, w9, w10, lsl #5
-; GISEL-NEXT:    and w10, w14, #0x1
-; GISEL-NEXT:    umov.b w14, v0[4]
-; GISEL-NEXT:    strb w8, [sp, #11]
-; GISEL-NEXT:    umov.b w15, v0[1]
-; GISEL-NEXT:    umov.b w16, v0[3]
-; GISEL-NEXT:    orr w8, w9, w10, lsl #6
-; GISEL-NEXT:    orr w9, w13, w11, lsl #2
-; GISEL-NEXT:    and w10, w12, #0x1
-; GISEL-NEXT:    and w11, w17, #0x1
-; GISEL-NEXT:    umov.b w12, v0[5]
-; GISEL-NEXT:    umov.b w17, v0[0]
-; GISEL-NEXT:    orr w8, w8, w10, lsl #7
-; GISEL-NEXT:    orr w9, w9, w11, lsl #3
-; GISEL-NEXT:    umov.b w10, v0[1]
-; GISEL-NEXT:    and w11, w14, #0x1
-; GISEL-NEXT:    umov.b w14, v0[0]
-; GISEL-NEXT:    and w15, w15, #0x1
-; GISEL-NEXT:    orr w9, w9, w11, lsl #4
-; GISEL-NEXT:    umov.b w11, v0[2]
-; GISEL-NEXT:    umov.b w13, v0[6]
-; GISEL-NEXT:    and w12, w12, #0x1
-; GISEL-NEXT:    bfi w17, w15, #1, #31
-; GISEL-NEXT:    umov.b w15, v0[5]
-; GISEL-NEXT:    orr w9, w9, w12, lsl #5
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    umov.b w12, v0[2]
-; GISEL-NEXT:    bfi w14, w10, #1, #31
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    ldr b1, [sp, #9]
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    and w13, w13, #0x1
-; GISEL-NEXT:    strb w8, [sp, #12]
-; GISEL-NEXT:    orr w11, w14, w11, lsl #2
-; GISEL-NEXT:    and w14, w16, #0x1
-; GISEL-NEXT:    umov.b w16, v0[4]
-; GISEL-NEXT:    and w12, w12, #0x1
-; GISEL-NEXT:    and w15, w15, #0x1
-; GISEL-NEXT:    orr w9, w9, w13, lsl #6
-; GISEL-NEXT:    orr w11, w11, w14, lsl #3
-; GISEL-NEXT:    orr w12, w17, w12, lsl #2
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    and w17, w0, #0x1
-; GISEL-NEXT:    umov.b w0, v0[5]
-; GISEL-NEXT:    umov.b w14, v0[6]
-; GISEL-NEXT:    orr w10, w11, w10, lsl #4
-; GISEL-NEXT:    orr w12, w12, w17, lsl #3
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    and w16, w16, #0x1
-; GISEL-NEXT:    umov.b w17, v0[6]
-; GISEL-NEXT:    orr w10, w10, w15, lsl #5
-; GISEL-NEXT:    umov.b w15, v0[7]
-; GISEL-NEXT:    orr w12, w12, w16, lsl #4
-; GISEL-NEXT:    and w16, w0, #0x1
-; GISEL-NEXT:    umov.b w0, v0[7]
-; GISEL-NEXT:    and w14, w14, #0x1
-; GISEL-NEXT:    orr w12, w12, w16, lsl #5
-; GISEL-NEXT:    orr w10, w10, w14, lsl #6
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    and w13, w17, #0x1
-; GISEL-NEXT:    orr w9, w9, w11, lsl #7
-; GISEL-NEXT:    mov.s v0[1], v1[0]
-; GISEL-NEXT:    orr w11, w12, w13, lsl #6
-; GISEL-NEXT:    and w12, w15, #0x1
-; GISEL-NEXT:    ; kill: def $d0 killed $d0 killed $q0
-; GISEL-NEXT:    orr w8, w10, w12, lsl #7
-; GISEL-NEXT:    and w10, w0, #0x1
-; GISEL-NEXT:    strb w9, [sp, #13]
-; GISEL-NEXT:    orr w9, w11, w10, lsl #7
-; GISEL-NEXT:    strb w8, [sp, #14]
-; GISEL-NEXT:    strb w9, [sp, #15]
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: vector_to_vector_cast:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    umov.b w8, v0[1]
+; CHECK-GI-NEXT:    mov d1, v0[1]
+; CHECK-GI-NEXT:    umov.b w10, v0[1]
+; CHECK-GI-NEXT:    umov.b w9, v0[0]
+; CHECK-GI-NEXT:    umov.b w13, v0[0]
+; CHECK-GI-NEXT:    umov.b w14, v0[2]
+; CHECK-GI-NEXT:    umov.b w15, v0[3]
+; CHECK-GI-NEXT:    umov.b w11, v0[2]
+; CHECK-GI-NEXT:    umov.b w16, v0[4]
+; CHECK-GI-NEXT:    umov.b w17, v0[5]
+; CHECK-GI-NEXT:    umov.b w12, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v1[1]
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    bfi w13, w10, #1, #31
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w8, v1[0]
+; CHECK-GI-NEXT:    umov.b w10, v1[2]
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    orr w13, w13, w14, lsl #2
+; CHECK-GI-NEXT:    umov.b w14, v1[3]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w0, w0, #0x1
+; CHECK-GI-NEXT:    and w16, w16, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #2
+; CHECK-GI-NEXT:    orr w13, w13, w15, lsl #3
+; CHECK-GI-NEXT:    umov.b w15, v1[4]
+; CHECK-GI-NEXT:    umov.b w11, v0[6]
+; CHECK-GI-NEXT:    bfi w8, w0, #1, #31
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    and w17, w17, #0x1
+; CHECK-GI-NEXT:    orr w13, w13, w16, lsl #4
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #2
+; CHECK-GI-NEXT:    umov.b w10, v1[5]
+; CHECK-GI-NEXT:    umov.b w16, v1[6]
+; CHECK-GI-NEXT:    orr w13, w13, w17, lsl #5
+; CHECK-GI-NEXT:    umov.b w17, v0[4]
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w14, lsl #3
+; CHECK-GI-NEXT:    and w12, w12, #0x1
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v1[7]
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #3
+; CHECK-GI-NEXT:    orr w11, w13, w11, lsl #6
+; CHECK-GI-NEXT:    orr w8, w8, w15, lsl #4
+; CHECK-GI-NEXT:    umov.b w15, v0[5]
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    and w0, w0, #0x1
+; CHECK-GI-NEXT:    and w12, w17, #0x1
+; CHECK-GI-NEXT:    umov.b w13, v0[1]
+; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #5
+; CHECK-GI-NEXT:    and w16, w16, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #4
+; CHECK-GI-NEXT:    umov.b w10, v0[0]
+; CHECK-GI-NEXT:    orr w11, w11, w0, lsl #7
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    and w12, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w15, v0[2]
+; CHECK-GI-NEXT:    orr w8, w8, w16, lsl #6
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #5
+; CHECK-GI-NEXT:    umov.b w12, v0[6]
+; CHECK-GI-NEXT:    strb w11, [sp, #8]
+; CHECK-GI-NEXT:    and w11, w13, #0x1
+; CHECK-GI-NEXT:    umov.b w13, v0[3]
+; CHECK-GI-NEXT:    orr w8, w8, w14, lsl #7
+; CHECK-GI-NEXT:    umov.b w14, v0[7]
+; CHECK-GI-NEXT:    ldr b0, [sp, #8]
+; CHECK-GI-NEXT:    bfi w10, w11, #1, #31
+; CHECK-GI-NEXT:    and w11, w15, #0x1
+; CHECK-GI-NEXT:    strb w8, [sp, #9]
+; CHECK-GI-NEXT:    umov.b w15, v0[4]
+; CHECK-GI-NEXT:    and w8, w12, #0x1
+; CHECK-GI-NEXT:    orr w10, w10, w11, lsl #2
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #6
+; CHECK-GI-NEXT:    and w9, w13, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[1]
+; CHECK-GI-NEXT:    orr w9, w10, w9, lsl #3
+; CHECK-GI-NEXT:    umov.b w10, v0[5]
+; CHECK-GI-NEXT:    umov.b w12, v0[0]
+; CHECK-GI-NEXT:    and w13, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w16, v0[2]
+; CHECK-GI-NEXT:    umov.b w17, v0[3]
+; CHECK-GI-NEXT:    and w14, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w15, v0[2]
+; CHECK-GI-NEXT:    orr w8, w8, w13, lsl #7
+; CHECK-GI-NEXT:    orr w9, w9, w14, lsl #4
+; CHECK-GI-NEXT:    umov.b w13, v0[6]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[3]
+; CHECK-GI-NEXT:    strb w8, [sp, #10]
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    bfi w12, w11, #1, #31
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #5
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    and w9, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    umov.b w15, v0[1]
+; CHECK-GI-NEXT:    orr w9, w12, w9, lsl #2
+; CHECK-GI-NEXT:    umov.b w12, v0[5]
+; CHECK-GI-NEXT:    and w13, w13, #0x1
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w13, lsl #6
+; CHECK-GI-NEXT:    umov.b w13, v0[0]
+; CHECK-GI-NEXT:    orr w9, w9, w14, lsl #3
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[6]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[3]
+; CHECK-GI-NEXT:    orr w9, w9, w10, lsl #4
+; CHECK-GI-NEXT:    and w10, w12, #0x1
+; CHECK-GI-NEXT:    umov.b w12, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w11, lsl #7
+; CHECK-GI-NEXT:    bfi w13, w15, #1, #31
+; CHECK-GI-NEXT:    and w11, w16, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w10, lsl #5
+; CHECK-GI-NEXT:    and w10, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[4]
+; CHECK-GI-NEXT:    strb w8, [sp, #11]
+; CHECK-GI-NEXT:    umov.b w15, v0[1]
+; CHECK-GI-NEXT:    umov.b w16, v0[3]
+; CHECK-GI-NEXT:    orr w8, w9, w10, lsl #6
+; CHECK-GI-NEXT:    orr w9, w13, w11, lsl #2
+; CHECK-GI-NEXT:    and w10, w12, #0x1
+; CHECK-GI-NEXT:    and w11, w17, #0x1
+; CHECK-GI-NEXT:    umov.b w12, v0[5]
+; CHECK-GI-NEXT:    umov.b w17, v0[0]
+; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #7
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #3
+; CHECK-GI-NEXT:    umov.b w10, v0[1]
+; CHECK-GI-NEXT:    and w11, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[0]
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #4
+; CHECK-GI-NEXT:    umov.b w11, v0[2]
+; CHECK-GI-NEXT:    umov.b w13, v0[6]
+; CHECK-GI-NEXT:    and w12, w12, #0x1
+; CHECK-GI-NEXT:    bfi w17, w15, #1, #31
+; CHECK-GI-NEXT:    umov.b w15, v0[5]
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #5
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w12, v0[2]
+; CHECK-GI-NEXT:    bfi w14, w10, #1, #31
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    ldr b1, [sp, #9]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w13, w13, #0x1
+; CHECK-GI-NEXT:    strb w8, [sp, #12]
+; CHECK-GI-NEXT:    orr w11, w14, w11, lsl #2
+; CHECK-GI-NEXT:    and w14, w16, #0x1
+; CHECK-GI-NEXT:    umov.b w16, v0[4]
+; CHECK-GI-NEXT:    and w12, w12, #0x1
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w13, lsl #6
+; CHECK-GI-NEXT:    orr w11, w11, w14, lsl #3
+; CHECK-GI-NEXT:    orr w12, w17, w12, lsl #2
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    and w17, w0, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[5]
+; CHECK-GI-NEXT:    umov.b w14, v0[6]
+; CHECK-GI-NEXT:    orr w10, w11, w10, lsl #4
+; CHECK-GI-NEXT:    orr w12, w12, w17, lsl #3
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    and w16, w16, #0x1
+; CHECK-GI-NEXT:    umov.b w17, v0[6]
+; CHECK-GI-NEXT:    orr w10, w10, w15, lsl #5
+; CHECK-GI-NEXT:    umov.b w15, v0[7]
+; CHECK-GI-NEXT:    orr w12, w12, w16, lsl #4
+; CHECK-GI-NEXT:    and w16, w0, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[7]
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    orr w12, w12, w16, lsl #5
+; CHECK-GI-NEXT:    orr w10, w10, w14, lsl #6
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w13, w17, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #7
+; CHECK-GI-NEXT:    mov.s v0[1], v1[0]
+; CHECK-GI-NEXT:    orr w11, w12, w13, lsl #6
+; CHECK-GI-NEXT:    and w12, w15, #0x1
+; CHECK-GI-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    orr w8, w10, w12, lsl #7
+; CHECK-GI-NEXT:    and w10, w0, #0x1
+; CHECK-GI-NEXT:    strb w9, [sp, #13]
+; CHECK-GI-NEXT:    orr w9, w11, w10, lsl #7
+; CHECK-GI-NEXT:    strb w8, [sp, #14]
+; CHECK-GI-NEXT:    strb w9, [sp, #15]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
   %bc = bitcast <16 x i1> %arg to <2 x i8>
   ret <2 x i8> %bc
 }
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll b/llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll
new file mode 100644
index 0000000000000..f3b8deff61918
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll
@@ -0,0 +1,17 @@
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -stop-after=finalize-isel < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -stop-after=finalize-isel < %s | FileCheck --check-prefix=GCN %s
+
+; GCN-LABEL: name: init_wwm
+; GCN: hasInitWholeWave: true
+define void @init_wwm(ptr addrspace(1) inreg %p) {
+entry:
+  %entry_exec = call i1 @llvm.amdgcn.init.whole.wave()
+  br i1 %entry_exec, label %bb.1, label %bb.2
+
+bb.1:
+  store i32 1, ptr addrspace(1) %p
+  br label %bb.2
+
+bb.2:
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
index da212a1850964..1b93ae029f27b 100755
--- a/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
+++ b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-movrs | FileCheck %s
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR
 
 define void @test_amx_internal(i16 %m, i16 %n, ptr %buf, i64 %s) {
 ; CHECK-LABEL: test_amx_internal:
@@ -35,6 +36,44 @@ define void @test_amx_internal(i16 %m, i16 %n, ptr %buf, i64 %s) {
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx_internal:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    pushq %rbp # encoding: [0x55]
+; EGPR-NEXT:    .cfi_def_cfa_offset 16
+; EGPR-NEXT:    .cfi_offset %rbp, -16
+; EGPR-NEXT:    movq %rsp, %rbp # encoding: [0x48,0x89,0xe5]
+; EGPR-NEXT:    .cfi_def_cfa_register %rbp
+; EGPR-NEXT:    andq $-1024, %rsp # encoding: [0x48,0x81,0xe4,0x00,0xfc,0xff,0xff]
+; EGPR-NEXT:    # imm = 0xFC00
+; EGPR-NEXT:    subq $3072, %rsp # encoding: [0x48,0x81,0xec,0x00,0x0c,0x00,0x00]
+; EGPR-NEXT:    # imm = 0xC00
+; EGPR-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xc0,0x03,0x00,0x00]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xd0,0x03,0x00,0x00]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xe0,0x03,0x00,0x00]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xf0,0x03,0x00,0x00]
+; EGPR-NEXT:    movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xc0,0x03,0x00,0x00,0x01]
+; EGPR-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    # encoding: [0x48,0x89,0x8c,0x24,0xb8,0x03,0x00,0x00]
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    # encoding: [0x48,0x8b,0xb4,0x24,0xb8,0x03,0x00,0x00]
+; EGPR-NEXT:    movw %ax, %cx # encoding: [0x66,0x89,0xc1]
+; EGPR-NEXT:    movw %di, %ax # encoding: [0x66,0x89,0xf8]
+; EGPR-NEXT:    # implicit-def: $al
+; EGPR-NEXT:    movb %al, {{[0-9]+}}(%rsp) # encoding: [0x88,0x84,0x24,0xf0,0x03,0x00,0x00]
+; EGPR-NEXT:    movw %cx, {{[0-9]+}}(%rsp) # encoding: [0x66,0x89,0x8c,0x24,0xd0,0x03,0x00,0x00]
+; EGPR-NEXT:    ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0xc0,0x03,0x00,0x00]
+; EGPR-NEXT:    tileloaddrs (%rdx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4a,0x04,0x32]
+; EGPR-NEXT:    movl $64, %esi # encoding: [0xbe,0x40,0x00,0x00,0x00]
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx # encoding: [0x48,0x8d,0x94,0x24,0x00,0x04,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm0, (%rdx,%rsi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x32]
+; EGPR-NEXT:    movq %rbp, %rsp # encoding: [0x48,0x89,0xec]
+; EGPR-NEXT:    popq %rbp # encoding: [0x5d]
+; EGPR-NEXT:    .cfi_def_cfa %rsp, 8
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %t1 = call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %m, i16 %n, ptr %buf, i64 %s)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
@@ -48,6 +87,12 @@ define void @test_amx_old(i16 %m, i16 %n, ptr %buf) {
 ; CHECK-NEXT:    movl $32, %eax
 ; CHECK-NEXT:    tileloaddrs (%rdx,%rax), %tmm2
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx_old:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl $32, %eax # encoding: [0xb8,0x20,0x00,0x00,0x00]
+; EGPR-NEXT:    tileloaddrs (%rdx,%rax), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4a,0x14,0x02]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   call void @llvm.x86.tileloaddrs64(i8 2, ptr %buf, i64 32)
   ret void
@@ -88,6 +133,44 @@ define void @test_amx_t1_internal(i16 %m, i16 %n, ptr %buf, i64 %s) {
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx_t1_internal:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    pushq %rbp # encoding: [0x55]
+; EGPR-NEXT:    .cfi_def_cfa_offset 16
+; EGPR-NEXT:    .cfi_offset %rbp, -16
+; EGPR-NEXT:    movq %rsp, %rbp # encoding: [0x48,0x89,0xe5]
+; EGPR-NEXT:    .cfi_def_cfa_register %rbp
+; EGPR-NEXT:    andq $-1024, %rsp # encoding: [0x48,0x81,0xe4,0x00,0xfc,0xff,0xff]
+; EGPR-NEXT:    # imm = 0xFC00
+; EGPR-NEXT:    subq $3072, %rsp # encoding: [0x48,0x81,0xec,0x00,0x0c,0x00,0x00]
+; EGPR-NEXT:    # imm = 0xC00
+; EGPR-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xc0,0x03,0x00,0x00]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xd0,0x03,0x00,0x00]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xe0,0x03,0x00,0x00]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xf0,0x03,0x00,0x00]
+; EGPR-NEXT:    movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xc0,0x03,0x00,0x00,0x01]
+; EGPR-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    # encoding: [0x48,0x89,0x8c,0x24,0xb8,0x03,0x00,0x00]
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    # encoding: [0x48,0x8b,0xb4,0x24,0xb8,0x03,0x00,0x00]
+; EGPR-NEXT:    movw %ax, %cx # encoding: [0x66,0x89,0xc1]
+; EGPR-NEXT:    movw %di, %ax # encoding: [0x66,0x89,0xf8]
+; EGPR-NEXT:    # implicit-def: $al
+; EGPR-NEXT:    movb %al, {{[0-9]+}}(%rsp) # encoding: [0x88,0x84,0x24,0xf0,0x03,0x00,0x00]
+; EGPR-NEXT:    movw %cx, {{[0-9]+}}(%rsp) # encoding: [0x66,0x89,0x8c,0x24,0xd0,0x03,0x00,0x00]
+; EGPR-NEXT:    ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0xc0,0x03,0x00,0x00]
+; EGPR-NEXT:    tileloaddrst1 (%rdx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x4a,0x04,0x32]
+; EGPR-NEXT:    movl $64, %esi # encoding: [0xbe,0x40,0x00,0x00,0x00]
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx # encoding: [0x48,0x8d,0x94,0x24,0x00,0x04,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm0, (%rdx,%rsi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x32]
+; EGPR-NEXT:    movq %rbp, %rsp # encoding: [0x48,0x89,0xec]
+; EGPR-NEXT:    popq %rbp # encoding: [0x5d]
+; EGPR-NEXT:    .cfi_def_cfa %rsp, 8
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %t1 = call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %m, i16 %n, ptr %buf, i64 %s)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
@@ -101,6 +184,12 @@ define void @test_amx_t1_old(i16 %m, i16 %n, ptr %buf) {
 ; CHECK-NEXT:    movl $32, %eax
 ; CHECK-NEXT:    tileloaddrst1 (%rdx,%rax), %tmm2
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx_t1_old:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl $32, %eax # encoding: [0xb8,0x20,0x00,0x00,0x00]
+; EGPR-NEXT:    tileloaddrst1 (%rdx,%rax), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x4a,0x14,0x02]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   call void @llvm.x86.tileloaddrst164(i8 2, ptr %buf, i64 32)
   ret void
diff --git a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
index 146b69773eb18..1f5758c804b2b 100755
--- a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
+++ b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O0
 ; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O2
+; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR
 
 define void @test_amx(i64 %stride, i8* %addr1) #0 {
 ; CHECK-LABEL: test_amx:
@@ -10,6 +11,14 @@ define void @test_amx(i64 %stride, i8* %addr1) #0 {
 ; CHECK-NEXT:    t2rpntlvwz1rs (%rsi,%rdi), %tmm0
 ; CHECK-NEXT:    t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    t2rpntlvwz0rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x04,0x3e]
+; EGPR-NEXT:    t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x3e]
+; EGPR-NEXT:    t2rpntlvwz1rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x04,0x3e]
+; EGPR-NEXT:    t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x3e]
+; EGPR-NEXT:    retq # encoding: [0xc3]
   call void @llvm.x86.t2rpntlvwz0rs(i8 1, i8* %addr1, i64 %stride)
   call void @llvm.x86.t2rpntlvwz0rst1(i8 2, i8* %addr1, i64 %stride)
   call void @llvm.x86.t2rpntlvwz1rs(i8 1, i8* %addr1, i64 %stride)
@@ -80,6 +89,27 @@ define void @test_amx2(i8* %base, i64 %stride) #0 {
 ; O2-NEXT:    t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4
 ; O2-NEXT:    tilerelease
 ; O2-NEXT:    retq
+;
+; EGPR-LABEL: test_amx2:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
+; EGPR-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xc0]
+; EGPR-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xd0]
+; EGPR-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xe0]
+; EGPR-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xf0]
+; EGPR-NEXT:    movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x08,0x00]
+; EGPR-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0]
+; EGPR-NEXT:    movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
+; EGPR-NEXT:    t2rpntlvwz0rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x24,0x37]
+; EGPR-NEXT:    t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x24,0x37]
+; EGPR-NEXT:    t2rpntlvwz1rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x24,0x37]
+; EGPR-NEXT:    t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x24,0x37]
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    retq # encoding: [0xc3]
   call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
   call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
   call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
diff --git a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
index cc4360317db7d..4cfd97afe721b 100644
--- a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
+++ b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR
 
 define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x float> %xmm) #0 {
 ; CHECK-LABEL: test_amx:
@@ -16,6 +17,21 @@ define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x floa
 ; CHECK-NEXT:    tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1
 ; CHECK-NEXT:    tconjtfp16 %tmm2, %tmm1
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    t2rpntlvwz0 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x04,0x31]
+; EGPR-NEXT:    t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x31]
+; EGPR-NEXT:    t2rpntlvwz1 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x04,0x31]
+; EGPR-NEXT:    t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x31]
+; EGPR-NEXT:    ttransposed %tmm3, %tmm1 # encoding: [0xc4,0xe2,0x7a,0x5f,0xcb]
+; EGPR-NEXT:    ttdpbf16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6c,0xca]
+; EGPR-NEXT:    ttdpfp16ps %tmm6, %tmm5, %tmm4 # encoding: [0xc4,0xe2,0x4b,0x6c,0xe5]
+; EGPR-NEXT:    ttcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x63,0x6b,0xca]
+; EGPR-NEXT:    ttcmmrlfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6b,0xca]
+; EGPR-NEXT:    tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x60,0x6b,0xca]
+; EGPR-NEXT:    tconjtfp16 %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x79,0x6b,0xca]
+; EGPR-NEXT:    retq # encoding: [0xc3]
   call void @llvm.x86.t2rpntlvwz0(i8 1, i8* %addr1, i64 %stride)
   call void @llvm.x86.t2rpntlvwz0t1(i8 2, i8* %addr1, i64 %stride)
   call void @llvm.x86.t2rpntlvwz1(i8 1, i8* %addr1, i64 %stride)
@@ -78,6 +94,46 @@ define void @test_amx2(i8* %pointer, i8* %base, i64 %stride) #0 {
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx2:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    pushq %rbp # encoding: [0x55]
+; EGPR-NEXT:    subq $2928, %rsp # encoding: [0x48,0x81,0xec,0x70,0x0b,0x00,0x00]
+; EGPR-NEXT:    # imm = 0xB70
+; EGPR-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
+; EGPR-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0x0d]
+; EGPR-NEXT:    movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x40,0x03,0x00,0x00,0x01]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x70,0x03,0x00,0x00,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x50,0x03,0x00,0x00,0x08,0x00]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x71,0x03,0x00,0x00,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x52,0x03,0x00,0x00,0x08,0x00]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x72,0x03,0x00,0x00,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x54,0x03,0x00,0x00,0x08,0x00]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x73,0x03,0x00,0x00,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x56,0x03,0x00,0x00,0x08,0x00]
+; EGPR-NEXT:    ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0x40,0x03,0x00,0x00]
+; EGPR-NEXT:    movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
+; EGPR-NEXT:    tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16]
+; EGPR-NEXT:    tilezero %tmm1 # encoding: [0xc4,0xe2,0x7b,0x49,0xc8]
+; EGPR-NEXT:    tilezero %tmm2 # encoding: [0xc4,0xe2,0x7b,0x49,0xd0]
+; EGPR-NEXT:    ttdpbf16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6c,0xd0]
+; EGPR-NEXT:    ttdpfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6c,0xd0]
+; EGPR-NEXT:    ttcmmimfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6b,0xd0]
+; EGPR-NEXT:    ttcmmrlfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6b,0xd0]
+; EGPR-NEXT:    movabsq $64, %rbp # encoding: [0x48,0xbd,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill
+; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x94,0x2c,0x80,0x03,0x00,0x00]
+; EGPR-NEXT:    tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload
+; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x9c,0x2c,0x80,0x03,0x00,0x00]
+; EGPR-NEXT:    tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3 # encoding: [0xc4,0xe2,0x70,0x6b,0xd8]
+; EGPR-NEXT:    tconjtfp16 %tmm3, %tmm0 # encoding: [0xc4,0xe2,0x79,0x6b,0xc3]
+; EGPR-NEXT:    tilestored %tmm2, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x14,0x17]
+; EGPR-NEXT:    addq $2928, %rsp # encoding: [0x48,0x81,0xc4,0x70,0x0b,0x00,0x00]
+; EGPR-NEXT:    # imm = 0xB70
+; EGPR-NEXT:    popq %rbp # encoding: [0x5d]
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 
   %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
   %b = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
@@ -117,6 +173,30 @@ define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 {
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx3:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
+; EGPR-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xff]
+; EGPR-NEXT:    movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf0,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd0,0x08,0x00]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00]
+; EGPR-NEXT:    movb $0, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x00]
+; EGPR-NEXT:    movw $0, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x00,0x00]
+; EGPR-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0]
+; EGPR-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; EGPR-NEXT:    movw $8, %cx # encoding: [0x66,0xb9,0x08,0x00]
+; EGPR-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16]
+; EGPR-NEXT:    t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x24,0x16]
+; EGPR-NEXT:    t2rpntlvwz1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x24,0x16]
+; EGPR-NEXT:    t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x24,0x16]
+; EGPR-NEXT:    ttransposed %tmm4, %tmm0 # encoding: [0xc4,0xe2,0x7a,0x5f,0xc4]
+; EGPR-NEXT:    tilestored %tmm0, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x17]
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; EGPR-NEXT:    retq # encoding: [0xc3]
   %1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
   %2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
   %3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
@@ -179,6 +259,72 @@ define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 {
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx_spill:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    subq $6088, %rsp # encoding: [0x48,0x81,0xec,0xc8,0x17,0x00,0x00]
+; EGPR-NEXT:    # imm = 0x17C8
+; EGPR-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
+; EGPR-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xfe]
+; EGPR-NEXT:    movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x80,0x01]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb0,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x90,0x08,0x00]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb4,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x98,0x08,0x00]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb5,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9a,0x08,0x00]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb6,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9c,0x08,0x00]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb7,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9e,0x08,0x00]
+; EGPR-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x80]
+; EGPR-NEXT:    movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
+; EGPR-NEXT:    tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16]
+; EGPR-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16]
+; EGPR-NEXT:    t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x34,0x16]
+; EGPR-NEXT:    movabsq $64, %rcx # encoding: [0x48,0xb9,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x0f,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x13,0x00,0x00]
+; EGPR-NEXT:    t2rpntlvwz1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x34,0x16]
+; EGPR-NEXT:    tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x07,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x0b,0x00,0x00]
+; EGPR-NEXT:    t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x34,0x16]
+; EGPR-NEXT:    tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0x74,0x0c,0xc0]
+; EGPR-NEXT:    tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x03,0x00,0x00]
+; EGPR-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x34,0x16]
+; EGPR-NEXT:    tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
+; EGPR-NEXT:    tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
+; EGPR-NEXT:    tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x0f,0x00,0x00]
+; EGPR-NEXT:    tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x13,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
+; EGPR-NEXT:    tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
+; EGPR-NEXT:    tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x07,0x00,0x00]
+; EGPR-NEXT:    tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x0b,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
+; EGPR-NEXT:    tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
+; EGPR-NEXT:    tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0x64,0x0c,0xc0]
+; EGPR-NEXT:    tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x03,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
+; EGPR-NEXT:    tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
+; EGPR-NEXT:    tilestored %tmm6, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x34,0x16]
+; EGPR-NEXT:    tilestored %tmm7, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x3c,0x16]
+; EGPR-NEXT:    addq $6088, %rsp # encoding: [0x48,0x81,0xc4,0xc8,0x17,0x00,0x00]
+; EGPR-NEXT:    # imm = 0x17C8
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; EGPR-NEXT:    retq # encoding: [0xc3]
   %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
   %b1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
   %b2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
diff --git a/llvm/test/CodeGen/X86/dag-large-offset.ll b/llvm/test/CodeGen/X86/dag-large-offset.ll
new file mode 100644
index 0000000000000..2774a93993153
--- /dev/null
+++ b/llvm/test/CodeGen/X86/dag-large-offset.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=i386 --frame-pointer=all | FileCheck %s
+
+; ISel will try to fold pointer arithmetic into the address displacement. However, we don't
+; want to do that if the offset is very close to the expressible limit because the final frame
+; layout may push it over/under the limit.
+
+define i32 @foo(i1 %b) #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    .cfi_offset %ebp, -8
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    .cfi_def_cfa_register %ebp
+; CHECK-NEXT:    subl $8, %esp
+; CHECK-NEXT:    movl __stack_chk_guard, %eax
+; CHECK-NEXT:    movl %eax, -4(%ebp)
+; CHECK-NEXT:    testb $1, 8(%ebp)
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    movl $-2147483647, %eax # imm = 0x80000001
+; CHECK-NEXT:    leal -5(%ebp,%eax), %eax
+; CHECK-NEXT:  .LBB0_3: # %entry
+; CHECK-NEXT:    movl __stack_chk_guard, %ecx
+; CHECK-NEXT:    cmpl -4(%ebp), %ecx
+; CHECK-NEXT:    jne .LBB0_5
+; CHECK-NEXT:  # %bb.4: # %entry
+; CHECK-NEXT:    addl $8, %esp
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    .cfi_def_cfa %esp, 4
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB0_5: # %entry
+; CHECK-NEXT:    .cfi_def_cfa %ebp, 8
+; CHECK-NEXT:    calll __stack_chk_fail
+entry:
+  %a = alloca i8, align 1
+  %0 = ptrtoint ptr %a to i32
+  %sub = add i32 %0, -2147483647
+  %retval.0 = select i1 %b, i32 %sub, i32 0
+  ret i32 %retval.0
+}
+
+attributes #0 = { sspreq }
diff --git a/llvm/test/CodeGen/X86/movrs-builtins.ll b/llvm/test/CodeGen/X86/movrs-builtins.ll
index c1722c831c95d..ccf0833e53990 100644
--- a/llvm/test/CodeGen/X86/movrs-builtins.ll
+++ b/llvm/test/CodeGen/X86/movrs-builtins.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+movrs | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+movrs,+egpr | FileCheck %s --check-prefix=EGPR
 
 define i8 @test_movrs_si8(ptr %__A) {
 ; CHECK-LABEL: test_movrs_si8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movrsb (%rdi), %al # encoding: [0x0f,0x38,0x8a,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_movrs_si8:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movrsb (%rdi), %al # EVEX TO LEGACY Compression encoding: [0x0f,0x38,0x8a,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = call i8 @llvm.x86.movrsqi(ptr %__A)
   ret i8 %0
@@ -17,6 +23,11 @@ define i16 @test_movrs_si16(ptr %__A) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movrsw (%rdi), %ax # encoding: [0x66,0x0f,0x38,0x8b,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_movrs_si16:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movrsw (%rdi), %ax # EVEX TO LEGACY Compression encoding: [0x66,0x0f,0x38,0x8b,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = call i16 @llvm.x86.movrshi(ptr %__A)
   ret i16 %0
@@ -28,6 +39,11 @@ define i32 @test_movrs_si32(ptr %__A) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movrsl (%rdi), %eax # encoding: [0x0f,0x38,0x8b,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_movrs_si32:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movrsl (%rdi), %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x38,0x8b,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = call i32 @llvm.x86.movrssi(ptr %__A)
   ret i32 %0
@@ -39,6 +55,11 @@ define i64 @test_movrs_si64(ptr %__A) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movrsq (%rdi), %rax # encoding: [0x48,0x0f,0x38,0x8b,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_movrs_si64:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movrsq (%rdi), %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x38,0x8b,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = call i64 @llvm.x86.movrsdi(ptr %__A)
   ret i64 %0
diff --git a/llvm/test/CodeGen/X86/xor-lea.ll b/llvm/test/CodeGen/X86/xor-lea.ll
index 10e9525a2706a..d50752e48d293 100644
--- a/llvm/test/CodeGen/X86/xor-lea.ll
+++ b/llvm/test/CodeGen/X86/xor-lea.ll
@@ -327,7 +327,8 @@ define i32 @xor_shl_sminval_i32(i32 %x) {
 ; X86-LABEL: xor_shl_sminval_i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal -2147483648(,%eax,8), %eax
+; X86-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; X86-NEXT:    leal (%ecx,%eax,8), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: xor_shl_sminval_i32:
diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
index 6df44c87d2332..57e3153da401b 100755
--- a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
+++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
@@ -96,3 +96,99 @@
 # ATT:   tileloaddrst1 -32(,%rbp,2), %tmm3
 # INTEL: tileloaddrst1 tmm3, [2*rbp - 32]
 0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6
+# INTEL: t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456]
+0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291]
+0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz0rs 64(%r18), %tmm6
+# INTEL: t2rpntlvwz0rs tmm6, [r18 + 64]
+0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40
+
+# ATT:   t2rpntlvwz0rs -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz0rs tmm2, [2*rbp - 32]
+0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6
+# INTEL: t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456]
+0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291]
+0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz0rst1 64(%r18), %tmm6
+# INTEL: t2rpntlvwz0rst1 tmm6, [r18 + 64]
+0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40
+
+# ATT:   t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
+0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6
+# INTEL: t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456]
+0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291]
+0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz1rs 64(%r18), %tmm6
+# INTEL: t2rpntlvwz1rs tmm6, [r18 + 64]
+0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40
+
+# ATT:   t2rpntlvwz1rs -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz1rs tmm2, [2*rbp - 32]
+0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6
+# INTEL: t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456]
+0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291]
+0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz1rst1 64(%r18), %tmm6
+# INTEL: t2rpntlvwz1rst1 tmm6, [r18 + 64]
+0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40
+
+# ATT:   t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
+0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   tileloaddrs 268435456(%r16,%r14,8), %tmm6
+# INTEL: tileloaddrs tmm6, [r16 + 8*r14 + 268435456]
+0x62,0xba,0x7f,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   tileloaddrs 291(%r8,%r17,4), %tmm3
+# INTEL: tileloaddrs tmm3, [r8 + 4*r17 + 291]
+0x62,0xd2,0x7b,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   tileloaddrs 64(%r18), %tmm6
+# INTEL: tileloaddrs tmm6, [r18 + 64]
+0x62,0xfa,0x7f,0x08,0x4a,0x74,0x22,0x40
+
+# ATT:   tileloaddrs -32(,%rbp,2), %tmm3
+# INTEL: tileloaddrs tmm3, [2*rbp - 32]
+0x62,0xf2,0x7f,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   tileloaddrst1 268435456(%r16,%r14,8), %tmm6
+# INTEL: tileloaddrst1 tmm6, [r16 + 8*r14 + 268435456]
+0x62,0xba,0x7d,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   tileloaddrst1 291(%r8,%r17,4), %tmm3
+# INTEL: tileloaddrst1 tmm3, [r8 + 4*r17 + 291]
+0x62,0xd2,0x79,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   tileloaddrst1 64(%r18), %tmm6
+# INTEL: tileloaddrst1 tmm6, [r18 + 64]
+0x62,0xfa,0x7d,0x08,0x4a,0x74,0x22,0x40
+
+# ATT:   tileloaddrst1 -32(,%rbp,2), %tmm3
+# INTEL: tileloaddrst1 tmm3, [2*rbp - 32]
+0x62,0xf2,0x7d,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff
diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
index 8c6f1be80ba2d..d768630ac1475 100644
--- a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
+++ b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
@@ -49,6 +49,54 @@
 # INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32]
 0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
 
+# ATT:   t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4
+# INTEL: t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456]
+0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz0 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291]
+0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz0 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz0 tmm2, [2*rbp - 32]
+0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4
+# INTEL: t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456]
+0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291]
+0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz0t1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz0t1 tmm2, [2*rbp - 32]
+0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4
+# INTEL: t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456]
+0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz1 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291]
+0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz1 tmm2, [2*rbp - 32]
+0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4
+# INTEL: t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456]
+0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291]
+0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32]
+0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
+
 # ATT:   ttransposed %tmm1, %tmm2
 # INTEL: ttransposed tmm2, tmm1
 0xc4,0xe2,0x7a,0x5f,0xd1
diff --git a/llvm/test/MC/Disassembler/X86/movrs.txt b/llvm/test/MC/Disassembler/X86/movrs.txt
index fa91b542d3f73..caac8bc8b7b30 100644
--- a/llvm/test/MC/Disassembler/X86/movrs.txt
+++ b/llvm/test/MC/Disassembler/X86/movrs.txt
@@ -95,4 +95,100 @@
 
 # ATT:   movrsq  -128(%rdx), %rbx
 # INTEL: movrs rbx, qword ptr [rdx - 128]
-0x48,0x0f,0x38,0x8b,0x5a,0x80
\ No newline at end of file
+0x48,0x0f,0x38,0x8b,0x5a,0x80
+
+# ATT:   movrsb  268435456(%rbp,%r14,8), %r16b
+# INTEL: movrs r16b, byte ptr [rbp + 8*r14 + 268435456]
+0x62,0xa4,0x7c,0x08,0x8a,0x84,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   movrsb  291(%r17,%rax,4), %bl
+# INTEL: movrs bl, byte ptr [r17 + 4*rax + 291]
+0x62,0xfc,0x7c,0x08,0x8a,0x9c,0x81,0x23,0x01,0x00,0x00
+
+# ATT:   movrsb  (%rip), %bl
+# INTEL: movrs bl, byte ptr [rip]
+0x62,0xf4,0x7c,0x08,0x8a,0x1d,0x00,0x00,0x00,0x00
+
+# ATT:   movrsb  -32(,%rbp,2), %r18b
+# INTEL: movrs r18b, byte ptr [2*rbp - 32]
+0x62,0xe4,0x7c,0x08,0x8a,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   movrsb  127(%r19), %bl
+# INTEL: movrs bl, byte ptr [r19 + 127]
+0x62,0xfc,0x7c,0x08,0x8a,0x5b,0x7f
+
+# ATT:   movrsb  -128(%r20,%riz), %bl
+# INTEL: movrs bl, byte ptr [r20 + riz - 128]
+0x62,0xfc,0x7c,0x08,0x8a,0x5c,0x24,0x80
+
+# ATT:   movrsw  268435456(%rbp,%r14,8), %r16w
+# INTEL: movrs r16w, word ptr [rbp + 8*r14 + 268435456]
+0x62,0xa4,0x7d,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   movrsw  291(%r17,%rax,4), %bx
+# INTEL: movrs bx, word ptr [r17 + 4*rax + 291]
+0x62,0xfc,0x7d,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00
+
+# ATT:   movrsw  (%rip), %bx
+# INTEL: movrs bx, word ptr [rip]
+0x62,0xf4,0x7d,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00
+
+# ATT:   movrsw  -32(,%rbp,2), %r18w
+# INTEL: movrs r18w, word ptr [2*rbp - 32]
+0x62,0xe4,0x7d,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   movrsw  127(%r19), %bx
+# INTEL: movrs bx, word ptr [r19 + 127]
+0x62,0xfc,0x7d,0x08,0x8b,0x5b,0x7f
+
+# ATT:   movrsw  -128(%r20,%riz), %bx
+# INTEL: movrs bx, word ptr [r20 + riz - 128]
+0x62,0xfc,0x7d,0x08,0x8b,0x5c,0x24,0x80
+
+# ATT:   movrsl  268435456(%rbp,%r14,8), %r16d
+# INTEL: movrs r16d, dword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa4,0x7c,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   movrsl  291(%r17,%rax,4), %ebx
+# INTEL: movrs ebx, dword ptr [r17 + 4*rax + 291]
+0x62,0xfc,0x7c,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00
+
+# ATT:   movrsl  (%rip), %ebx
+# INTEL: movrs ebx, dword ptr [rip]
+0x62,0xf4,0x7c,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00
+
+# ATT:   movrsl  -32(,%rbp,2), %r18d
+# INTEL: movrs r18d, dword ptr [2*rbp - 32]
+0x62,0xe4,0x7c,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   movrsl  127(%r19), %ebx
+# INTEL: movrs ebx, dword ptr [r19 + 127]
+0x62,0xfc,0x7c,0x08,0x8b,0x5b,0x7f
+
+# ATT:   movrsl  -128(%r20,%riz), %ebx
+# INTEL: movrs ebx, dword ptr [r20 + riz - 128]
+0x62,0xfc,0x7c,0x08,0x8b,0x5c,0x24,0x80
+
+# ATT:   movrsq  268435456(%rbp,%r14,8), %r16
+# INTEL: movrs r16, qword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa4,0xfc,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   movrsq  291(%r17,%rax,4), %rbx
+# INTEL: movrs rbx, qword ptr [r17 + 4*rax + 291]
+0x62,0xfc,0xfc,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00
+
+# ATT:   movrsq  (%rip), %rbx
+# INTEL: movrs rbx, qword ptr [rip]
+0x62,0xf4,0xfc,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00
+
+# ATT:   movrsq  -32(,%rbp,2), %r18
+# INTEL: movrs r18, qword ptr [2*rbp - 32]
+0x62,0xe4,0xfc,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   movrsq  127(%r19), %rbx
+# INTEL: movrs rbx, qword ptr [r19 + 127]
+0x62,0xfc,0xfc,0x08,0x8b,0x5b,0x7f
+
+# ATT:   movrsq  -128(%r20,%riz), %rbx
+# INTEL: movrs rbx, qword ptr [r20 + riz - 128]
+0x62,0xfc,0xfc,0x08,0x8b,0x5c,0x24,0x80
diff --git a/llvm/test/MC/LoongArch/Macros/aliases-la.s b/llvm/test/MC/LoongArch/Macros/aliases-la.s
index dd5a4d474e001..1b5b818f4348f 100644
--- a/llvm/test/MC/LoongArch/Macros/aliases-la.s
+++ b/llvm/test/MC/LoongArch/Macros/aliases-la.s
@@ -3,13 +3,26 @@
 
 # RUN: llvm-mc --triple=loongarch64 %s \
 # RUN:     | FileCheck %s --check-prefix=NORMAL
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o %t
+# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=RELOC
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.relax
+# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=RELOC,RELAX
 # RUN: llvm-mc --triple=loongarch64 --mattr=+la-global-with-pcrel < %s \
 # RUN:     | FileCheck %s --check-prefix=GTOPCR
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+la-global-with-pcrel \
+# RUN:     --mattr=-relax %s -o %t
+# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=GTOPCR-RELOC
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+la-global-with-pcrel \
+# RUN:     --mattr=+relax %s -o %t.relax
+# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=GTOPCR-RELOC,GTOPCR-RELAX
 # RUN: llvm-mc --triple=loongarch64 --mattr=+la-global-with-abs < %s \
 # RUN:     | FileCheck %s --check-prefix=GTOABS
 # RUN: llvm-mc --triple=loongarch64 --mattr=+la-local-with-abs < %s \
 # RUN:     | FileCheck %s --check-prefix=LTOABS
 
+# RELOC:      Relocations [
+# RELOC-NEXT:   Section ({{.*}}) .rela.text {
+
 la $a0, sym
 # NORMAL:      pcalau12i $a0, %got_pc_hi20(sym)
 # NORMAL-NEXT: ld.d $a0, $a0, %got_pc_lo12(sym)
@@ -22,6 +35,16 @@ la $a0, sym
 # GTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym)
 # GTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym)
 
+# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+
+# GTOPCR-RELOC: R_LARCH_PCALA_HI20 sym 0x0
+# GTOPCR-RELAX: R_LARCH_RELAX - 0x0
+# GTOPCR-RELOC-NEXT: R_LARCH_PCALA_LO12 sym 0x0
+# GTOPCR-RELAX-NEXT: R_LARCH_RELAX - 0x0
+
 la.global $a0, sym_global
 # NORMAL:      pcalau12i $a0, %got_pc_hi20(sym_global)
 # NORMAL-NEXT: ld.d $a0, $a0, %got_pc_lo12(sym_global)
@@ -34,6 +57,16 @@ la.global $a0, sym_global
 # GTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym_global)
 # GTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_global)
 
+# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym_global 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_global 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+
+# GTOPCR-RELOC-NEXT: R_LARCH_PCALA_HI20 sym_global 0x0
+# GTOPCR-RELAX-NEXT: R_LARCH_RELAX - 0x0
+# GTOPCR-RELOC-NEXT: R_LARCH_PCALA_LO12 sym_global 0x0
+# GTOPCR-RELAX-NEXT: R_LARCH_RELAX - 0x0
+
 la.global $a0, $a1, sym_global_large
 # NORMAL:      pcalau12i $a0, %got_pc_hi20(sym_global_large)
 # NORMAL-NEXT: addi.d $a1, $zero, %got_pc_lo12(sym_global_large)
@@ -52,6 +85,11 @@ la.global $a0, $a1, sym_global_large
 # GTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym_global_large)
 # GTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_global_large)
 
+# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym_global_large 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_global_large 0x0
+# RELOC-NEXT: R_LARCH_GOT64_PC_LO20 sym_global_large 0x0
+# RELOC-NEXT: R_LARCH_GOT64_PC_HI12 sym_global_large 0x0
+
 la.local $a0, sym_local
 # NORMAL:      pcalau12i $a0, %pc_hi20(sym_local)
 # NORMAL-NEXT: addi.d $a0, $a0, %pc_lo12(sym_local)
@@ -61,6 +99,11 @@ la.local $a0, sym_local
 # LTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym_local)
 # LTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_local)
 
+# RELOC-NEXT: R_LARCH_PCALA_HI20 sym_local 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+# RELOC-NEXT: R_LARCH_PCALA_LO12 sym_local 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+
 la.local $a0, $a1, sym_local_large
 # NORMAL:      pcalau12i $a0, %pc_hi20(sym_local_large)
 # NORMAL-NEXT: addi.d $a1, $zero, %pc_lo12(sym_local_large)
@@ -72,3 +115,12 @@ la.local $a0, $a1, sym_local_large
 # LTOABS-NEXT: ori $a0, $a0, %abs_lo12(sym_local_large)
 # LTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym_local_large)
 # LTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_local_large)
+
+# RELOC-NEXT: R_LARCH_PCALA_HI20 sym_local_large 0x0
+# RELOC-NEXT: R_LARCH_PCALA_LO12 sym_local_large 0x0
+# RELOC-NEXT: R_LARCH_PCALA64_LO20 sym_local_large 0x0
+# RELOC-NEXT: R_LARCH_PCALA64_HI12 sym_local_large 0x0
+
+
+# RELOC-NEXT:   }
+# RELOC-NEXT: ]
diff --git a/llvm/test/MC/LoongArch/Macros/macros-call.s b/llvm/test/MC/LoongArch/Macros/macros-call.s
index a648a39780381..df7715050a0f9 100644
--- a/llvm/test/MC/LoongArch/Macros/macros-call.s
+++ b/llvm/test/MC/LoongArch/Macros/macros-call.s
@@ -1,9 +1,26 @@
 # RUN: llvm-mc --triple=loongarch64 %s | FileCheck %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o %t
+# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=RELOC
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.relax
+# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=RELOC,RELAX
+
+# RELOC:      Relocations [
+# RELOC-NEXT:   Section ({{.*}}) .rela.text {
 
 call36 sym_call
 # CHECK:      pcaddu18i $ra, %call36(sym_call)
 # CHECK-NEXT: jirl $ra, $ra, 0
 
+# RELOC-NEXT: R_LARCH_CALL36 sym_call 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+
 tail36 $t0, sym_tail
 # CHECK:      pcaddu18i $t0, %call36(sym_tail)
 # CHECK-NEXT: jr $t0
+
+# RELOC-NEXT: R_LARCH_CALL36 sym_tail 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+
+
+# RELOC-NEXT:   }
+# RELOC-NEXT: ]
diff --git a/llvm/test/MC/LoongArch/Macros/macros-la.s b/llvm/test/MC/LoongArch/Macros/macros-la.s
index d4272b93ba54d..a732988ef1f1a 100644
--- a/llvm/test/MC/LoongArch/Macros/macros-la.s
+++ b/llvm/test/MC/LoongArch/Macros/macros-la.s
@@ -5,6 +5,12 @@
 # RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=RELOC,RELAX
 # RUN: llvm-mc --triple=loongarch64 --mattr=+la-global-with-abs \
 # RUN:     %s | FileCheck %s --check-prefix=ABS
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+la-global-with-abs \
+# RUN:     --mattr=-relax %s -o %t
+# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=GTOABS-RELOC
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+la-global-with-abs \
+# RUN:     --mattr=+relax %s -o %t.relax
+# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=GTOABS-RELOC,GTOABS-RELAX
 
 # RELOC:      Relocations [
 # RELOC-NEXT:   Section ({{.*}}) .rela.text {
@@ -36,6 +42,10 @@ la.pcrel $a0, sym_pcrel
 # RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_PCALA_LO12 sym_pcrel 0x0
 # RELAX-NEXT: R_LARCH_RELAX - 0x0
+# GTOABS-RELOC: R_LARCH_PCALA_HI20 sym_pcrel 0x0
+# GTOABS-RELAX-NEXT: R_LARCH_RELAX - 0x0
+# GTOABS-RELOC-NEXT: R_LARCH_PCALA_LO12 sym_pcrel 0x0
+# GTOABS-RELAX-NEXT: R_LARCH_RELAX - 0x0
 
 la.got $a0, sym_got
 # CHECK-NEXT: pcalau12i $a0, %got_pc_hi20(sym_got)
@@ -73,7 +83,9 @@ la.tls.ie $a0, sym_ie
 # ABS-NEXT:   ld.d $a0, $a0, 0
 # ABS-EMPTY:
 # RELOC-NEXT: R_LARCH_TLS_IE_PC_HI20 sym_ie 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_TLS_IE_PC_LO12 sym_ie 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 
 la.tls.ld $a0, sym_ld
 # CHECK-NEXT: pcalau12i $a0, %ld_pc_hi20(sym_ld)
@@ -85,7 +97,9 @@ la.tls.ld $a0, sym_ld
 # ABS-NEXT:   lu52i.d $a0, $a0, %got64_hi12(sym_ld)
 # ABS-EMPTY:
 # RELOC-NEXT: R_LARCH_TLS_LD_PC_HI20 sym_ld 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_ld 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 
 la.tls.gd $a0, sym_gd
 # CHECK-NEXT: pcalau12i $a0, %gd_pc_hi20(sym_gd)
@@ -97,7 +111,9 @@ la.tls.gd $a0, sym_gd
 # ABS-NEXT:   lu52i.d $a0, $a0, %got64_hi12(sym_gd)
 # ABS-EMPTY:
 # RELOC-NEXT: R_LARCH_TLS_GD_PC_HI20 sym_gd 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_gd 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 
 la.tls.desc $a0, sym_desc
 # CHECK-NEXT: pcalau12i $a0, %desc_pc_hi20(sym_desc)
@@ -113,9 +129,13 @@ la.tls.desc $a0, sym_desc
 # ABS-NEXT:   jirl $ra, $ra, %desc_call(sym_desc)
 # ABS-EMPTY:
 # RELOC-NEXT: R_LARCH_TLS_DESC_PC_HI20 sym_desc 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_TLS_DESC_PC_LO12 sym_desc 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_TLS_DESC_LD sym_desc 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_TLS_DESC_CALL sym_desc 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 
 #############################################################
 ## with a temporary register.
diff --git a/llvm/test/MC/LoongArch/Relocations/relax-tls-le.s b/llvm/test/MC/LoongArch/Relocations/relax-tls-le.s
new file mode 100644
index 0000000000000..899f12f85654d
--- /dev/null
+++ b/llvm/test/MC/LoongArch/Relocations/relax-tls-le.s
@@ -0,0 +1,70 @@
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=+relax < %s \
+# RUN:     | llvm-readobj -r - | FileCheck --check-prefix=LA32-RELAX-RELOC %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=-relax < %s \
+# RUN:     | llvm-readobj -r - | FileCheck --check-prefix=LA32-NORELAX-RELOC %s
+# RUN: llvm-mc --triple=loongarch32 --mattr=+relax < %s --show-encoding \
+# RUN:     | FileCheck --check-prefix=LA32-RELAX-FIXUP %s
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax --defsym=LA64=1 < %s \
+# RUN:     | llvm-readobj -r - | FileCheck --check-prefix=LA64-RELAX-RELOC %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax --defsym=LA64=1 < %s \
+# RUN:     | llvm-readobj -r - | FileCheck --check-prefix=LA64-NORELAX-RELOC %s
+# RUN: llvm-mc --triple=loongarch64 --mattr=+relax --defsym=LA64=1 < %s --show-encoding \
+# RUN:     | FileCheck --check-prefix=LA64-RELAX-FIXUP %s
+
+.long foo
+
+.ifndef LA64
+
+lu12i.w $a0, %le_hi20_r(foo)
+# LA32-NORELAX-RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0
+# LA32-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0
+# LA32-RELAX-RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0
+# LA32-RELAX-RELOC: R_LARCH_RELAX - 0x0
+# LA32-RELAX-FIXUP: fixup A - offset: 0, value: %le_hi20_r(foo), kind: FK_NONE
+# LA32-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE
+
+add.w $a0, $a0, $tp, %le_add_r(foo)
+# LA32-NORELAX-RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0
+# LA32-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0
+# LA32-RELAX-RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0
+# LA32-RELAX-RELOC: R_LARCH_RELAX - 0x0
+# LA32-RELAX-FIXUP: fixup A - offset: 0, value: %le_add_r(foo), kind: FK_NONE
+# LA32-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE
+
+addi.w $a0, $a0, %le_lo12_r(foo)
+# LA32-NORELAX-RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0
+# LA32-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0
+# LA32-RELAX-RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0
+# LA32-RELAX-RELOC: R_LARCH_RELAX - 0x0
+# LA32-RELAX-FIXUP: fixup A - offset: 0, value: %le_lo12_r(foo), kind: FK_NONE
+# LA32-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE
+
+.else
+
+lu12i.w $a0, %le_hi20_r(foo)
+# LA64-NORELAX-RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0
+# LA64-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0
+# LA64-RELAX-RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0
+# LA64-RELAX-RELOC: R_LARCH_RELAX - 0x0
+# LA64-RELAX-FIXUP: fixup A - offset: 0, value: %le_hi20_r(foo), kind: FK_NONE
+# LA64-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE
+
+add.d $a0, $a0, $tp, %le_add_r(foo)
+# LA64-NORELAX-RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0
+# LA64-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0
+# LA64-RELAX-RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0
+# LA64-RELAX-RELOC: R_LARCH_RELAX - 0x0
+# LA64-RELAX-FIXUP: fixup A - offset: 0, value: %le_add_r(foo), kind: FK_NONE
+# LA64-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE
+
+addi.d $a0, $a0, %le_lo12_r(foo)
+# LA64-NORELAX-RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0
+# LA64-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0
+# LA64-RELAX-RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0
+# LA64-RELAX-RELOC: R_LARCH_RELAX - 0x0
+# LA64-RELAX-FIXUP: fixup A - offset: 0, value: %le_lo12_r(foo), kind: FK_NONE
+# LA64-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE
+
+.endif
+
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
index d780ad4f0e369..92db672e1c82d 100755
--- a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
@@ -86,4 +86,92 @@
 
 // CHECK: tileloaddrst1 -32(,%rbp,2), %tmm3
 // CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
-          tileloaddrst1 -32(,%rbp,2), %tmm3
\ No newline at end of file
+          tileloaddrst1 -32(,%rbp,2), %tmm3
+
+// CHECK: t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6
+// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz0rs   291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2
+
+// CHECK: t2rpntlvwz0rs   64(%r18), %tmm6
+// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40]
+          t2rpntlvwz0rs 64(%r18), %tmm6
+
+// CHECK: {evex}  t2rpntlvwz0rs   -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0rs -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6
+// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz0rst1   291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2
+
+// CHECK: t2rpntlvwz0rst1   64(%r18), %tmm6
+// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40]
+          t2rpntlvwz0rst1 64(%r18), %tmm6
+
+// CHECK: {evex}  t2rpntlvwz0rst1   -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6
+// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz1rs   291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2
+
+// CHECK: t2rpntlvwz1rs   64(%r18), %tmm6
+// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40]
+          t2rpntlvwz1rs 64(%r18), %tmm6
+
+// CHECK: {evex}  t2rpntlvwz1rs   -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1rs -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6
+// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz1rst1   291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2
+
+// CHECK: t2rpntlvwz1rst1   64(%r18), %tmm6
+// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40]
+          t2rpntlvwz1rst1 64(%r18), %tmm6
+
+// CHECK: {evex}  t2rpntlvwz1rst1   -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
+
+// CHECK: tileloaddrs     291(%r16,%rax,4), %tmm3
+// CHECK: encoding: [0x62,0xfa,0x7f,0x08,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00]
+          tileloaddrs 291(%r16,%rax,4), %tmm3
+
+// CHECK: tileloaddrs     291(%r8,%r17,4), %tmm3
+// CHECK: encoding: [0x62,0xd2,0x7b,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00]
+          tileloaddrs 291(%r8,%r17,4), %tmm3
+
+// CHECK: {evex}  tileloaddrs     -32(,%rbp,2), %tmm3
+// CHECK: encoding: [0x62,0xf2,0x7f,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} tileloaddrs -32(,%rbp,2), %tmm3
+
+// CHECK: tileloaddrst1     291(%r16,%rax,4), %tmm3
+// CHECK: encoding: [0x62,0xfa,0x7d,0x08,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00]
+          tileloaddrst1 291(%r16,%rax,4), %tmm3
+
+// CHECK: tileloaddrst1     291(%r8,%r17,4), %tmm3
+// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00]
+          tileloaddrst1 291(%r8,%r17,4), %tmm3
+
+// CHECK: {evex}  tileloaddrst1     -32(,%rbp,2), %tmm3
+// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} tileloaddrst1 -32(,%rbp,2), %tmm3
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
index ccc7ac51a98a4..140d1aa6b198e 100755
--- a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
@@ -95,3 +95,99 @@
 // CHECK: tileloaddrst1 tmm3, [2*rbp - 32]
 // CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
           tileloaddrst1 tmm3, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: t2rpntlvwz0rs tmm6, [r18 + 64]
+// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40]
+          t2rpntlvwz0rs tmm6, [r18 + 64]
+
+// CHECK: {evex} t2rpntlvwz0rs tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0rs tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: t2rpntlvwz0rst1 tmm6, [r18 + 64]
+// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40]
+          t2rpntlvwz0rst1 tmm6, [r18 + 64]
+
+// CHECK: {evex} t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: t2rpntlvwz1rs tmm6, [r18 + 64]
+// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40]
+          t2rpntlvwz1rs tmm6, [r18 + 64]
+
+// CHECK: {evex} t2rpntlvwz1rs tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1rs tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: t2rpntlvwz1rst1 tmm6, [r18 + 64]
+// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40]
+          t2rpntlvwz1rst1 tmm6, [r18 + 64]
+
+// CHECK: {evex} t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
+
+// CHECK: tileloaddrs     tmm6, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xba,0x7f,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          tileloaddrs tmm6, [r16 + 8*r14 + 268435456]
+
+// CHECK: tileloaddrs     tmm3, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd2,0x7b,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00]
+          tileloaddrs tmm3, [r8 + 4*r17 + 291]
+
+// CHECK: tileloaddrs     tmm6, [r18 + 64]
+// CHECK: encoding: [0x62,0xfa,0x7f,0x08,0x4a,0x74,0x22,0x40]
+          tileloaddrs tmm6, [r18 + 64]
+
+// CHECK: {evex}  tileloaddrs     tmm3, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf2,0x7f,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} tileloaddrs tmm3, [2*rbp - 32]
+
+// CHECK: tileloaddrst1     tmm6, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          tileloaddrst1 tmm6, [r16 + 8*r14 + 268435456]
+
+// CHECK: tileloaddrst1     tmm3, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00]
+          tileloaddrst1 tmm3, [r8 + 4*r17 + 291]
+
+// CHECK: tileloaddrst1     tmm6, [r18 + 64]
+// CHECK: encoding: [0x62,0xfa,0x7d,0x08,0x4a,0x74,0x22,0x40]
+          tileloaddrst1 tmm6, [r18 + 64]
+
+// CHECK: {evex}  tileloaddrst1     tmm3, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} tileloaddrst1 tmm3, [2*rbp - 32]
diff --git a/llvm/test/MC/X86/amx-transpose-att.s b/llvm/test/MC/X86/amx-transpose-att.s
index 21bbf258ac6ef..5158470f8c905 100644
--- a/llvm/test/MC/X86/amx-transpose-att.s
+++ b/llvm/test/MC/X86/amx-transpose-att.s
@@ -48,6 +48,54 @@
 // CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
           t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
 
+// CHECK: t2rpntlvwz0     268435456(%r16,%r14,8), %tmm4
+// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4
+
+// CHECK: t2rpntlvwz0     291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0 291(%r8,%r17,4), %tmm2
+
+// CHECK: {evex}  t2rpntlvwz0     -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0 -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz0t1     268435456(%r16,%r14,8), %tmm4
+// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4
+
+// CHECK: t2rpntlvwz0t1     291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2
+
+// CHECK: {evex}  t2rpntlvwz0t1     -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0t1 -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz1     268435456(%r16,%r14,8), %tmm4
+// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4
+
+// CHECK: t2rpntlvwz1     291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1 291(%r8,%r17,4), %tmm2
+
+// CHECK: {evex}  t2rpntlvwz1     -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1 -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz1t1     268435456(%r16,%r14,8), %tmm4
+// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4
+
+// CHECK: t2rpntlvwz1t1     291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2
+
+// CHECK: {evex}  t2rpntlvwz1t1     -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
+
 // CHECK: ttransposed     %tmm1, %tmm5
 // CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9]
           ttransposed %tmm1, %tmm5
diff --git a/llvm/test/MC/X86/amx-transpose-intel.s b/llvm/test/MC/X86/amx-transpose-intel.s
index a772232ddbbf2..0d2c22f67a173 100644
--- a/llvm/test/MC/X86/amx-transpose-intel.s
+++ b/llvm/test/MC/X86/amx-transpose-intel.s
@@ -48,6 +48,54 @@
 // CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
           t2rpntlvwz1t1 tmm2, [2*rbp - 32]
 
+// CHECK: t2rpntlvwz0     tmm4, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz0     tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: {evex} t2rpntlvwz0     tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0 tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz0t1     tmm4, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz0t1     tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: {evex} t2rpntlvwz0t1     tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0t1 tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz1     tmm4, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz1     tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: {evex} t2rpntlvwz1     tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1 tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz1t1     tmm4, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz1t1     tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: {evex} t2rpntlvwz1t1     tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1t1 tmm2, [2*rbp - 32]
+
 // CHECK: ttransposed     tmm5, tmm1
 // CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9]
           ttransposed tmm5, tmm1
diff --git a/llvm/test/MC/X86/movrs-att-64.s b/llvm/test/MC/X86/movrs-att-64.s
index 59a2fdb6d10b2..e951b30369d46 100644
--- a/llvm/test/MC/X86/movrs-att-64.s
+++ b/llvm/test/MC/X86/movrs-att-64.s
@@ -94,4 +94,100 @@
 
 // CHECK: movrsq  -128(%rdx), %rbx
 // CHECK: encoding: [0x48,0x0f,0x38,0x8b,0x5a,0x80]
-          movrs  -128(%rdx), %rbx
\ No newline at end of file
+          movrs  -128(%rdx), %rbx
+
+// CHECK: movrsb  268435456(%rbp,%r14,8), %r16b
+// CHECK: encoding: [0x62,0xa4,0x7c,0x08,0x8a,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs  268435456(%rbp,%r14,8), %r16b
+
+// CHECK: movrsb  291(%r17,%rax,4), %bl
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs  291(%r17,%rax,4), %bl
+
+// CHECK: {evex}  movrsb  (%rip), %bl
+// CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x8a,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs  (%rip), %bl
+
+// CHECK: movrsb  -32(,%rbp,2), %r18b
+// CHECK: encoding: [0x62,0xe4,0x7c,0x08,0x8a,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs  -32(,%rbp,2), %r18b
+
+// CHECK: movrsb  127(%r19), %bl
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x5b,0x7f]
+          movrs 127(%r19), %bl
+
+// CHECK: movrsb  -128(%r20), %bl
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x5c,0x24,0x80]
+          movrs  -128(%r20), %bl
+
+// CHECK: movrsw  268435456(%rbp,%r14,8), %r16w
+// CHECK: encoding: [0x62,0xa4,0x7d,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs  268435456(%rbp,%r14,8), %r16w
+
+// CHECK: movrsw  291(%r17,%rax,4), %bx
+// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs  291(%r17,%rax,4), %bx
+
+// CHECK: {evex}  movrsw  (%rip), %bx
+// CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs  (%rip), %bx
+
+// CHECK: movrsw  -32(,%rbp,2), %r18w
+// CHECK: encoding: [0x62,0xe4,0x7d,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs  -32(,%rbp,2), %r18w
+
+// CHECK: movrsw  127(%r19), %bx
+// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x5b,0x7f]
+          movrs 127(%r19), %bx
+
+// CHECK: movrsw  -128(%r20), %bx
+// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x5c,0x24,0x80]
+          movrs  -128(%r20), %bx
+
+// CHECK: movrsl  268435456(%rbp,%r14,8), %r16d
+// CHECK: encoding: [0x62,0xa4,0x7c,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs  268435456(%rbp,%r14,8), %r16d
+
+// CHECK: movrsl  291(%r17,%rax,4), %ebx
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs  291(%r17,%rax,4), %ebx
+
+// CHECK: {evex}  movrsl  (%rip), %ebx
+// CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs  (%rip), %ebx
+
+// CHECK: movrsl  -32(,%rbp,2), %r18d
+// CHECK: encoding: [0x62,0xe4,0x7c,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs  -32(,%rbp,2), %r18d
+
+// CHECK: movrsl  127(%r19), %ebx
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x5b,0x7f]
+          movrs 127(%r19), %ebx
+
+// CHECK: movrsl  -128(%r20), %ebx
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x5c,0x24,0x80]
+          movrs  -128(%r20), %ebx
+
+// CHECK: movrsq  268435456(%rbp,%r14,8), %r16
+// CHECK: encoding: [0x62,0xa4,0xfc,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs  268435456(%rbp,%r14,8), %r16
+
+// CHECK: movrsq  291(%r17,%rax,4), %rbx
+// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs  291(%r17,%rax,4), %rbx
+
+// CHECK: {evex}  movrsq  (%rip), %rbx
+// CHECK: encoding: [0x62,0xf4,0xfc,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs  (%rip), %rbx
+
+// CHECK: movrsq  -32(,%rbp,2), %r18
+// CHECK: encoding: [0x62,0xe4,0xfc,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs  -32(,%rbp,2), %r18
+
+// CHECK: movrsq  127(%r19), %rbx
+// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x5b,0x7f]
+          movrs 127(%r19), %rbx
+
+// CHECK: movrsq  -128(%r20), %rbx
+// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x5c,0x24,0x80]
+          movrs  -128(%r20), %rbx
diff --git a/llvm/test/MC/X86/movrs-intel-64.s b/llvm/test/MC/X86/movrs-intel-64.s
index f41075a21b3e8..f698f1c440442 100644
--- a/llvm/test/MC/X86/movrs-intel-64.s
+++ b/llvm/test/MC/X86/movrs-intel-64.s
@@ -94,4 +94,100 @@
 
 // CHECK: movrs rbx, qword ptr [rdx - 128]
 // CHECK: encoding: [0x48,0x0f,0x38,0x8b,0x5a,0x80]
-          movrs rbx, qword ptr [rdx - 128]
\ No newline at end of file
+          movrs rbx, qword ptr [rdx - 128]
+
+// CHECK: movrs r16b, byte ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa4,0x7c,0x08,0x8a,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs r16b, byte ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: movrs bl, byte ptr [r17 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs bl, byte ptr [r17 + 4*rax + 291]
+
+// CHECK: {evex} movrs bl, byte ptr [rip]
+// CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x8a,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs bl, byte ptr [rip]
+
+// CHECK: movrs r18b, byte ptr [2*rbp - 32]
+// CHECK: encoding: [0x62,0xe4,0x7c,0x08,0x8a,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs r18b, byte ptr [2*rbp - 32]
+
+// CHECK: movrs bl, byte ptr [r19 + 127]
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x5b,0x7f]
+          movrs bl, byte ptr [r19 + 127]
+
+// CHECK: movrs bl, byte ptr [r20 - 128]
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x5c,0x24,0x80]
+          movrs bl, byte ptr [r20 - 128]
+
+// CHECK: movrs r16w, word ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa4,0x7d,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs r16w, word ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: movrs bx, word ptr [r17 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs bx, word ptr [r17 + 4*rax + 291]
+
+// CHECK: {evex} movrs bx, word ptr [rip]
+// CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs bx, word ptr [rip]
+
+// CHECK: movrs r18w, word ptr [2*rbp - 32]
+// CHECK: encoding: [0x62,0xe4,0x7d,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs r18w, word ptr [2*rbp - 32]
+
+// CHECK: movrs bx, word ptr [r19 + 127]
+// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x5b,0x7f]
+          movrs bx, word ptr [r19 + 127]
+
+// CHECK: movrs bx, word ptr [r20 - 128]
+// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x5c,0x24,0x80]
+          movrs bx, word ptr [r20 - 128]
+
+// CHECK: movrs r16d, dword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa4,0x7c,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs r16d, dword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: movrs ebx, dword ptr [r17 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs ebx, dword ptr [r17 + 4*rax + 291]
+
+// CHECK: {evex} movrs ebx, dword ptr [rip]
+// CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs ebx, dword ptr [rip]
+
+// CHECK: movrs r18d, dword ptr [2*rbp - 32]
+// CHECK: encoding: [0x62,0xe4,0x7c,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs r18d, dword ptr [2*rbp - 32]
+
+// CHECK: movrs ebx, dword ptr [r19 + 127]
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x5b,0x7f]
+          movrs ebx, dword ptr [r19 + 127]
+
+// CHECK: movrs ebx, dword ptr [r20 - 128]
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x5c,0x24,0x80]
+          movrs ebx, dword ptr [r20 - 128]
+
+// CHECK: movrs r16, qword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa4,0xfc,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs r16, qword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: movrs rbx, qword ptr [r17 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs rbx, qword ptr [r17 + 4*rax + 291]
+
+// CHECK: {evex} movrs rbx, qword ptr [rip]
+// CHECK: encoding: [0x62,0xf4,0xfc,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs rbx, qword ptr [rip]
+
+// CHECK: movrs r18, qword ptr [2*rbp - 32]
+// CHECK: encoding: [0x62,0xe4,0xfc,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs r18, qword ptr [2*rbp - 32]
+
+// CHECK: movrs rbx, qword ptr [r19 + 127]
+// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x5b,0x7f]
+          movrs rbx, qword ptr [r19 + 127]
+
+// CHECK: movrs rbx, qword ptr [r20 - 128]
+// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x5c,0x24,0x80]
+          movrs rbx, qword ptr [r20 - 128]
diff --git a/llvm/test/TableGen/x86-instr-mapping.inc b/llvm/test/TableGen/x86-instr-mapping.inc
index ed43684db2dfc..4f64d4b8d93d0 100644
--- a/llvm/test/TableGen/x86-instr-mapping.inc
+++ b/llvm/test/TableGen/x86-instr-mapping.inc
@@ -133,6 +133,10 @@ static const X86TableEntry X86CompressEVEXTable[] = {
   { X86::MOVDIR64B64_EVEX, X86::MOVDIR64B64 },
   { X86::MOVDIRI32_EVEX, X86::MOVDIRI32 },
   { X86::MOVDIRI64_EVEX, X86::MOVDIRI64 },
+  { X86::MOVRS16rm_EVEX, X86::MOVRS16rm },
+  { X86::MOVRS32rm_EVEX, X86::MOVRS32rm },
+  { X86::MOVRS64rm_EVEX, X86::MOVRS64rm },
+  { X86::MOVRS8rm_EVEX, X86::MOVRS8rm },
   { X86::MULX32rm_EVEX, X86::MULX32rm },
   { X86::MULX32rr_EVEX, X86::MULX32rr },
   { X86::MULX64rm_EVEX, X86::MULX64rm },
@@ -163,6 +167,16 @@ static const X86TableEntry X86CompressEVEXTable[] = {
   { X86::SHRX64rm_EVEX, X86::SHRX64rm },
   { X86::SHRX64rr_EVEX, X86::SHRX64rr },
   { X86::STTILECFG_EVEX, X86::STTILECFG },
+  { X86::T2RPNTLVWZ0RST1_EVEX, X86::T2RPNTLVWZ0RST1 },
+  { X86::T2RPNTLVWZ0RS_EVEX, X86::T2RPNTLVWZ0RS },
+  { X86::T2RPNTLVWZ0T1_EVEX, X86::T2RPNTLVWZ0T1 },
+  { X86::T2RPNTLVWZ0_EVEX, X86::T2RPNTLVWZ0 },
+  { X86::T2RPNTLVWZ1RST1_EVEX, X86::T2RPNTLVWZ1RST1 },
+  { X86::T2RPNTLVWZ1RS_EVEX, X86::T2RPNTLVWZ1RS },
+  { X86::T2RPNTLVWZ1T1_EVEX, X86::T2RPNTLVWZ1T1 },
+  { X86::T2RPNTLVWZ1_EVEX, X86::T2RPNTLVWZ1 },
+  { X86::TILELOADDRST1_EVEX, X86::TILELOADDRST1 },
+  { X86::TILELOADDRS_EVEX, X86::TILELOADDRS },
   { X86::TILELOADDT1_EVEX, X86::TILELOADDT1 },
   { X86::TILELOADD_EVEX, X86::TILELOADD },
   { X86::TILESTORED_EVEX, X86::TILESTORED },
diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
new file mode 100644
index 0000000000000..4b6a19d3f05cf
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
@@ -0,0 +1,368 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names)" --version 4
+
+; REQUIRES: aarch64-registered-target
+
+; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+$test_single_bb_resolver.resolver = comdat any
+$test_multi_bb_resolver.resolver = comdat any
+$test_caller_feats_not_implied.resolver = comdat any
+$test_non_fmv_caller.resolver = comdat any
+$test_priority.resolver = comdat any
+$test_alternative_names.resolver = comdat any
+
+@__aarch64_cpu_features = external local_unnamed_addr global { i64 }
+
+@test_single_bb_resolver = weak_odr ifunc i32 (), ptr @test_single_bb_resolver.resolver
+@test_multi_bb_resolver = weak_odr ifunc i32 (), ptr @test_multi_bb_resolver.resolver
+@test_caller_feats_not_implied = weak_odr ifunc i32 (), ptr @test_caller_feats_not_implied.resolver
+@test_non_fmv_caller = weak_odr ifunc i32 (), ptr @test_non_fmv_caller.resolver
+@test_priority = weak_odr ifunc i32 (), ptr @test_priority.resolver
+@test_alternative_names = weak_odr ifunc i32 (), ptr @test_alternative_names.resolver
+
+declare void @__init_cpu_features_resolver() local_unnamed_addr
+
+declare i32 @test_single_bb_resolver.default() #0
+declare i32 @test_single_bb_resolver._Msve() #1
+declare i32 @test_single_bb_resolver._Msve2() #2
+
+define weak_odr ptr @test_single_bb_resolver.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_single_bb_resolver.resolver() comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 68719476736
+  %.not = icmp eq i64 %1, 0
+  %2 = and i64 %0, 1073741824
+  %.not3 = icmp eq i64 %2, 0
+  %test_single_bb_resolver._Msve.test_single_bb_resolver.default = select i1 %.not3, ptr @test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve
+  %common.ret.op = select i1 %.not, ptr %test_single_bb_resolver._Msve.test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve2
+  ret ptr %common.ret.op
+}
+
+define i32 @caller1._Msve() #1 {
+; CHECK-LABEL: define i32 @caller1._Msve(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve()
+;
+entry:
+  %call = tail call i32 @test_single_bb_resolver()
+  ret i32 %call
+}
+
+define i32 @caller1._Msve2() #2 {
+; CHECK-LABEL: define i32 @caller1._Msve2(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve2()
+;
+entry:
+  %call = tail call i32 @test_single_bb_resolver()
+  ret i32 %call
+}
+
+define i32 @caller1.default() #0 {
+; CHECK-LABEL: define i32 @caller1.default(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_single_bb_resolver.default()
+;
+entry:
+  %call = tail call i32 @test_single_bb_resolver()
+  ret i32 %call
+}
+
+declare i32 @test_multi_bb_resolver._Mmops() #3
+declare i32 @test_multi_bb_resolver._Msve2() #2
+declare i32 @test_multi_bb_resolver._Msve() #1
+declare i32 @test_multi_bb_resolver.default() #0
+
+define weak_odr ptr @test_multi_bb_resolver.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_multi_bb_resolver.resolver() comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 576460752303423488
+  %.not = icmp eq i64 %1, 0
+  br i1 %.not, label %resolver_else, label %common.ret
+
+common.ret:                                       ; preds = %resolver_else2, %resolver_else, %resolver_entry
+  %common.ret.op = phi ptr [ @test_multi_bb_resolver._Mmops, %resolver_entry ], [ @test_multi_bb_resolver._Msve2, %resolver_else ], [ %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default, %resolver_else2 ]
+  ret ptr %common.ret.op
+
+resolver_else:                                    ; preds = %resolver_entry
+  %2 = and i64 %0, 68719476736
+  %.not5 = icmp eq i64 %2, 0
+  br i1 %.not5, label %resolver_else2, label %common.ret
+
+resolver_else2:                                   ; preds = %resolver_else
+  %3 = and i64 %0, 1073741824
+  %.not6 = icmp eq i64 %3, 0
+  %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default = select i1 %.not6, ptr @test_multi_bb_resolver.default, ptr @test_multi_bb_resolver._Msve
+  br label %common.ret
+}
+
+define i32 @caller2._MmopsMsve2() #4 {
+; CHECK-LABEL: define i32 @caller2._MmopsMsve2(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops()
+;
+entry:
+  %call = tail call i32 @test_multi_bb_resolver()
+  ret i32 %call
+}
+
+define i32 @caller2._Mmops() #3 {
+; CHECK-LABEL: define i32 @caller2._Mmops(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops()
+;
+entry:
+  %call = tail call i32 @test_multi_bb_resolver()
+  ret i32 %call
+}
+
+define i32 @caller2._Msve() #1 {
+; CHECK-LABEL: define i32 @caller2._Msve(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver()
+;
+entry:
+  %call = tail call i32 @test_multi_bb_resolver()
+  ret i32 %call
+}
+
+define i32 @caller2.default() #0 {
+; CHECK-LABEL: define i32 @caller2.default(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver.default()
+;
+entry:
+  %call = tail call i32 @test_multi_bb_resolver()
+  ret i32 %call
+}
+
+declare i32 @test_caller_feats_not_implied._Mmops() #3
+declare i32 @test_caller_feats_not_implied._Msme() #5
+declare i32 @test_caller_feats_not_implied._Msve() #1
+declare i32 @test_caller_feats_not_implied.default() #0
+
+define weak_odr ptr @test_caller_feats_not_implied.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_caller_feats_not_implied.resolver() comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 576460752303423488
+  %.not = icmp eq i64 %1, 0
+  br i1 %.not, label %resolver_else, label %common.ret
+
+common.ret:                                       ; preds = %resolver_else2, %resolver_else, %resolver_entry
+  %common.ret.op = phi ptr [ @test_caller_feats_not_implied._Mmops, %resolver_entry ], [ @test_caller_feats_not_implied._Msme, %resolver_else ], [ %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default, %resolver_else2 ]
+  ret ptr %common.ret.op
+
+resolver_else:                                    ; preds = %resolver_entry
+  %2 = and i64 %0, 4398046511104
+  %.not5 = icmp eq i64 %2, 0
+  br i1 %.not5, label %resolver_else2, label %common.ret
+
+resolver_else2:                                   ; preds = %resolver_else
+  %3 = and i64 %0, 1073741824
+  %.not6 = icmp eq i64 %3, 0
+  %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default = select i1 %.not6, ptr @test_caller_feats_not_implied.default, ptr @test_caller_feats_not_implied._Msve
+  br label %common.ret
+}
+
+define i32 @caller3._Mmops() #3 {
+; CHECK-LABEL: define i32 @caller3._Mmops(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR3]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied._Mmops()
+;
+entry:
+  %call = tail call i32 @test_caller_feats_not_implied()
+  ret i32 %call
+}
+
+define i32 @caller3._Msve() #1 {
+; CHECK-LABEL: define i32 @caller3._Msve(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied()
+;
+entry:
+  %call = tail call i32 @test_caller_feats_not_implied()
+  ret i32 %call
+}
+
+define i32 @caller3.default() #0 {
+; CHECK-LABEL: define i32 @caller3.default(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied()
+;
+entry:
+  %call = tail call i32 @test_caller_feats_not_implied()
+  ret i32 %call
+}
+
+declare i32 @test_non_fmv_caller._Maes() #6
+declare i32 @test_non_fmv_caller._Msm4() #7
+declare i32 @test_non_fmv_caller.default() #0
+
+define weak_odr ptr @test_non_fmv_caller.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_non_fmv_caller.resolver() comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 32768
+  %.not = icmp eq i64 %1, 0
+  %test_non_fmv_caller._Maes.test_non_fmv_caller.default = select i1 %.not, ptr @test_non_fmv_caller.default, ptr @test_non_fmv_caller._Maes
+  ret ptr %test_non_fmv_caller._Maes.test_non_fmv_caller.default
+}
+
+define i32 @caller4() #8 {
+; CHECK-LABEL: define i32 @caller4(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_non_fmv_caller._Maes()
+;
+entry:
+  %call = tail call i32 @test_non_fmv_caller()
+  ret i32 %call
+}
+
+define i32 @caller5() #9 {
+; CHECK-LABEL: define i32 @caller5(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR8:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_non_fmv_caller()
+;
+entry:
+  %call = tail call i32 @test_non_fmv_caller()
+  ret i32 %call
+}
+
+declare i32 @test_priority._Msve2-sha3() #10
+declare i32 @test_priority._Mls64Mssbs() #11
+declare i32 @test_priority._MflagmMlseMrng() #12
+declare i32 @test_priority.default() #0
+
+define weak_odr ptr @test_priority.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_priority.resolver() comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 131
+  %2 = icmp eq i64 %1, 131
+  br i1 %2, label %common.ret, label %resolver_else
+
+common.ret:                                       ; preds = %resolver_else2, %resolver_else, %resolver_entry
+  %common.ret.op = phi ptr [ @test_priority._MflagmMlseMrng, %resolver_entry ], [ @test_priority._Mls64Mssbs, %resolver_else ], [ %test_priority._Msve2-sha3.test_priority.default, %resolver_else2 ]
+  ret ptr %common.ret.op
+
+resolver_else:                                    ; preds = %resolver_entry
+  %3 = and i64 %0, 9570149208162304
+  %4 = icmp eq i64 %3, 9570149208162304
+  br i1 %4, label %common.ret, label %resolver_else2
+
+resolver_else2:                                   ; preds = %resolver_else
+  %5 = and i64 %0, 1099511627776
+  %.not = icmp eq i64 %5, 0
+  %test_priority._Msve2-sha3.test_priority.default = select i1 %.not, ptr @test_priority.default, ptr @test_priority._Msve2-sha3
+  br label %common.ret
+}
+
+define i32 @caller6._MflagmMls64MlseMrngMssbsMsve2-sha3() #13 {
+; CHECK-LABEL: define i32 @caller6._MflagmMls64MlseMrngMssbsMsve2-sha3(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR12:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_priority._Mls64Mssbs()
+;
+entry:
+  %call = tail call i32 @test_priority()
+  ret i32 %call
+}
+
+declare i32 @test_alternative_names._Mdpb2Mfrintts() #14
+declare i32 @test_alternative_names._Mflagm2Mfrintts() #15
+declare i32 @test_alternative_names._Mrcpc2() #16
+declare i32 @test_alternative_names.default() #0
+
+define weak_odr ptr @test_alternative_names.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_alternative_names.resolver() comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 17563904
+  %2 = icmp eq i64 %1, 17563904
+  br i1 %2, label %common.ret, label %resolver_else
+
+common.ret:                                       ; preds = %resolver_else2, %resolver_else, %resolver_entry
+  %common.ret.op = phi ptr [ @test_alternative_names._Mdpb2Mfrintts, %resolver_entry ], [ @test_alternative_names._Mflagm2Mfrintts, %resolver_else ], [ %test_alternative_names._Mrcpc2.test_alternative_names.default, %resolver_else2 ]
+  ret ptr %common.ret.op
+
+resolver_else:                                    ; preds = %resolver_entry
+  %3 = and i64 %0, 16777478
+  %4 = icmp eq i64 %3, 16777478
+  br i1 %4, label %common.ret, label %resolver_else2
+
+resolver_else2:                                   ; preds = %resolver_else
+  %5 = and i64 %0, 12582912
+  %6 = icmp eq i64 %5, 12582912
+  %test_alternative_names._Mrcpc2.test_alternative_names.default = select i1 %6, ptr @test_alternative_names._Mrcpc2, ptr @test_alternative_names.default
+  br label %common.ret
+}
+
+define i32 @caller7._Mdpb2Mfrintts() #14 {
+; CHECK-LABEL: define i32 @caller7._Mdpb2Mfrintts(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR13:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_alternative_names._Mdpb2Mfrintts()
+;
+entry:
+  %call = tail call i32 @test_alternative_names()
+  ret i32 %call
+}
+
+define i32 @caller7._Mfrintts() #17 {
+; CHECK-LABEL: define i32 @caller7._Mfrintts(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR16:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_alternative_names()
+;
+entry:
+  %call = tail call i32 @test_alternative_names()
+  ret i32 %call
+}
+
+define i32 @caller7._Mrcpc2() #16 {
+; CHECK-LABEL: define i32 @caller7._Mrcpc2(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR15:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_alternative_names._Mrcpc2()
+;
+entry:
+  %call = tail call i32 @test_alternative_names()
+  ret i32 %call
+}
+
+define i32 @caller7.default() #0 {
+; CHECK-LABEL: define i32 @caller7.default(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_alternative_names.default()
+;
+entry:
+  %call = tail call i32 @test_alternative_names()
+  ret i32 %call
+}
+
+attributes #0 = { "fmv-features" }
+attributes #1 = { "fmv-features"="sve" }
+attributes #2 = { "fmv-features"="sve2" }
+attributes #3 = { "fmv-features"="mops" }
+attributes #4 = { "fmv-features"="mops,sve2" }
+attributes #5 = { "fmv-features"="sme" }
+attributes #6 = { "fmv-features"="aes" }
+attributes #7 = { "fmv-features"="sm4" }
+attributes #8 = { "target-features"="+aes,+fp-armv8,+neon,+outline-atomics,+v8a" }
+attributes #9 = { "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,+sm4" }
+attributes #10 = { "fmv-features"="sve2-sha3" }
+attributes #11 = { "fmv-features"="ls64,ssbs" }
+attributes #12 = { "fmv-features"="flagm,lse,rng" }
+attributes #13 = { "fmv-features"="flagm,ls64,lse,rng,ssbs,sve2-sha3" }
+attributes #14 = { "fmv-features"="dpb2,frintts" }
+attributes #15 = { "fmv-features"="flagm2,frintts" }
+attributes #16 = { "fmv-features"="rcpc2" }
+attributes #17 = { "fmv-features"="frintts" }
diff --git a/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll b/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll
new file mode 100644
index 0000000000000..6296954333e8a
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll
@@ -0,0 +1,631 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes='instcombine<no-verify-fixpoint>' < %s | FileCheck %s
+
+@x = global double 0.000000e+00
+@r1 = global double 0.000000e+00
+@r2 = global double 0.000000e+00
+@r3 = global double 0.000000e+00
+@v = global [2 x double] zeroinitializer
+@v1 = global [2 x double] zeroinitializer
+@v2 = global [2 x double] zeroinitializer
+
+; div/mul/div1 in the same block.
+define void @bb_constraint_case1(double %a) {
+; CHECK-LABEL: define void @bb_constraint_case1(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; div/mul in one block and div1 in other block with conditional guard.
+define void @bb_constraint_case2(double %a, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case2(
+; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    [[D_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[D_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %d.not = icmp eq i32 %d, 0
+  br i1 %d.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; div in one block. mul/div1 in other block and conditionally guarded. Don't optimize.
+define void @bb_constraint_case3(double %a, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case3(
+; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[D_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[D_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %d.not = icmp eq i32 %d, 0
+  br i1 %d.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; div in one block. mul/div1 each in different block and conditionally guarded. Don't optimize.
+define void @bb_constraint_case4(double %a, i32 %c, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case4(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    br i1 [[C_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[D_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[D_NOT]], label [[IF_END1:%.*]], label [[IF_THEN1:%.*]]
+; CHECK:       if.then1:
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    br label [[IF_END1]]
+; CHECK:       if.end1:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %c.not = icmp eq i32 %c, 0
+  br i1 %c.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %d.not = icmp eq i32 %d, 0
+  br i1 %d.not, label %if.end1, label %if.then1
+
+if.then1:                                         ; preds = %if.end
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  br label %if.end1
+
+if.end1:                                          ; preds = %if.then1, %if.end
+  ret void
+}
+
+; sqrt value comes from different blocks. Don't optimize.
+define void @bb_constraint_case5(double %a, i32 %c) {
+; CHECK-LABEL: define void @bb_constraint_case5(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    br i1 [[C_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[A]], 1.000000e+01
+; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[ADD]])
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[SQRT:%.*]] = phi double [ [[TMP0]], [[IF_THEN]] ], [ [[TMP1]], [[IF_ELSE]] ]
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c.not = icmp eq i32 %c, 0
+  br i1 %c.not, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %0 = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %add = fadd double %a, 1.000000e+01
+  %1 = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %add)
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %sqrt = phi double[ %0, %if.then], [ %1, %if.else]
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; div in one block and conditionally guarded. mul/div1 in other block. Don't optimize.
+define void @bb_constraint_case6(double %a, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case6(
+; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[D_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[D_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr @x, align 8
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[TMP1]], ptr @x, align 8
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[DIV:%.*]] = phi double [ [[TMP0]], [[IF_ELSE]] ], [ [[TMP1]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %d.not = icmp eq i32 %d, 0
+  br i1 %d.not, label %if.else, label %if.then
+
+if.else:                                          ; preds = %entry
+  %1 = load double, ptr @x
+  br label %if.end
+
+if.then:                                          ; preds = %entry
+  %2 = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %2, ptr @x
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %div = phi double [ %1, %if.else ], [ %2, %if.then ]
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; value for mul comes from different blocks. Don't optimize.
+define void @bb_constraint_case7(double %a, i32 %c, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case7(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    br i1 [[C_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv double 3.000000e+00, [[A]]
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[D_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[D_NOT]], label [[IF_ELSE1:%.*]], label [[IF_THEN1:%.*]]
+; CHECK:       if.then1:
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv double 2.000000e+00, [[A]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.else1:
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[MUL:%.*]] = phi double [ [[TMP1]], [[IF_THEN1]] ], [ [[TMP2]], [[IF_ELSE1]] ], [ [[TMP0]], [[IF_THEN]] ]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %c.not = icmp eq i32 %c, 0
+  br i1 %c.not, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = fdiv double 3.000000e+00, %a
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %d.not = icmp eq i32 %d, 0
+  br i1 %d.not, label %if.else1, label %if.then1
+
+if.then1:                                         ; preds = %if.else
+  %2 = fdiv double 2.000000e+00, %a
+  br label %if.end
+
+if.else1:                                         ; preds = %if.else
+  %3 = fmul reassoc double %div, %div
+  br label %if.end
+
+if.end:                                           ; preds = %if.then1, %if.else1, %if.then
+  %mul = phi double [ %2, %if.then1 ], [ %3, %if.else1 ], [ %1, %if.then ]
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; value of mul  comes from two different blocks(as shown by select ins).
+define void @bb_constraint_case8(double %a, i32 %c) {
+; CHECK-LABEL: define void @bb_constraint_case8(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul double [[A]], [[A]]
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[C_NOT]], double [[TMP1]], double [[TMP0]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %c.not = icmp eq i32 %c, 0
+  %1 = fmul double %a, %a
+  %2 = fmul reassoc double %div, %div
+  %mul = select i1 %c.not, double %1, double %2
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; multiple instances of multiply ops to optimize. Optimize all.
+define void @mutiple_multiply_instances(double %a, i32 %c) {
+; CHECK-LABEL: define void @mutiple_multiply_instances(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP1]], [[SQRT1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul double [[A]], [[A]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul double [[A]], [[A]]
+; CHECK-NEXT:    [[MUL1:%.*]] = select i1 [[C_NOT]], double [[TMP2]], double [[TMP1]]
+; CHECK-NEXT:    [[MUL2:%.*]] = select i1 [[C_NOT]], double [[TMP1]], double [[TMP3]]
+; CHECK-NEXT:    store double [[MUL1]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[MUL2]], ptr @r3, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %c.not = icmp eq i32 %c, 0
+  %1 = fmul double %a, %a
+  %2 = fmul double %a, %a
+  %3 = fmul reassoc double %div, %div
+  %4 = fmul reassoc double %div, %div
+  %mul1 = select i1 %c.not, double %1, double %3
+  %mul2 = select i1 %c.not, double %4, double %2
+  store double %mul1, ptr @r1
+  store double %mul2, ptr @r3
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; missing flags for optimization.
+define void @missing_arcp_flag_on_div(double %a) {
+; CHECK-LABEL: define void @missing_arcp_flag_on_div(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; missing flags for optimization.
+define void @missing_reassoc_flag_on_mul(double %a) {
+; CHECK-LABEL: define void @missing_reassoc_flag_on_mul(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; missing flags for optimization.
+define void @missing_reassoc_flag_on_div1(double %a) {
+; CHECK-LABEL: define void @missing_reassoc_flag_on_div1(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; div = -1/sqrt(a)
+define void @negative_fdiv_val(double %a) {
+; CHECK-LABEL: define void @negative_fdiv_val(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg reassoc double [[SQRT1]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double -1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+define void @fpmath_metadata_on_div1(double %a) {
+; CHECK-LABEL: define void @fpmath_metadata_on_div1(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]), !fpmath [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt, !fpmath !3
+  store double %div1, ptr @r2
+  ret void
+}
+
+define void @fpmath_metadata_on_mul(double %a) {
+; CHECK-LABEL: define void @fpmath_metadata_on_mul(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]], !fpmath [[META1:![0-9]+]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div, !fpmath !2
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; FIXME: DIV in the result should get the fpmath metadata from %div.
+define void @fpmath_metadata_on_div(double %a) {
+; CHECK-LABEL: define void @fpmath_metadata_on_div(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]], !fpmath [[META2:![0-9]+]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt, !fpmath !1
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+define void @fpmath_metadata_on_all(double %a) {
+; CHECK-LABEL: define void @fpmath_metadata_on_all(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]), !fpmath [[META0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]], !fpmath [[META1]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]], !fpmath [[META2]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a), !fpmath !0
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt, !fpmath !1
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div, !fpmath !2
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt, !fpmath !3
+  store double %div1, ptr @r2
+  ret void
+}
+
+define void @vector_input(<2 x double> %a) {
+; CHECK-LABEL: define void @vector_input(
+; CHECK-SAME: <2 x double> [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc <2 x double> @llvm.sqrt.v2f64(<2 x double> [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc <2 x double> splat (double 1.000000e+00), [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc <2 x double> [[TMP0]], [[SQRT1]]
+; CHECK-NEXT:    store <2 x double> [[DIV]], ptr @v, align 16
+; CHECK-NEXT:    store <2 x double> [[TMP0]], ptr @v1, align 16
+; CHECK-NEXT:    store <2 x double> [[SQRT1]], ptr @v2, align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %a)
+  %div = fdiv reassoc arcp ninf <2 x double><double 1.000000e+00, double 1.000000e+00>, %sqrt
+  store <2 x double> %div, ptr @v
+  %mul = fmul reassoc <2 x double> %div, %div
+  store <2 x double> %mul, ptr @v1
+  %div1 = fdiv reassoc <2 x double> %a, %sqrt
+  store <2 x double> %div1, ptr @v2
+  ret void
+}
+
+define void @strict_fp_metadata(double %a) {
+; CHECK-LABEL: define void @strict_fp_metadata(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 1, metadata !"round.dynamic", metadata !"fpexcept.strict")
+; CHECK-NEXT:    [[CALL:%.*]] = call double @llvm.sqrt.f64(double noundef [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[CONV]], double [[CALL]], metadata !"round.dynamic", metadata !"fpexcept.strict")
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[MUL:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[DIV]], double [[DIV]], metadata !"round.dynamic", metadata !"fpexcept.strict")
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV2:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[A]], double [[CALL]], metadata !"round.dynamic", metadata !"fpexcept.strict")
+; CHECK-NEXT:    store double [[DIV2]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 1, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  %call = call double @llvm.sqrt.f64(double noundef %a)
+  %div = call double @llvm.experimental.constrained.fdiv.f64(double %conv, double %call, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  store double %div, ptr @x
+  %mul = call double @llvm.experimental.constrained.fmul.f64(double %div, double %div, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  store double %mul, ptr @r1
+  %div2 = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %call, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  store double %div2, ptr @r2
+  ret void
+}
+
+declare double @llvm.experimental.constrained.sitofp.f64.i32(i32, metadata, metadata)
+declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata)
+declare double @llvm.sqrt.f64(double)
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
+
+!0 = !{float 2.5}
+!1 = !{float 3.5}
+!2 = !{float 4.5}
+!3 = !{float 5.5}
+; CHECK: [[META0]] = !{float 5.500000e+00}
+; CHECK: [[META1]] = !{float 4.500000e+00}
+; CHECK: [[META2]] = !{float 3.500000e+00}
diff --git a/llvm/test/Transforms/InstCombine/nsw.ll b/llvm/test/Transforms/InstCombine/nsw.ll
index 329a47324f862..b00f2e58add78 100644
--- a/llvm/test/Transforms/InstCombine/nsw.ll
+++ b/llvm/test/Transforms/InstCombine/nsw.ll
@@ -415,3 +415,63 @@ define i8 @neg_nsw_mul_missing_nsw_on_mul(i8 %a1, i8 %a2, i8 %b) {
   %neg = sub nsw i8 0, %shl
   ret i8 %neg
 }
+
+; This could propagate nsw.
+
+define i16 @mul_nsw_reassoc_prop(i16 %x) {
+; CHECK-LABEL: @mul_nsw_reassoc_prop(
+; CHECK-NEXT:    [[B:%.*]] = mul nsw i16 [[X:%.*]], 6
+; CHECK-NEXT:    ret i16 [[B]]
+;
+  %a = mul nsw i16 %x, 3
+  %b = mul nsw i16 %a, 2
+  ret i16 %b
+}
+
+; This could propagate nsw.
+
+define i16 @mul_nsw_reassoc_prop_neg(i16 %x) {
+; CHECK-LABEL: @mul_nsw_reassoc_prop_neg(
+; CHECK-NEXT:    [[B:%.*]] = mul nsw i16 [[X:%.*]], -2201
+; CHECK-NEXT:    ret i16 [[B]]
+;
+  %a = mul nsw i16 %x, -71
+  %b = mul nsw i16 %a, 31
+  ret i16 %b
+}
+
+; Must not propagate nsw.
+
+define i16 @mul_nsw_reassoc_prop_no_nsw1(i16 %x) {
+; CHECK-LABEL: @mul_nsw_reassoc_prop_no_nsw1(
+; CHECK-NEXT:    [[B:%.*]] = mul i16 [[X:%.*]], 6
+; CHECK-NEXT:    ret i16 [[B]]
+;
+  %a = mul i16 %x, 3
+  %b = mul nsw i16 %a, 2
+  ret i16 %b
+}
+
+; Must not propagate nsw.
+
+define i16 @mul_nsw_reassoc_prop_no_nsw2(i16 %x) {
+; CHECK-LABEL: @mul_nsw_reassoc_prop_no_nsw2(
+; CHECK-NEXT:    [[B:%.*]] = mul i16 [[X:%.*]], 6
+; CHECK-NEXT:    ret i16 [[B]]
+;
+  %a = mul nsw i16 %x, 3
+  %b = mul i16 %a, 2
+  ret i16 %b
+}
+
+; Must not propagate nsw.
+
+define i16 @mul_nsw_reassoc_prop_overflow(i16 %x) {
+; CHECK-LABEL: @mul_nsw_reassoc_prop_overflow(
+; CHECK-NEXT:    [[B:%.*]] = mul i16 [[X:%.*]], -31777
+; CHECK-NEXT:    ret i16 [[B]]
+;
+  %a = mul nsw i16 %x, 1023
+  %b = mul nsw i16 %a, 33
+  ret i16 %b
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index bf95622733461..05c0bc0761ea4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -396,8 +396,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
 ; CHECK-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
@@ -1548,5 +1548,263 @@ end:
   ret void
 }
 
+; Check vectorization on an interleaved load/store groups of factor 4
+
+; for (int i = 0; i < 1024; ++i) {
+;   dst[i].x = a[i].x + b[i].x;
+;   dst[i].y = a[i].y - b[i].y;
+;   dst[i].z = a[i].z << b[i].z;
+;   dst[i].t = a[i].t >> b[i].t;
+; }
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a, ptr readonly %b) {
+; CHECK-LABEL: @interleave_deinterleave(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], 1024
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP7]])
+; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP8]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <vscale x 16 x i32>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC9:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC8]])
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 1
+; CHECK-NEXT:    [[STRIDED_VEC10:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP14]])
+; CHECK-NEXT:    [[STRIDED_VEC11:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP15]])
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP9]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP22:%.*]] = sub nsw <vscale x 4 x i32> [[TMP10]], [[TMP17]]
+; CHECK-NEXT:    [[TMP23:%.*]] = shl <vscale x 4 x i32> [[TMP11]], [[TMP18]]
+; CHECK-NEXT:    [[TMP24:%.*]] = ashr <vscale x 4 x i32> [[TMP12]], [[TMP19]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP23]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP24]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC13:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC12]])
+; CHECK-NEXT:    store <vscale x 16 x i32> [[INTERLEAVED_VEC13]], ptr [[TMP21]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[Y]], align 4
+; CHECK-NEXT:    [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[Y11]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP28]], [[TMP29]]
+; CHECK-NEXT:    [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4
+; CHECK-NEXT:    store i32 [[SUB]], ptr [[Y14]], align 4
+; CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[Z]], align 4
+; CHECK-NEXT:    [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8
+; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[Z19]], align 4
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP30]], [[TMP31]]
+; CHECK-NEXT:    [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8
+; CHECK-NEXT:    store i32 [[SHL]], ptr [[Z22]], align 4
+; CHECK-NEXT:    [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12
+; CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[T]], align 4
+; CHECK-NEXT:    [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12
+; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[T27]], align 4
+; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[TMP32]], [[TMP33]]
+; CHECK-NEXT:    [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12
+; CHECK-NEXT:    store i32 [[SHR]], ptr [[T30]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv
+  store i32 %add, ptr %arrayidx5, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4
+  %2 = load i32, ptr %y, align 4
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4
+  %3 = load i32, ptr %y11, align 4
+  %sub = sub nsw i32 %2, %3
+  %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4
+  store i32 %sub, ptr %y14, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8
+  %4 = load i32, ptr %z, align 4
+  %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8
+  %5 = load i32, ptr %z19, align 4
+  %shl = shl i32 %4, %5
+  %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8
+  store i32 %shl, ptr %z22, align 4
+  %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12
+  %6 = load i32, ptr %t, align 4
+  %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12
+  %7 = load i32, ptr %t27, align 4
+  %shr = ashr i32 %6, %7
+  %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12
+  store i32 %shr, ptr %t30, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; Check vectorization on a reverse interleaved load/store groups of factor 4
+
+; for (int i = 1023; i >= 0; i--) {
+;   int a = A[i].x + i;
+;   int b = A[i].y - i;
+;   int c = A[i].z * i;
+;   int d = A[i].t << i;
+;   B[i].x = a;
+;   B[i].y = b;
+;   B[i].z = c;
+;   B[i].t = d;
+; }
+
+define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) #1{
+; CHECK-LABEL: @interleave_deinterleave_reverse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK-NEXT:    [[INDUCTION:%.*]] = sub <vscale x 4 x i32> splat (i32 1023), [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw i32 0, [[TMP3]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP4]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP11]])
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP12]])
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
+; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
+; CHECK-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]])
+; CHECK-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]])
+; CHECK-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP16]])
+; CHECK-NEXT:    [[TMP17:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE3]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP19:%.*]] = mul nsw <vscale x 4 x i32> [[REVERSE4]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP20:%.*]] = shl nuw nsw <vscale x 4 x i32> [[REVERSE5]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4
+; CHECK-NEXT:    [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]]
+; CHECK-NEXT:    [[REVERSE6:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP17]])
+; CHECK-NEXT:    [[REVERSE7:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; CHECK-NEXT:    [[REVERSE8:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-NEXT:    [[REVERSE9:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE6]], <vscale x 4 x i32> [[REVERSE8]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE7]], <vscale x 4 x i32> [[REVERSE9]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC10]])
+; CHECK-NEXT:    store <vscale x 16 x i32> [[INTERLEAVED_VEC11]], ptr [[TMP26]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP44:![0-9]+]]
+;
+entry:
+  br label %for.body
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
+  %x = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 0
+  %load1 = load i32, ptr %x, align 4
+  %trunc = trunc i64 %indvars.iv to i32
+  %add = add nsw i32 %load1, %trunc
+  %y = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 1
+  %load2 = load i32, ptr %y, align 4
+  %sub = sub nsw i32 %load2, %trunc
+  %z = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 2
+  %load3 = load i32, ptr %z, align 4
+  %mul = mul nsw i32 %load3, %trunc
+  %t = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 3
+  %load4 = load i32, ptr %t, align 4
+  %shl = shl nuw nsw i32 %load4, %trunc
+  %x5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 0
+  store i32 %add, ptr %x5, align 4
+  %y8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 1
+  store i32 %sub, ptr %y8, align 4
+  %z5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 2
+  store i32 %mul, ptr %z5, align 4
+  %t8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 3
+  store i32 %shl, ptr %t8, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+}
 attributes #1 = { "target-features"="+sve" vscale_range(1, 16) }
 attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 1a281fe7c6f7f..d4392bebdf37b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -529,3 +529,255 @@ for.inc:
 for.end:
   ret void
 }
+
+; Expected to contain interleave2/deinterleave2 instructions
+;
+; void masked_strided_factor4(const unsigned char* restrict p,
+;                            unsigned char* restrict q,
+;                            unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char left1 = p[4*ix];
+;         char right1 = p[4*ix + 1];
+;         char left2 = p[4*ix + 2];
+;         char right2 = p[4*ix + 3];
+;         char max1 = max(left1, right1);
+;         char max2 = max(left2, right2);
+;         q[4*ix] = max1;
+;         q[4*ix + 1] = 0 - max1;
+;         q[4*ix + 2] = max2;
+;         q[4*ix + 3] = 0 - max2;
+;     }
+; }
+;}
+define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
+; SCALAR_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4
+; SCALAR_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; SCALAR_TAIL_FOLDING-NEXT:  entry:
+; SCALAR_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALAR_TAIL_FOLDING:       vector.ph:
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALAR_TAIL_FOLDING:       vector.body:
+; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 2
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK2:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK2]], <vscale x 64 x i8> poison)
+; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 32 x i8>, <vscale x 32 x i8> } @llvm.vector.deinterleave2.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP11]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP12]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP17]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP8]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP19]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC5:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP20]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC6:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave2.nxv64i8(<vscale x 32 x i8> [[INTERLEAVED_VEC]], <vscale x 32 x i8> [[INTERLEAVED_VEC5]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK7:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK8:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK9:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK7]], <vscale x 32 x i1> [[INTERLEAVED_MASK8]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK9]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALAR_TAIL_FOLDING:       middle.block:
+; SCALAR_TAIL_FOLDING-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SCALAR_TAIL_FOLDING:       scalar.ph:
+; SCALAR_TAIL_FOLDING-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    br label [[FOR_BODY:%.*]]
+; SCALAR_TAIL_FOLDING:       for.body:
+; SCALAR_TAIL_FOLDING-NEXT:    [[IX_024:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_024]], [[CONV]]
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; SCALAR_TAIL_FOLDING:       if.then:
+; SCALAR_TAIL_FOLDING-NEXT:    [[IDX0:%.*]] = shl nuw nsw i32 [[IX_024]], 2
+; SCALAR_TAIL_FOLDING-NEXT:    [[IDX1:%.*]] = or disjoint i32 [[IDX0]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[IDX2:%.*]] = or disjoint i32 [[IDX0]], 2
+; SCALAR_TAIL_FOLDING-NEXT:    [[IDX3:%.*]] = or disjoint i32 [[IDX0]], 3
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP24:%.*]] = zext nneg i32 [[IDX0]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY1IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP24]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP25:%.*]] = load i8, ptr [[ARRAY1IDX0]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP26:%.*]] = zext nneg i32 [[IDX1]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY1IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP26]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP27:%.*]] = load i8, ptr [[ARRAY1IDX1]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP28:%.*]] = zext nneg i32 [[IDX2]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY1IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP28]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAY1IDX2]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP30:%.*]] = zext nneg i32 [[IDX3]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY1IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP30]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP31:%.*]] = load i8, ptr [[ARRAY1IDX3]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[SPEC_SELECT_I1:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP25]], i8 [[TMP27]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[SUB1:%.*]] = sub i8 0, [[SPEC_SELECT_I1]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[SPEC_SELECT_I2:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP29]], i8 [[TMP31]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[SUB2:%.*]] = sub i8 0, [[SPEC_SELECT_I2]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP32:%.*]] = zext nneg i32 [[IDX0]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY3IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP32]]
+; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SPEC_SELECT_I1]], ptr [[ARRAY3IDX0]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP33:%.*]] = zext nneg i32 [[IDX1]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY3IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP33]]
+; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SUB1]], ptr [[ARRAY3IDX1]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP34:%.*]] = zext nneg i32 [[IDX2]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY3IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP34]]
+; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SPEC_SELECT_I2]], ptr [[ARRAY3IDX2]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP35:%.*]] = zext nneg i32 [[IDX3]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY3IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP35]]
+; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SUB2]], ptr [[ARRAY3IDX3]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    br label [[FOR_INC]]
+; SCALAR_TAIL_FOLDING:       for.inc:
+; SCALAR_TAIL_FOLDING-NEXT:    [[INC]] = add nuw nsw i32 [[IX_024]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; SCALAR_TAIL_FOLDING:       for.end:
+; SCALAR_TAIL_FOLDING-NEXT:    ret void
+;
+; PREDICATED_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4
+; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; PREDICATED_TAIL_FOLDING-NEXT:  entry:
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PREDICATED_TAIL_FOLDING:       vector.ph:
+; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PREDICATED_TAIL_FOLDING:       vector.body:
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 2
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK2:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK2]], <vscale x 64 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 32 x i8>, <vscale x 32 x i8> } @llvm.vector.deinterleave2.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP11]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP12]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP17]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP8]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP19]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC5:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP20]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC6:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave2.nxv64i8(<vscale x 32 x i8> [[INTERLEAVED_VEC]], <vscale x 32 x i8> [[INTERLEAVED_VEC5]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK7:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK8:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK9:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK7]], <vscale x 32 x i1> [[INTERLEAVED_MASK8]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK9]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
+; PREDICATED_TAIL_FOLDING:       middle.block:
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; PREDICATED_TAIL_FOLDING:       scalar.ph:
+; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_BODY:%.*]]
+; PREDICATED_TAIL_FOLDING:       for.body:
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 poison, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; PREDICATED_TAIL_FOLDING:       if.then:
+; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_INC]]
+; PREDICATED_TAIL_FOLDING:       for.inc:
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; PREDICATED_TAIL_FOLDING:       for.end:
+; PREDICATED_TAIL_FOLDING-NEXT:    ret void
+;
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.024, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %idx0 = shl nuw nsw i32 %ix.024, 2
+  %idx1 = add i32 %idx0, 1
+  %idx2 = add i32 %idx0, 2
+  %idx3 = add i32 %idx0, 3
+
+  %array1idx0 = getelementptr inbounds i8, ptr %p, i32 %idx0
+  %0 = load i8, ptr %array1idx0, align 1
+  %array1idx1 = getelementptr inbounds i8, ptr %p, i32 %idx1
+  %1 = load i8, ptr %array1idx1, align 1
+  %array1idx2 = getelementptr inbounds i8, ptr %p, i32 %idx2
+  %2 = load i8, ptr %array1idx2, align 1
+  %array1idx3 = getelementptr inbounds i8, ptr %p, i32 %idx3
+  %3 = load i8, ptr %array1idx3, align 1
+
+  %cmp.i1 = icmp slt i8 %0, %1
+  %spec.select.i1 = select i1 %cmp.i1, i8 %1, i8 %0
+  %sub1 = sub i8 0, %spec.select.i1
+  %cmp.i2 = icmp slt i8 %2, %3
+  %spec.select.i2 = select i1 %cmp.i2, i8 %3, i8 %2
+  %sub2 = sub i8 0, %spec.select.i2
+
+  %array3idx0 = getelementptr inbounds i8, ptr %q, i32 %idx0
+  store i8 %spec.select.i1, ptr %array3idx0, align 1
+  %array3idx1 = getelementptr inbounds i8, ptr %q, i32 %idx1
+  store i8 %sub1, ptr %array3idx1, align 1
+  %array3idx2 = getelementptr inbounds i8, ptr %q, i32 %idx2
+  store i8 %spec.select.i2, ptr %array3idx2, align 1
+  %array3idx3 = getelementptr inbounds i8, ptr %q, i32 %idx3
+  store i8 %sub2, ptr %array3idx3, align 1
+
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.024, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index bda4839dead51..b1ff589fe51bf 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -9,7 +9,7 @@ define void @load_store_factor2_i32(ptr %p) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
@@ -17,88 +17,88 @@ define void @load_store_factor2_i32(ptr %p) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[Q0]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 1)
-; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP11]], splat (i32 2)
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP15]])
-; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 1)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 2)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP12]])
+; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
+; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q2]], align 4
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; CHECK-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
 ; CHECK-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor2_i32(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP2]], align 4
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; FIXED-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; FIXED-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; FIXED-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; FIXED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; FIXED-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
+; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
+; FIXED-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; FIXED-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
+; FIXED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
+; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
 ; FIXED-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q2]], align 4
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; FIXED-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
 ; FIXED-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -107,7 +107,7 @@ define void @load_store_factor2_i32(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
@@ -115,44 +115,44 @@ define void @load_store_factor2_i32(ptr %p) {
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[Q0]], align 4
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 1)
-; SCALABLE-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP11]], splat (i32 2)
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP15]])
-; SCALABLE-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 1)
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 2)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP12]])
+; SCALABLE-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q2]], align 4
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
 ; SCALABLE-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -186,7 +186,7 @@ define void @load_store_factor2_i64(ptr %p) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
@@ -194,88 +194,88 @@ define void @load_store_factor2_i64(ptr %p) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 1)
-; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], splat (i64 2)
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP15]])
-; CHECK-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 1)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 2)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP12]])
+; CHECK-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q2]], align 8
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
 ; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor2_i64(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; FIXED-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; FIXED-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; FIXED-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; FIXED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; FIXED-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
+; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; FIXED-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; FIXED-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 4
+; FIXED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
 ; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q2]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
 ; FIXED-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -284,7 +284,7 @@ define void @load_store_factor2_i64(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
@@ -292,44 +292,44 @@ define void @load_store_factor2_i64(ptr %p) {
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[TMP8]], align 8
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[Q0]], align 8
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], splat (i64 2)
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP15]])
-; SCALABLE-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 2)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP12]])
+; SCALABLE-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q2]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
 ; SCALABLE-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -360,42 +360,42 @@ exit:
 define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; CHECK-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; CHECK-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
+; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
+; CHECK-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
+; CHECK-NEXT:    [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]]
+; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q3]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q3]], align 4
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; CHECK-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
@@ -405,50 +405,50 @@ define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-NEXT:    [[X2:%.*]] = load i32, ptr [[Q2]], align 4
 ; CHECK-NEXT:    [[Y2:%.*]] = add i32 [[X2]], 3
 ; CHECK-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor3_i32(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
 ; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; FIXED-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; FIXED-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; FIXED-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; FIXED-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; FIXED-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; FIXED-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; FIXED-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
+; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
+; FIXED-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
+; FIXED-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; FIXED-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; FIXED-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; FIXED-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
+; FIXED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
+; FIXED-NEXT:    [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]]
+; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q3]], align 4
 ; FIXED-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q3]], align 4
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; FIXED-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
@@ -458,50 +458,50 @@ define void @load_store_factor3_i32(ptr %p) {
 ; FIXED-NEXT:    [[X2:%.*]] = load i32, ptr [[Q2]], align 4
 ; FIXED-NEXT:    [[Y2:%.*]] = add i32 [[X2]], 3
 ; FIXED-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
 ; SCALABLE-LABEL: @load_store_factor3_i32(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
-; SCALABLE-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
 ; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; SCALABLE-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; SCALABLE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SCALABLE-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SCALABLE-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; SCALABLE-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
+; SCALABLE-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
+; SCALABLE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SCALABLE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SCALABLE-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; SCALABLE-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
+; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
+; SCALABLE-NEXT:    [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q3]], align 4
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q3]], align 4
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
@@ -511,9 +511,9 @@ define void @load_store_factor3_i32(ptr %p) {
 ; SCALABLE-NEXT:    [[X2:%.*]] = load i32, ptr [[Q2]], align 4
 ; SCALABLE-NEXT:    [[Y2:%.*]] = add i32 [[X2]], 3
 ; SCALABLE-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -550,42 +550,42 @@ exit:
 define void @load_store_factor3_i64(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
+; CHECK-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q3]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q3]], align 8
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -595,50 +595,50 @@ define void @load_store_factor3_i64(ptr %p) {
 ; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; CHECK-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
 ; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor3_i64(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; FIXED-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; FIXED-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; FIXED-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; FIXED-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; FIXED-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; FIXED-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; FIXED-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
+; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; FIXED-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; FIXED-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; FIXED-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; FIXED-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; FIXED-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 4
+; FIXED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
+; FIXED-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q3]], align 8
 ; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q3]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -648,50 +648,50 @@ define void @load_store_factor3_i64(ptr %p) {
 ; FIXED-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; FIXED-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
 ; FIXED-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
 ; SCALABLE-LABEL: @load_store_factor3_i64(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
-; SCALABLE-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; SCALABLE-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; SCALABLE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SCALABLE-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; SCALABLE-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; SCALABLE-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; SCALABLE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SCALABLE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SCALABLE-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; SCALABLE-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 4
+; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
+; SCALABLE-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q3]], align 8
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q3]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -701,9 +701,9 @@ define void @load_store_factor3_i64(ptr %p) {
 ; SCALABLE-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; SCALABLE-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
 ; SCALABLE-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -740,56 +740,75 @@ exit:
 define void @load_store_factor8(ptr %p) {
 ; CHECK-LABEL: @load_store_factor8(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 0, i32 8>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 1, i32 9>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 2, i32 10>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 3, i32 11>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 4, i32 12>
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 5, i32 13>
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 6, i32 14>
-; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 7, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; CHECK-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; CHECK-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; CHECK-NEXT:    [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
-; CHECK-NEXT:    [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8)
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i64>, ptr [[Q0]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP6]])
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP7]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 1
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP8]])
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP9]])
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP10]])
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP11]])
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 1)
+; CHECK-NEXT:    [[TMP21:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 2)
+; CHECK-NEXT:    [[TMP22:%.*]] = add <vscale x 1 x i64> [[TMP14]], splat (i64 3)
+; CHECK-NEXT:    [[TMP23:%.*]] = add <vscale x 1 x i64> [[TMP15]], splat (i64 4)
+; CHECK-NEXT:    [[TMP24:%.*]] = add <vscale x 1 x i64> [[TMP16]], splat (i64 5)
+; CHECK-NEXT:    [[TMP25:%.*]] = add <vscale x 1 x i64> [[TMP17]], splat (i64 6)
+; CHECK-NEXT:    [[TMP26:%.*]] = add <vscale x 1 x i64> [[TMP18]], splat (i64 7)
+; CHECK-NEXT:    [[TMP27:%.*]] = add <vscale x 1 x i64> [[TMP19]], splat (i64 8)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP24]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC7:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP25]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC8:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP26]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC9:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP23]], <vscale x 1 x i64> [[TMP27]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC]], <vscale x 2 x i64> [[INTERLEAVED_VEC8]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC7]], <vscale x 2 x i64> [[INTERLEAVED_VEC9]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> [[INTERLEAVED_VEC10]], <vscale x 4 x i64> [[INTERLEAVED_VEC11]])
+; CHECK-NEXT:    store <vscale x 8 x i64> [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP2]]
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET8:%.*]] = shl i64 [[I1]], 3
+; CHECK-NEXT:    [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q8]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q8]], align 8
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -824,23 +843,23 @@ define void @load_store_factor8(ptr %p) {
 ; CHECK-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
 ; CHECK-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
 ; CHECK-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor8(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 3
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[Q0]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 0, i32 8>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 1, i32 9>
 ; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 2, i32 10>
@@ -849,39 +868,39 @@ define void @load_store_factor8(ptr %p) {
 ; FIXED-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 5, i32 13>
 ; FIXED-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 6, i32 14>
 ; FIXED-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 7, i32 15>
-; FIXED-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; FIXED-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; FIXED-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; FIXED-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; FIXED-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; FIXED-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; FIXED-NEXT:    [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
-; FIXED-NEXT:    [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8)
-; FIXED-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; FIXED-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; FIXED-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; FIXED-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; FIXED-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; FIXED-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; FIXED-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; FIXED-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; FIXED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; FIXED-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
+; FIXED-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; FIXED-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; FIXED-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
+; FIXED-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
+; FIXED-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
+; FIXED-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
+; FIXED-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8)
+; FIXED-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; FIXED-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; FIXED-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; FIXED-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; FIXED-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; FIXED-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; FIXED-NEXT:    [[TMP17:%.*]] = shufflevector <8 x i64> [[TMP15]], <8 x i64> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP17]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; FIXED-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 2
+; FIXED-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET8:%.*]] = shl i64 [[I1]], 3
+; FIXED-NEXT:    [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q8]], align 8
 ; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q8]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -916,64 +935,83 @@ define void @load_store_factor8(ptr %p) {
 ; FIXED-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
 ; FIXED-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
 ; FIXED-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
 ; SCALABLE-LABEL: @load_store_factor8(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; SCALABLE-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8
-; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 0, i32 8>
-; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 1, i32 9>
-; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 2, i32 10>
-; SCALABLE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 3, i32 11>
-; SCALABLE-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 4, i32 12>
-; SCALABLE-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 5, i32 13>
-; SCALABLE-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 6, i32 14>
-; SCALABLE-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 7, i32 15>
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; SCALABLE-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; SCALABLE-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; SCALABLE-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; SCALABLE-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; SCALABLE-NEXT:    [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
-; SCALABLE-NEXT:    [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8)
-; SCALABLE-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; SCALABLE-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; SCALABLE-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP3:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i64>, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP6]])
+; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP7]])
+; SCALABLE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 0
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 0
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 1
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 1
+; SCALABLE-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP8]])
+; SCALABLE-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP9]])
+; SCALABLE-NEXT:    [[STRIDED_VEC5:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP10]])
+; SCALABLE-NEXT:    [[STRIDED_VEC6:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP11]])
+; SCALABLE-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 0
+; SCALABLE-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 0
+; SCALABLE-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 0
+; SCALABLE-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 0
+; SCALABLE-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 1
+; SCALABLE-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 1
+; SCALABLE-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 1
+; SCALABLE-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 1
+; SCALABLE-NEXT:    [[TMP20:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP21:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 2)
+; SCALABLE-NEXT:    [[TMP22:%.*]] = add <vscale x 1 x i64> [[TMP14]], splat (i64 3)
+; SCALABLE-NEXT:    [[TMP23:%.*]] = add <vscale x 1 x i64> [[TMP15]], splat (i64 4)
+; SCALABLE-NEXT:    [[TMP24:%.*]] = add <vscale x 1 x i64> [[TMP16]], splat (i64 5)
+; SCALABLE-NEXT:    [[TMP25:%.*]] = add <vscale x 1 x i64> [[TMP17]], splat (i64 6)
+; SCALABLE-NEXT:    [[TMP26:%.*]] = add <vscale x 1 x i64> [[TMP18]], splat (i64 7)
+; SCALABLE-NEXT:    [[TMP27:%.*]] = add <vscale x 1 x i64> [[TMP19]], splat (i64 8)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP24]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC7:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP25]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC8:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP26]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC9:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP23]], <vscale x 1 x i64> [[TMP27]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC]], <vscale x 2 x i64> [[INTERLEAVED_VEC8]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC7]], <vscale x 2 x i64> [[INTERLEAVED_VEC9]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> [[INTERLEAVED_VEC10]], <vscale x 4 x i64> [[INTERLEAVED_VEC11]])
+; SCALABLE-NEXT:    store <vscale x 8 x i64> [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP2]]
+; SCALABLE-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET8:%.*]] = shl i64 [[I1]], 3
+; SCALABLE-NEXT:    [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q8]], align 8
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q8]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -1008,9 +1046,9 @@ define void @load_store_factor8(ptr %p) {
 ; SCALABLE-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
 ; SCALABLE-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
 ; SCALABLE-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -1080,7 +1118,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
@@ -1088,94 +1126,94 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[Q0]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP12]], ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP11]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
+; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; CHECK-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
-; CHECK-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
+; CHECK-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]]
 ; CHECK-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @combine_load_factor2_i32(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; FIXED-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[I]], 8
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1
 ; FIXED-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
-; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP2]]
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP3]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; FIXED-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
-; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; FIXED-NEXT:    [[TMP8:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]]
-; FIXED-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
-; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]]
-; FIXED-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
-; FIXED-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 8
-; FIXED-NEXT:    store <8 x i32> [[TMP8]], ptr [[TMP12]], align 4
-; FIXED-NEXT:    store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; FIXED-NEXT:    [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
+; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; FIXED-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
+; FIXED-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC4]]
+; FIXED-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]]
+; FIXED-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
+; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 8
+; FIXED-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP9]], align 4
+; FIXED-NEXT:    store <8 x i32> [[TMP7]], ptr [[TMP10]], align 4
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 16
+; FIXED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
+; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; FIXED-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
-; FIXED-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
+; FIXED-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]]
 ; FIXED-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -1184,7 +1222,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
@@ -1192,43 +1230,43 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[Q0]], align 4
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[TMP11]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
-; SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
-; SCALABLE-NEXT:    store <vscale x 4 x i32> [[TMP12]], ptr [[TMP14]], align 4
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], [[TMP10]]
+; SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
+; SCALABLE-NEXT:    store <vscale x 4 x i32> [[TMP11]], ptr [[TMP13]], align 4
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; SCALABLE-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
-; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
+; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]]
 ; SCALABLE-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -1263,7 +1301,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
@@ -1271,94 +1309,94 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP12]], ptr [[TMP14]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP11]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
+; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]]
 ; CHECK-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @combine_load_factor2_i64(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; FIXED-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[I]], 4
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1
 ; FIXED-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
-; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]]
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; FIXED-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; FIXED-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]]
-; FIXED-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
-; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
-; FIXED-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0
-; FIXED-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP10]], i32 4
-; FIXED-NEXT:    store <4 x i64> [[TMP8]], ptr [[TMP12]], align 8
-; FIXED-NEXT:    store <4 x i64> [[TMP9]], ptr [[TMP13]], align 8
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; FIXED-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; FIXED-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC1]]
+; FIXED-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC3]], [[STRIDED_VEC4]]
+; FIXED-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
+; FIXED-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0
+; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP8]], i32 4
+; FIXED-NEXT:    store <4 x i64> [[TMP6]], ptr [[TMP9]], align 8
+; FIXED-NEXT:    store <4 x i64> [[TMP7]], ptr [[TMP10]], align 8
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
+; FIXED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; FIXED-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; FIXED-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
+; FIXED-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]]
 ; FIXED-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -1367,7 +1405,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
@@ -1375,43 +1413,43 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[TMP8]], align 8
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[Q0]], align 8
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], [[TMP11]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
-; SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0
-; SCALABLE-NEXT:    store <vscale x 2 x i64> [[TMP12]], ptr [[TMP14]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP9]], [[TMP10]]
+; SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[TMP11]], ptr [[TMP13]], align 8
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; SCALABLE-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
+; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]]
 ; SCALABLE-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
index 9f8cf169c0593..809b69900731a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
@@ -11,6 +11,10 @@
 ; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP
 
+; FIXME: Fixed-order recurrence is not supported yet with EVL tail folding.
+; The llvm.splice may occurs unexpected behavior if the evl of the
+; second-to-last iteration is not VF*UF.
+
 define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-LABEL: define void @first_order_recurrence(
 ; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -27,31 +31,35 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TC]], 1
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
 ; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP11]]
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
-; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP25:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP26:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP25]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP26]]
+; IF-EVL-NEXT:    [[TMP27:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP13]]
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP15]], i32 4, <vscale x 4 x i1> [[TMP27]], <vscale x 4 x i32> poison)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1)
-; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP19:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[VP_OP_LOAD]]
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP13]]
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
-; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP19]], ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[TMP27]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]]
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
@@ -172,6 +180,7 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TC]], 1
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
@@ -182,27 +191,30 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], 4
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP13]], 1
 ; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 22, i32 [[TMP14]]
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
-; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP32:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP33:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP32]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP33]]
+; IF-EVL-NEXT:    [[TMP34:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]]
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP16]]
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[TMP34]], <vscale x 4 x i32> poison)
 ; IF-EVL-NEXT:    [[TMP19]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1)
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP19]], i32 -1)
-; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[TMP19]], <vscale x 4 x i32> [[TMP20]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
+; IF-EVL-NEXT:    [[TMP23:%.*]] = add nsw <vscale x 4 x i32> [[TMP19]], [[TMP20]]
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP16]]
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP21]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
-; IF-EVL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP15]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP23]], ptr [[TMP22]], i32 4, <vscale x 4 x i1> [[TMP34]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]]
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
@@ -218,12 +230,12 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL:       [[SCALAR_PH]]:
 ; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT6:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
 ; IF-EVL-NEXT:    br label %[[FOR_BODY:.*]]
 ; IF-EVL:       [[FOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP31:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT6]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
 ; IF-EVL-NEXT:    [[TMP31]] = load i32, ptr [[ARRAYIDX]], align 4
 ; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[FOR1]], [[FOR2]]
@@ -342,6 +354,7 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TC]], 1
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
@@ -356,30 +369,33 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], 4
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP16]], 1
 ; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 11, i32 [[TMP17]]
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
-; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VECTOR_RECUR4:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT3]], %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP39:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP40:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP39]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP40]]
+; IF-EVL-NEXT:    [[TMP41:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT6]]
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP19]]
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP21]], i32 4, <vscale x 4 x i1> [[TMP41]], <vscale x 4 x i32> poison)
 ; IF-EVL-NEXT:    [[TMP22]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1)
 ; IF-EVL-NEXT:    [[TMP23]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP22]], i32 -1)
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR4]], <vscale x 4 x i32> [[TMP23]], i32 -1)
-; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[TMP23]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
-; IF-EVL-NEXT:    [[VP_OP5:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[VP_OP]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
+; IF-EVL-NEXT:    [[TMP27:%.*]] = add nsw <vscale x 4 x i32> [[TMP23]], [[TMP24]]
+; IF-EVL-NEXT:    [[TMP42:%.*]] = add <vscale x 4 x i32> [[TMP27]], [[TMP22]]
 ; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP19]]
 ; IF-EVL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP25]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP5]], ptr align 4 [[TMP26]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
-; IF-EVL-NEXT:    [[TMP27:%.*]] = zext i32 [[TMP18]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP27]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP42]], ptr [[TMP26]], i32 4, <vscale x 4 x i1> [[TMP41]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]]
 ; IF-EVL-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
@@ -399,14 +415,14 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL:       [[SCALAR_PH]]:
 ; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
-; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT9:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT7]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT9:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT10:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT7]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ]
 ; IF-EVL-NEXT:    br label %[[FOR_BODY:.*]]
 ; IF-EVL:       [[FOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP38:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT9]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT9]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT10]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
 ; IF-EVL-NEXT:    [[TMP38]] = load i32, ptr [[ARRAYIDX]], align 4
 ; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[FOR2]], [[FOR3]]
diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
index 572511a5ffb92..90671689f1dce 100644
--- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-vectorize -force-vector-width=2 -use-dereferenceable-at-point-semantics=1 -S %s | FileCheck %s
+; RUN: opt -p loop-vectorize -force-vector-width=2 -use-dereferenceable-at-point-semantics -S %s | FileCheck %s
 
 declare void @llvm.assume(i1)
 
-define void @deref_assumption_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -104,7 +104,7 @@ exit:
   ret void
 }
 
-define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -181,7 +181,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_too_small_in_header_constant_trip_count(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -282,7 +282,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_align_1(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -383,7 +383,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attribute(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attribute(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attribute(
 ; CHECK-SAME: ptr noalias align 4 [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -484,7 +484,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_align_not_known(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -585,7 +585,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_then_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_in_then_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_then_constant_trip_count(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -682,7 +682,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_latch_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_in_latch_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_latch_constant_trip_count(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -785,7 +785,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_header_variable_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) nofree {
+define void @deref_assumption_in_header_variable_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_variable_trip_count(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -890,7 +890,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_1(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -968,7 +968,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -1063,7 +1063,7 @@ exit:
   ret void
 }
 
-define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -1142,7 +1142,7 @@ exit:
 }
 
 
-define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via_argument_attr(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via_argument_attr(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via_argument_attr(
 ; CHECK-SAME: ptr noalias align 4 [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -1220,7 +1220,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -1315,7 +1315,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -1410,8 +1410,8 @@ exit:
   ret void
 }
 
-; %a may be freeed between the dereferenceable assumption and accesses.
-; FIXME: It is not safe to use with -use-dereferenceable-at-point-semantics.
+; %a may be freed between the dereferenceable assumption and accesses.
+; It is not safe to use with -use-dereferenceable-at-point-semantics.
 define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; CHECK-LABEL: define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
@@ -1422,16 +1422,29 @@ define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true)
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
 ; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP15]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP14]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <2 x i32> [ [[TMP12]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP11]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
@@ -1491,6 +1504,103 @@ exit:
   ret void
 }
 
+; %a may be freed between the dereferenceable assumption and accesses.
+; It is not safe to use with -use-dereferenceable-at-point-semantics.
+define void @may_free_local_ptr_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %b, ptr noalias %c) nofree nosync {
+; CHECK-LABEL: define void @may_free_local_ptr_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(
+; CHECK-SAME: ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[A:%.*]] = call ptr @get_ptr()
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4) ]
+; CHECK-NEXT:    call void @may_free()
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true)
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP10]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x i32> [ [[TMP8]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
+; CHECK:       [[LOOP_THEN]]:
+; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP35:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = call ptr @get_ptr()
+  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4) ]
+  call void @may_free()
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp sge i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %l.a = load i32, ptr %a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+declare ptr @get_ptr()
 declare void @may_free()
 
 ;.
@@ -1528,4 +1638,6 @@ declare void @may_free()
 ; CHECK: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP32]] = distinct !{[[LOOP32]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP33]] = distinct !{[[LOOP33]], [[META2]], [[META1]]}
+; CHECK: [[LOOP34]] = distinct !{[[LOOP34]], [[META1]], [[META2]]}
+; CHECK: [[LOOP35]] = distinct !{[[LOOP35]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll
new file mode 100644
index 0000000000000..362ec22600f92
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll
@@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize,interleaved-access -mattr=+sve -S -o - %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+%struct.xyzt = type { i32, i32, i32, i32 }
+; for (int i = 0; i < 1024; ++i) {
+;   dst[i].x = a[i].x + b[i].x;
+;   dst[i].y = a[i].y - b[i].y;
+;   dst[i].z = a[i].z << b[i].z;
+;   dst[i].t = a[i].t >> b[i].t;
+; }
+
+define void @interleave_deinterleave(ptr noalias %dst, ptr %a, ptr %b) {
+; CHECK-LABEL: @interleave_deinterleave(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[LDN9:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[TMP13]])
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN9]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN9]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN9]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN9]], 3
+; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP9]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP22:%.*]] = sub nsw <vscale x 4 x i32> [[TMP10]], [[TMP17]]
+; CHECK-NEXT:    [[TMP23:%.*]] = shl <vscale x 4 x i32> [[TMP11]], [[TMP18]]
+; CHECK-NEXT:    [[TMP24:%.*]] = ashr <vscale x 4 x i32> [[TMP12]], [[TMP19]]
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP23]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i1> splat (i1 true), ptr [[TMP21]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4
+; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[Y]], align 4
+; CHECK-NEXT:    [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[Y11]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP33]], [[TMP26]]
+; CHECK-NEXT:    [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4
+; CHECK-NEXT:    store i32 [[SUB]], ptr [[Y14]], align 4
+; CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[Z]], align 4
+; CHECK-NEXT:    [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[Z19]], align 4
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP27]], [[TMP28]]
+; CHECK-NEXT:    [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8
+; CHECK-NEXT:    store i32 [[SHL]], ptr [[Z22]], align 4
+; CHECK-NEXT:    [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[T]], align 4
+; CHECK-NEXT:    [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[T27]], align 4
+; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12
+; CHECK-NEXT:    store i32 [[SHR]], ptr [[T30]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.a = getelementptr inbounds %struct.xyzt, ptr %a, i64 %iv
+  %a.0 = load i32, ptr %gep.a, align 4
+  %gep.b = getelementptr inbounds %struct.xyzt, ptr %b, i64 %iv
+  %b.0 = load i32, ptr %gep.b, align 4
+  %add = add nsw i32 %b.0, %a.0
+  %gep.dst = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %iv
+  store i32 %add, ptr %gep.dst, align 4
+  %gep.a.1 = getelementptr inbounds nuw i8, ptr %gep.a, i64 4
+  %a.1 = load i32, ptr %gep.a.1, align 4
+  %gep.b.1 = getelementptr inbounds nuw i8, ptr %gep.b, i64 4
+  %b.1 = load i32, ptr %gep.b.1, align 4
+  %sub = sub nsw i32 %a.1, %b.1
+  %gep.dst.1 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 4
+  store i32 %sub, ptr %gep.dst.1, align 4
+  %gep.a.2 = getelementptr inbounds nuw i8, ptr %gep.a, i64 8
+  %a.2 = load i32, ptr %gep.a.2, align 4
+  %gep.b.2 = getelementptr inbounds nuw i8, ptr %gep.b, i64 8
+  %b.2 = load i32, ptr %gep.b.2, align 4
+  %shl = shl i32 %a.2, %b.2
+  %gep.dst.2 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 8
+  store i32 %shl, ptr %gep.dst.2, align 4
+  %gep.a.3 = getelementptr inbounds nuw i8, ptr %gep.a, i64 12
+  %a.3 = load i32, ptr %gep.a.3, align 4
+  %gep.b.3 = getelementptr inbounds nuw i8, ptr %gep.b, i64 12
+  %b.3 = load i32, ptr %gep.b.3, align 4
+  %shr = ashr i32 %a.3, %b.3
+  %gep.dst.3 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 12
+  store i32 %shr, ptr %gep.dst.3, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index bf9c597d8ac5e..736a36da97f57 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -392,6 +392,7 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   SDValue FPToSI = DAG->getNode(ISD::FP_TO_SINT, DL, FloatVT, Op2);
   SDValue FPToUI = DAG->getNode(ISD::FP_TO_UINT, DL, FloatVT, Op2);
 
+  SDValue Bcast = DAG->getNode(ISD::BITCAST, DL, FloatVT, Op0);
   SDValue Brev = DAG->getNode(ISD::BITREVERSE, DL, Int32VT, Op0);
   SDValue Bswap = DAG->getNode(ISD::BSWAP, DL, Int32VT, Op0);
 
@@ -423,8 +424,12 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   EXPECT_FALSE(sd_match(FPToUI, m_FPToSI(m_Value())));
   EXPECT_FALSE(sd_match(FPToSI, m_FPToUI(m_Value())));
 
+  EXPECT_TRUE(sd_match(Bcast, m_BitCast(m_Value())));
+  EXPECT_TRUE(sd_match(Bcast, m_BitCast(m_SpecificVT(MVT::i32))));
   EXPECT_TRUE(sd_match(Brev, m_BitReverse(m_Value())));
   EXPECT_TRUE(sd_match(Bswap, m_BSwap(m_Value())));
+  EXPECT_FALSE(sd_match(Bcast, m_BitReverse(m_Value())));
+  EXPECT_FALSE(sd_match(Bcast, m_BitCast(m_SpecificVT(MVT::f32))));
   EXPECT_FALSE(sd_match(Brev, m_BSwap(m_Value())));
   EXPECT_FALSE(sd_match(Bswap, m_BitReverse(m_Value())));
 
diff --git a/mlir/include/mlir/Target/SPIRV/Deserialization.h b/mlir/include/mlir/Target/SPIRV/Deserialization.h
index e39258beeaac8..a346a7fd1e5f7 100644
--- a/mlir/include/mlir/Target/SPIRV/Deserialization.h
+++ b/mlir/include/mlir/Target/SPIRV/Deserialization.h
@@ -15,6 +15,7 @@
 
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Support/LLVM.h"
+#include <cstdint>
 
 namespace mlir {
 class MLIRContext;
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index 390ecabaef21b..4e4b9e5698fe9 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -1368,6 +1368,77 @@ llvm.func @omp_atomic_read(%arg0 : !llvm.ptr, %arg1 : !llvm.ptr) -> () {
 
 // -----
 
+// CHECK-LABEL: @omp_atomic_read_implicit_cast
+llvm.func @omp_atomic_read_implicit_cast () {
+//CHECK: %[[Z:.*]] = alloca float, i64 1, align 4
+//CHECK: %[[Y:.*]] = alloca double, i64 1, align 8
+//CHECK: %[[X:.*]] = alloca [2 x { float, float }], i64 1, align 8
+//CHECK: %[[W:.*]] = alloca i32, i64 1, align 4
+//CHECK: %[[X_ELEMENT:.*]] = getelementptr { float, float }, ptr %3, i64 0
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x f32 {bindc_name = "z"} : (i64) -> !llvm.ptr
+  %2 = llvm.mlir.constant(1 : i64) : i64
+  %3 = llvm.alloca %2 x f64 {bindc_name = "y"} : (i64) -> !llvm.ptr
+  %4 = llvm.mlir.constant(1 : i64) : i64
+  %5 = llvm.alloca %4 x !llvm.array<2 x struct<(f32, f32)>> {bindc_name = "x"} : (i64) -> !llvm.ptr
+  %6 = llvm.mlir.constant(1 : i64) : i64
+  %7 = llvm.alloca %6 x i32 {bindc_name = "w"} : (i64) -> !llvm.ptr
+  %8 = llvm.mlir.constant(1 : index) : i64
+  %9 = llvm.mlir.constant(2 : index) : i64
+  %10 = llvm.mlir.constant(1 : i64) : i64
+  %11 = llvm.mlir.constant(0 : i64) : i64
+  %12 = llvm.sub %8, %10 overflow<nsw> : i64
+  %13 = llvm.mul %12, %10 overflow<nsw> : i64
+  %14 = llvm.mul %13, %10 overflow<nsw> : i64
+  %15 = llvm.add %14, %11 overflow<nsw> : i64
+  %16 = llvm.mul %10, %9 overflow<nsw> : i64
+  %17 = llvm.getelementptr %5[%15] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(f32, f32)>
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = alloca { float, float }, align 8
+//CHECK: call void @__atomic_load(i64 8, ptr %[[X_ELEMENT]], ptr %[[ATOMIC_LOAD_TEMP]], i32 0)
+//CHECK: %[[LOAD:.*]] = load { float, float }, ptr %[[ATOMIC_LOAD_TEMP]], align 8
+//CHECK: %[[EXT:.*]] = extractvalue { float, float } %[[LOAD]], 0
+//CHECK: store float %[[EXT]], ptr %[[Y]], align 4
+  omp.atomic.read %3 = %17 : !llvm.ptr, !llvm.ptr, !llvm.struct<(f32, f32)>
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i32, ptr %[[Z]] monotonic, align 4
+//CHECK: %[[CAST:.*]] = bitcast i32 %[[ATOMIC_LOAD_TEMP]] to float
+//CHECK: %[[LOAD:.*]] = fpext float %[[CAST]] to double
+//CHECK: store double %[[LOAD]], ptr %[[Y]], align 8
+  omp.atomic.read %3 = %1 : !llvm.ptr, !llvm.ptr, f32
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i32, ptr %[[W]] monotonic, align 4
+//CHECK: %[[LOAD:.*]] = sitofp i32 %[[ATOMIC_LOAD_TEMP]] to double
+//CHECK: store double %[[LOAD]], ptr %[[Y]], align 8
+  omp.atomic.read %3 = %7 : !llvm.ptr, !llvm.ptr, i32
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i64, ptr %[[Y]] monotonic, align 4
+//CHECK: %[[CAST:.*]] = bitcast i64 %[[ATOMIC_LOAD_TEMP]] to double
+//CHECK: %[[LOAD:.*]] = fptrunc double %[[CAST]] to float
+//CHECK: store float %[[LOAD]], ptr %[[Z]], align 4
+  omp.atomic.read %1 = %3 : !llvm.ptr, !llvm.ptr, f64
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i32, ptr %[[W]] monotonic, align 4
+//CHECK: %[[LOAD:.*]] = sitofp i32 %[[ATOMIC_LOAD_TEMP]] to float
+//CHECK: store float %[[LOAD]], ptr %[[Z]], align 4
+  omp.atomic.read %1 = %7 : !llvm.ptr, !llvm.ptr, i32
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i64, ptr %[[Y]] monotonic, align 4
+//CHECK: %[[CAST:.*]] = bitcast i64 %[[ATOMIC_LOAD_TEMP]] to double
+//CHECK: %[[LOAD:.*]] = fptosi double %[[CAST]] to i32
+//CHECK: store i32 %[[LOAD]], ptr %[[W]], align 4
+  omp.atomic.read %7 = %3 : !llvm.ptr, !llvm.ptr, f64
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i32, ptr %[[Z]] monotonic, align 4
+//CHECK: %[[CAST:.*]] = bitcast i32 %[[ATOMIC_LOAD_TEMP]] to float
+//CHECK: %[[LOAD:.*]] = fptosi float %[[CAST]] to i32
+//CHECK: store i32 %[[LOAD]], ptr %[[W]], align 4
+  omp.atomic.read %7 = %1 : !llvm.ptr, !llvm.ptr, f32
+  llvm.return
+}
+
+// -----
+
 // CHECK-LABEL: @omp_atomic_write
 // CHECK-SAME: (ptr %[[x:.*]], i32 %[[expr:.*]])
 llvm.func @omp_atomic_write(%x: !llvm.ptr, %expr: i32) -> () {
diff --git a/offload/cmake/caches/AMDGPUBot.cmake b/offload/cmake/caches/AMDGPUBot.cmake
index d72b620ae3080..69bef91b2ce49 100644
--- a/offload/cmake/caches/AMDGPUBot.cmake
+++ b/offload/cmake/caches/AMDGPUBot.cmake
@@ -1,17 +1,19 @@
-# This file is meant for test builds on one basic AMDGPU buildbot only.
+# This file is used across all AMDGPU-cmake builders
 
 # Install directory set to /tmp as this is a bot config
 set(CMAKE_INSTALL_PREFIX /tmp/llvm.install.test CACHE STRING "")
 
+# General settings
+set(CMAKE_BUILD_TYPE Release CACHE STRING "")
+set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "")
+set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "")
+
 set(LLVM_ENABLE_PROJECTS "clang;lld" CACHE STRING "")
 set(LLVM_ENABLE_RUNTIMES "compiler-rt;openmp;offload" CACHE STRING "")
+
 set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
 set(LLVM_ENABLE_ASSERTIONS ON CACHE BOOL "")
-set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 32" CACHE STRING "")
 set(LLVM_TARGETS_TO_BUILD "host;AMDGPU" CACHE STRING "")
+set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 32" CACHE STRING "")
 
 set(CLANG_DEFAULT_LINKER "lld" CACHE STRING "")
-
-set(CMAKE_BUILD_TYPE Release CACHE STRING "")
-set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "")
-set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "")
diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S
index cc5344cdd124a..0bf9f07a13f14 100644
--- a/openmp/runtime/src/z_Linux_asm.S
+++ b/openmp/runtime/src/z_Linux_asm.S
@@ -19,6 +19,16 @@
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 
+# if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+# if __has_include(<cet.h>)
+# include <cet.h>
+# endif
+# endif
+
+# if !defined(_CET_ENDBR)
+# define _CET_ENDBR
+# endif
+
 # if KMP_MIC
 // the 'delay r16/r32/r64' should be used instead of the 'pause'.
 // The delay operation has the effect of removing the current thread from
@@ -66,6 +76,7 @@
 	ALIGN  4
 	.globl KMP_PREFIX_UNDERSCORE($0)
 KMP_PREFIX_UNDERSCORE($0):
+	_CET_ENDBR
 .endmacro
 # else // KMP_OS_DARWIN
 #  define KMP_PREFIX_UNDERSCORE(x) x //no extra underscore for Linux* OS symbols
@@ -92,6 +103,7 @@ KMP_PREFIX_UNDERSCORE($0):
         .globl KMP_PREFIX_UNDERSCORE(\proc)
 KMP_PREFIX_UNDERSCORE(\proc):
 	.cfi_startproc
+	_CET_ENDBR
 .endm
 .macro KMP_CFI_DEF_OFFSET sz
 	.cfi_def_cfa_offset	\sz
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index f72babb646a85..2286d4cd35e08 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -700,6 +700,7 @@ cc_library(
     includes = ["include"],
     textual_hdrs = [
         # keep sorted
+        "include/clang/Basic/AllDiagnosticKinds.inc",
         "include/clang/Basic/AttrHasAttributeImpl.inc",
         "include/clang/Basic/AttrList.inc",
         "include/clang/Basic/AttrSubMatchRulesList.inc",