From 33de6aa8b234febb21e3611950446ff551cadde7 Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Wed, 12 Nov 2025 20:10:35 +0800
Subject: [PATCH 01/15] [Clang][NFC] Fix a typo in FunctionDecl (#167677)

Found it while looking at other stuffs.
---
 clang/include/clang/AST/Decl.h                 | 2 +-
 clang/lib/AST/Decl.cpp                         | 2 +-
 clang/lib/Sema/SemaDeclCXX.cpp                 | 2 +-
 clang/lib/Sema/SemaTemplateInstantiateDecl.cpp | 2 +-
 clang/lib/Serialization/ASTWriterDecl.cpp      | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 406d79ebd6641..ee2321dd158d4 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -2335,7 +2335,7 @@ class FunctionDecl : public DeclaratorDecl,
   }
 
   void setDefaultedOrDeletedInfo(DefaultedOrDeletedFunctionInfo *Info);
-  DefaultedOrDeletedFunctionInfo *getDefalutedOrDeletedInfo() const;
+  DefaultedOrDeletedFunctionInfo *getDefaultedOrDeletedInfo() const;
 
   /// Whether this function is variadic.
   bool isVariadic() const;
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 8579e51e45697..eff2b81d61a50 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -3180,7 +3180,7 @@ void FunctionDecl::DefaultedOrDeletedFunctionInfo::setDeletedMessage(
 }
 
 FunctionDecl::DefaultedOrDeletedFunctionInfo *
-FunctionDecl::getDefalutedOrDeletedInfo() const {
+FunctionDecl::getDefaultedOrDeletedInfo() const {
   return FunctionDeclBits.HasDefaultedOrDeletedInfo ? DefaultedOrDeletedInfo
                                                     : nullptr;
 }
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index d41ab126c426f..8030aac3d8771 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -8035,7 +8035,7 @@ class DefaultedComparisonVisitor {
   DefaultedComparisonVisitor(Sema &S, CXXRecordDecl *RD, FunctionDecl *FD,
                              DefaultedComparisonKind DCK)
       : S(S), RD(RD), FD(FD), DCK(DCK) {
-    if (auto *Info = FD->getDefalutedOrDeletedInfo()) {
+    if (auto *Info = FD->getDefaultedOrDeletedInfo()) {
       // FIXME: Change CreateOverloadedBinOp to take an ArrayRef instead of an
       // UnresolvedSet to avoid this copy.
       Fns.assign(Info->getUnqualifiedLookups().begin(),
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index a56017cd7b7e7..1b6b559c1227b 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -5465,7 +5465,7 @@ TemplateDeclInstantiator::InitMethodInstantiation(CXXMethodDecl *New,
 bool TemplateDeclInstantiator::SubstDefaultedFunction(FunctionDecl *New,
                                                       FunctionDecl *Tmpl) {
   // Transfer across any unqualified lookups.
-  if (auto *DFI = Tmpl->getDefalutedOrDeletedInfo()) {
+  if (auto *DFI = Tmpl->getDefaultedOrDeletedInfo()) {
     SmallVector<DeclAccessPair, 32> Lookups;
     Lookups.reserve(DFI->getUnqualifiedLookups().size());
     bool AnyChanged = false;
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index a8c487005f6ec..c9f8797ab973f 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -903,7 +903,7 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
   Record.push_back(D->getODRHash());
 
   if (D->isDefaulted() || D->isDeletedAsWritten()) {
-    if (auto *FDI = D->getDefalutedOrDeletedInfo()) {
+    if (auto *FDI = D->getDefaultedOrDeletedInfo()) {
       // Store both that there is an DefaultedOrDeletedInfo and whether it
       // contains a DeletedMessage.
       StringLiteral *DeletedMessage = FDI->getDeletedMessage();

From 07cd105416561e1b1406173d73314c60c8dc961f Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Wed, 12 Nov 2025 12:06:51 +0000
Subject: [PATCH 02/15] [AArch64] Add 'REQUIRES: asserts' to
 regalloc-hint-movprfx.mir

This should fix the buildbot failure reported here:

  https://lab.llvm.org/buildbot/#/builders/11/builds/27869
---
 llvm/test/CodeGen/AArch64/regalloc-hint-movprfx.mir | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/test/CodeGen/AArch64/regalloc-hint-movprfx.mir b/llvm/test/CodeGen/AArch64/regalloc-hint-movprfx.mir
index c2d8f8e73772d..05f583e2e692f 100644
--- a/llvm/test/CodeGen/AArch64/regalloc-hint-movprfx.mir
+++ b/llvm/test/CodeGen/AArch64/regalloc-hint-movprfx.mir
@@ -1,4 +1,5 @@
 # RUN: llc -mtriple=aarch64 -mattr=+sve -start-before=greedy -stop-after=virtregrewriter -debug-only=regalloc %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=DBG
+# REQUIRES: asserts
 
 # Check that the register allocator gets hints to reuse registers of one of it's operands.
 ---

From 7838dbee3a307cd8bd129ee8dbb998209133bffe Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles@arm.com>
Date: Wed, 12 Nov 2025 13:15:34 +0000
Subject: [PATCH 03/15] [Flang][OpenMP] Add Lowering support for Collapse with
 Taskloop (#166791)

Support for lowering collapse already exists within
`genLoopNestClauses`, which is called when lowering taskloop. However,
the TODO message still included the Collapse clause, so it was not
activated. By removing this, it enables lowering of the Collapse clause
in taskloop.
---
 flang/lib/Lower/OpenMP/OpenMP.cpp             |  2 --
 .../Lower/OpenMP/Todo/taskloop-collapse.f90   | 15 --------
 flang/test/Lower/OpenMP/taskloop-collapse.f90 | 34 +++++++++++++++++++
 flang/test/Semantics/OpenMP/taskloop04.f90    | 15 ++++++++
 4 files changed, 49 insertions(+), 17 deletions(-)
 delete mode 100644 flang/test/Lower/OpenMP/Todo/taskloop-collapse.f90
 create mode 100644 flang/test/Lower/OpenMP/taskloop-collapse.f90
 create mode 100644 flang/test/Semantics/OpenMP/taskloop04.f90

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 6e9f0716a02fe..4048aeea37b92 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1781,8 +1781,6 @@ static void genTaskloopClauses(
   cp.processPriority(stmtCtx, clauseOps);
   cp.processReduction(loc, clauseOps, reductionSyms);
   cp.processUntied(clauseOps);
-
-  cp.processTODO<clause::Collapse>(loc, llvm::omp::Directive::OMPD_taskloop);
 }
 
 static void genTaskwaitClauses(lower::AbstractConverter &converter,
diff --git a/flang/test/Lower/OpenMP/Todo/taskloop-collapse.f90 b/flang/test/Lower/OpenMP/Todo/taskloop-collapse.f90
deleted file mode 100644
index cd54f5eeba6c4..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/taskloop-collapse.f90
+++ /dev/null
@@ -1,15 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-! CHECK: not yet implemented: Unhandled clause COLLAPSE in TASKLOOP construct
-subroutine omp_taskloop_collapse()
-   integer x
-   x = 0
-   !$omp taskloop collapse(2)
-   do i = 1, 100
-     do j = 1, 100
-      x = x + 1
-     end do
-   end do
-   !$omp end taskloop
-end subroutine omp_taskloop_collapse
diff --git a/flang/test/Lower/OpenMP/taskloop-collapse.f90 b/flang/test/Lower/OpenMP/taskloop-collapse.f90
new file mode 100644
index 0000000000000..48243640d07b9
--- /dev/null
+++ b/flang/test/Lower/OpenMP/taskloop-collapse.f90
@@ -0,0 +1,34 @@
+! Test the collapse clause when being used with the taskloop construct
+! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=45 %s -o - 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=45 %s -o - 2>&1 | FileCheck %s
+
+! CHECK-LABEL: omp.private
+! CHECK-SAME: {type = private} @[[J_PRIVATE:.*]] : i32
+! CHECK-LABEL: omp.private
+! CHECK-SAME: {type = private} @[[I_PRIVATE:.*]] : i32
+! CHECK-LABEL: omp.private
+! CHECK-SAME: {type = firstprivate} @[[SUM_FIRSTPRIVATE:.*]] : i32 copy
+
+! CHECK-LABEL: func.func @_QPtest()
+! CHECK: %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtestEi"}
+! CHECK: %[[DECLARE_I:.*]]:2 = hlfir.declare %1 {uniq_name = "_QFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[ALLOCA_J:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFtestEj"}
+! CHECK: %[[DECLARE_J:.*]]:2 = hlfir.declare %3 {uniq_name = "_QFtestEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[ALLOCA_SUM:.*]] = fir.alloca i32 {bindc_name = "sum", uniq_name = "_QFtestEsum"}
+! CHECK: %[[DECLARE_SUM:.*]]:2 = hlfir.declare %5 {uniq_name = "_QFtestEsum"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+subroutine test()
+    integer :: i, j, sum
+
+    !$omp taskloop collapse(2)
+    ! CHECK-LABEL: omp.taskloop
+    ! CHECK-SAME: private(@_QFtestEsum_firstprivate_i32 %[[DECLARE_SUM]]#0 -> %arg0, @_QFtestEi_private_i32 %[[DECLARE_I]]#0 -> %arg1, @_QFtestEj_private_i32 %[[DECLARE_J]]#0 -> %arg2 : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>)
+    ! CHECK-LABEL: omp.loop_nest
+    ! CHECK-SAME: (%arg3, %arg4) : i32 = (%c1_i32, %c1_i32_1) to (%c10_i32, %c5_i32) inclusive step (%c1_i32_0, %c1_i32_2) collapse(2)
+    do i = 1, 10
+        do j = 1, 5
+            sum = sum + i + j
+        end do
+    end do
+    !$omp end taskloop
+end subroutine
diff --git a/flang/test/Semantics/OpenMP/taskloop04.f90 b/flang/test/Semantics/OpenMP/taskloop04.f90
new file mode 100644
index 0000000000000..4ffcf84f708e9
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/taskloop04.f90
@@ -0,0 +1,15 @@
+! When lowering Taskloop, it is possible for the TileSizes clause to be lowered, but this is not a supported clause.
+! We should make sure that any use of Tilesizes with Taskloop is correctly rejected by the Semantics.
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+
+subroutine test
+    integer :: i, sum
+
+    !ERROR: TILE cannot follow TASKLOOP
+    !ERROR: SIZES clause is not allowed on the TASKLOOP directive
+    !$omp taskloop tile sizes(2)
+    do i=1,10
+        sum = sum + i
+    end do
+    !$omp end taskloop
+end subroutine

From c5eb7eb3dcd51415adf804ca45eeb719cbab4351 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Wed, 12 Nov 2025 08:03:32 -0600
Subject: [PATCH 04/15] [OpenMP] Add more comments to
 `ConstructDecompositionT.h`, NFC (#167564)

---
 .../Frontend/OpenMP/ConstructDecompositionT.h | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
index d702273cec9ec..3918cecfc1e65 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
@@ -68,17 +68,16 @@ find_unique(Container &&container, Predicate &&pred) {
 
 namespace tomp {
 
-// ClauseType - Either instance of ClauseT, or a type derived from ClauseT.
-//
-// This is the clause representation in the code using this infrastructure.
-//
-// HelperType - A class that implements two member functions:
+// ClauseType: Either an instance of ClauseT, or a type derived from ClauseT.
+//   This is the clause representation in the code using this infrastructure.
 //
+// HelperType: A class that implements two member functions:
 //   // Return the base object of the given object, if any.
 //   std::optional<Object> getBaseObject(const Object &object) const
 //   // Return the iteration variable of the outermost loop associated
 //   // with the construct being worked on, if any.
 //   std::optional<Object> getLoopIterVar() const
+
 template <typename ClauseType, typename HelperType>
 struct ConstructDecompositionT {
   using ClauseTy = ClauseType;
@@ -181,27 +180,32 @@ struct ConstructDecompositionT {
   std::enable_if_t<llvm::remove_cvref_t<U>::UnionTrait::value, void>
   addClauseSymsToMap(U &&item, const ClauseTy *);
 
-  // Apply a clause to the only directive that allows it. If there are no
+  // Apply the clause to the only directive that allows it. If there are no
   // directives that allow it, or if there is more that one, do not apply
   // anything and return false, otherwise return true.
   bool applyToUnique(const ClauseTy *node);
 
-  // Apply a clause to the first directive in given range that allows it.
+  // Apply the clause to the first directive in given range that allows it.
   // If such a directive does not exist, return false, otherwise return true.
   template <typename Iterator>
   bool applyToFirst(const ClauseTy *node, llvm::iterator_range<Iterator> range);
 
-  // Apply a clause to the innermost directive that allows it. If such a
+  // Apply the clause to the innermost directive that allows it. If such a
   // directive does not exist, return false, otherwise return true.
   bool applyToInnermost(const ClauseTy *node);
 
-  // Apply a clause to the outermost directive that allows it. If such a
+  // Apply the clause to the outermost directive that allows it. If such a
   // directive does not exist, return false, otherwise return true.
   bool applyToOutermost(const ClauseTy *node);
 
+  // Apply the clause to all directives that allow it, and which satisfy
+  // the predicate: bool shouldApply(LeafReprInternal). If no such
+  // directives exist, return false, otherwise return true.
   template <typename Predicate>
   bool applyIf(const ClauseTy *node, Predicate shouldApply);
 
+  // Apply the clause to all directives that allow it. If no such directives
+  // exist, return false, otherwise return true.
   bool applyToAll(const ClauseTy *node);
 
   template <typename Clause>
@@ -983,7 +987,7 @@ bool ConstructDecompositionT<C, H>::applyClause(
       return dir == llvm::omp::Directive::OMPD_simd ||
              llvm::is_contained(getWorksharingLoop(), dir);
     case ReductionModifier::Task:
-      if (alreadyApplied)
+      if (alreadyApplied) // Not an error
         return false;
       // According to [5.2:135:16-18], "task" only applies to "parallel" and
       // worksharing constructs.

From 0f4dc93608059469df7606b4bd0b2834f9035d54 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Wed, 12 Nov 2025 14:05:59 +0000
Subject: [PATCH 05/15] [lldb][Language] Pass SymbolNameFitsToLanguage
 parameter by const-ref (#167684)

We've been seeing (rare) crashes from both
`CPlusPlusLanguage::SymbolNameFitsToLanguage` and
`ObjCLanguage::SymbolNameFitsToLanguage` when we try to read contents of
the `ConstString`s of the `Mangled` parameter. I'm not entirely sure how
that can happen (current theory is corrupted stack somehow which
overwrites `ConstString::m_string` to an invalid pointer) but I'm not
able to confirm that.

One thing these crashes had in common is that they operate on the
`Mangled` object we copied into `SymbolNameFitsToLanguage` by value.
While I can't see off the top why that would cause it to contain
unintiailized/corrupt `ConstString`s, the class is sufficiently large
enough to probably pass it by `const &` anyway. This is what this patch
does.

rdar://164519648
---
 lldb/include/lldb/Target/Language.h                          | 4 +++-
 lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp | 2 +-
 lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h   | 2 +-
 lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp           | 2 +-
 lldb/source/Plugins/Language/ObjC/ObjCLanguage.h             | 2 +-
 5 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/lldb/include/lldb/Target/Language.h b/lldb/include/lldb/Target/Language.h
index 9958b6ea2f815..9292f790333a1 100644
--- a/lldb/include/lldb/Target/Language.h
+++ b/lldb/include/lldb/Target/Language.h
@@ -318,7 +318,9 @@ class Language : public PluginInterface {
   ///
   /// This function should only return true if there is a high confidence
   /// that the name actually belongs to this language.
-  virtual bool SymbolNameFitsToLanguage(Mangled name) const { return false; }
+  virtual bool SymbolNameFitsToLanguage(const Mangled &name) const {
+    return false;
+  }
 
   /// An individual data formatter may apply to several types and cross language
   /// boundaries. Each of those languages may want to customize the display of
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index a2199cb65cd35..e935ea8fab813 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -103,7 +103,7 @@ CPlusPlusLanguage::GetFunctionNameInfo(ConstString name) const {
     return {func_name_type, ConstString(basename)};
 }
 
-bool CPlusPlusLanguage::SymbolNameFitsToLanguage(Mangled mangled) const {
+bool CPlusPlusLanguage::SymbolNameFitsToLanguage(const Mangled &mangled) const {
   auto mangling_scheme =
       Mangled::GetManglingScheme(mangled.GetMangledName().GetStringRef());
   return mangling_scheme == Mangled::eManglingSchemeItanium ||
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h
index 9a528ca7b03f9..13d436a68c691 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h
@@ -92,7 +92,7 @@ class CPlusPlusLanguage : public Language {
 
   static llvm::StringRef GetPluginNameStatic() { return "cplusplus"; }
 
-  bool SymbolNameFitsToLanguage(Mangled mangled) const override;
+  bool SymbolNameFitsToLanguage(const Mangled &mangled) const override;
 
   bool DemangledNameContainsPath(llvm::StringRef path,
                                  ConstString demangled) const override;
diff --git a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp
index 3b8e21cbb9269..5e31faccb315c 100644
--- a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp
+++ b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp
@@ -235,7 +235,7 @@ ObjCLanguage::GetFunctionNameInfo(ConstString name) const {
   return {func_name_type, std::nullopt};
 }
 
-bool ObjCLanguage::SymbolNameFitsToLanguage(Mangled mangled) const {
+bool ObjCLanguage::SymbolNameFitsToLanguage(const Mangled &mangled) const {
   ConstString demangled_name = mangled.GetDemangledName();
   if (!demangled_name)
     return false;
diff --git a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h
index a68ea41c723de..729230dc2dccb 100644
--- a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h
+++ b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h
@@ -145,7 +145,7 @@ class ObjCLanguage : public Language {
   std::pair<lldb::FunctionNameType, std::optional<ConstString>>
   GetFunctionNameInfo(ConstString name) const override;
 
-  bool SymbolNameFitsToLanguage(Mangled mangled) const override;
+  bool SymbolNameFitsToLanguage(const Mangled &mangled) const override;
 
   lldb::TypeCategoryImplSP GetFormatters() override;
 

From f6cf44ac2d3ebc4b2c785c202ac4f33b7533d45f Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Wed, 12 Nov 2025 14:22:15 +0000
Subject: [PATCH 06/15] [lldb][ObjC][NFC] Rewrite IsPossibleObjCMethodName in
 terms of llvm::StringRef (#167660)

We've seen some crashes around this area (particularly around
checking/handling raw C-strings). Dealing with `StringRef`s makes it a
bit easier to reason about.

This doesn't fix anything per se, but is an improvement in readability.

rdar://164519648
---
 .../Plugins/Language/ObjC/ObjCLanguage.cpp    |  7 +++
 .../Plugins/Language/ObjC/ObjCLanguage.h      |  8 +---
 .../Language/ObjC/ObjCLanguageTest.cpp        | 43 +++++++++++++++++++
 3 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp
index 5e31faccb315c..c0dcb958ad85f 100644
--- a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp
+++ b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp
@@ -1065,3 +1065,10 @@ ObjCLanguage::GetBooleanFromString(llvm::StringRef str) const {
       .Case("NO", {false})
       .Default({});
 }
+
+bool ObjCLanguage::IsPossibleObjCMethodName(llvm::StringRef name) {
+  if (!name.starts_with("-[") && !name.starts_with("+["))
+    return false;
+
+  return name.ends_with("]");
+}
diff --git a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h
index 729230dc2dccb..ced6bd3290a86 100644
--- a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h
+++ b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h
@@ -175,13 +175,7 @@ class ObjCLanguage : public Language {
 
   static llvm::StringRef GetPluginNameStatic() { return "objc"; }
 
-  static bool IsPossibleObjCMethodName(const char *name) {
-    if (!name)
-      return false;
-    bool starts_right = (name[0] == '+' || name[0] == '-') && name[1] == '[';
-    bool ends_right = (name[strlen(name) - 1] == ']');
-    return (starts_right && ends_right);
-  }
+  static bool IsPossibleObjCMethodName(llvm::StringRef name);
 
   static bool IsPossibleObjCSelector(const char *name) {
     if (!name)
diff --git a/lldb/unittests/Language/ObjC/ObjCLanguageTest.cpp b/lldb/unittests/Language/ObjC/ObjCLanguageTest.cpp
index 70baa7e6bc135..4b018a29f3587 100644
--- a/lldb/unittests/Language/ObjC/ObjCLanguageTest.cpp
+++ b/lldb/unittests/Language/ObjC/ObjCLanguageTest.cpp
@@ -112,3 +112,46 @@ TEST(ObjCLanguage, InvalidMethodNameParsing) {
     EXPECT_FALSE(lax_method.has_value());
   }
 }
+
+struct ObjCMethodTestCase {
+  llvm::StringRef name;
+  bool is_valid;
+};
+
+struct ObjCMethodNameTextFiture
+    : public testing::TestWithParam<ObjCMethodTestCase> {};
+
+static ObjCMethodTestCase g_objc_method_name_test_cases[] = {
+    {"", false},
+    {"+[Uh oh!", false},
+    {"-[Definitely not...", false},
+    {"[Nice try ] :)", false},
+    {"+MaybeIfYouSquintYourEyes]", false},
+    {"?[Tricky]", false},
+    {"[]", false},
+    {"-[a", false},
+    {"+[a", false},
+    {"-]a]", false},
+    {"+]a]", false},
+
+    // FIXME: should these count as valid?
+    {"+[]", true},
+    {"-[]", true},
+    {"-[[]", true},
+    {"+[[]", true},
+    {"+[a ]", true},
+    {"-[a ]", true},
+
+    // Valid names
+    {"+[a a]", true},
+    {"-[a a]", true},
+};
+
+TEST_P(ObjCMethodNameTextFiture, TestIsPossibleObjCMethodName) {
+  // Tests ObjCLanguage::IsPossibleObjCMethodName
+  auto [name, expect_valid] = GetParam();
+  EXPECT_EQ(ObjCLanguage::IsPossibleObjCMethodName(name), expect_valid);
+}
+
+INSTANTIATE_TEST_SUITE_P(ObjCMethodNameTests, ObjCMethodNameTextFiture,
+                         testing::ValuesIn(g_objc_method_name_test_cases));

From 448146d6479cdfd6b7c80cb33dbaed882dead2f1 Mon Sep 17 00:00:00 2001
From: Maxime Arthaud <arthaud@meta.com>
Date: Wed, 12 Nov 2025 15:36:56 +0100
Subject: [PATCH 07/15] [llvm-c] Add bindings for DbgRecord (#166383)

In the LLVM-C library, there is currently no way to get information about a
DbgRecord - which is the new way to attach debug information to llvm
instructions.

We can only iterate on debug records with LLVMGetFirstDbgRecord/
LLVMGetLastDbgRecord/LLVMGetNextDbgRecord, but there is no way to read
information.

This PR adds utility functions to read DbgRecord information.
---
 llvm/include/llvm-c/Core.h         | 38 ++++++++++++++++++++++++++++++
 llvm/lib/IR/Core.cpp               | 31 ++++++++++++++++++++++++
 llvm/tools/llvm-c-test/debuginfo.c | 37 +++++++++++++++++++++++++++++
 3 files changed, 106 insertions(+)

diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 4e380d9bd5969..83dd1eba876e6 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -531,6 +531,13 @@ enum {
  */
 typedef unsigned LLVMGEPNoWrapFlags;
 
+typedef enum {
+  LLVMDbgRecordLabel,
+  LLVMDbgRecordDeclare,
+  LLVMDbgRecordValue,
+  LLVMDbgRecordAssign,
+} LLVMDbgRecordKind;
+
 /**
  * @}
  */
@@ -3896,6 +3903,37 @@ LLVM_C_ABI LLVMDbgRecordRef LLVMGetNextDbgRecord(LLVMDbgRecordRef DbgRecord);
 LLVM_C_ABI LLVMDbgRecordRef
 LLVMGetPreviousDbgRecord(LLVMDbgRecordRef DbgRecord);
 
+/**
+ * Get the debug location attached to the debug record.
+ *
+ * @see llvm::DbgRecord::getDebugLoc()
+ */
+LLVMMetadataRef LLVMDbgRecordGetDebugLoc(LLVMDbgRecordRef Rec);
+
+LLVMDbgRecordKind LLVMDbgRecordGetKind(LLVMDbgRecordRef Rec);
+
+/**
+ * Get the value of the DbgVariableRecord.
+ *
+ * @see llvm::DbgVariableRecord::getValue()
+ */
+LLVMValueRef LLVMDbgVariableRecordGetValue(LLVMDbgRecordRef Rec,
+                                           unsigned OpIdx);
+
+/**
+ * Get the debug info variable of the DbgVariableRecord.
+ *
+ * @see llvm::DbgVariableRecord::getVariable()
+ */
+LLVMMetadataRef LLVMDbgVariableRecordGetVariable(LLVMDbgRecordRef Rec);
+
+/**
+ * Get the debug info expression of the DbgVariableRecord.
+ *
+ * @see llvm::DbgVariableRecord::getExpression()
+ */
+LLVMMetadataRef LLVMDbgVariableRecordGetExpression(LLVMDbgRecordRef Rec);
+
 /**
  * @defgroup LLVMCCoreValueInstructionCall Call Sites and Invocations
  *
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 27d8294b01264..604730e0d3004 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -3036,6 +3036,37 @@ LLVMDbgRecordRef LLVMGetPreviousDbgRecord(LLVMDbgRecordRef Rec) {
   return wrap(&*--I);
 }
 
+LLVMMetadataRef LLVMDbgRecordGetDebugLoc(LLVMDbgRecordRef Rec) {
+  return wrap(unwrap<DbgRecord>(Rec)->getDebugLoc().getAsMDNode());
+}
+
+LLVMDbgRecordKind LLVMDbgRecordGetKind(LLVMDbgRecordRef Rec) {
+  DbgRecord *Record = unwrap<DbgRecord>(Rec);
+  if (isa<DbgLabelRecord>(Record))
+    return LLVMDbgRecordLabel;
+  DbgVariableRecord *VariableRecord = dyn_cast<DbgVariableRecord>(Record);
+  assert(VariableRecord && "unexpected record");
+  if (VariableRecord->isDbgDeclare())
+    return LLVMDbgRecordDeclare;
+  if (VariableRecord->isDbgValue())
+    return LLVMDbgRecordValue;
+  assert(VariableRecord->isDbgAssign() && "unexpected record");
+  return LLVMDbgRecordAssign;
+}
+
+LLVMValueRef LLVMDbgVariableRecordGetValue(LLVMDbgRecordRef Rec,
+                                           unsigned OpIdx) {
+  return wrap(unwrap<DbgVariableRecord>(Rec)->getValue(OpIdx));
+}
+
+LLVMMetadataRef LLVMDbgVariableRecordGetVariable(LLVMDbgRecordRef Rec) {
+  return wrap(unwrap<DbgVariableRecord>(Rec)->getRawVariable());
+}
+
+LLVMMetadataRef LLVMDbgVariableRecordGetExpression(LLVMDbgRecordRef Rec) {
+  return wrap(unwrap<DbgVariableRecord>(Rec)->getRawExpression());
+}
+
 unsigned LLVMGetNumArgOperands(LLVMValueRef Instr) {
   if (FuncletPadInst *FPI = dyn_cast<FuncletPadInst>(unwrap(Instr))) {
     return FPI->arg_size();
diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c
index 9db7aa0929aab..677722fea1a98 100644
--- a/llvm/tools/llvm-c-test/debuginfo.c
+++ b/llvm/tools/llvm-c-test/debuginfo.c
@@ -364,6 +364,43 @@ int llvm_test_dibuilder(void) {
   assert(AddDbgRecordUnderTheRange == NULL);
   (void)AddDbgRecordUnderTheRange;
 
+  // Test that we can read the first debug record.
+  LLVMMetadataRef AddDbgRecordFirstDebugLoc =
+      LLVMDbgRecordGetDebugLoc(AddDbgRecordFirst);
+  (void)AddDbgRecordFirstDebugLoc;
+  assert(LLVMDILocationGetLine(AddDbgRecordFirstDebugLoc) == 43);
+  assert(LLVMDbgRecordGetKind(AddDbgRecordFirst) == LLVMDbgRecordValue);
+  LLVMValueRef AddDbgRecordFirstValue =
+      LLVMDbgVariableRecordGetValue(AddDbgRecordFirst, 0);
+  (void)AddDbgRecordFirstValue;
+  assert(LLVMGetValueKind(AddDbgRecordFirstValue) == LLVMConstantIntValueKind);
+  assert(LLVMConstIntGetZExtValue(AddDbgRecordFirstValue) == 0);
+  LLVMMetadataRef AddDbgRecordFirstVariable =
+      LLVMDbgVariableRecordGetVariable(AddDbgRecordFirst);
+  (void)AddDbgRecordFirstVariable;
+  assert(LLVMGetMetadataKind(AddDbgRecordFirstVariable) ==
+         LLVMDILocalVariableMetadataKind);
+  // TODO: For now, there is no way to get the name.
+  LLVMMetadataRef AddDbgRecordFirstVariableScope =
+      LLVMDIVariableGetScope(AddDbgRecordFirstVariable);
+  (void)AddDbgRecordFirstVariableScope;
+  assert(LLVMGetMetadataKind(AddDbgRecordFirstVariableScope) ==
+         LLVMDILexicalBlockMetadataKind);
+  LLVMMetadataRef AddDbgRecordFirstVariableFile =
+      LLVMDIScopeGetFile(AddDbgRecordFirstVariableScope);
+  (void)AddDbgRecordFirstVariableFile;
+  assert(LLVMGetMetadataKind(AddDbgRecordFirstVariableFile) ==
+         LLVMDIFileMetadataKind);
+  unsigned FileLen = 0;
+  assert(strcmp(LLVMDIFileGetFilename(AddDbgRecordFirstVariableFile, &FileLen),
+                "debuginfo.c") == 0);
+  (void)FileLen;
+  LLVMMetadataRef AddDbgRecordFirstExpr =
+      LLVMDbgVariableRecordGetExpression(AddDbgRecordFirst);
+  assert(LLVMGetMetadataKind(AddDbgRecordFirstExpr) ==
+         LLVMDIExpressionMetadataKind);
+  (void)AddDbgRecordFirstExpr;
+
   char *MStr = LLVMPrintModuleToString(M);
   puts(MStr);
   LLVMDisposeMessage(MStr);

From 1f58cbe60a01d4651cec3ada480ab8b63586b680 Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar@gmail.com>
Date: Wed, 12 Nov 2025 09:38:21 -0500
Subject: [PATCH 08/15] [DAG] Fold (umin (sub a b) a) -> (usubo a b); (select
 usubo.1 a usubo.0) (#161651)

Fixes #161036.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  19 +++
 .../umin-sub-to-usubo-select-combine.ll       | 151 +++++++++++++++++
 .../X86/umin-sub-to-usubo-select-combine.ll   | 156 ++++++++++++++++++
 3 files changed, 326 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll
 create mode 100644 llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index df353c4d91b1a..d9d3a3ec01757 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6219,6 +6219,25 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
                                         SDLoc(N), VT, N0, N1))
     return SD;
 
+  if (TLI.isOperationLegalOrCustom(ISD::USUBO, VT) &&
+      !TLI.isOperationLegalOrCustom(ISD::UMIN, VT)) {
+    SDValue B;
+
+    // (umin (sub a, b), a) -> (usubo a, b); (select usubo.1, a, usubo.0)
+    if (sd_match(N0, m_Sub(m_Specific(N1), m_Value(B)))) {
+      SDVTList VTs = DAG.getVTList(VT, getSetCCResultType(VT));
+      SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, N1, B);
+      return DAG.getSelect(DL, VT, USO.getValue(1), N1, USO.getValue(0));
+    }
+
+    // (umin a, (sub a, b)) -> (usubo a, b); (select usubo.1, a, usubo.0)
+    if (sd_match(N1, m_Sub(m_Specific(N0), m_Value(B)))) {
+      SDVTList VTs = DAG.getVTList(VT, getSetCCResultType(VT));
+      SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, N0, B);
+      return DAG.getSelect(DL, VT, USO.getValue(1), N0, USO.getValue(0));
+    }
+  }
+
   // Simplify the operands using demanded-bits information.
   if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
diff --git a/llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll
new file mode 100644
index 0000000000000..fe3eee06db65e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll
@@ -0,0 +1,151 @@
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+; GitHub issue #161036
+
+; Positive test : umin(sub(a,b),a) with scalar types should be folded
+define i64 @underflow_compare_fold_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold_i64
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT:  subs x8, x0, x1
+; CHECK-NEXT:  csel x0, x0, x8, lo
+; CHECK-NEXT:  ret
+  %sub = sub i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+; Positive test : umin(a,sub(a,b)) with scalar types should be folded
+define i64 @underflow_compare_fold_i64_commute(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold_i64_commute
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT:  subs x8, x0, x1
+; CHECK-NEXT:  csel x0, x0, x8, lo
+; CHECK-NEXT:  ret
+  %sub = sub i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
+  ret i64 %cond
+}
+
+; Positive test : multi-use is OK since the sub instruction still runs once
+define i64 @underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i64_multi_use
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT:  subs x8, x0, x1
+; CHECK-NEXT:  csel x0, x0, x8, lo
+; CHECK-NEXT:  str	x8, [x2]
+; CHECK-NEXT:  ret
+  %sub = sub i64 %a, %b
+  store i64 %sub, ptr addrspace(1) %ptr
+  %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: underflow_compare_fold_i32
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT:  subs w8, w0, w1
+; CHECK-NEXT:  csel w0, w0, w8, lo
+; CHECK-NEXT:  ret
+  %sub = sub i32 %a, %b
+  %cond = tail call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32_commute(i32 %a, i32 %b) {
+; CHECK-LABEL: underflow_compare_fold_i32_commute
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT:  subs w8, w0, w1
+; CHECK-NEXT:  csel w0, w0, w8, lo
+; CHECK-NEXT:  ret
+  %sub = sub i32 %a, %b
+  %cond = tail call i32 @llvm.umin.i32(i32 %a, i32 %sub)
+  ret i32 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i32_multi_use
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT:  subs w8, w0, w1
+; CHECK-NEXT:  csel w0, w0, w8, lo
+; CHECK-NEXT:  str	w8, [x2]
+; CHECK-NEXT:  ret
+  %sub = sub i32 %a, %b
+  store i32 %sub, ptr addrspace(1) %ptr
+  %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+; Negative test : i16
+define i16 @underflow_compare_fold_i16(i16 %a, i16 %b) {
+; CHECK-LABEL: underflow_compare_fold_i16
+; CHECK-LABEL: %bb.0:
+; CHECK-LABEL: sub w8, w0, w1
+; CHECK-LABEL: and w9, w0, #0xffff
+; CHECK-LABEL: and w8, w8, #0xffff
+; CHECK-LABEL: cmp w8, w9
+; CHECK-LABEL: csel w0, w8, w9, lo
+; CHECK-LABEL: ret
+  %sub = sub i16 %a, %b
+  %cond = tail call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+  ret i16 %cond
+}
+
+; Negative test : i16
+define i16 @underflow_compare_fold_i16_commute(i16 %a, i16 %b) {
+; CHECK-LABEL: underflow_compare_fold_i16_commute
+; CHECK-LABEL: %bb.0:
+; CHECK-LABEL: sub w8, w0, w1
+; CHECK-LABEL: and w9, w0, #0xffff
+; CHECK-LABEL: and w8, w8, #0xffff
+; CHECK-LABEL: cmp w9, w8
+; CHECK-LABEL: csel w0, w9, w8, lo
+; CHECK-LABEL: ret
+  %sub = sub i16 %a, %b
+  %cond = tail call i16 @llvm.umin.i16(i16 %a, i16 %sub)
+  ret i16 %cond
+}
+
+; Negative test : i16
+define i16 @underflow_compare_fold_i16_multi_use(i16 %a, i16 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i16_multi_use
+; CHECK-LABEL: %bb.0:
+; CHECK-LABEL: sub w8, w0, w1
+; CHECK-LABEL: and w9, w0, #0xffff
+; CHECK-LABEL: and w10, w8, #0xffff
+; CHECK-LABEL: strh w8, [x2]
+; CHECK-LABEL: cmp w10, w9
+; CHECK-LABEL: csel w0, w10, w9, lo
+; CHECK-LABEL: ret
+  %sub = sub i16 %a, %b
+  store i16 %sub, ptr addrspace(1) %ptr
+  %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+  ret i16 %cond
+}
+
+; Negative test, vector types : umin(sub(a,b),a) but with vectors
+define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: underflow_compare_dontfold_vectors
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: sub v1.16b, v0.16b, v1.16b
+; CHECK-NEXT: umin v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: ret
+  %sub = sub <16 x i8> %a, %b
+  %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
+  ret <16 x i8> %cond
+}
+
+; Negative test, pattern mismatch : umin(add(a,b),a)
+define i64 @umin_add(i64 %a, i64 %b) {
+; CHECK-LABEL: umin_add
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: add x8, x0, x1
+; CHECK-NEXT: cmp x8, x0
+; CHECK-NEXT: csel x0, x8, x0, lo
+; CHECK-NEXT: ret
+  %add = add i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
+  ret i64 %cond
+}
diff --git a/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll
new file mode 100644
index 0000000000000..e9756b411eb2c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll
@@ -0,0 +1,156 @@
+; RUN: llc < %s -mtriple=x86_64 | FileCheck %s
+
+; GitHub issue #161036
+
+; Positive test : umin(sub(a,b),a) with scalar types should be folded
+define i64 @underflow_compare_fold_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold_i64
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movq    %rdi, %rax
+; CHECK-NEXT: subq    %rsi, %rax
+; CHECK-NEXT: cmovbq  %rdi, %rax
+; CHECK-NEXT: retq
+  %sub = sub i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+; Positive test : umin(a,sub(a,b)) with scalar types should be folded
+define i64 @underflow_compare_fold_i64_commute(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold_i64_commute
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movq    %rdi, %rax
+; CHECK-NEXT: subq    %rsi, %rax
+; CHECK-NEXT: cmovbq  %rdi, %rax
+; CHECK-NEXT: retq
+  %sub = sub i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
+  ret i64 %cond
+}
+
+; Positive test : multi-use is OK since the sub instruction still runs once
+define i64 @underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i64_multi_use
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movq    %rdi, %rax
+; CHECK-NEXT: subq    %rsi, %rax
+; CHECK-NEXT: movq    %rax, (%rdx)
+; CHECK-NEXT: cmovbq  %rdi, %rax
+; CHECK-NEXT: retq
+  %sub = sub i64 %a, %b
+  store i64 %sub, ptr addrspace(1) %ptr
+  %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: underflow_compare_fold_i32
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movl    %edi, %eax
+; CHECK-NEXT: subl    %esi, %eax
+; CHECK-NEXT: cmovbl  %edi, %eax
+; CHECK-NEXT: retq
+  %sub = sub i32 %a, %b
+  %cond = tail call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32_commute(i32 %a, i32 %b) {
+; CHECK-LABEL: underflow_compare_fold_i32_commute
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movl    %edi, %eax
+; CHECK-NEXT: subl    %esi, %eax
+; CHECK-NEXT: cmovbl  %edi, %eax
+; CHECK-NEXT: retq
+  %sub = sub i32 %a, %b
+  %cond = tail call i32 @llvm.umin.i32(i32 %a, i32 %sub)
+  ret i32 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i32_multi_use
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movl    %edi, %eax
+; CHECK-NEXT: subl    %esi, %eax
+; CHECK-NEXT: movl    %eax, (%rdx)
+; CHECK-NEXT: cmovbl  %edi, %eax
+; CHECK-NEXT: retq
+  %sub = sub i32 %a, %b
+  store i32 %sub, ptr addrspace(1) %ptr
+  %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+; Positive test : i16
+define i16 @underflow_compare_fold_i16(i16 %a, i16 %b) {
+; CHECK-LABEL: underflow_compare_fold_i16
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT: movl    %edi, %eax
+; CHECK-NEXT: subw    %si,  %ax
+; CHECK-NEXT: cmovbl  %edi, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+  %sub = sub i16 %a, %b
+  %cond = tail call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+  ret i16 %cond
+}
+
+; Positive test : i16
+define i16 @underflow_compare_fold_i16_commute(i16 %a, i16 %b) {
+; CHECK-LABEL: underflow_compare_fold_i16_commute
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT: movl    %edi, %eax
+; CHECK-NEXT: subw    %si,  %ax
+; CHECK-NEXT: cmovbl  %edi, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+  %sub = sub i16 %a, %b
+  %cond = tail call i16 @llvm.umin.i16(i16 %a, i16 %sub)
+  ret i16 %cond
+}
+
+; Positive test : i16
+define i16 @underflow_compare_fold_i16_multi_use(i16 %a, i16 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i16_multi_use
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT: movl    %edi, %eax
+; CHECK-NEXT: subw    %si,  %ax
+; CHECK-NEXT: movw    %ax,  (%rdx)
+; CHECK-NEXT: cmovbl  %edi, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+  %sub = sub i16 %a, %b
+  store i16 %sub, ptr addrspace(1) %ptr
+  %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+  ret i16 %cond
+}
+
+
+; Negative test, vector types : umin(sub(a,b),a) but with vectors
+define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: underflow_compare_dontfold_vectors
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: psubb %xmm1, %xmm2
+; CHECK-NEXT: pminub %xmm2, %xmm0
+; CHECK-NEXT: retq
+  %sub = sub <16 x i8> %a, %b
+  %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
+  ret <16 x i8> %cond
+}
+
+; Negative test, pattern mismatch : umin(add(a,b),a)
+define i64 @umin_add(i64 %a, i64 %b) {
+; CHECK-LABEL: umin_add
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: leaq (%rsi,%rdi), %rax
+; CHECK-NEXT: cmpq %rdi, %rax
+; CHECK-NEXT: cmovaeq %rdi, %rax
+; CHECK-NEXT: retq
+  %add = add i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
+  ret i64 %cond
+}

From aca28f10da54526a7955f62f090273b588888b6d Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 12 Nov 2025 15:53:37 +0100
Subject: [PATCH 09/15] [libc++] Optimize __tree copy/move
 constructor/assignment with allocator (#163558)

This patch applies the same optimization as implemented in #151304 to
the overloads taking an allocator as the second argument.

Apple M4:
```
Benchmark                                                               old             new    Difference    % Difference
-----------------------------------------------------------  --------------  --------------  ------------  --------------
std::map<int,_int>::ctor(&&,_different_allocs)/0                      14.59           12.78         -1.81         -12.41%
std::map<int,_int>::ctor(&&,_different_allocs)/1024                16407.05         6265.11     -10141.94         -61.81%
std::map<int,_int>::ctor(&&,_different_allocs)/32                    395.99          199.76       -196.23         -49.56%
std::map<int,_int>::ctor(&&,_different_allocs)/8192               141478.67        53767.84     -87710.83         -62.00%
std::map<int,_int>::ctor(const&,_alloc)/0                             12.83           12.71         -0.12          -0.94%
std::map<int,_int>::ctor(const&,_alloc)/1024                        9979.71         7849.11      -2130.59         -21.35%
std::map<int,_int>::ctor(const&,_alloc)/32                           283.82          266.05        -17.77          -6.26%
std::map<int,_int>::ctor(const&,_alloc)/8192                       81418.63        63190.41     -18228.21         -22.39%
std::map<std::string,_int>::ctor(&&,_different_allocs)/0              14.58           12.68         -1.90         -13.00%
std::map<std::string,_int>::ctor(&&,_different_allocs)/1024        19513.56         7806.04     -11707.52         -60.00%
std::map<std::string,_int>::ctor(&&,_different_allocs)/32            477.80          247.28       -230.52         -48.25%
std::map<std::string,_int>::ctor(&&,_different_allocs)/8192       504558.78        69592.21    -434966.56         -86.21%
std::map<std::string,_int>::ctor(const&,_alloc)/0                     12.64           12.60         -0.04          -0.33%
std::map<std::string,_int>::ctor(const&,_alloc)/1024               43198.53        37220.54      -5977.99         -13.84%
std::map<std::string,_int>::ctor(const&,_alloc)/32                   928.39          867.03        -61.36          -6.61%
std::map<std::string,_int>::ctor(const&,_alloc)/8192              461313.81       389200.82     -72112.99         -15.63%
```
---
 libcxx/include/__tree                         | 156 +++++++++++-------
 libcxx/include/map                            |  39 +----
 libcxx/include/set                            |  40 +----
 .../associative_container_benchmarks.h        |  59 +++++++
 .../containers/associative/flat_map.bench.cpp |   8 +
 .../associative/flat_multimap.bench.cpp       |   8 +
 .../containers/associative/map.bench.cpp      |   3 +
 .../containers/associative/multimap.bench.cpp |   3 +
 .../containers/associative/multiset.bench.cpp |   3 +
 .../containers/associative/set.bench.cpp      |   3 +
 .../associative/unordered_map.bench.cpp       |   3 +
 .../associative/unordered_multimap.bench.cpp  |   3 +
 .../associative/unordered_multiset.bench.cpp  |   3 +
 .../associative/unordered_set.bench.cpp       |   3 +
 14 files changed, 202 insertions(+), 132 deletions(-)

diff --git a/libcxx/include/__tree b/libcxx/include/__tree
index 694796922c914..ceae22bb48702 100644
--- a/libcxx/include/__tree
+++ b/libcxx/include/__tree
@@ -887,6 +887,18 @@ public:
   }
 
   _LIBCPP_HIDE_FROM_ABI __tree(const __tree& __t);
+
+  _LIBCPP_HIDE_FROM_ABI __tree(const __tree& __other, const allocator_type& __alloc)
+      : __begin_node_(__end_node()), __node_alloc_(__alloc), __size_(0), __value_comp_(__other.value_comp()) {
+    if (__other.size() == 0)
+      return;
+
+    *__root_ptr()       = static_cast<__node_base_pointer>(__copy_construct_tree(__other.__root()));
+    __root()->__parent_ = __end_node();
+    __begin_node_       = static_cast<__end_node_pointer>(std::__tree_min(__end_node()->__left_));
+    __size_             = __other.size();
+  }
+
   _LIBCPP_HIDE_FROM_ABI __tree& operator=(const __tree& __t);
   template <class _ForwardIterator>
   _LIBCPP_HIDE_FROM_ABI void __assign_unique(_ForwardIterator __first, _ForwardIterator __last);
@@ -995,27 +1007,6 @@ public:
         std::forward<_Args>(__args)...);
   }
 
-  template <class _ValueT = _Tp, __enable_if_t<__is_tree_value_type_v<_ValueT>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI void
-  __insert_unique_from_orphaned_node(const_iterator __p, __get_node_value_type_t<_Tp>&& __value) {
-    __emplace_hint_unique(__p, const_cast<key_type&&>(__value.first), std::move(__value.second));
-  }
-
-  template <class _ValueT = _Tp, __enable_if_t<!__is_tree_value_type_v<_ValueT>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI void __insert_unique_from_orphaned_node(const_iterator __p, _Tp&& __value) {
-    __emplace_hint_unique(__p, std::move(__value));
-  }
-
-  template <class _ValueT = _Tp, __enable_if_t<__is_tree_value_type_v<_ValueT>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI void __insert_multi_from_orphaned_node(const_iterator __p, value_type&& __value) {
-    __emplace_hint_multi(__p, const_cast<key_type&&>(__value.first), std::move(__value.second));
-  }
-
-  template <class _ValueT = _Tp, __enable_if_t<!__is_tree_value_type_v<_ValueT>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI void __insert_multi_from_orphaned_node(const_iterator __p, _Tp&& __value) {
-    __emplace_hint_multi(__p, std::move(__value));
-  }
-
   template <class _InIter, class _Sent>
   _LIBCPP_HIDE_FROM_ABI void __insert_range_multi(_InIter __first, _Sent __last) {
     if (__first == __last)
@@ -1388,19 +1379,19 @@ private:
   // copy the exact structure 1:1. Since this is for copy construction _only_ we know that we get a correct tree. If we
   // didn't get a correct tree, the invariants of __tree are broken and we have a much bigger problem than an improperly
   // balanced tree.
+  template <class _NodeConstructor>
 #ifdef _LIBCPP_COMPILER_CLANG_BASED // FIXME: GCC complains about not being able to always_inline a recursive function
   _LIBCPP_HIDE_FROM_ABI
 #endif
-  __node_pointer
-  __copy_construct_tree(__node_pointer __src) {
+  __node_pointer __construct_from_tree(__node_pointer __src, _NodeConstructor __construct) {
     if (!__src)
       return nullptr;
 
-    __node_holder __new_node = __construct_node(__src->__get_value());
+    __node_holder __new_node = __construct(__src->__get_value());
 
     unique_ptr<__node, __tree_deleter> __left(
-        __copy_construct_tree(static_cast<__node_pointer>(__src->__left_)), __node_alloc_);
-    __node_pointer __right = __copy_construct_tree(static_cast<__node_pointer>(__src->__right_));
+        __construct_from_tree(static_cast<__node_pointer>(__src->__left_), __construct), __node_alloc_);
+    __node_pointer __right = __construct_from_tree(static_cast<__node_pointer>(__src->__right_), __construct);
 
     __node_pointer __new_node_ptr = __new_node.release();
 
@@ -1414,46 +1405,85 @@ private:
     return __new_node_ptr;
   }
 
+  _LIBCPP_HIDE_FROM_ABI __node_pointer __copy_construct_tree(__node_pointer __src) {
+    return __construct_from_tree(__src, [this](const value_type& __val) { return __construct_node(__val); });
+  }
+
+  template <class _ValueT = _Tp, __enable_if_t<__is_tree_value_type_v<_ValueT>, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI __node_pointer __move_construct_tree(__node_pointer __src) {
+    return __construct_from_tree(__src, [this](value_type& __val) {
+      return __construct_node(const_cast<key_type&&>(__val.first), std::move(__val.second));
+    });
+  }
+
+  template <class _ValueT = _Tp, __enable_if_t<!__is_tree_value_type_v<_ValueT>, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI __node_pointer __move_construct_tree(__node_pointer __src) {
+    return __construct_from_tree(__src, [this](value_type& __val) { return __construct_node(std::move(__val)); });
+  }
+
+  template <class _Assignment, class _ConstructionAlg>
   // This copy assignment will always produce a correct red-black-tree assuming the incoming tree is correct, since our
   // own tree is a red-black-tree and the incoming tree is a red-black-tree. The invariants of a red-black-tree are
   // temporarily not met until all of the incoming red-black tree is copied.
 #ifdef _LIBCPP_COMPILER_CLANG_BASED // FIXME: GCC complains about not being able to always_inline a recursive function
   _LIBCPP_HIDE_FROM_ABI
 #endif
-  __node_pointer
-  __copy_assign_tree(__node_pointer __dest, __node_pointer __src) {
+  __node_pointer __assign_from_tree(
+      __node_pointer __dest, __node_pointer __src, _Assignment __assign, _ConstructionAlg __construct_subtree) {
     if (!__src) {
       destroy(__dest);
       return nullptr;
     }
 
-    __assign_value(__dest->__get_value(), __src->__get_value());
+    __assign(__dest->__get_value(), __src->__get_value());
     __dest->__is_black_ = __src->__is_black_;
 
     // If we already have a left node in the destination tree, reuse it and copy-assign recursively
     if (__dest->__left_) {
-      __dest->__left_ = static_cast<__node_base_pointer>(__copy_assign_tree(
-          static_cast<__node_pointer>(__dest->__left_), static_cast<__node_pointer>(__src->__left_)));
+      __dest->__left_ = static_cast<__node_base_pointer>(__assign_from_tree(
+          static_cast<__node_pointer>(__dest->__left_),
+          static_cast<__node_pointer>(__src->__left_),
+          __assign,
+          __construct_subtree));
 
       // Otherwise, we must create new nodes; copy-construct from here on
     } else if (__src->__left_) {
-      auto __new_left       = __copy_construct_tree(static_cast<__node_pointer>(__src->__left_));
+      auto __new_left       = __construct_subtree(static_cast<__node_pointer>(__src->__left_));
       __dest->__left_       = static_cast<__node_base_pointer>(__new_left);
       __new_left->__parent_ = static_cast<__end_node_pointer>(__dest);
     }
 
     // Identical to the left case above, just for the right nodes
     if (__dest->__right_) {
-      __dest->__right_ = static_cast<__node_base_pointer>(__copy_assign_tree(
-          static_cast<__node_pointer>(__dest->__right_), static_cast<__node_pointer>(__src->__right_)));
+      __dest->__right_ = static_cast<__node_base_pointer>(__assign_from_tree(
+          static_cast<__node_pointer>(__dest->__right_),
+          static_cast<__node_pointer>(__src->__right_),
+          __assign,
+          __construct_subtree));
     } else if (__src->__right_) {
-      auto __new_right       = __copy_construct_tree(static_cast<__node_pointer>(__src->__right_));
+      auto __new_right       = __construct_subtree(static_cast<__node_pointer>(__src->__right_));
       __dest->__right_       = static_cast<__node_base_pointer>(__new_right);
       __new_right->__parent_ = static_cast<__end_node_pointer>(__dest);
     }
 
     return __dest;
   }
+
+  _LIBCPP_HIDE_FROM_ABI __node_pointer __copy_assign_tree(__node_pointer __dest, __node_pointer __src) {
+    return __assign_from_tree(
+        __dest,
+        __src,
+        [](value_type& __lhs, const value_type& __rhs) { __assign_value(__lhs, __rhs); },
+        [this](__node_pointer __nd) { return __copy_construct_tree(__nd); });
+  }
+
+  _LIBCPP_HIDE_FROM_ABI __node_pointer __move_assign_tree(__node_pointer __dest, __node_pointer __src) {
+    return __assign_from_tree(
+        __dest,
+        __src,
+        [](value_type& __lhs, value_type& __rhs) { __assign_value(__lhs, std::move(__rhs)); },
+        [this](__node_pointer __nd) { return __move_construct_tree(__nd); });
+  }
 };
 
 // Precondition:  __size_ != 0
@@ -1594,21 +1624,26 @@ __tree<_Tp, _Compare, _Allocator>::__tree(__tree&& __t) _NOEXCEPT_(
 
 template <class _Tp, class _Compare, class _Allocator>
 __tree<_Tp, _Compare, _Allocator>::__tree(__tree&& __t, const allocator_type& __a)
-    : __node_alloc_(__node_allocator(__a)), __size_(0), __value_comp_(std::move(__t.value_comp())) {
+    : __begin_node_(__end_node()),
+      __node_alloc_(__node_allocator(__a)),
+      __size_(0),
+      __value_comp_(std::move(__t.value_comp())) {
+  if (__t.size() == 0)
+    return;
   if (__a == __t.__alloc()) {
-    if (__t.__size_ == 0)
-      __begin_node_ = __end_node();
-    else {
-      __begin_node_                    = __t.__begin_node_;
-      __end_node()->__left_            = __t.__end_node()->__left_;
-      __end_node()->__left_->__parent_ = static_cast<__end_node_pointer>(__end_node());
-      __size_                          = __t.__size_;
-      __t.__begin_node_                = __t.__end_node();
-      __t.__end_node()->__left_        = nullptr;
-      __t.__size_                      = 0;
-    }
+    __begin_node_                    = __t.__begin_node_;
+    __end_node()->__left_            = __t.__end_node()->__left_;
+    __end_node()->__left_->__parent_ = static_cast<__end_node_pointer>(__end_node());
+    __size_                          = __t.__size_;
+    __t.__begin_node_                = __t.__end_node();
+    __t.__end_node()->__left_        = nullptr;
+    __t.__size_                      = 0;
   } else {
-    __begin_node_ = __end_node();
+    *__root_ptr()       = static_cast<__node_base_pointer>(__move_construct_tree(__t.__root()));
+    __root()->__parent_ = __end_node();
+    __begin_node_       = static_cast<__end_node_pointer>(std::__tree_min(__end_node()->__left_));
+    __size_             = __t.size();
+    __t.clear(); // Ensure that __t is in a valid state after moving out the keys
   }
 }
 
@@ -1633,22 +1668,21 @@ void __tree<_Tp, _Compare, _Allocator>::__move_assign(__tree& __t, true_type)
 
 template <class _Tp, class _Compare, class _Allocator>
 void __tree<_Tp, _Compare, _Allocator>::__move_assign(__tree& __t, false_type) {
-  if (__node_alloc() == __t.__node_alloc())
+  if (__node_alloc() == __t.__node_alloc()) {
     __move_assign(__t, true_type());
-  else {
-    value_comp()       = std::move(__t.value_comp());
-    const_iterator __e = end();
+  } else {
+    value_comp() = std::move(__t.value_comp());
     if (__size_ != 0) {
-      _DetachedTreeCache __cache(this);
-      while (__cache.__get() != nullptr && __t.__size_ != 0) {
-        __assign_value(__cache.__get()->__get_value(), std::move(__t.remove(__t.begin())->__get_value()));
-        __node_insert_multi(__cache.__get());
-        __cache.__advance();
-      }
-    }
-    while (__t.__size_ != 0) {
-      __insert_multi_from_orphaned_node(__e, std::move(__t.remove(__t.begin())->__get_value()));
+      *__root_ptr() = static_cast<__node_base_pointer>(__move_assign_tree(__root(), __t.__root()));
+    } else {
+      *__root_ptr() = static_cast<__node_base_pointer>(__move_construct_tree(__t.__root()));
+      if (__root())
+        __root()->__parent_ = __end_node();
     }
+    __begin_node_ =
+        __end_node()->__left_ ? static_cast<__end_node_pointer>(std::__tree_min(__end_node()->__left_)) : __end_node();
+    __size_ = __t.size();
+    __t.clear(); // Ensure that __t is in a valid state after moving out the keys
   }
 }
 
diff --git a/libcxx/include/map b/libcxx/include/map
index cc8b8769189d1..0dca11cabd12e 100644
--- a/libcxx/include/map
+++ b/libcxx/include/map
@@ -995,7 +995,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI map(map&& __m) = default;
 
-  _LIBCPP_HIDE_FROM_ABI map(map&& __m, const allocator_type& __a);
+  _LIBCPP_HIDE_FROM_ABI map(map&& __m, const allocator_type& __a) : __tree_(std::move(__m.__tree_), __a) {}
 
   _LIBCPP_HIDE_FROM_ABI map& operator=(map&& __m) = default;
 
@@ -1023,10 +1023,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI explicit map(const allocator_type& __a) : __tree_(typename __base::allocator_type(__a)) {}
 
-  _LIBCPP_HIDE_FROM_ABI map(const map& __m, const allocator_type& __a)
-      : __tree_(__m.__tree_.value_comp(), typename __base::allocator_type(__a)) {
-    insert(__m.begin(), __m.end());
-  }
+  _LIBCPP_HIDE_FROM_ABI map(const map& __m, const allocator_type& __alloc) : __tree_(__m.__tree_, __alloc) {}
 
   _LIBCPP_HIDE_FROM_ABI ~map() { static_assert(sizeof(std::__diagnose_non_const_comparator<_Key, _Compare>()), ""); }
 
@@ -1426,18 +1423,6 @@ map(initializer_list<pair<_Key, _Tp>>, _Allocator)
 #  endif
 
 #  ifndef _LIBCPP_CXX03_LANG
-template <class _Key, class _Tp, class _Compare, class _Allocator>
-map<_Key, _Tp, _Compare, _Allocator>::map(map&& __m, const allocator_type& __a)
-    : __tree_(std::move(__m.__tree_), typename __base::allocator_type(__a)) {
-  if (__a != __m.get_allocator()) {
-    const_iterator __e = cend();
-    while (!__m.empty()) {
-      __tree_.__insert_unique_from_orphaned_node(
-          __e.__i_, std::move(__m.__tree_.remove(__m.begin().__i_)->__get_value()));
-    }
-  }
-}
-
 template <class _Key, class _Tp, class _Compare, class _Allocator>
 _Tp& map<_Key, _Tp, _Compare, _Allocator>::operator[](const key_type& __k) {
   return __tree_.__emplace_unique(std::piecewise_construct, std::forward_as_tuple(__k), std::forward_as_tuple())
@@ -1683,7 +1668,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI multimap(multimap&& __m) = default;
 
-  _LIBCPP_HIDE_FROM_ABI multimap(multimap&& __m, const allocator_type& __a);
+  _LIBCPP_HIDE_FROM_ABI multimap(multimap&& __m, const allocator_type& __a) : __tree_(std::move(__m.__tree_), __a) {}
 
   _LIBCPP_HIDE_FROM_ABI multimap& operator=(multimap&& __m) = default;
 
@@ -1712,10 +1697,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI explicit multimap(const allocator_type& __a) : __tree_(typename __base::allocator_type(__a)) {}
 
-  _LIBCPP_HIDE_FROM_ABI multimap(const multimap& __m, const allocator_type& __a)
-      : __tree_(__m.__tree_.value_comp(), typename __base::allocator_type(__a)) {
-    insert(__m.begin(), __m.end());
-  }
+  _LIBCPP_HIDE_FROM_ABI multimap(const multimap& __m, const allocator_type& __a) : __tree_(__m.__tree_, __a) {}
 
   _LIBCPP_HIDE_FROM_ABI ~multimap() {
     static_assert(sizeof(std::__diagnose_non_const_comparator<_Key, _Compare>()), "");
@@ -1990,19 +1972,6 @@ multimap(initializer_list<pair<_Key, _Tp>>, _Allocator)
     -> multimap<remove_const_t<_Key>, _Tp, less<remove_const_t<_Key>>, _Allocator>;
 #  endif
 
-#  ifndef _LIBCPP_CXX03_LANG
-template <class _Key, class _Tp, class _Compare, class _Allocator>
-multimap<_Key, _Tp, _Compare, _Allocator>::multimap(multimap&& __m, const allocator_type& __a)
-    : __tree_(std::move(__m.__tree_), typename __base::allocator_type(__a)) {
-  if (__a != __m.get_allocator()) {
-    const_iterator __e = cend();
-    while (!__m.empty())
-      __tree_.__insert_multi_from_orphaned_node(
-          __e.__i_, std::move(__m.__tree_.remove(__m.begin().__i_)->__get_value()));
-  }
-}
-#  endif
-
 template <class _Key, class _Tp, class _Compare, class _Allocator>
 inline _LIBCPP_HIDE_FROM_ABI bool
 operator==(const multimap<_Key, _Tp, _Compare, _Allocator>& __x, const multimap<_Key, _Tp, _Compare, _Allocator>& __y) {
diff --git a/libcxx/include/set b/libcxx/include/set
index d58b6e96b061d..3d6f571a42a1a 100644
--- a/libcxx/include/set
+++ b/libcxx/include/set
@@ -671,12 +671,10 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI explicit set(const allocator_type& __a) : __tree_(__a) {}
 
-  _LIBCPP_HIDE_FROM_ABI set(const set& __s, const allocator_type& __a) : __tree_(__s.__tree_.value_comp(), __a) {
-    insert(__s.begin(), __s.end());
-  }
+  _LIBCPP_HIDE_FROM_ABI set(const set& __s, const allocator_type& __alloc) : __tree_(__s.__tree_, __alloc) {}
 
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI set(set&& __s, const allocator_type& __a);
+  _LIBCPP_HIDE_FROM_ABI set(set&& __s, const allocator_type& __alloc) : __tree_(std::move(__s.__tree_), __alloc) {}
 
   _LIBCPP_HIDE_FROM_ABI set(initializer_list<value_type> __il, const value_compare& __comp = value_compare())
       : __tree_(__comp) {
@@ -946,19 +944,6 @@ template <class _Key, class _Allocator, class = enable_if_t<__is_allocator_v<_Al
 set(initializer_list<_Key>, _Allocator) -> set<_Key, less<_Key>, _Allocator>;
 #  endif
 
-#  ifndef _LIBCPP_CXX03_LANG
-
-template <class _Key, class _Compare, class _Allocator>
-set<_Key, _Compare, _Allocator>::set(set&& __s, const allocator_type& __a) : __tree_(std::move(__s.__tree_), __a) {
-  if (__a != __s.get_allocator()) {
-    const_iterator __e = cend();
-    while (!__s.empty())
-      insert(__e, std::move(__s.__tree_.remove(__s.begin())->__get_value()));
-  }
-}
-
-#  endif // _LIBCPP_CXX03_LANG
-
 template <class _Key, class _Compare, class _Allocator>
 inline _LIBCPP_HIDE_FROM_ABI bool
 operator==(const set<_Key, _Compare, _Allocator>& __x, const set<_Key, _Compare, _Allocator>& __y) {
@@ -1128,13 +1113,10 @@ public:
 #  ifndef _LIBCPP_CXX03_LANG
   _LIBCPP_HIDE_FROM_ABI multiset(multiset&& __s) = default;
 
-  _LIBCPP_HIDE_FROM_ABI multiset(multiset&& __s, const allocator_type& __a);
+  _LIBCPP_HIDE_FROM_ABI multiset(multiset&& __s, const allocator_type& __a) : __tree_(std::move(__s.__tree_), __a) {}
 #  endif // _LIBCPP_CXX03_LANG
   _LIBCPP_HIDE_FROM_ABI explicit multiset(const allocator_type& __a) : __tree_(__a) {}
-  _LIBCPP_HIDE_FROM_ABI multiset(const multiset& __s, const allocator_type& __a)
-      : __tree_(__s.__tree_.value_comp(), __a) {
-    insert(__s.begin(), __s.end());
-  }
+  _LIBCPP_HIDE_FROM_ABI multiset(const multiset& __s, const allocator_type& __a) : __tree_(__s.__tree_, __a) {}
 
 #  ifndef _LIBCPP_CXX03_LANG
   _LIBCPP_HIDE_FROM_ABI multiset(initializer_list<value_type> __il, const value_compare& __comp = value_compare())
@@ -1407,20 +1389,6 @@ template <class _Key, class _Allocator, class = enable_if_t<__is_allocator_v<_Al
 multiset(initializer_list<_Key>, _Allocator) -> multiset<_Key, less<_Key>, _Allocator>;
 #  endif
 
-#  ifndef _LIBCPP_CXX03_LANG
-
-template <class _Key, class _Compare, class _Allocator>
-multiset<_Key, _Compare, _Allocator>::multiset(multiset&& __s, const allocator_type& __a)
-    : __tree_(std::move(__s.__tree_), __a) {
-  if (__a != __s.get_allocator()) {
-    const_iterator __e = cend();
-    while (!__s.empty())
-      insert(__e, std::move(__s.__tree_.remove(__s.begin())->__get_value()));
-  }
-}
-
-#  endif // _LIBCPP_CXX03_LANG
-
 template <class _Key, class _Compare, class _Allocator>
 inline _LIBCPP_HIDE_FROM_ABI bool
 operator==(const multiset<_Key, _Compare, _Allocator>& __x, const multiset<_Key, _Compare, _Allocator>& __y) {
diff --git a/libcxx/test/benchmarks/containers/associative/associative_container_benchmarks.h b/libcxx/test/benchmarks/containers/associative/associative_container_benchmarks.h
index 22a6d0d753b0c..5dd55f244d885 100644
--- a/libcxx/test/benchmarks/containers/associative/associative_container_benchmarks.h
+++ b/libcxx/test/benchmarks/containers/associative/associative_container_benchmarks.h
@@ -11,6 +11,7 @@
 
 #include <algorithm>
 #include <iterator>
+#include <memory_resource>
 #include <random>
 #include <string>
 #include <ranges>
@@ -33,6 +34,9 @@ struct adapt_operations {
 
   // using InsertionResult = ...;
   // static Container::iterator get_iterator(InsertionResult const&);
+
+  // template <class Allocator>
+  // using rebind_alloc = ...;
 };
 
 template <class Container>
@@ -103,6 +107,61 @@ void associative_container_benchmarks(std::string container) {
     }
   });
 
+  bench("ctor(const&, alloc)", [=](auto& st) {
+    const std::size_t size = st.range(0);
+    std::vector<Value> in  = make_value_types(generate_unique_keys(size));
+    Container src(in.begin(), in.end());
+    ScratchSpace c[BatchSize];
+
+    while (st.KeepRunningBatch(BatchSize)) {
+      for (std::size_t i = 0; i != BatchSize; ++i) {
+        new (c + i) Container(src, std::allocator<typename Container::value_type>());
+        benchmark::DoNotOptimize(c + i);
+        benchmark::ClobberMemory();
+      }
+
+      st.PauseTiming();
+      for (std::size_t i = 0; i != BatchSize; ++i) {
+        reinterpret_cast<Container*>(c + i)->~Container();
+      }
+      st.ResumeTiming();
+    }
+  });
+
+  bench("ctor(&&, different allocs)", [=](auto& st) {
+    using PMRContainer = adapt_operations<Container>::template rebind_alloc<
+        std::pmr::polymorphic_allocator<typename Container::value_type>>;
+
+    const std::size_t size = st.range(0);
+    std::vector<Value> in  = make_value_types(generate_unique_keys(size));
+    std::pmr::monotonic_buffer_resource rs(size * 64 * BatchSize); // 64 bytes should be enough per node
+    std::vector<PMRContainer> srcs;
+    srcs.reserve(BatchSize);
+    for (size_t i = 0; i != BatchSize; ++i)
+      srcs.emplace_back(&rs).insert(in.begin(), in.end());
+    alignas(PMRContainer) char c[BatchSize * sizeof(PMRContainer)];
+
+    std::pmr::monotonic_buffer_resource rs2(size * 64 * BatchSize); // 64 bytes should be enough per node
+    while (st.KeepRunningBatch(BatchSize)) {
+      for (std::size_t i = 0; i != BatchSize; ++i) {
+        new (c + i * sizeof(PMRContainer)) PMRContainer(std::move(srcs[i]), &rs2);
+        benchmark::DoNotOptimize(c + i);
+        benchmark::ClobberMemory();
+      }
+
+      st.PauseTiming();
+      for (std::size_t i = 0; i != BatchSize; ++i) {
+        reinterpret_cast<PMRContainer*>(c + i * sizeof(PMRContainer))->~PMRContainer();
+      }
+      rs2.release();
+      srcs.clear();
+      for (size_t i = 0; i != BatchSize; ++i)
+        srcs.emplace_back(&rs).insert(in.begin(), in.end());
+
+      st.ResumeTiming();
+    }
+  });
+
   bench("ctor(iterator, iterator) (unsorted sequence)", [=](auto& st) {
     const std::size_t size = st.range(0);
     std::mt19937 randomness;
diff --git a/libcxx/test/benchmarks/containers/associative/flat_map.bench.cpp b/libcxx/test/benchmarks/containers/associative/flat_map.bench.cpp
index f3b86554802ca..407afb14e1e13 100644
--- a/libcxx/test/benchmarks/containers/associative/flat_map.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/flat_map.bench.cpp
@@ -24,6 +24,14 @@ struct support::adapt_operations<std::flat_map<K, V>> {
 
   using InsertionResult = std::pair<typename std::flat_map<K, V>::iterator, bool>;
   static auto get_iterator(InsertionResult const& result) { return result.first; }
+
+  template <class Allocator>
+  using rebind_alloc =
+      std::flat_map<K,
+                    V,
+                    std::less<K>,
+                    std::vector<K, typename std::allocator_traits<Allocator>::template rebind_alloc<K>>,
+                    std::vector<V, typename std::allocator_traits<Allocator>::template rebind_alloc<V>>>;
 };
 
 int main(int argc, char** argv) {
diff --git a/libcxx/test/benchmarks/containers/associative/flat_multimap.bench.cpp b/libcxx/test/benchmarks/containers/associative/flat_multimap.bench.cpp
index 80eaa549042c6..4f70d26116b0b 100644
--- a/libcxx/test/benchmarks/containers/associative/flat_multimap.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/flat_multimap.bench.cpp
@@ -23,6 +23,14 @@ struct support::adapt_operations<std::flat_multimap<K, V>> {
 
   using InsertionResult = typename std::flat_multimap<K, V>::iterator;
   static auto get_iterator(InsertionResult const& result) { return result; }
+
+  template <class Allocator>
+  using rebind_alloc =
+      std::flat_multimap<K,
+                         V,
+                         std::less<K>,
+                         std::vector<K, typename std::allocator_traits<Allocator>::template rebind_alloc<K>>,
+                         std::vector<V, typename std::allocator_traits<Allocator>::template rebind_alloc<V>>>;
 };
 
 int main(int argc, char** argv) {
diff --git a/libcxx/test/benchmarks/containers/associative/map.bench.cpp b/libcxx/test/benchmarks/containers/associative/map.bench.cpp
index 142229ae64cad..cc9ffd857caf2 100644
--- a/libcxx/test/benchmarks/containers/associative/map.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/map.bench.cpp
@@ -38,6 +38,9 @@ struct support::adapt_operations<std::map<K, V>> {
 
   using InsertionResult = std::pair<typename std::map<K, V>::iterator, bool>;
   static auto get_iterator(InsertionResult const& result) { return result.first; }
+
+  template <class Allocator>
+  using rebind_alloc = std::map<K, V, std::less<K>, Allocator>;
 };
 
 int main(int argc, char** argv) {
diff --git a/libcxx/test/benchmarks/containers/associative/multimap.bench.cpp b/libcxx/test/benchmarks/containers/associative/multimap.bench.cpp
index 15a0b573081bb..8e3abf0b7cf8b 100644
--- a/libcxx/test/benchmarks/containers/associative/multimap.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/multimap.bench.cpp
@@ -24,6 +24,9 @@ struct support::adapt_operations<std::multimap<K, V>> {
 
   using InsertionResult = typename std::multimap<K, V>::iterator;
   static auto get_iterator(InsertionResult const& result) { return result; }
+
+  template <class Allocator>
+  using rebind_alloc = std::multimap<K, V, std::less<K>, Allocator>;
 };
 
 int main(int argc, char** argv) {
diff --git a/libcxx/test/benchmarks/containers/associative/multiset.bench.cpp b/libcxx/test/benchmarks/containers/associative/multiset.bench.cpp
index c205e0a4f793f..7bafd0ab52dce 100644
--- a/libcxx/test/benchmarks/containers/associative/multiset.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/multiset.bench.cpp
@@ -22,6 +22,9 @@ struct support::adapt_operations<std::multiset<K>> {
 
   using InsertionResult = typename std::multiset<K>::iterator;
   static auto get_iterator(InsertionResult const& result) { return result; }
+
+  template <class Allocator>
+  using rebind_alloc = std::multiset<K, std::less<K>, Allocator>;
 };
 
 int main(int argc, char** argv) {
diff --git a/libcxx/test/benchmarks/containers/associative/set.bench.cpp b/libcxx/test/benchmarks/containers/associative/set.bench.cpp
index 50ee142b6e8b3..e5a6cc58913d2 100644
--- a/libcxx/test/benchmarks/containers/associative/set.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/set.bench.cpp
@@ -23,6 +23,9 @@ struct support::adapt_operations<std::set<K>> {
 
   using InsertionResult = std::pair<typename std::set<K>::iterator, bool>;
   static auto get_iterator(InsertionResult const& result) { return result.first; }
+
+  template <class Allocator>
+  using rebind_alloc = std::set<K, std::less<K>, Allocator>;
 };
 
 int main(int argc, char** argv) {
diff --git a/libcxx/test/benchmarks/containers/associative/unordered_map.bench.cpp b/libcxx/test/benchmarks/containers/associative/unordered_map.bench.cpp
index d670c531910ea..ddfc90c306010 100644
--- a/libcxx/test/benchmarks/containers/associative/unordered_map.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/unordered_map.bench.cpp
@@ -37,6 +37,9 @@ struct support::adapt_operations<std::unordered_map<K, V>> {
 
   using InsertionResult = std::pair<typename std::unordered_map<K, V>::iterator, bool>;
   static auto get_iterator(InsertionResult const& result) { return result.first; }
+
+  template <class Allocator>
+  using rebind_alloc = std::unordered_map<K, V, std::hash<K>, std::equal_to<K>, Allocator>;
 };
 
 int main(int argc, char** argv) {
diff --git a/libcxx/test/benchmarks/containers/associative/unordered_multimap.bench.cpp b/libcxx/test/benchmarks/containers/associative/unordered_multimap.bench.cpp
index 8738ca4bf9f0c..5d92bd8b2deaf 100644
--- a/libcxx/test/benchmarks/containers/associative/unordered_multimap.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/unordered_multimap.bench.cpp
@@ -23,6 +23,9 @@ struct support::adapt_operations<std::unordered_multimap<K, V>> {
 
   using InsertionResult = typename std::unordered_multimap<K, V>::iterator;
   static auto get_iterator(InsertionResult const& result) { return result; }
+
+  template <class Allocator>
+  using rebind_alloc = std::unordered_multimap<K, V, std::hash<K>, std::equal_to<K>, Allocator>;
 };
 
 int main(int argc, char** argv) {
diff --git a/libcxx/test/benchmarks/containers/associative/unordered_multiset.bench.cpp b/libcxx/test/benchmarks/containers/associative/unordered_multiset.bench.cpp
index 4888b01bfeba0..09412fc4aeae7 100644
--- a/libcxx/test/benchmarks/containers/associative/unordered_multiset.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/unordered_multiset.bench.cpp
@@ -22,6 +22,9 @@ struct support::adapt_operations<std::unordered_multiset<K>> {
 
   using InsertionResult = typename std::unordered_multiset<K>::iterator;
   static auto get_iterator(InsertionResult const& result) { return result; }
+
+  template <class Allocator>
+  using rebind_alloc = std::unordered_multiset<K, std::hash<K>, std::equal_to<K>, Allocator>;
 };
 
 int main(int argc, char** argv) {
diff --git a/libcxx/test/benchmarks/containers/associative/unordered_set.bench.cpp b/libcxx/test/benchmarks/containers/associative/unordered_set.bench.cpp
index 89443a597e85a..1b6663321b43c 100644
--- a/libcxx/test/benchmarks/containers/associative/unordered_set.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/unordered_set.bench.cpp
@@ -24,6 +24,9 @@ struct support::adapt_operations<std::unordered_set<K>> {
 
   using InsertionResult = std::pair<typename std::unordered_set<K>::iterator, bool>;
   static auto get_iterator(InsertionResult const& result) { return result.first; }
+
+  template <class Allocator>
+  using rebind_alloc = std::unordered_set<K, std::hash<K>, std::equal_to<K>, Allocator>;
 };
 
 int main(int argc, char** argv) {

From 09fd430de7a6002d7a22669348d6589bd32bd5ad Mon Sep 17 00:00:00 2001
From: Ebuka Ezike <yerimyah1@gmail.com>
Date: Wed, 12 Nov 2025 14:54:13 +0000
Subject: [PATCH 10/15] [lldb-dap] Refactor event thread (#166948)

Handle each event type in a different function
---
 lldb/tools/lldb-dap/DAP.cpp | 330 +++++++++++++++++++-----------------
 lldb/tools/lldb-dap/DAP.h   |   4 +
 2 files changed, 178 insertions(+), 156 deletions(-)

diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index 11aed33886edb..d4203a2f00983 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -1319,7 +1319,7 @@ void DAP::ProgressEventThread() {
   lldb::SBEvent event;
   bool done = false;
   while (!done) {
-    if (listener.WaitForEvent(1, event)) {
+    if (listener.WaitForEvent(UINT32_MAX, event)) {
       const auto event_mask = event.GetType();
       if (event.BroadcasterMatchesRef(broadcaster)) {
         if (event_mask & eBroadcastBitStopProgressThread) {
@@ -1377,7 +1377,6 @@ void DAP::ProgressEventThread() {
 // is required.
 void DAP::EventThread() {
   llvm::set_thread_name("lldb.DAP.client." + m_client_name + ".event_handler");
-  lldb::SBEvent event;
   lldb::SBListener listener = debugger.GetListener();
   broadcaster.AddListener(listener, eBroadcastBitStopEventThread);
   debugger.GetBroadcaster().AddListener(
@@ -1388,169 +1387,176 @@ void DAP::EventThread() {
       debugger, lldb::SBThread::GetBroadcasterClassName(),
       lldb::SBThread::eBroadcastBitStackChanged);
 
+  lldb::SBEvent event;
   bool done = false;
   while (!done) {
-    if (listener.WaitForEvent(1, event)) {
-      const auto event_mask = event.GetType();
-      if (lldb::SBProcess::EventIsProcessEvent(event)) {
-        lldb::SBProcess process = lldb::SBProcess::GetProcessFromEvent(event);
-        if (event_mask & lldb::SBProcess::eBroadcastBitStateChanged) {
-          auto state = lldb::SBProcess::GetStateFromEvent(event);
-          switch (state) {
-          case lldb::eStateConnected:
-          case lldb::eStateDetached:
-          case lldb::eStateInvalid:
-          case lldb::eStateUnloaded:
-            break;
-          case lldb::eStateAttaching:
-          case lldb::eStateCrashed:
-          case lldb::eStateLaunching:
-          case lldb::eStateStopped:
-          case lldb::eStateSuspended:
-            // Only report a stopped event if the process was not
-            // automatically restarted.
-            if (!lldb::SBProcess::GetRestartedFromEvent(event)) {
-              SendStdOutStdErr(*this, process);
-              if (llvm::Error err = SendThreadStoppedEvent(*this))
-                DAP_LOG_ERROR(log, std::move(err),
-                              "({1}) reporting thread stopped: {0}",
-                              m_client_name);
-            }
-            break;
-          case lldb::eStateRunning:
-          case lldb::eStateStepping:
-            WillContinue();
-            SendContinuedEvent(*this);
-            break;
-          case lldb::eStateExited:
-            lldb::SBStream stream;
-            process.GetStatus(stream);
-            SendOutput(OutputType::Console, stream.GetData());
-
-            // When restarting, we can get an "exited" event for the process we
-            // just killed with the old PID, or even with no PID. In that case
-            // we don't have to terminate the session.
-            if (process.GetProcessID() == LLDB_INVALID_PROCESS_ID ||
-                process.GetProcessID() == restarting_process_id) {
-              restarting_process_id = LLDB_INVALID_PROCESS_ID;
-            } else {
-              // Run any exit LLDB commands the user specified in the
-              // launch.json
-              RunExitCommands();
-              SendProcessExitedEvent(*this, process);
-              SendTerminatedEvent();
-              done = true;
-            }
-            break;
-          }
-        } else if ((event_mask & lldb::SBProcess::eBroadcastBitSTDOUT) ||
-                   (event_mask & lldb::SBProcess::eBroadcastBitSTDERR)) {
-          SendStdOutStdErr(*this, process);
-        }
-      } else if (lldb::SBTarget::EventIsTargetEvent(event)) {
-        if (event_mask & lldb::SBTarget::eBroadcastBitModulesLoaded ||
-            event_mask & lldb::SBTarget::eBroadcastBitModulesUnloaded ||
-            event_mask & lldb::SBTarget::eBroadcastBitSymbolsLoaded ||
-            event_mask & lldb::SBTarget::eBroadcastBitSymbolsChanged) {
-          const uint32_t num_modules =
-              lldb::SBTarget::GetNumModulesFromEvent(event);
-          const bool remove_module =
-              event_mask & lldb::SBTarget::eBroadcastBitModulesUnloaded;
-
-          // NOTE: Both mutexes must be acquired to prevent deadlock when
-          // handling `modules_request`, which also requires both locks.
-          lldb::SBMutex api_mutex = GetAPIMutex();
-          const std::scoped_lock<lldb::SBMutex, std::mutex> guard(
-              api_mutex, modules_mutex);
-          for (uint32_t i = 0; i < num_modules; ++i) {
-            lldb::SBModule module =
-                lldb::SBTarget::GetModuleAtIndexFromEvent(i, event);
-
-            std::optional<protocol::Module> p_module =
-                CreateModule(target, module, remove_module);
-            if (!p_module)
-              continue;
-
-            llvm::StringRef module_id = p_module->id;
-
-            const bool module_exists = modules.contains(module_id);
-            if (remove_module && module_exists) {
-              modules.erase(module_id);
-              Send(protocol::Event{
-                  "module", ModuleEventBody{std::move(p_module).value(),
-                                            ModuleEventBody::eReasonRemoved}});
-            } else if (module_exists) {
-              Send(protocol::Event{
-                  "module", ModuleEventBody{std::move(p_module).value(),
-                                            ModuleEventBody::eReasonChanged}});
-            } else if (!remove_module) {
-              modules.insert(module_id);
-              Send(protocol::Event{
-                  "module", ModuleEventBody{std::move(p_module).value(),
-                                            ModuleEventBody::eReasonNew}});
-            }
-          }
-        }
-      } else if (lldb::SBBreakpoint::EventIsBreakpointEvent(event)) {
-        if (event_mask & lldb::SBTarget::eBroadcastBitBreakpointChanged) {
-          auto event_type =
-              lldb::SBBreakpoint::GetBreakpointEventTypeFromEvent(event);
-          auto bp = Breakpoint(
-              *this, lldb::SBBreakpoint::GetBreakpointFromEvent(event));
-          // If the breakpoint was set through DAP, it will have the
-          // BreakpointBase::kDAPBreakpointLabel. Regardless of whether
-          // locations were added, removed, or resolved, the breakpoint isn't
-          // going away and the reason is always "changed".
-          if ((event_type & lldb::eBreakpointEventTypeLocationsAdded ||
-               event_type & lldb::eBreakpointEventTypeLocationsRemoved ||
-               event_type & lldb::eBreakpointEventTypeLocationsResolved) &&
-              bp.MatchesName(BreakpointBase::kDAPBreakpointLabel)) {
-            // As the DAP client already knows the path of this breakpoint, we
-            // don't need to send it back as part of the "changed" event. This
-            // avoids sending paths that should be source mapped. Note that
-            // CreateBreakpoint doesn't apply source mapping and certain
-            // implementation ignore the source part of this event anyway.
-            protocol::Breakpoint protocol_bp = bp.ToProtocolBreakpoint();
-
-            // "source" is not needed here, unless we add adapter data to be
-            // saved by the client.
-            if (protocol_bp.source && !protocol_bp.source->adapterData)
-              protocol_bp.source = std::nullopt;
-
-            llvm::json::Object body;
-            body.try_emplace("breakpoint", protocol_bp);
-            body.try_emplace("reason", "changed");
-
-            llvm::json::Object bp_event = CreateEventObject("breakpoint");
-            bp_event.try_emplace("body", std::move(body));
-
-            SendJSON(llvm::json::Value(std::move(bp_event)));
-          }
-        }
+    if (!listener.WaitForEvent(UINT32_MAX, event))
+      continue;
 
-      } else if (lldb::SBThread::EventIsThreadEvent(event)) {
-        HandleThreadEvent(event);
-      } else if (event_mask & lldb::eBroadcastBitError ||
-                 event_mask & lldb::eBroadcastBitWarning) {
-        lldb::SBStructuredData data =
-            lldb::SBDebugger::GetDiagnosticFromEvent(event);
-        if (!data.IsValid())
-          continue;
-        std::string type = GetStringValue(data.GetValueForKey("type"));
-        std::string message = GetStringValue(data.GetValueForKey("message"));
-        SendOutput(OutputType::Important,
-                   llvm::formatv("{0}: {1}", type, message).str());
-      } else if (event.BroadcasterMatchesRef(broadcaster)) {
-        if (event_mask & eBroadcastBitStopEventThread) {
-          done = true;
-        }
+    const uint32_t event_mask = event.GetType();
+    if (lldb::SBProcess::EventIsProcessEvent(event)) {
+      HandleProcessEvent(event, /*&process_exited=*/done);
+    } else if (lldb::SBTarget::EventIsTargetEvent(event)) {
+      HandleTargetEvent(event);
+    } else if (lldb::SBBreakpoint::EventIsBreakpointEvent(event)) {
+      HandleBreakpointEvent(event);
+    } else if (lldb::SBThread::EventIsThreadEvent(event)) {
+      HandleThreadEvent(event);
+    } else if (event_mask & lldb::eBroadcastBitError ||
+               event_mask & lldb::eBroadcastBitWarning) {
+      HandleDiagnosticEvent(event);
+    } else if (event.BroadcasterMatchesRef(broadcaster)) {
+      if (event_mask & eBroadcastBitStopEventThread) {
+        done = true;
+      }
+    }
+  }
+}
+
+void DAP::HandleProcessEvent(const lldb::SBEvent &event, bool &process_exited) {
+  lldb::SBProcess process = lldb::SBProcess::GetProcessFromEvent(event);
+  const uint32_t event_mask = event.GetType();
+  if (event_mask & lldb::SBProcess::eBroadcastBitStateChanged) {
+    auto state = lldb::SBProcess::GetStateFromEvent(event);
+    switch (state) {
+    case lldb::eStateConnected:
+    case lldb::eStateDetached:
+    case lldb::eStateInvalid:
+    case lldb::eStateUnloaded:
+      break;
+    case lldb::eStateAttaching:
+    case lldb::eStateCrashed:
+    case lldb::eStateLaunching:
+    case lldb::eStateStopped:
+    case lldb::eStateSuspended:
+      // Only report a stopped event if the process was not
+      // automatically restarted.
+      if (!lldb::SBProcess::GetRestartedFromEvent(event)) {
+        SendStdOutStdErr(*this, process);
+        if (llvm::Error err = SendThreadStoppedEvent(*this))
+          DAP_LOG_ERROR(log, std::move(err),
+                        "({1}) reporting thread stopped: {0}", m_client_name);
+      }
+      break;
+    case lldb::eStateRunning:
+    case lldb::eStateStepping:
+      WillContinue();
+      SendContinuedEvent(*this);
+      break;
+    case lldb::eStateExited:
+      lldb::SBStream stream;
+      process.GetStatus(stream);
+      SendOutput(OutputType::Console, stream.GetData());
+
+      // When restarting, we can get an "exited" event for the process we
+      // just killed with the old PID, or even with no PID. In that case
+      // we don't have to terminate the session.
+      if (process.GetProcessID() == LLDB_INVALID_PROCESS_ID ||
+          process.GetProcessID() == restarting_process_id) {
+        restarting_process_id = LLDB_INVALID_PROCESS_ID;
+      } else {
+        // Run any exit LLDB commands the user specified in the
+        // launch.json
+        RunExitCommands();
+        SendProcessExitedEvent(*this, process);
+        SendTerminatedEvent();
+        process_exited = true;
+      }
+      break;
+    }
+  } else if ((event_mask & lldb::SBProcess::eBroadcastBitSTDOUT) ||
+             (event_mask & lldb::SBProcess::eBroadcastBitSTDERR)) {
+    SendStdOutStdErr(*this, process);
+  }
+}
+
+void DAP::HandleTargetEvent(const lldb::SBEvent &event) {
+  const uint32_t event_mask = event.GetType();
+  if (event_mask & lldb::SBTarget::eBroadcastBitModulesLoaded ||
+      event_mask & lldb::SBTarget::eBroadcastBitModulesUnloaded ||
+      event_mask & lldb::SBTarget::eBroadcastBitSymbolsLoaded ||
+      event_mask & lldb::SBTarget::eBroadcastBitSymbolsChanged) {
+    const uint32_t num_modules = lldb::SBTarget::GetNumModulesFromEvent(event);
+    const bool remove_module =
+        event_mask & lldb::SBTarget::eBroadcastBitModulesUnloaded;
+
+    // NOTE: Both mutexes must be acquired to prevent deadlock when
+    // handling `modules_request`, which also requires both locks.
+    lldb::SBMutex api_mutex = GetAPIMutex();
+    const std::scoped_lock<lldb::SBMutex, std::mutex> guard(api_mutex,
+                                                            modules_mutex);
+    for (uint32_t i = 0; i < num_modules; ++i) {
+      lldb::SBModule module =
+          lldb::SBTarget::GetModuleAtIndexFromEvent(i, event);
+
+      std::optional<protocol::Module> p_module =
+          CreateModule(target, module, remove_module);
+      if (!p_module)
+        continue;
+
+      const llvm::StringRef module_id = p_module->id;
+
+      const bool module_exists = modules.contains(module_id);
+      if (remove_module && module_exists) {
+        modules.erase(module_id);
+        Send(protocol::Event{"module",
+                             ModuleEventBody{std::move(p_module).value(),
+                                             ModuleEventBody::eReasonRemoved}});
+      } else if (module_exists) {
+        Send(protocol::Event{"module",
+                             ModuleEventBody{std::move(p_module).value(),
+                                             ModuleEventBody::eReasonChanged}});
+      } else if (!remove_module) {
+        modules.insert(module_id);
+        Send(protocol::Event{"module",
+                             ModuleEventBody{std::move(p_module).value(),
+                                             ModuleEventBody::eReasonNew}});
       }
     }
   }
 }
 
+void DAP::HandleBreakpointEvent(const lldb::SBEvent &event) {
+  const uint32_t event_mask = event.GetType();
+  if (!(event_mask & lldb::SBTarget::eBroadcastBitBreakpointChanged))
+    return;
+
+  auto event_type = lldb::SBBreakpoint::GetBreakpointEventTypeFromEvent(event);
+  auto bp =
+      Breakpoint(*this, lldb::SBBreakpoint::GetBreakpointFromEvent(event));
+  // If the breakpoint was set through DAP, it will have the
+  // BreakpointBase::kDAPBreakpointLabel. Regardless of whether
+  // locations were added, removed, or resolved, the breakpoint isn't
+  // going away and the reason is always "changed".
+  if ((event_type & lldb::eBreakpointEventTypeLocationsAdded ||
+       event_type & lldb::eBreakpointEventTypeLocationsRemoved ||
+       event_type & lldb::eBreakpointEventTypeLocationsResolved) &&
+      bp.MatchesName(BreakpointBase::kDAPBreakpointLabel)) {
+    // As the DAP client already knows the path of this breakpoint, we
+    // don't need to send it back as part of the "changed" event. This
+    // avoids sending paths that should be source mapped. Note that
+    // CreateBreakpoint doesn't apply source mapping and certain
+    // implementation ignore the source part of this event anyway.
+    protocol::Breakpoint protocol_bp = bp.ToProtocolBreakpoint();
+
+    // "source" is not needed here, unless we add adapter data to be
+    // saved by the client.
+    if (protocol_bp.source && !protocol_bp.source->adapterData)
+      protocol_bp.source = std::nullopt;
+
+    llvm::json::Object body;
+    body.try_emplace("breakpoint", protocol_bp);
+    body.try_emplace("reason", "changed");
+
+    llvm::json::Object bp_event = CreateEventObject("breakpoint");
+    bp_event.try_emplace("body", std::move(body));
+
+    SendJSON(llvm::json::Value(std::move(bp_event)));
+  }
+}
+
 void DAP::HandleThreadEvent(const lldb::SBEvent &event) {
-  uint32_t event_type = event.GetType();
+  const uint32_t event_type = event.GetType();
 
   if (event_type & lldb::SBThread::eBroadcastBitStackChanged) {
     const lldb::SBThread evt_thread = lldb::SBThread::GetThreadFromEvent(event);
@@ -1559,6 +1565,18 @@ void DAP::HandleThreadEvent(const lldb::SBEvent &event) {
   }
 }
 
+void DAP::HandleDiagnosticEvent(const lldb::SBEvent &event) {
+  const lldb::SBStructuredData data =
+      lldb::SBDebugger::GetDiagnosticFromEvent(event);
+  if (!data.IsValid())
+    return;
+
+  std::string type = GetStringValue(data.GetValueForKey("type"));
+  std::string message = GetStringValue(data.GetValueForKey("message"));
+  SendOutput(OutputType::Important,
+             llvm::formatv("{0}: {1}", type, message).str());
+}
+
 std::vector<protocol::Breakpoint> DAP::SetSourceBreakpoints(
     const protocol::Source &source,
     const std::optional<std::vector<protocol::SourceBreakpoint>> &breakpoints) {
diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h
index b4f111e4e720c..5d40341329f34 100644
--- a/lldb/tools/lldb-dap/DAP.h
+++ b/lldb/tools/lldb-dap/DAP.h
@@ -454,7 +454,11 @@ struct DAP final : public DAPTransport::MessageHandler {
   /// Event threads.
   /// @{
   void EventThread();
+  void HandleProcessEvent(const lldb::SBEvent &event, bool &process_exited);
+  void HandleTargetEvent(const lldb::SBEvent &event);
+  void HandleBreakpointEvent(const lldb::SBEvent &event);
   void HandleThreadEvent(const lldb::SBEvent &event);
+  void HandleDiagnosticEvent(const lldb::SBEvent &event);
   void ProgressEventThread();
 
   std::thread event_thread;

From 1e4b9e4059e12924b1ad38bd127cdbdb57e3c488 Mon Sep 17 00:00:00 2001
From: Tarun Prabhu <tarun@lanl.gov>
Date: Wed, 12 Nov 2025 08:10:36 -0700
Subject: [PATCH 11/15] [flang][NFC] Strip trailing whitespace from tests (3 of
 N)

Only the fortran source files in flang/test have been modified. The
other files in the directory will be cleaned up in subsequent commits
---
 flang/test/Lower/HLFIR/charconvert.f90                 |  2 +-
 .../HLFIR/procedure-pointer-component-default-init.f90 |  2 +-
 flang/test/Lower/HLFIR/procedure-pointer.f90           | 10 +++++-----
 flang/test/Lower/HLFIR/reshape.f90                     |  2 +-
 flang/test/Lower/MIF/co_broadcast.f90                  |  4 ++--
 flang/test/Lower/MIF/co_max.f90                        |  6 +++---
 flang/test/Lower/MIF/co_min.f90                        |  6 +++---
 flang/test/Lower/MIF/co_sum.f90                        |  2 +-
 flang/test/Lower/MIF/coarray-init.f90                  |  2 +-
 flang/test/Lower/MIF/num_images.f90                    |  2 +-
 flang/test/Lower/MIF/sync_all.f90                      |  6 +++---
 flang/test/Lower/MIF/sync_images.f90                   |  8 ++++----
 flang/test/Lower/MIF/sync_memory.f90                   |  8 ++++----
 flang/test/Lower/MIF/this_image.f90                    |  2 +-
 14 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/flang/test/Lower/HLFIR/charconvert.f90 b/flang/test/Lower/HLFIR/charconvert.f90
index 1044986a3db01..f4cd3b17fee41 100644
--- a/flang/test/Lower/HLFIR/charconvert.f90
+++ b/flang/test/Lower/HLFIR/charconvert.f90
@@ -56,7 +56,7 @@ subroutine charconvert3(c, c4)
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPcharconvert3
-! CHECK-SAME: %[[ARG0:.*]]: !fir.boxchar<1> {{.*}}, %[[ARG1:.*]]: !fir.boxchar<4> 
+! CHECK-SAME: %[[ARG0:.*]]: !fir.boxchar<1> {{.*}}, %[[ARG1:.*]]: !fir.boxchar<4>
 ! CHECK:   %[[VAL_0:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:   %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcharconvert3Ec"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:   %[[VAL_2:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index)
diff --git a/flang/test/Lower/HLFIR/procedure-pointer-component-default-init.f90 b/flang/test/Lower/HLFIR/procedure-pointer-component-default-init.f90
index 85931262b5892..61cc743ec9c59 100644
--- a/flang/test/Lower/HLFIR/procedure-pointer-component-default-init.f90
+++ b/flang/test/Lower/HLFIR/procedure-pointer-component-default-init.f90
@@ -1,5 +1,5 @@
 ! Test procedure pointer component default initialization when the size
-! of the derived type is 32 bytes and larger. 
+! of the derived type is 32 bytes and larger.
 ! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
   interface
diff --git a/flang/test/Lower/HLFIR/procedure-pointer.f90 b/flang/test/Lower/HLFIR/procedure-pointer.f90
index 9680497a1e7ac..75e81c165d25b 100644
--- a/flang/test/Lower/HLFIR/procedure-pointer.f90
+++ b/flang/test/Lower/HLFIR/procedure-pointer.f90
@@ -11,7 +11,7 @@ real function real_func(x)
       real :: x
     end function
     character(:) function char_func(x)
-      pointer :: char_func 
+      pointer :: char_func
       integer :: x
     end function
     subroutine sub(x)
@@ -148,7 +148,7 @@ subroutine  sub5()
 use m
   procedure(real), pointer :: p3
 
-  p3 => real_func 
+  p3 => real_func
 ! CHECK: %[[VAL_0:.*]] = fir.alloca !fir.boxproc<() -> f32> {bindc_name = "p3", uniq_name = "_QFsub5Ep3"}
 ! CHECK: %[[VAL_1:.*]] = fir.zero_bits () -> f32
 ! CHECK: %[[VAL_2:.*]] = fir.emboxproc %[[VAL_1]] : (() -> f32) -> !fir.boxproc<() -> f32>
@@ -165,7 +165,7 @@ subroutine  sub6()
   procedure(), pointer :: p4
   real :: r
 
-  p4 => sub 
+  p4 => sub
 ! CHECK: %[[VAL_0:.*]] = fir.alloca !fir.boxproc<() -> ()> {bindc_name = "p4", uniq_name = "_QFsub6Ep4"}
 ! CHECK: %[[VAL_1:.*]] = fir.zero_bits () -> ()
 ! CHECK: %[[VAL_2:.*]] = fir.emboxproc %[[VAL_1]] : (() -> ()) -> !fir.boxproc<() -> ()>
@@ -197,7 +197,7 @@ subroutine sub7(p1, p2)
 
   call foo2(p2)
 ! CHECK: fir.call @_QPfoo2(%[[VAL_1]]#0) fastmath<contract> : (!fir.ref<!fir.boxproc<() -> ()>>) -> ()
-end 
+end
 
 subroutine sub8()
 use m
@@ -338,7 +338,7 @@ subroutine sub12()
 ! CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref<!fir.boxproc<(!fir.ref<i32>) -> !fir.box<!fir.ptr<!fir.char<1,?>>>>>) -> (!fir.ref<!fir.boxproc<(!fir.ref<i32>) -> !fir.box<!fir.ptr<!fir.char<1,?>>>>>, !fir.ref<!fir.boxproc<(!fir.ref<i32>) -> !fir.box<!fir.ptr<!fir.char<1,?>>>>>)
 ! CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_16]]#0 : (!fir.ref<!fir.boxproc<(!fir.ref<i32>) -> !fir.box<!fir.ptr<!fir.char<1,?>>>>>) -> !fir.ref<!fir.boxproc<() -> ()>>
 ! CHECK: fir.call @_QPfoo2(%[[VAL_17]]) fastmath<contract> : (!fir.ref<!fir.boxproc<() -> ()>>) -> ()
-end 
+end
 
 subroutine test_opt_pointer()
   interface
diff --git a/flang/test/Lower/HLFIR/reshape.f90 b/flang/test/Lower/HLFIR/reshape.f90
index 8bf3cfd08c6ac..83072d33d6052 100644
--- a/flang/test/Lower/HLFIR/reshape.f90
+++ b/flang/test/Lower/HLFIR/reshape.f90
@@ -49,7 +49,7 @@ end subroutine reshape_test_nopad
 ! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFreshape_test_nopadEsh"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
 ! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFreshape_test_nopadEsource"} : (!fir.box<!fir.array<?x?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xi32>>, !fir.box<!fir.array<?x?x?xi32>>)
 ! CHECK:           %[[VAL_13:.*]] = hlfir.reshape %[[VAL_11]]#0 %[[VAL_10]]#0 order %[[VAL_7]]#0 : (!fir.box<!fir.array<?x?x?xi32>>, !fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>) -> !hlfir.expr<?x?xi32>
-  
+
 subroutine test_reshape_optional1(pad, order, source, shape)
   real, pointer :: pad(:, :)
   integer, pointer :: order(:)
diff --git a/flang/test/Lower/MIF/co_broadcast.f90 b/flang/test/Lower/MIF/co_broadcast.f90
index 25e4330ade704..fadee5f6bcdf8 100644
--- a/flang/test/Lower/MIF/co_broadcast.f90
+++ b/flang/test/Lower/MIF/co_broadcast.f90
@@ -34,13 +34,13 @@ program test_co_broadcast
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
   ! CHECK: mif.co_broadcast %[[V1]] source %[[C1_i32:.*]] : (!fir.box<!fir.array<2xi32>>, i32)
   call co_broadcast(array_i, source_image=1)
-  
+
   ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
   ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xcomplex<f32>>>, !fir.shape<1>) -> !fir.box<!fir.array<2xcomplex<f32>>>
   ! CHECK: mif.co_broadcast %[[V1]] source %[[C1_i32:.*]] : (!fir.box<!fir.array<2xcomplex<f32>>>, i32)
   call co_broadcast(array_c, source_image=1)
-  
+
   ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
   ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
diff --git a/flang/test/Lower/MIF/co_max.f90 b/flang/test/Lower/MIF/co_max.f90
index 19e65626b50f2..0a179c832dce8 100644
--- a/flang/test/Lower/MIF/co_max.f90
+++ b/flang/test/Lower/MIF/co_max.f90
@@ -40,12 +40,12 @@ program test_co_max
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
   ! CHECK: mif.co_max %[[V1]] : (!fir.box<!fir.array<2xi32>>)
   call co_max(array_i)
-   
+
   ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2x!fir.char<1>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1>>>
   ! CHECK: mif.co_max %[[V1]] result %[[C1_i32:.*]] : (!fir.box<!fir.array<2x!fir.char<1>>>, i32)
-  call co_max(array_c, result_image=1) 
-   
+  call co_max(array_c, result_image=1)
+
   ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
   ! CHECK: mif.co_max %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 : (!fir.box<!fir.array<2xf64>>, i32, !fir.ref<i32>)
diff --git a/flang/test/Lower/MIF/co_min.f90 b/flang/test/Lower/MIF/co_min.f90
index a7adc6b540147..bedee0e61619c 100644
--- a/flang/test/Lower/MIF/co_min.f90
+++ b/flang/test/Lower/MIF/co_min.f90
@@ -40,12 +40,12 @@ program test_co_min
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
   ! CHECK: mif.co_min %[[V1]] : (!fir.box<!fir.array<2xi32>>)
   call co_min(array_i)
-   
+
   ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2x!fir.char<1>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1>>>
   ! CHECK: mif.co_min %[[V1]] result %[[C1_i32:.*]] : (!fir.box<!fir.array<2x!fir.char<1>>>, i32)
-  call co_min(array_c, result_image=1) 
-   
+  call co_min(array_c, result_image=1)
+
   ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
   ! CHECK: mif.co_min %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 : (!fir.box<!fir.array<2xf64>>, i32, !fir.ref<i32>)
diff --git a/flang/test/Lower/MIF/co_sum.f90 b/flang/test/Lower/MIF/co_sum.f90
index 0d8a25850ad5f..9710fd6d521ff 100644
--- a/flang/test/Lower/MIF/co_sum.f90
+++ b/flang/test/Lower/MIF/co_sum.f90
@@ -36,7 +36,7 @@ program test_co_sum
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
   ! CHECK: mif.co_sum %[[V1]] : (!fir.box<!fir.array<2xi32>>)
   call co_sum(array_i)
-   
+
   ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
   ! CHECK: mif.co_sum %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 : (!fir.box<!fir.array<2xf64>>, i32, !fir.ref<i32>)
diff --git a/flang/test/Lower/MIF/coarray-init.f90 b/flang/test/Lower/MIF/coarray-init.f90
index e3544736df284..e3526f6e09993 100644
--- a/flang/test/Lower/MIF/coarray-init.f90
+++ b/flang/test/Lower/MIF/coarray-init.f90
@@ -3,7 +3,7 @@
 
 program test_init
 
-end 
+end
 
 ! ALL-LABEL: func.func @main
 ! ALL: fir.call @_FortranAProgramStart
diff --git a/flang/test/Lower/MIF/num_images.f90 b/flang/test/Lower/MIF/num_images.f90
index a673b6e8120f8..8f31ab4bc0090 100644
--- a/flang/test/Lower/MIF/num_images.f90
+++ b/flang/test/Lower/MIF/num_images.f90
@@ -3,7 +3,7 @@
 program test
   use iso_fortran_env
   integer :: i
-  integer :: team_number 
+  integer :: team_number
   type(team_type) :: team
 
   ! CHECK: mif.num_images : () -> i32
diff --git a/flang/test/Lower/MIF/sync_all.f90 b/flang/test/Lower/MIF/sync_all.f90
index 2b1997c8cc0b8..4d685df31abbb 100644
--- a/flang/test/Lower/MIF/sync_all.f90
+++ b/flang/test/Lower/MIF/sync_all.f90
@@ -4,7 +4,7 @@
 program test_sync_all
   implicit none
   ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable.
- 
+
   ! COARRAY: %[[ERRMSG:.*]]:2 = hlfir.declare %[[VAL_1:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
   ! COARRAY: %[[STAT:.*]]:2 = hlfir.declare %[[VAL_2:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer sync_status
@@ -15,11 +15,11 @@ program test_sync_all
 
   ! COARRAY: mif.sync_all stat %[[STAT]]#0 : (!fir.ref<i32>)
   sync all(stat=sync_status)
-  
+
   ! COARRAY: %[[VAL_1:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
   ! COARRAY: mif.sync_all errmsg %[[VAL_1]] : (!fir.box<!fir.char<1,128>>)
   sync all(                  errmsg=error_message)
-  
+
   ! COARRAY: %[[VAL_2:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
   ! COARRAY: mif.sync_all stat %[[STAT]]#0 errmsg %[[VAL_2]] : (!fir.ref<i32>, !fir.box<!fir.char<1,128>>)
   sync all(stat=sync_status, errmsg=error_message)
diff --git a/flang/test/Lower/MIF/sync_images.f90 b/flang/test/Lower/MIF/sync_images.f90
index 7ee5936131750..1ef577ed4f158 100644
--- a/flang/test/Lower/MIF/sync_images.f90
+++ b/flang/test/Lower/MIF/sync_images.f90
@@ -4,7 +4,7 @@
 program test_sync_images
   implicit none
   ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable.
- 
+
   ! COARRAY: %[[ERRMSG:.*]]:2 = hlfir.declare %[[VAL_1:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
   ! COARRAY: %[[ME:.*]]:2 = hlfir.declare %[[VAL_3:.*]] {uniq_name = "_QFEme"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! COARRAY: %[[STAT:.*]]:2 = hlfir.declare %[[VAL_2:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -24,14 +24,14 @@ program test_sync_images
   ! COARRAY: %[[VAL_5:.*]] = fir.embox %[[IMG_SET:.*]]#0(%[[SHAPE_1:.*]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
   ! COARRAY: mif.sync_images image_set %[[VAL_5]] stat %[[STAT]]#0 errmsg %[[VAL_4]] : (!fir.box<!fir.array<1xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,128>>)
   sync images([1],  stat=sync_status, errmsg=error_message)
-  
+
   ! COARRAY: mif.sync_images : ()
   sync images(*)
-  
+
   ! COARRAY: %[[VAL_6:.*]] = fir.embox %[[ME]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
   ! COARRAY: mif.sync_images image_set %[[VAL_6]] : (!fir.box<i32>)
   sync images(me)
-  
+
   ! COARRAY: %[[VAL_7:.*]] = fir.embox %[[IMG_SET:.*]]#0(%[[SHAPE_3:.*]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
   ! COARRAY: mif.sync_images image_set %[[VAL_7]] : (!fir.box<!fir.array<1xi32>>)
   sync images([1])
diff --git a/flang/test/Lower/MIF/sync_memory.f90 b/flang/test/Lower/MIF/sync_memory.f90
index e6e0fa1e7fdf3..a36fc2d1919a5 100644
--- a/flang/test/Lower/MIF/sync_memory.f90
+++ b/flang/test/Lower/MIF/sync_memory.f90
@@ -4,22 +4,22 @@
 program test_sync_memory
   implicit none
   ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable.
- 
+
   ! COARRAY: %[[ERRMSG:.*]]:2 = hlfir.declare %[[VAL_1:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
   ! COARRAY: %[[STAT:.*]]:2 = hlfir.declare %[[VAL_2:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer sync_status
   character(len=128) :: error_message
 
-  ! COARRAY: mif.sync_memory : () 
+  ! COARRAY: mif.sync_memory : ()
   sync memory
 
   ! COARRAY: mif.sync_memory stat %[[STAT]]#0 : (!fir.ref<i32>)
   sync memory(stat=sync_status)
-  
+
   ! COARRAY: %[[VAL_1:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
   ! COARRAY: mif.sync_memory errmsg %[[VAL_1]] : (!fir.box<!fir.char<1,128>>)
   sync memory(                  errmsg=error_message)
-  
+
   ! COARRAY: %[[VAL_2:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
   ! COARRAY: mif.sync_memory stat %[[STAT]]#0 errmsg %[[VAL_2]] : (!fir.ref<i32>, !fir.box<!fir.char<1,128>>)
   sync memory(stat=sync_status, errmsg=error_message)
diff --git a/flang/test/Lower/MIF/this_image.f90 b/flang/test/Lower/MIF/this_image.f90
index ce729b349e6cf..c6674c309f3f4 100644
--- a/flang/test/Lower/MIF/this_image.f90
+++ b/flang/test/Lower/MIF/this_image.f90
@@ -5,7 +5,7 @@ program test
   integer :: i
   type(team_type) :: team
 
-  ! CHECK: mif.this_image : () -> i32 
+  ! CHECK: mif.this_image : () -> i32
   i = this_image()
 
   ! CHECK: mif.this_image team %[[TEAM:.*]] : ({{.*}}) -> i32

From 62d1a080e69e3c5e98840e000135afa7c688a77b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 12 Nov 2025 15:11:00 +0000
Subject: [PATCH 12/15] [LV] Use ExtractLane(LastActiveLane, V) live outs when
 tail-folding. (#149042)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Building on top of https://github.com/llvm/llvm-project/pull/148817,
introduce a new abstract LastActiveLane opcode that gets lowered to
Not(Mask) → FirstActiveLane(NotMask) → Sub(result, 1).

When folding the tail, update all extracts for uses outside the loop the
extract the value of the last actice lane.

See also https://github.com/llvm/llvm-project/issues/148603

PR: https://github.com/llvm/llvm-project/pull/149042
---
 .../Vectorize/LoopVectorizationLegality.cpp   |  18 -
 .../Transforms/Vectorize/LoopVectorize.cpp    |   3 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   7 +
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |   1 +
 .../Transforms/Vectorize/VPlanPatternMatch.h  |  22 +
 .../Transforms/Vectorize/VPlanPredicator.cpp  |  39 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  29 +
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  61 +-
 llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp |  13 +-
 .../Transforms/Vectorize/VPlanVerifier.cpp    |  46 +
 .../LoopVectorize/RISCV/dead-ops-cost.ll      |  68 +-
 .../Transforms/LoopVectorize/RISCV/divrem.ll  | 101 +-
 .../first-order-recurrence-scalable-vf1.ll    |  86 +-
 .../LoopVectorize/RISCV/scalable-tailfold.ll  |  56 +-
 .../tail-folding-fixed-order-recurrence.ll    |  72 +-
 .../LoopVectorize/RISCV/uniform-load-store.ll | 126 ++-
 .../LoopVectorize/X86/small-size.ll           |  78 +-
 .../first-order-recurrence-tail-folding.ll    | 868 +++++++++++++++---
 llvm/test/Transforms/LoopVectorize/optsize.ll | 207 +++--
 .../pr43166-fold-tail-by-masking.ll           |  86 +-
 .../use-scalar-epilogue-if-tp-fails.ll        | 223 ++++-
 21 files changed, 1626 insertions(+), 584 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 03112c67dda7b..e522d2f617d8a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -2097,24 +2097,6 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
   for (const auto &Reduction : getReductionVars())
     ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
 
-  // TODO: handle non-reduction outside users when tail is folded by masking.
-  for (auto *AE : AllowedExit) {
-    // Check that all users of allowed exit values are inside the loop or
-    // are the live-out of a reduction.
-    if (ReductionLiveOuts.count(AE))
-      continue;
-    for (User *U : AE->users()) {
-      Instruction *UI = cast<Instruction>(U);
-      if (TheLoop->contains(UI))
-        continue;
-      LLVM_DEBUG(
-          dbgs()
-          << "LV: Cannot fold tail by masking, loop has an outside user for "
-          << *UI << "\n");
-      return false;
-    }
-  }
-
   for (const auto &Entry : getInductionVars()) {
     PHINode *OrigPhi = Entry.first;
     for (User *U : OrigPhi->users()) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5cc01a9f974b4..b9d4ff41c0755 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8895,7 +8895,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       if (FinalReductionResult == U || Parent->getParent())
         continue;
       U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult);
-      if (match(U, m_ExtractLastElement(m_VPValue())))
+      if (match(U, m_CombineOr(m_ExtractLastElement(m_VPValue()),
+                               m_ExtractLane(m_VPValue(), m_VPValue()))))
         cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult);
     }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 72858e1265d86..08f77b75400bd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1047,6 +1047,13 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
     // It produces the lane index across all unrolled iterations. Unrolling will
     // add all copies of its original operand as additional operands.
     FirstActiveLane,
+    // Calculates the last active lane index of the vector predicate operands.
+    // The predicates must be prefix-masks (all 1s before all 0s). Used when
+    // tail-folding to extract the correct live-out value from the last active
+    // iteration. It produces the lane index across all unrolled iterations.
+    // Unrolling will add all copies of its original operand as additional
+    // operands.
+    LastActiveLane,
 
     // The opcodes below are used for VPInstructionWithType.
     //
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 80a2e4bc3f754..fb0b029de3d41 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -115,6 +115,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   case VPInstruction::ExtractLane:
     return inferScalarType(R->getOperand(1));
   case VPInstruction::FirstActiveLane:
+  case VPInstruction::LastActiveLane:
     return Type::getIntNTy(Ctx, 64);
   case VPInstruction::ExtractLastElement:
   case VPInstruction::ExtractLastLanePerPart:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index b57c44872c1b6..aa2785252d376 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -395,12 +395,24 @@ m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1) {
   return m_VPInstruction<Instruction::ExtractElement>(Op0, Op1);
 }
 
+template <typename Op0_t, typename Op1_t>
+inline VPInstruction_match<VPInstruction::ExtractLane, Op0_t, Op1_t>
+m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1) {
+  return m_VPInstruction<VPInstruction::ExtractLane>(Op0, Op1);
+}
+
 template <typename Op0_t>
 inline VPInstruction_match<VPInstruction::ExtractLastLanePerPart, Op0_t>
 m_ExtractLastLanePerPart(const Op0_t &Op0) {
   return m_VPInstruction<VPInstruction::ExtractLastLanePerPart>(Op0);
 }
 
+template <typename Op0_t>
+inline VPInstruction_match<VPInstruction::ExtractPenultimateElement, Op0_t>
+m_ExtractPenultimateElement(const Op0_t &Op0) {
+  return m_VPInstruction<VPInstruction::ExtractPenultimateElement>(Op0);
+}
+
 template <typename Op0_t, typename Op1_t, typename Op2_t>
 inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t>
 m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
@@ -429,6 +441,16 @@ m_FirstActiveLane(const Op0_t &Op0) {
   return m_VPInstruction<VPInstruction::FirstActiveLane>(Op0);
 }
 
+template <typename Op0_t>
+inline VPInstruction_match<VPInstruction::LastActiveLane, Op0_t>
+m_LastActiveLane(const Op0_t &Op0) {
+  return m_VPInstruction<VPInstruction::LastActiveLane>(Op0);
+}
+
+inline VPInstruction_match<VPInstruction::StepVector> m_StepVector() {
+  return m_VPInstruction<VPInstruction::StepVector>();
+}
+
 template <unsigned Opcode, typename Op0_t>
 inline AllRecipe_match<Opcode, Op0_t> m_Unary(const Op0_t &Op0) {
   return AllRecipe_match<Opcode, Op0_t>(Op0);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index fb17d5dd62b9d..3579af21d8b07 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -44,11 +44,6 @@ class VPPredicator {
   /// possibly inserting new recipes at \p Dst (using Builder's insertion point)
   VPValue *createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst);
 
-  /// Returns the *entry* mask for \p VPBB.
-  VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
-    return BlockMaskCache.lookup(VPBB);
-  }
-
   /// Record \p Mask as the *entry* mask of \p VPBB, which is expected to not
   /// already have a mask.
   void setBlockInMask(VPBasicBlock *VPBB, VPValue *Mask) {
@@ -68,6 +63,11 @@ class VPPredicator {
   }
 
 public:
+  /// Returns the *entry* mask for \p VPBB.
+  VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
+    return BlockMaskCache.lookup(VPBB);
+  }
+
   /// Returns the precomputed predicate of the edge from \p Src to \p Dst.
   VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const {
     return EdgeMaskCache.lookup({Src, Dst});
@@ -301,5 +301,34 @@ VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
 
     PrevVPBB = VPBB;
   }
+
+  // If we folded the tail and introduced a header mask, any extract of the
+  // last element must be updated to extract from the last active lane of the
+  // header mask instead (i.e., the lane corresponding to the last active
+  // iteration).
+  if (FoldTail) {
+    assert(Plan.getExitBlocks().size() == 1 &&
+           "only a single-exit block is supported currently");
+    VPBasicBlock *EB = Plan.getExitBlocks().front();
+    assert(EB->getSinglePredecessor() == Plan.getMiddleBlock() &&
+           "the exit block must have middle block as single predecessor");
+
+    VPBuilder B(Plan.getMiddleBlock()->getTerminator());
+    for (auto &P : EB->phis()) {
+      auto *ExitIRI = cast<VPIRPhi>(&P);
+      VPValue *Inc = ExitIRI->getIncomingValue(0);
+      VPValue *Op;
+      if (!match(Inc, m_ExtractLastElement(m_VPValue(Op))))
+        continue;
+
+      // Compute the index of the last active lane.
+      VPValue *HeaderMask = Predicator.getBlockInMask(Header);
+      VPValue *LastActiveLane =
+          B.createNaryOp(VPInstruction::LastActiveLane, HeaderMask);
+      auto *Ext =
+          B.createNaryOp(VPInstruction::ExtractLane, {LastActiveLane, Op});
+      Inc->replaceAllUsesWith(Ext);
+    }
+  }
   return Predicator.getBlockMaskCache();
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index beae8051e75dc..5e46659227262 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -547,6 +547,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
   case VPInstruction::ExtractLastLanePerPart:
   case VPInstruction::ExtractPenultimateElement:
   case VPInstruction::FirstActiveLane:
+  case VPInstruction::LastActiveLane:
   case VPInstruction::Not:
   case VPInstruction::Unpack:
     return 1;
@@ -1156,6 +1157,29 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
                                   {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
     return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
   }
+  case VPInstruction::LastActiveLane: {
+    Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
+    if (VF.isScalar())
+      return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
+                                        CmpInst::makeCmpResultType(ScalarTy),
+                                        CmpInst::ICMP_EQ, Ctx.CostKind);
+    // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
+    auto *PredTy = toVectorTy(ScalarTy, VF);
+    IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
+                                  Type::getInt64Ty(Ctx.LLVMCtx),
+                                  {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
+    InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
+    // Add cost of NOT operation on the predicate.
+    Cost += Ctx.TTI.getArithmeticInstrCost(
+        Instruction::Xor, PredTy, Ctx.CostKind,
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+        {TargetTransformInfo::OK_UniformConstantValue,
+         TargetTransformInfo::OP_None});
+    // Add cost of SUB operation on the index.
+    Cost += Ctx.TTI.getArithmeticInstrCost(
+        Instruction::Sub, Type::getInt64Ty(Ctx.LLVMCtx), Ctx.CostKind);
+    return Cost;
+  }
   case VPInstruction::FirstOrderRecurrenceSplice: {
     assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
     SmallVector<int> Mask(VF.getKnownMinValue());
@@ -1210,6 +1234,7 @@ bool VPInstruction::isVectorToScalar() const {
          getOpcode() == Instruction::ExtractElement ||
          getOpcode() == VPInstruction::ExtractLane ||
          getOpcode() == VPInstruction::FirstActiveLane ||
+         getOpcode() == VPInstruction::LastActiveLane ||
          getOpcode() == VPInstruction::ComputeAnyOfResult ||
          getOpcode() == VPInstruction::ComputeFindIVResult ||
          getOpcode() == VPInstruction::ComputeReductionResult ||
@@ -1275,6 +1300,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case VPInstruction::ExtractPenultimateElement:
   case VPInstruction::ActiveLaneMask:
   case VPInstruction::FirstActiveLane:
+  case VPInstruction::LastActiveLane:
   case VPInstruction::FirstOrderRecurrenceSplice:
   case VPInstruction::LogicalAnd:
   case VPInstruction::Not:
@@ -1451,6 +1477,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::FirstActiveLane:
     O << "first-active-lane";
     break;
+  case VPInstruction::LastActiveLane:
+    O << "last-active-lane";
+    break;
   case VPInstruction::ReductionStartVector:
     O << "reduction-start-vector";
     break;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 7cef98f465715..bb517aad31f19 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -805,8 +805,8 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
                                                VPValue *Op,
                                                ScalarEvolution &SE) {
   VPValue *Incoming, *Mask;
-  if (!match(Op, m_VPInstruction<VPInstruction::ExtractLane>(
-                     m_FirstActiveLane(m_VPValue(Mask)), m_VPValue(Incoming))))
+  if (!match(Op, m_ExtractLane(m_FirstActiveLane(m_VPValue(Mask)),
+                               m_VPValue(Incoming))))
     return nullptr;
 
   auto *WideIV = getOptimizableIVOf(Incoming, SE);
@@ -1274,8 +1274,7 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
   }
 
   // Look through ExtractPenultimateElement (BuildVector ....).
-  if (match(Def, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
-                     m_BuildVector()))) {
+  if (match(Def, m_ExtractPenultimateElement(m_BuildVector()))) {
     auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
     Def->replaceAllUsesWith(
         BuildVector->getOperand(BuildVector->getNumOperands() - 2));
@@ -2056,6 +2055,32 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
     // Set the first operand of RecurSplice to FOR again, after replacing
     // all users.
     RecurSplice->setOperand(0, FOR);
+
+    // Check for users extracting at the penultimate active lane of the FOR.
+    // If only a single lane is active in the current iteration, we need to
+    // select the last element from the previous iteration (from the FOR phi
+    // directly).
+    for (VPUser *U : RecurSplice->users()) {
+      if (!match(U, m_ExtractLane(m_LastActiveLane(m_VPValue()),
+                                  m_Specific(RecurSplice))))
+        continue;
+
+      VPBuilder B(cast<VPInstruction>(U));
+      VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
+      Type *I64Ty = Type::getInt64Ty(Plan.getContext());
+      VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0));
+      VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1));
+      VPValue *PenultimateIndex =
+          B.createNaryOp(Instruction::Sub, {LastActiveLane, One});
+      VPValue *PenultimateLastIter =
+          B.createNaryOp(VPInstruction::ExtractLane,
+                         {PenultimateIndex, FOR->getBackedgeValue()});
+      VPValue *LastPrevIter =
+          B.createNaryOp(VPInstruction::ExtractLastElement, FOR);
+      VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
+      VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
+      cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
+    }
   }
   return true;
 }
@@ -3445,6 +3470,34 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
         ToRemove.push_back(Expr);
       }
 
+      // Expand LastActiveLane into Not + FirstActiveLane + Sub.
+      auto *LastActiveL = dyn_cast<VPInstruction>(&R);
+      if (LastActiveL &&
+          LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
+        // Create Not(Mask) for all operands.
+        SmallVector<VPValue *, 2> NotMasks;
+        for (VPValue *Op : LastActiveL->operands()) {
+          VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
+          NotMasks.push_back(NotMask);
+        }
+
+        // Create FirstActiveLane on the inverted masks.
+        VPValue *FirstInactiveLane = Builder.createNaryOp(
+            VPInstruction::FirstActiveLane, NotMasks,
+            LastActiveL->getDebugLoc(), "first.inactive.lane");
+
+        // Subtract 1 to get the last active lane.
+        VPValue *One = Plan.getOrAddLiveIn(
+            ConstantInt::get(Type::getInt64Ty(Plan.getContext()), 1));
+        VPValue *LastLane = Builder.createNaryOp(
+            Instruction::Sub, {FirstInactiveLane, One},
+            LastActiveL->getDebugLoc(), "last.active.lane");
+
+        LastActiveL->replaceAllUsesWith(LastLane);
+        ToRemove.push_back(LastActiveL);
+        continue;
+      }
+
       VPValue *VectorStep;
       VPValue *ScalarStep;
       if (!match(&R, m_VPInstruction<VPInstruction::WideIVStep>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index d4b8b72beb942..221ca4ab05370 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -352,6 +352,7 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
     VPValue *Op1;
     if (match(&R, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(Op1))) ||
         match(&R, m_FirstActiveLane(m_VPValue(Op1))) ||
+        match(&R, m_LastActiveLane(m_VPValue(Op1))) ||
         match(&R, m_VPInstruction<VPInstruction::ComputeAnyOfResult>(
                       m_VPValue(), m_VPValue(), m_VPValue(Op1))) ||
         match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
@@ -364,17 +365,21 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
       continue;
     }
     VPValue *Op0;
-    if (match(&R, m_VPInstruction<VPInstruction::ExtractLane>(
-                      m_VPValue(Op0), m_VPValue(Op1)))) {
+    if (match(&R, m_ExtractLane(m_VPValue(Op0), m_VPValue(Op1)))) {
       addUniformForAllParts(cast<VPInstruction>(&R));
       for (unsigned Part = 1; Part != UF; ++Part)
         R.addOperand(getValueForPart(Op1, Part));
       continue;
     }
     if (match(&R, m_ExtractLastElement(m_VPValue(Op0))) ||
-        match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
-                      m_VPValue(Op0)))) {
+        match(&R, m_ExtractPenultimateElement(m_VPValue(Op0)))) {
       addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
+      if (isa<VPFirstOrderRecurrencePHIRecipe>(Op0)) {
+        assert(match(&R, m_ExtractLastElement(m_VPValue())) &&
+               "can only extract last element of FOR");
+        continue;
+      }
+
       if (Plan.hasScalarVFOnly()) {
         auto *I = cast<VPInstruction>(&R);
         // Extracting from end with VF = 1 implies retrieving the last or
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 34754a1ea3992..2d63d2a787f88 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -18,6 +18,7 @@
 #include "VPlanDominatorTree.h"
 #include "VPlanHelpers.h"
 #include "VPlanPatternMatch.h"
+#include "VPlanUtils.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/TypeSwitch.h"
 
@@ -44,6 +45,9 @@ class VPlanVerifier {
   /// incoming value into EVL's recipe.
   bool verifyEVLRecipe(const VPInstruction &EVL) const;
 
+  /// Verify that \p LastActiveLane's operand is guaranteed to be a prefix-mask.
+  bool verifyLastActiveLaneRecipe(const VPInstruction &LastActiveLane) const;
+
   bool verifyVPBasicBlock(const VPBasicBlock *VPBB);
 
   bool verifyBlock(const VPBlockBase *VPB);
@@ -221,6 +225,44 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
   });
 }
 
+bool VPlanVerifier::verifyLastActiveLaneRecipe(
+    const VPInstruction &LastActiveLane) const {
+  assert(LastActiveLane.getOpcode() == VPInstruction::LastActiveLane &&
+         "must be called with VPInstruction::LastActiveLane");
+
+  if (LastActiveLane.getNumOperands() < 1) {
+    errs() << "LastActiveLane must have at least one operand\n";
+    return false;
+  }
+
+  const VPlan &Plan = *LastActiveLane.getParent()->getPlan();
+  // All operands must be prefix-mask. Currently we check for header masks or
+  // EVL-derived masks, as those are currently the only operands in practice,
+  // but this may need updating in the future.
+  for (VPValue *Op : LastActiveLane.operands()) {
+    if (vputils::isHeaderMask(Op, Plan))
+      continue;
+
+    // Masks derived from EVL are also fine.
+    auto BroadcastOrEVL =
+        m_CombineOr(m_Broadcast(m_EVL(m_VPValue())), m_EVL(m_VPValue()));
+    if (match(Op, m_CombineOr(m_ICmp(m_StepVector(), BroadcastOrEVL),
+                              m_ICmp(BroadcastOrEVL, m_StepVector()))))
+      continue;
+
+    errs() << "LastActiveLane operand ";
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    VPSlotTracker Tracker(&Plan);
+    Op->printAsOperand(errs(), Tracker);
+#endif
+    errs() << " must be prefix mask (a header mask or an "
+              "EVL-derived mask currently)\n";
+    return false;
+  }
+
+  return true;
+}
+
 bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
   if (!verifyPhiRecipes(VPBB))
     return false;
@@ -313,6 +355,10 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
           return false;
         }
         break;
+      case VPInstruction::LastActiveLane:
+        if (!verifyLastActiveLaneRecipe(*VPI))
+          return false;
+        break;
       default:
         break;
       }
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index b81637f50989d..a22f72fe929d1 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -69,51 +69,51 @@ exit:
 define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
 ; CHECK-LABEL: define i8 @dead_live_out_due_to_scalar_epilogue_required(
 ; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP1]], i32 6)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 252, [[TMP2]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 1005
 ; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 1005
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 252, [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 252, [[TMP6]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[N_VEC]], 4
-; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul <vscale x 4 x i32> [[TMP9]], splat (i32 4)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 4, [[TMP4]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP14]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <vscale x 16 x i32> [[TMP0]], splat (i32 4)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 16 x i32> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i32> [[VEC_IND]] to <vscale x 4 x i64>
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], <vscale x 4 x i64> [[TMP15]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x ptr> align 1 [[TMP16]], <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META3:![0-9]+]], !noalias [[META6:![0-9]+]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i32 [ 252, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP2]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 4, [[TMP2]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp uge <vscale x 16 x i32> [[TMP0]], [[BROADCAST_SPLAT3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sext <vscale x 16 x i32> [[VEC_IND]] to <vscale x 16 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[SRC]], <vscale x 16 x i64> [[TMP9]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP6]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP2]]), !alias.scope [[META3:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], <vscale x 16 x i64> [[TMP9]]
+; CHECK-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x ptr> align 1 [[TMP7]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP2]]), !alias.scope [[META6:![0-9]+]], !noalias [[META3]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP2]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP5]], i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP10]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 16
+; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 16 x i8> [[WIDE_MASKED_GATHER]], i64 [[TMP11]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IDXPROM]]
 ; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
@@ -121,9 +121,9 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 4
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IV]], 1001
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[R:%.*]] = phi i8 [ [[L]], %[[LOOP]] ]
+; CHECK-NEXT:    [[R:%.*]] = phi i8 [ [[L]], %[[LOOP]] ], [ [[TMP15]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
index 8e71718061c9b..ebd80b2c2af4d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -639,73 +639,50 @@ for.end:
 define i32 @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c) {
 ; CHECK-LABEL: @udiv_sdiv_with_invariant_divisors(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 12, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP2]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 12, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 12, [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[Y:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i8> poison, i8 [[X:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i8> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i16
-; CHECK-NEXT:    [[TMP4:%.*]] = add i16 -12, [[DOTCAST]]
-; CHECK-NEXT:    [[DOTCAST5:%.*]] = trunc i32 [[N_VEC]] to i8
-; CHECK-NEXT:    [[TMP5:%.*]] = add i8 -12, [[DOTCAST5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[C:%.*]], <vscale x 2 x i8> splat (i8 1), <vscale x 2 x i8> [[BROADCAST_SPLAT2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[C]], <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i16> [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i8> @llvm.stepvector.nxv2i8()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i8> [[TMP8]], splat (i8 1)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i8> splat (i8 -12), [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc i32 [[TMP3]] to i8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <vscale x 2 x i8> poison, i8 [[TMP10]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <vscale x 2 x i8> [[BROADCAST_SPLATINSERT6]], <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[C:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = xor <vscale x 8 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[X:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 8 x i8> [[BROADCAST_SPLATINSERT1]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[Y:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT3]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i8> @llvm.stepvector.nxv8i8()
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <vscale x 8 x i8> [[TMP1]], splat (i8 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i8> splat (i8 -12), [[TMP2]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = udiv <vscale x 2 x i8> [[VEC_IND]], [[TMP6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = zext <vscale x 2 x i8> [[TMP11]] to <vscale x 2 x i16>
-; CHECK-NEXT:    [[TMP13:%.*]] = sdiv <vscale x 2 x i16> [[TMP12]], [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = sext <vscale x 2 x i16> [[TMP13]] to <vscale x 2 x i32>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> [[TMP14]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i8> [[VEC_IND]], [[BROADCAST_SPLAT7]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i32 [ 12, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 8, i1 true)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT7]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[TMP4]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 8 x i8> [[BROADCAST_SPLATINSERT5]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i32> @llvm.stepvector.nxv8i32()
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp uge <vscale x 8 x i32> [[TMP5]], [[BROADCAST_SPLAT8]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> [[BROADCAST_SPLAT2]], <vscale x 8 x i8> splat (i8 1), i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP9:%.*]] = udiv <vscale x 8 x i8> [[VEC_IND]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <vscale x 8 x i8> [[TMP9]] to <vscale x 8 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 8 x i16> @llvm.vp.merge.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[BROADCAST_SPLAT4]], <vscale x 8 x i16> splat (i16 1), i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP12:%.*]] = sdiv <vscale x 8 x i16> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i16> [[TMP12]] to <vscale x 8 x i32>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], <vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> [[TMP13]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP3]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i8> [[VEC_IND]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP17:%.*]] = mul nuw i32 [[TMP16]], 2
-; CHECK-NEXT:    [[TMP18:%.*]] = sub i32 [[TMP17]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 2 x i32> [[PREDPHI]], i32 [[TMP18]]
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 12, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ -12, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL8:%.*]] = phi i8 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ -12, [[ENTRY]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[NARROW_IV:%.*]] = phi i8 [ [[BC_RESUME_VAL8]], [[SCALAR_PH]] ], [ [[IV_NEXT_TRUNC:%.*]], [[LOOP_LATCH]] ]
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_LATCH]], label [[THEN:%.*]]
-; CHECK:       then:
-; CHECK-NEXT:    [[UD:%.*]] = udiv i8 [[NARROW_IV]], [[X]]
-; CHECK-NEXT:    [[UD_EXT:%.*]] = zext i8 [[UD]] to i16
-; CHECK-NEXT:    [[SD:%.*]] = sdiv i16 [[UD_EXT]], [[Y]]
-; CHECK-NEXT:    [[SD_EXT:%.*]] = sext i16 [[SD]] to i32
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ 0, [[LOOP_HEADER]] ], [ [[SD_EXT]], [[THEN]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i16 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 0
-; CHECK-NEXT:    [[IV_NEXT_TRUNC]] = trunc i16 [[IV_NEXT]] to i8
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> [[TMP15]], i1 true)
+; CHECK-NEXT:    [[TMP17:%.*]] = sub i64 [[TMP16]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 8
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 0
+; CHECK-NEXT:    [[MERGE_LCSSA:%.*]] = extractelement <vscale x 8 x i32> [[PREDPHI]], i64 [[TMP17]]
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[MERGE_LCSSA:%.*]] = phi i32 [ [[MERGE]], [[LOOP_LATCH]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[MERGE_LCSSA]]
 ;
 ; FIXED-LABEL: @udiv_sdiv_with_invariant_divisors(
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
index 7eb3d7fc5a36d..0083da77dfea3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
@@ -7,55 +7,54 @@ target triple = "riscv64-unknown-linux-gnu"
 define i64 @pr97452_scalable_vf1_for(ptr %src, ptr noalias %dst) #0 {
 ; CHECK-LABEL: define i64 @pr97452_scalable_vf1_for(
 ; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 23, [[TMP0]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 23, [[TMP1]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 23, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 0, i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 [[TMP4]], 1
+; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 0, i32 [[TMP5]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 1 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <vscale x 1 x i64>, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> [[VECTOR_RECUR]], <vscale x 1 x i64> [[WIDE_LOAD]], i32 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP7]], ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 2 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ 23, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[PREV_EVL:%.*]] = phi i32 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp uge <vscale x 2 x i32> [[TMP7]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[VP_OP_LOAD]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.splice.nxv2i64(<vscale x 2 x i64> [[VECTOR_RECUR]], <vscale x 2 x i64> [[VP_OP_LOAD]], i32 -1, <vscale x 2 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP6]])
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[EVL_BASED_IV]]
+; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr align 8 [[TMP11]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 [[TMP11]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 1 x i64> [[WIDE_LOAD]], i32 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = sub i32 [[TMP14]], 1
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 1 x i64> [[TMP7]], i32 [[TMP15]]
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 23, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[FOR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L]] = load i64, ptr [[GEP_SRC]], align 8
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[FOR]], ptr [[GEP_DST]], align 8
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 22
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP8]], i1 true)
+; CHECK-NEXT:    [[TMP15:%.*]] = sub i64 [[TMP14]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 2 x i64> [[VP_OP_LOAD]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP22:%.*]] = mul nuw i32 [[TMP21]], 2
+; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP22]], 1
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 2 x i64> [[VECTOR_RECUR]], i32 [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[TMP15]], 0
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i64 [[TMP24]], i64 [[TMP20]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ], [ [[TMP16]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-NEXT:    ret i64 [[TMP26]]
 ;
 entry:
   br label %loop
@@ -81,5 +80,4 @@ attributes #0 = { "target-features"="+64bit,+v,+zvl128b,+zvl256b" }
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
index 3c90908b0a08f..e09284f26f6db 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
@@ -71,7 +71,7 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.end:
@@ -115,7 +115,7 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP11]])
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -159,7 +159,7 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.end:
@@ -199,7 +199,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.end:
@@ -224,43 +224,37 @@ for.end:
 define i64 @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %n) {
 ; CHECK-LABEL: @uniform_load(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK-NEXT:    br label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ 1025, [[ENTRY]] ], [ [[AVL_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp uge <vscale x 2 x i32> [[TMP1]], [[BROADCAST_SPLAT1]]
 ; CHECK-NEXT:    [[V:%.*]] = load i64, ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[ARRAYIDX]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[TMP5]], [[IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP2]], i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x i64> [[BROADCAST_SPLAT]], i64 [[TMP8]]
 ; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[B]], align 8
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
-; CHECK-NEXT:    store i64 [[V1]], ptr [[ARRAYIDX1]], align 8
-; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], 1025
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V1]], [[FOR_BODY1]] ], [ [[V]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[V_LCSSA]]
+; CHECK-NEXT:    ret i64 [[TMP12]]
 ;
 entry:
   br label %for.body
@@ -299,7 +293,7 @@ define void @vector_add_trip1024(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
index c7ba826295de8..a89435f4b24e3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
@@ -400,61 +400,54 @@ for.end:
 define i32 @FOR_reduction(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-LABEL: define i32 @FOR_reduction(
 ; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] {
-; IF-EVL-NEXT:  [[ENTRY:.*]]:
-; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP9]], 2
-; IF-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], [[TMP1]]
-; IF-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IF-EVL-NEXT:  [[ENTRY:.*:]]
+; IF-EVL-NEXT:    br label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TC]], [[TMP3]]
-; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 4
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
 ; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP8]]
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
-; IF-EVL-NEXT:    [[INDVARS:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[INDVARS:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[AVL:%.*]] = phi i64 [ [[TC]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[PREV_EVL:%.*]] = phi i32 [ [[TMP4]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP9]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP9]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP22:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; IF-EVL-NEXT:    [[TMP23:%.*]] = icmp uge <vscale x 4 x i32> [[TMP22]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
-; IF-EVL-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
+; IF-EVL-NEXT:    [[WIDE_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[ARRAYIDX]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1, <vscale x 4 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[TMP10]], [[WIDE_LOAD]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]]
-; IF-EVL-NEXT:    store <vscale x 4 x i32> [[TMP11]], ptr [[TMP12]], align 4
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS]], [[TMP3]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP11]], ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP9]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[INDVARS]]
+; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; IF-EVL-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP23]], i1 true)
+; IF-EVL-NEXT:    [[TMP28:%.*]] = sub i64 [[TMP27]], 1
+; IF-EVL-NEXT:    [[TMP17:%.*]] = sub i64 [[TMP28]], 1
+; IF-EVL-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 4
+; IF-EVL-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 0
+; IF-EVL-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i64 [[TMP17]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = mul nuw i32 [[TMP14]], 4
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
-; IF-EVL-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP16]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 4
-; IF-EVL-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 2
-; IF-EVL-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP19]]
-; IF-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
-; IF-EVL:       [[SCALAR_PH]]:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; IF-EVL-NEXT:    br label %[[FOR_BODY:.*]]
-; IF-EVL:       [[FOR_BODY]]:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP0:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP0]] = load i32, ptr [[ARRAYIDX1]], align 4
-; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[FOR1]], [[TMP0]]
-; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
-; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[INDVARS_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 4 x i32> [[VECTOR_RECUR]], i32 [[TMP16]]
+; IF-EVL-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP28]], 0
+; IF-EVL-NEXT:    [[FOR1_LCSSA:%.*]] = select i1 [[TMP26]], i32 [[TMP25]], i32 [[TMP21]]
+; IF-EVL-NEXT:    br label %[[FOR_END:.*]]
 ; IF-EVL:       [[FOR_END]]:
-; IF-EVL-NEXT:    [[FOR1_LCSSA:%.*]] = phi i32 [ [[FOR1]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
 ; IF-EVL-NEXT:    ret i32 [[FOR1_LCSSA]]
 ;
 ; NO-VP-LABEL: define i32 @FOR_reduction(
@@ -570,7 +563,7 @@ define void @first_order_recurrence_indvar(ptr noalias %A, i64 %TC) {
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
 ; IF-EVL-NEXT:    br label %[[FOR_END:.*]]
 ; IF-EVL:       [[FOR_END]]:
@@ -662,8 +655,7 @@ for.end:
 ; IF-EVL: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
 ; IF-EVL: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
 ; IF-EVL: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
-; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
-; IF-EVL: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
+; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
 ;.
 ; NO-VP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; NO-VP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 1e21c753840e9..3f404daef6965 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -109,44 +109,38 @@ for.end:
 define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %n) {
 ; SCALABLE-LABEL: define i64 @uniform_load_outside_use(
 ; SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SCALABLE-NEXT:  [[ENTRY:.*]]:
-; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
-; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; SCALABLE-NEXT:  [[ENTRY:.*:]]
+; SCALABLE-NEXT:    br label %[[VECTOR_PH:.*]]
 ; SCALABLE:       [[VECTOR_PH]]:
-; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
-; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; SCALABLE:       [[VECTOR_BODY]]:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP0]], i64 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; SCALABLE-NEXT:    [[TMP2:%.*]] = icmp uge <vscale x 2 x i32> [[TMP1]], [[BROADCAST_SPLAT1]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = load i64, ptr [[B]], align 8
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; SCALABLE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
+; SCALABLE-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
+; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP5]], [[INDEX]]
+; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; SCALABLE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
-; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
-; SCALABLE:       [[SCALAR_PH]]:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; SCALABLE-NEXT:    br label %[[FOR_BODY:.*]]
-; SCALABLE:       [[FOR_BODY]]:
-; SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; SCALABLE-NEXT:    [[V:%.*]] = load i64, ptr [[B]], align 8
-; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; SCALABLE-NEXT:    [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP2]], i1 true)
+; SCALABLE-NEXT:    [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1
+; SCALABLE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP7]], 2
+; SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP11]], 0
+; SCALABLE-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x i64> [[BROADCAST_SPLAT]], i64 [[LAST_ACTIVE_LANE]]
+; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
-; SCALABLE-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
-; SCALABLE-NEXT:    ret i64 [[V_LCSSA]]
+; SCALABLE-NEXT:    ret i64 [[TMP12]]
 ;
 ; FIXEDLEN-LABEL: define i64 @uniform_load_outside_use(
 ; FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
@@ -184,44 +178,38 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ;
 ; TF-SCALABLE-LABEL: define i64 @uniform_load_outside_use(
 ; TF-SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; TF-SCALABLE-NEXT:  [[ENTRY:.*]]:
-; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
-; TF-SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
-; TF-SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; TF-SCALABLE-NEXT:  [[ENTRY:.*:]]
+; TF-SCALABLE-NEXT:    br label %[[VECTOR_PH:.*]]
 ; TF-SCALABLE:       [[VECTOR_PH]]:
-; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
-; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; TF-SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TF-SCALABLE:       [[VECTOR_BODY]]:
-; TF-SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP0]], i64 0
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = icmp uge <vscale x 2 x i32> [[TMP1]], [[BROADCAST_SPLAT1]]
 ; TF-SCALABLE-NEXT:    [[V:%.*]] = load i64, ptr [[B]], align 8
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[ARRAYIDX]], align 8
-; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP3]]
-; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[ARRAYIDX]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
+; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP5]], [[IV]]
+; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
+; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; TF-SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
-; TF-SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
-; TF-SCALABLE:       [[SCALAR_PH]]:
-; TF-SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; TF-SCALABLE-NEXT:    br label %[[FOR_BODY:.*]]
-; TF-SCALABLE:       [[FOR_BODY]]:
-; TF-SCALABLE-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[V1:%.*]] = load i64, ptr [[B]], align 8
-; TF-SCALABLE-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
-; TF-SCALABLE-NEXT:    store i64 [[V1]], ptr [[ARRAYIDX1]], align 8
-; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
-; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; TF-SCALABLE-NEXT:    [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP2]], i1 true)
+; TF-SCALABLE-NEXT:    [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1
+; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
+; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 0
+; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x i64> [[BROADCAST_SPLAT]], i64 [[LAST_ACTIVE_LANE]]
+; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
-; TF-SCALABLE-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V1]], %[[FOR_BODY]] ], [ [[V]], %[[MIDDLE_BLOCK]] ]
-; TF-SCALABLE-NEXT:    ret i64 [[V_LCSSA]]
+; TF-SCALABLE-NEXT:    ret i64 [[TMP12]]
 ;
 entry:
   br label %for.body
@@ -269,7 +257,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -350,7 +338,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; TF-SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
@@ -399,7 +387,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; SCALABLE-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -457,7 +445,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
@@ -499,7 +487,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]]
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -557,7 +545,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
@@ -608,7 +596,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -679,7 +667,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
@@ -731,7 +719,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]]
 ; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -812,7 +800,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
@@ -860,7 +848,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]]
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -918,7 +906,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
index e99ffda9e4043..c10dc5ddba2a9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -524,22 +524,78 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; induction is used outside the loop.
 define i64 @example23d(ptr noalias nocapture %src, ptr noalias nocapture %dst) optsize {
 ; CHECK-LABEL: @example23d(
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[TMP1:%.*]]
-; CHECK:       1:
-; CHECK-NEXT:    [[DOT04:%.*]] = phi ptr [ [[SRC:%.*]], [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[TMP1]] ]
-; CHECK-NEXT:    [[DOT013:%.*]] = phi ptr [ [[DST:%.*]], [[TMP0]] ], [ [[TMP6:%.*]], [[TMP1]] ]
-; CHECK-NEXT:    [[I_02:%.*]] = phi i64 [ 0, [[TMP0]] ], [ [[TMP7:%.*]], [[TMP1]] ]
-; CHECK-NEXT:    [[TMP2]] = getelementptr inbounds nuw i8, ptr [[DOT04]], i64 2
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[TMP9]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[TMP10]], i64 6
+; CHECK-NEXT:    [[OFFSET_IDX4:%.*]] = shl i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[OFFSET_IDX4]]
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[TMP11]], i64 4
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX4]]
+; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[TMP32]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX4]]
+; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[TMP6]], i64 12
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP33]], i64 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[DOT013:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX4]]
+; CHECK-NEXT:    [[DOT04:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[DOT04]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 7
-; CHECK-NEXT:    [[TMP6]] = getelementptr inbounds nuw i8, ptr [[DOT013]], i64 4
 ; CHECK-NEXT:    store i32 [[TMP5]], ptr [[DOT013]], align 4
-; CHECK-NEXT:    [[TMP7]] = add nuw nsw i64 [[I_02]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[TMP7]], 257
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[TMP8:%.*]], label [[TMP1]]
-; CHECK:       8:
-; CHECK-NEXT:    ret i64 [[TMP7]]
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP33]], i64 1
+; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK:       pred.store.if9:
+; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[NEXT_GEP1]], align 2
+; CHECK-NEXT:    [[TMP14:%.*]] = zext i16 [[TMP13]] to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = shl nuw nsw i32 [[TMP14]], 7
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[NEXT_GEP6]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; CHECK:       pred.store.continue10:
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP33]], i64 2
+; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK:       pred.store.if11:
+; CHECK-NEXT:    [[TMP17:%.*]] = load i16, ptr [[NEXT_GEP2]], align 2
+; CHECK-NEXT:    [[TMP18:%.*]] = zext i16 [[TMP17]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw i32 [[TMP18]], 7
+; CHECK-NEXT:    store i32 [[TMP19]], ptr [[NEXT_GEP7]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; CHECK:       pred.store.continue12:
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP33]], i64 3
+; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14]]
+; CHECK:       pred.store.if13:
+; CHECK-NEXT:    [[TMP21:%.*]] = load i16, ptr [[NEXT_GEP3]], align 2
+; CHECK-NEXT:    [[TMP22:%.*]] = zext i16 [[TMP21]] to i32
+; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 7
+; CHECK-NEXT:    store i32 [[TMP23]], ptr [[NEXT_GEP8]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
+; CHECK:       pred.store.continue14:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
+; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[TMP1]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[TMP30:%.*]]
+; CHECK:       25:
+; CHECK-NEXT:    [[TMP25:%.*]] = xor <4 x i1> [[TMP33]], splat (i1 true)
+; CHECK-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP25]], i1 true)
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], -1
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = add nsw i64 [[TMP28]], 1
+; CHECK-NEXT:    ret i64 [[TMP29]]
 ;
   br label %1
 
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll
index e97d6e66d9d7a..58217069058f8 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll
@@ -6,59 +6,276 @@
 define i32 @FOR_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) {
 ; VF2IC1-LABEL: define i32 @FOR_used_outside(
 ; VF2IC1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; VF2IC1-NEXT:  [[ENTRY:.*]]:
-; VF2IC1-NEXT:    br label %[[LOOP:.*]]
-; VF2IC1:       [[LOOP]]:
-; VF2IC1-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF2IC1-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
+; VF2IC1-NEXT:  [[ENTRY:.*:]]
+; VF2IC1-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC1:       [[VECTOR_PH]]:
+; VF2IC1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 1
+; VF2IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; VF2IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF2IC1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; VF2IC1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF2IC1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; VF2IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC1:       [[VECTOR_BODY]]:
+; VF2IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE4:.*]] ]
+; VF2IC1-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE4]] ]
+; VF2IC1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 33>, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[PRED_STORE_CONTINUE4]] ]
+; VF2IC1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
+; VF2IC1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 1
+; VF2IC1-NEXT:    [[TMP2:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; VF2IC1-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; VF2IC1-NEXT:    br i1 [[TMP3]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; VF2IC1:       [[PRED_LOAD_IF]]:
 ; VF2IC1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]]
-; VF2IC1-NEXT:    [[TMP10]] = load i32, ptr [[TMP9]], align 4
-; VF2IC1-NEXT:    [[TMP23:%.*]] = add nsw i32 [[FOR]], [[TMP10]]
-; VF2IC1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; VF2IC1-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
+; VF2IC1-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; VF2IC1:       [[PRED_LOAD_CONTINUE]]:
+; VF2IC1-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ]
+; VF2IC1-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; VF2IC1-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]]
+; VF2IC1:       [[PRED_LOAD_IF1]]:
+; VF2IC1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP4]]
+; VF2IC1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4
+; VF2IC1-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP34]], i32 1
+; VF2IC1-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; VF2IC1:       [[PRED_LOAD_CONTINUE2]]:
+; VF2IC1-NEXT:    [[TMP12]] = phi <2 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], %[[PRED_LOAD_IF1]] ]
+; VF2IC1-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP12]], <2 x i32> <i32 1, i32 2>
+; VF2IC1-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; VF2IC1-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC1:       [[PRED_STORE_IF]]:
+; VF2IC1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC1-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0
+; VF2IC1-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0
+; VF2IC1-NEXT:    [[TMP18:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+; VF2IC1-NEXT:    store i32 [[TMP18]], ptr [[TMP15]], align 4
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC1:       [[PRED_STORE_CONTINUE]]:
+; VF2IC1-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; VF2IC1-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4]]
+; VF2IC1:       [[PRED_STORE_IF3]]:
+; VF2IC1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP4]]
+; VF2IC1-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
+; VF2IC1-NEXT:    [[TMP22:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1
+; VF2IC1-NEXT:    [[TMP23:%.*]] = add nsw i32 [[TMP21]], [[TMP22]]
 ; VF2IC1-NEXT:    store i32 [[TMP23]], ptr [[TMP20]], align 4
-; VF2IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1
-; VF2IC1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF2IC1-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF2IC1:       [[PRED_STORE_CONTINUE4]]:
+; VF2IC1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; VF2IC1-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2IC1-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2IC1-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2IC1:       [[MIDDLE_BLOCK]]:
+; VF2IC1-NEXT:    [[TMP25:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
+; VF2IC1-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP25]], i1 true)
+; VF2IC1-NEXT:    [[TMP27:%.*]] = sub i64 [[TMP26]], 1
+; VF2IC1-NEXT:    [[TMP28:%.*]] = sub i64 [[TMP27]], 1
+; VF2IC1-NEXT:    [[TMP29:%.*]] = extractelement <2 x i32> [[TMP12]], i64 [[TMP28]]
+; VF2IC1-NEXT:    [[TMP30:%.*]] = extractelement <2 x i32> [[VECTOR_RECUR]], i32 1
+; VF2IC1-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[TMP27]], 0
+; VF2IC1-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 [[TMP29]]
+; VF2IC1-NEXT:    br label %[[FOR_END:.*]]
 ; VF2IC1:       [[FOR_END]]:
-; VF2IC1-NEXT:    [[TMP32:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
 ; VF2IC1-NEXT:    ret i32 [[TMP32]]
 ;
 ; VF2IC2-LABEL: define i32 @FOR_used_outside(
 ; VF2IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; VF2IC2-NEXT:  [[ENTRY:.*]]:
-; VF2IC2-NEXT:    br label %[[LOOP:.*]]
-; VF2IC2:       [[LOOP]]:
-; VF2IC2-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF2IC2-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP23:%.*]], %[[LOOP]] ]
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 3
+; VF2IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; VF2IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF2IC2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; VF2IC2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF2IC2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ]
+; VF2IC2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE12]] ]
+; VF2IC2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 33>, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[PRED_STORE_CONTINUE12]] ]
+; VF2IC2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2IC2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; VF2IC2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF2IC2-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; VF2IC2-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 3
+; VF2IC2-NEXT:    [[TMP4:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; VF2IC2-NEXT:    [[TMP5:%.*]] = icmp ule <2 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; VF2IC2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP6]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; VF2IC2:       [[PRED_LOAD_IF]]:
 ; VF2IC2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]]
-; VF2IC2-NEXT:    [[TMP23]] = load i32, ptr [[TMP22]], align 4
-; VF2IC2-NEXT:    [[TMP47:%.*]] = add nsw i32 [[FOR]], [[TMP23]]
-; VF2IC2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF2IC2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+; VF2IC2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP23]], i32 0
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE]]:
+; VF2IC2-NEXT:    [[TMP10:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP9]], %[[PRED_LOAD_IF]] ]
+; VF2IC2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP11]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]]
+; VF2IC2:       [[PRED_LOAD_IF1]]:
+; VF2IC2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]]
+; VF2IC2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; VF2IC2-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP13]], i32 1
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE2]]:
+; VF2IC2-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ [[TMP10]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
+; VF2IC2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP16]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]]
+; VF2IC2:       [[PRED_LOAD_IF3]]:
+; VF2IC2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP2]]
+; VF2IC2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+; VF2IC2-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE4]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE4]]:
+; VF2IC2-NEXT:    [[TMP20:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], %[[PRED_LOAD_IF3]] ]
+; VF2IC2-NEXT:    [[TMP21:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP21]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]]
+; VF2IC2:       [[PRED_LOAD_IF5]]:
+; VF2IC2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP7]]
+; VF2IC2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP34]], align 4
+; VF2IC2-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP37]], i32 1
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE6]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE6]]:
+; VF2IC2-NEXT:    [[TMP25]] = phi <2 x i32> [ [[TMP20]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], %[[PRED_LOAD_IF5]] ]
+; VF2IC2-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP15]], <2 x i32> <i32 1, i32 2>
+; VF2IC2-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> [[TMP25]], <2 x i32> <i32 1, i32 2>
+; VF2IC2-NEXT:    [[TMP28:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC2:       [[PRED_STORE_IF]]:
+; VF2IC2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF2IC2-NEXT:    [[TMP30:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0
+; VF2IC2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
+; VF2IC2-NEXT:    [[TMP32:%.*]] = add nsw i32 [[TMP30]], [[TMP31]]
+; VF2IC2-NEXT:    store i32 [[TMP32]], ptr [[TMP29]], align 4
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC2:       [[PRED_STORE_CONTINUE]]:
+; VF2IC2-NEXT:    [[TMP33:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP33]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF2IC2:       [[PRED_STORE_IF7]]:
+; VF2IC2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1
+; VF2IC2-NEXT:    [[TMP36:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1
+; VF2IC2-NEXT:    [[TMP47:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
 ; VF2IC2-NEXT:    store i32 [[TMP47]], ptr [[TMP44]], align 4
-; VF2IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP3]], 1
-; VF2IC2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF2IC2-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF2IC2:       [[PRED_STORE_CONTINUE8]]:
+; VF2IC2-NEXT:    [[TMP38:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP38]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF2IC2:       [[PRED_STORE_IF9]]:
+; VF2IC2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP2]]
+; VF2IC2-NEXT:    [[TMP40:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0
+; VF2IC2-NEXT:    [[TMP41:%.*]] = extractelement <2 x i32> [[TMP25]], i32 0
+; VF2IC2-NEXT:    [[TMP42:%.*]] = add nsw i32 [[TMP40]], [[TMP41]]
+; VF2IC2-NEXT:    store i32 [[TMP42]], ptr [[TMP39]], align 4
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF2IC2:       [[PRED_STORE_CONTINUE10]]:
+; VF2IC2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP43]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]]
+; VF2IC2:       [[PRED_STORE_IF11]]:
+; VF2IC2-NEXT:    [[TMP67:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP7]]
+; VF2IC2-NEXT:    [[TMP45:%.*]] = extractelement <2 x i32> [[TMP27]], i32 1
+; VF2IC2-NEXT:    [[TMP46:%.*]] = extractelement <2 x i32> [[TMP25]], i32 1
+; VF2IC2-NEXT:    [[TMP68:%.*]] = add nsw i32 [[TMP45]], [[TMP46]]
+; VF2IC2-NEXT:    store i32 [[TMP68]], ptr [[TMP67]], align 4
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF2IC2:       [[PRED_STORE_CONTINUE12]]:
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VF2IC2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
+; VF2IC2-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2IC2-NEXT:    br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+; VF2IC2-NEXT:    [[TMP49:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true)
+; VF2IC2-NEXT:    [[TMP50:%.*]] = xor <2 x i1> [[TMP5]], splat (i1 true)
+; VF2IC2-NEXT:    [[TMP51:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP50]], i1 true)
+; VF2IC2-NEXT:    [[TMP52:%.*]] = add i64 2, [[TMP51]]
+; VF2IC2-NEXT:    [[TMP53:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP49]], i1 true)
+; VF2IC2-NEXT:    [[TMP54:%.*]] = add i64 0, [[TMP53]]
+; VF2IC2-NEXT:    [[TMP55:%.*]] = icmp ne i64 [[TMP53]], 2
+; VF2IC2-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i64 [[TMP54]], i64 [[TMP52]]
+; VF2IC2-NEXT:    [[TMP57:%.*]] = sub i64 [[TMP56]], 1
+; VF2IC2-NEXT:    [[TMP58:%.*]] = sub i64 [[TMP57]], 1
+; VF2IC2-NEXT:    [[TMP59:%.*]] = extractelement <2 x i32> [[TMP15]], i64 [[TMP58]]
+; VF2IC2-NEXT:    [[TMP60:%.*]] = sub i64 [[TMP58]], 2
+; VF2IC2-NEXT:    [[TMP61:%.*]] = extractelement <2 x i32> [[TMP25]], i64 [[TMP60]]
+; VF2IC2-NEXT:    [[TMP62:%.*]] = icmp uge i64 [[TMP58]], 2
+; VF2IC2-NEXT:    [[TMP63:%.*]] = select i1 [[TMP62]], i32 [[TMP61]], i32 [[TMP59]]
+; VF2IC2-NEXT:    [[TMP64:%.*]] = extractelement <2 x i32> [[VECTOR_RECUR]], i32 1
+; VF2IC2-NEXT:    [[TMP65:%.*]] = icmp eq i64 [[TMP57]], 0
+; VF2IC2-NEXT:    [[TMP66:%.*]] = select i1 [[TMP65]], i32 [[TMP64]], i32 [[TMP63]]
+; VF2IC2-NEXT:    br label %[[FOR_END:.*]]
 ; VF2IC2:       [[FOR_END]]:
-; VF2IC2-NEXT:    [[TMP66:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
 ; VF2IC2-NEXT:    ret i32 [[TMP66]]
 ;
 ; VF1IC2-LABEL: define i32 @FOR_used_outside(
 ; VF1IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; VF1IC2-NEXT:  [[ENTRY:.*]]:
-; VF1IC2-NEXT:    br label %[[LOOP:.*]]
-; VF1IC2:       [[LOOP]]:
-; VF1IC2-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF1IC2-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[LOOP]] ]
+; VF1IC2-NEXT:  [[ENTRY:.*:]]
+; VF1IC2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF1IC2:       [[VECTOR_PH]]:
+; VF1IC2-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 1
+; VF1IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; VF1IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF1IC2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; VF1IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF1IC2:       [[VECTOR_BODY]]:
+; VF1IC2-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE5:.*]] ]
+; VF1IC2-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 33, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[PRED_STORE_CONTINUE5]] ]
+; VF1IC2-NEXT:    [[TMP3:%.*]] = add i64 [[TMP0]], 1
+; VF1IC2-NEXT:    [[VEC_IV:%.*]] = add i64 [[TMP0]], 0
+; VF1IC2-NEXT:    [[VEC_IV1:%.*]] = add i64 [[TMP0]], 1
+; VF1IC2-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]]
+; VF1IC2-NEXT:    [[TMP2:%.*]] = icmp ule i64 [[VEC_IV1]], [[TRIP_COUNT_MINUS_1]]
+; VF1IC2-NEXT:    br i1 [[TMP1]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; VF1IC2:       [[PRED_LOAD_IF]]:
 ; VF1IC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]]
-; VF1IC2-NEXT:    [[TMP7]] = load i32, ptr [[TMP6]], align 4
-; VF1IC2-NEXT:    [[TMP12:%.*]] = add nsw i32 [[FOR]], [[TMP7]]
-; VF1IC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
+; VF1IC2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+; VF1IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; VF1IC2:       [[PRED_LOAD_CONTINUE]]:
+; VF1IC2-NEXT:    [[TMP5:%.*]] = phi i32 [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ]
+; VF1IC2-NEXT:    br i1 [[TMP2]], label %[[PRED_LOAD_IF2:.*]], label %[[PRED_LOAD_CONTINUE3:.*]]
+; VF1IC2:       [[PRED_LOAD_IF2]]:
+; VF1IC2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]]
+; VF1IC2-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
+; VF1IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE3]]
+; VF1IC2:       [[PRED_LOAD_CONTINUE3]]:
+; VF1IC2-NEXT:    [[TMP8]] = phi i32 [ poison, %[[PRED_LOAD_CONTINUE]] ], [ [[TMP32]], %[[PRED_LOAD_IF2]] ]
+; VF1IC2-NEXT:    br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF1IC2:       [[PRED_STORE_IF]]:
+; VF1IC2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
+; VF1IC2-NEXT:    [[TMP10:%.*]] = add nsw i32 [[VECTOR_RECUR]], [[TMP5]]
+; VF1IC2-NEXT:    store i32 [[TMP10]], ptr [[TMP9]], align 4
+; VF1IC2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF1IC2:       [[PRED_STORE_CONTINUE]]:
+; VF1IC2-NEXT:    br i1 [[TMP2]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5]]
+; VF1IC2:       [[PRED_STORE_IF4]]:
+; VF1IC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF1IC2-NEXT:    [[TMP12:%.*]] = add nsw i32 [[TMP5]], [[TMP8]]
 ; VF1IC2-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
-; VF1IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1
-; VF1IC2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF1IC2-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF1IC2-NEXT:    br label %[[PRED_STORE_CONTINUE5]]
+; VF1IC2:       [[PRED_STORE_CONTINUE5]]:
+; VF1IC2-NEXT:    [[INDEX_NEXT]] = add i64 [[TMP0]], 2
+; VF1IC2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF1IC2-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF1IC2:       [[MIDDLE_BLOCK]]:
+; VF1IC2-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP1]], true
+; VF1IC2-NEXT:    [[TMP15:%.*]] = xor i1 [[TMP2]], true
+; VF1IC2-NEXT:    [[TMP16:%.*]] = icmp eq i1 [[TMP15]], false
+; VF1IC2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i64
+; VF1IC2-NEXT:    [[TMP18:%.*]] = add i64 1, [[TMP17]]
+; VF1IC2-NEXT:    [[TMP19:%.*]] = icmp eq i1 [[TMP14]], false
+; VF1IC2-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i64
+; VF1IC2-NEXT:    [[TMP21:%.*]] = add i64 0, [[TMP20]]
+; VF1IC2-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[TMP20]], 1
+; VF1IC2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 [[TMP18]]
+; VF1IC2-NEXT:    [[TMP24:%.*]] = sub i64 [[TMP23]], 1
+; VF1IC2-NEXT:    [[TMP25:%.*]] = sub i64 [[TMP24]], 1
+; VF1IC2-NEXT:    [[TMP26:%.*]] = sub i64 [[TMP25]], 1
+; VF1IC2-NEXT:    [[TMP27:%.*]] = icmp uge i64 [[TMP25]], 1
+; VF1IC2-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP8]], i32 [[TMP5]]
+; VF1IC2-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[TMP24]], 0
+; VF1IC2-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[VECTOR_RECUR]], i32 [[TMP28]]
+; VF1IC2-NEXT:    br label %[[FOR_END:.*]]
 ; VF1IC2:       [[FOR_END]]:
-; VF1IC2-NEXT:    [[TMP30:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
 ; VF1IC2-NEXT:    ret i32 [[TMP30]]
 ;
 entry:
@@ -83,59 +300,265 @@ for.end:
 define i32 @FOR_next_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) {
 ; VF2IC1-LABEL: define i32 @FOR_next_used_outside(
 ; VF2IC1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; VF2IC1-NEXT:  [[ENTRY:.*]]:
-; VF2IC1-NEXT:    br label %[[LOOP:.*]]
-; VF2IC1:       [[LOOP]]:
-; VF2IC1-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF2IC1-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
+; VF2IC1-NEXT:  [[ENTRY:.*:]]
+; VF2IC1-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC1:       [[VECTOR_PH]]:
+; VF2IC1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 1
+; VF2IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; VF2IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF2IC1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; VF2IC1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF2IC1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; VF2IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC1:       [[VECTOR_BODY]]:
+; VF2IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE4:.*]] ]
+; VF2IC1-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE4]] ]
+; VF2IC1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 33>, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[PRED_STORE_CONTINUE4]] ]
+; VF2IC1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
+; VF2IC1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 1
+; VF2IC1-NEXT:    [[TMP2:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; VF2IC1-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; VF2IC1-NEXT:    br i1 [[TMP3]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; VF2IC1:       [[PRED_LOAD_IF]]:
 ; VF2IC1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]]
-; VF2IC1-NEXT:    [[TMP10]] = load i32, ptr [[TMP9]], align 4
-; VF2IC1-NEXT:    [[TMP23:%.*]] = add nsw i32 [[FOR]], [[TMP10]]
-; VF2IC1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; VF2IC1-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
+; VF2IC1-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; VF2IC1:       [[PRED_LOAD_CONTINUE]]:
+; VF2IC1-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ]
+; VF2IC1-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; VF2IC1-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]]
+; VF2IC1:       [[PRED_LOAD_IF1]]:
+; VF2IC1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP4]]
+; VF2IC1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4
+; VF2IC1-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP30]], i32 1
+; VF2IC1-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; VF2IC1:       [[PRED_LOAD_CONTINUE2]]:
+; VF2IC1-NEXT:    [[TMP12]] = phi <2 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], %[[PRED_LOAD_IF1]] ]
+; VF2IC1-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP12]], <2 x i32> <i32 1, i32 2>
+; VF2IC1-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; VF2IC1-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC1:       [[PRED_STORE_IF]]:
+; VF2IC1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC1-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0
+; VF2IC1-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0
+; VF2IC1-NEXT:    [[TMP18:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+; VF2IC1-NEXT:    store i32 [[TMP18]], ptr [[TMP15]], align 4
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC1:       [[PRED_STORE_CONTINUE]]:
+; VF2IC1-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; VF2IC1-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4]]
+; VF2IC1:       [[PRED_STORE_IF3]]:
+; VF2IC1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP4]]
+; VF2IC1-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
+; VF2IC1-NEXT:    [[TMP22:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1
+; VF2IC1-NEXT:    [[TMP23:%.*]] = add nsw i32 [[TMP21]], [[TMP22]]
 ; VF2IC1-NEXT:    store i32 [[TMP23]], ptr [[TMP20]], align 4
-; VF2IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1
-; VF2IC1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF2IC1-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF2IC1:       [[PRED_STORE_CONTINUE4]]:
+; VF2IC1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; VF2IC1-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2IC1-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2IC1-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2IC1:       [[MIDDLE_BLOCK]]:
+; VF2IC1-NEXT:    [[TMP25:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
+; VF2IC1-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP25]], i1 true)
+; VF2IC1-NEXT:    [[TMP27:%.*]] = sub i64 [[TMP26]], 1
+; VF2IC1-NEXT:    [[TMP28:%.*]] = extractelement <2 x i32> [[TMP12]], i64 [[TMP27]]
+; VF2IC1-NEXT:    br label %[[FOR_END:.*]]
 ; VF2IC1:       [[FOR_END]]:
-; VF2IC1-NEXT:    [[TMP28:%.*]] = phi i32 [ [[TMP10]], %[[LOOP]] ]
 ; VF2IC1-NEXT:    ret i32 [[TMP28]]
 ;
 ; VF2IC2-LABEL: define i32 @FOR_next_used_outside(
 ; VF2IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; VF2IC2-NEXT:  [[ENTRY:.*]]:
-; VF2IC2-NEXT:    br label %[[LOOP:.*]]
-; VF2IC2:       [[LOOP]]:
-; VF2IC2-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF2IC2-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP23:%.*]], %[[LOOP]] ]
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 3
+; VF2IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; VF2IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF2IC2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; VF2IC2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF2IC2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ]
+; VF2IC2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE12]] ]
+; VF2IC2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 33>, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[PRED_STORE_CONTINUE12]] ]
+; VF2IC2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2IC2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; VF2IC2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF2IC2-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; VF2IC2-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 3
+; VF2IC2-NEXT:    [[TMP4:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; VF2IC2-NEXT:    [[TMP5:%.*]] = icmp ule <2 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; VF2IC2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP6]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; VF2IC2:       [[PRED_LOAD_IF]]:
 ; VF2IC2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]]
-; VF2IC2-NEXT:    [[TMP23]] = load i32, ptr [[TMP22]], align 4
-; VF2IC2-NEXT:    [[TMP47:%.*]] = add nsw i32 [[FOR]], [[TMP23]]
-; VF2IC2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF2IC2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+; VF2IC2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP23]], i32 0
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE]]:
+; VF2IC2-NEXT:    [[TMP10:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP9]], %[[PRED_LOAD_IF]] ]
+; VF2IC2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP11]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]]
+; VF2IC2:       [[PRED_LOAD_IF1]]:
+; VF2IC2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]]
+; VF2IC2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; VF2IC2-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP13]], i32 1
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE2]]:
+; VF2IC2-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ [[TMP10]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
+; VF2IC2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP16]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]]
+; VF2IC2:       [[PRED_LOAD_IF3]]:
+; VF2IC2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP2]]
+; VF2IC2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+; VF2IC2-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE4]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE4]]:
+; VF2IC2-NEXT:    [[TMP20:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], %[[PRED_LOAD_IF3]] ]
+; VF2IC2-NEXT:    [[TMP21:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP21]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]]
+; VF2IC2:       [[PRED_LOAD_IF5]]:
+; VF2IC2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP7]]
+; VF2IC2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP34]], align 4
+; VF2IC2-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP37]], i32 1
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE6]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE6]]:
+; VF2IC2-NEXT:    [[TMP25]] = phi <2 x i32> [ [[TMP20]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], %[[PRED_LOAD_IF5]] ]
+; VF2IC2-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP15]], <2 x i32> <i32 1, i32 2>
+; VF2IC2-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> [[TMP25]], <2 x i32> <i32 1, i32 2>
+; VF2IC2-NEXT:    [[TMP28:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC2:       [[PRED_STORE_IF]]:
+; VF2IC2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF2IC2-NEXT:    [[TMP30:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0
+; VF2IC2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
+; VF2IC2-NEXT:    [[TMP32:%.*]] = add nsw i32 [[TMP30]], [[TMP31]]
+; VF2IC2-NEXT:    store i32 [[TMP32]], ptr [[TMP29]], align 4
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC2:       [[PRED_STORE_CONTINUE]]:
+; VF2IC2-NEXT:    [[TMP33:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP33]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF2IC2:       [[PRED_STORE_IF7]]:
+; VF2IC2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1
+; VF2IC2-NEXT:    [[TMP36:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1
+; VF2IC2-NEXT:    [[TMP47:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
 ; VF2IC2-NEXT:    store i32 [[TMP47]], ptr [[TMP44]], align 4
-; VF2IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP3]], 1
-; VF2IC2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF2IC2-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF2IC2:       [[PRED_STORE_CONTINUE8]]:
+; VF2IC2-NEXT:    [[TMP38:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP38]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF2IC2:       [[PRED_STORE_IF9]]:
+; VF2IC2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP2]]
+; VF2IC2-NEXT:    [[TMP40:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0
+; VF2IC2-NEXT:    [[TMP41:%.*]] = extractelement <2 x i32> [[TMP25]], i32 0
+; VF2IC2-NEXT:    [[TMP42:%.*]] = add nsw i32 [[TMP40]], [[TMP41]]
+; VF2IC2-NEXT:    store i32 [[TMP42]], ptr [[TMP39]], align 4
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF2IC2:       [[PRED_STORE_CONTINUE10]]:
+; VF2IC2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP43]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]]
+; VF2IC2:       [[PRED_STORE_IF11]]:
+; VF2IC2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP7]]
+; VF2IC2-NEXT:    [[TMP45:%.*]] = extractelement <2 x i32> [[TMP27]], i32 1
+; VF2IC2-NEXT:    [[TMP46:%.*]] = extractelement <2 x i32> [[TMP25]], i32 1
+; VF2IC2-NEXT:    [[TMP64:%.*]] = add nsw i32 [[TMP45]], [[TMP46]]
+; VF2IC2-NEXT:    store i32 [[TMP64]], ptr [[TMP63]], align 4
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF2IC2:       [[PRED_STORE_CONTINUE12]]:
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VF2IC2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
+; VF2IC2-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2IC2-NEXT:    br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+; VF2IC2-NEXT:    [[TMP49:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true)
+; VF2IC2-NEXT:    [[TMP50:%.*]] = xor <2 x i1> [[TMP5]], splat (i1 true)
+; VF2IC2-NEXT:    [[TMP51:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP50]], i1 true)
+; VF2IC2-NEXT:    [[TMP52:%.*]] = add i64 2, [[TMP51]]
+; VF2IC2-NEXT:    [[TMP53:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP49]], i1 true)
+; VF2IC2-NEXT:    [[TMP54:%.*]] = add i64 0, [[TMP53]]
+; VF2IC2-NEXT:    [[TMP55:%.*]] = icmp ne i64 [[TMP53]], 2
+; VF2IC2-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i64 [[TMP54]], i64 [[TMP52]]
+; VF2IC2-NEXT:    [[TMP57:%.*]] = sub i64 [[TMP56]], 1
+; VF2IC2-NEXT:    [[TMP58:%.*]] = extractelement <2 x i32> [[TMP15]], i64 [[TMP57]]
+; VF2IC2-NEXT:    [[TMP59:%.*]] = sub i64 [[TMP57]], 2
+; VF2IC2-NEXT:    [[TMP60:%.*]] = extractelement <2 x i32> [[TMP25]], i64 [[TMP59]]
+; VF2IC2-NEXT:    [[TMP61:%.*]] = icmp uge i64 [[TMP57]], 2
+; VF2IC2-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP60]], i32 [[TMP58]]
+; VF2IC2-NEXT:    br label %[[FOR_END:.*]]
 ; VF2IC2:       [[FOR_END]]:
-; VF2IC2-NEXT:    [[TMP62:%.*]] = phi i32 [ [[TMP23]], %[[LOOP]] ]
 ; VF2IC2-NEXT:    ret i32 [[TMP62]]
 ;
 ; VF1IC2-LABEL: define i32 @FOR_next_used_outside(
 ; VF1IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; VF1IC2-NEXT:  [[ENTRY:.*]]:
-; VF1IC2-NEXT:    br label %[[LOOP:.*]]
-; VF1IC2:       [[LOOP]]:
-; VF1IC2-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF1IC2-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[LOOP]] ]
+; VF1IC2-NEXT:  [[ENTRY:.*:]]
+; VF1IC2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF1IC2:       [[VECTOR_PH]]:
+; VF1IC2-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 1
+; VF1IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; VF1IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF1IC2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; VF1IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF1IC2:       [[VECTOR_BODY]]:
+; VF1IC2-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE5:.*]] ]
+; VF1IC2-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 33, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[PRED_STORE_CONTINUE5]] ]
+; VF1IC2-NEXT:    [[TMP3:%.*]] = add i64 [[TMP0]], 1
+; VF1IC2-NEXT:    [[VEC_IV:%.*]] = add i64 [[TMP0]], 0
+; VF1IC2-NEXT:    [[VEC_IV1:%.*]] = add i64 [[TMP0]], 1
+; VF1IC2-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]]
+; VF1IC2-NEXT:    [[TMP2:%.*]] = icmp ule i64 [[VEC_IV1]], [[TRIP_COUNT_MINUS_1]]
+; VF1IC2-NEXT:    br i1 [[TMP1]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; VF1IC2:       [[PRED_LOAD_IF]]:
 ; VF1IC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]]
-; VF1IC2-NEXT:    [[TMP7]] = load i32, ptr [[TMP6]], align 4
-; VF1IC2-NEXT:    [[TMP12:%.*]] = add nsw i32 [[FOR]], [[TMP7]]
-; VF1IC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
+; VF1IC2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+; VF1IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; VF1IC2:       [[PRED_LOAD_CONTINUE]]:
+; VF1IC2-NEXT:    [[TMP5:%.*]] = phi i32 [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ]
+; VF1IC2-NEXT:    br i1 [[TMP2]], label %[[PRED_LOAD_IF2:.*]], label %[[PRED_LOAD_CONTINUE3:.*]]
+; VF1IC2:       [[PRED_LOAD_IF2]]:
+; VF1IC2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]]
+; VF1IC2-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4
+; VF1IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE3]]
+; VF1IC2:       [[PRED_LOAD_CONTINUE3]]:
+; VF1IC2-NEXT:    [[TMP8]] = phi i32 [ poison, %[[PRED_LOAD_CONTINUE]] ], [ [[TMP29]], %[[PRED_LOAD_IF2]] ]
+; VF1IC2-NEXT:    br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF1IC2:       [[PRED_STORE_IF]]:
+; VF1IC2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
+; VF1IC2-NEXT:    [[TMP10:%.*]] = add nsw i32 [[VECTOR_RECUR]], [[TMP5]]
+; VF1IC2-NEXT:    store i32 [[TMP10]], ptr [[TMP9]], align 4
+; VF1IC2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF1IC2:       [[PRED_STORE_CONTINUE]]:
+; VF1IC2-NEXT:    br i1 [[TMP2]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5]]
+; VF1IC2:       [[PRED_STORE_IF4]]:
+; VF1IC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF1IC2-NEXT:    [[TMP12:%.*]] = add nsw i32 [[TMP5]], [[TMP8]]
 ; VF1IC2-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
-; VF1IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1
-; VF1IC2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF1IC2-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF1IC2-NEXT:    br label %[[PRED_STORE_CONTINUE5]]
+; VF1IC2:       [[PRED_STORE_CONTINUE5]]:
+; VF1IC2-NEXT:    [[INDEX_NEXT]] = add i64 [[TMP0]], 2
+; VF1IC2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF1IC2-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF1IC2:       [[MIDDLE_BLOCK]]:
+; VF1IC2-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP1]], true
+; VF1IC2-NEXT:    [[TMP15:%.*]] = xor i1 [[TMP2]], true
+; VF1IC2-NEXT:    [[TMP16:%.*]] = icmp eq i1 [[TMP15]], false
+; VF1IC2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i64
+; VF1IC2-NEXT:    [[TMP18:%.*]] = add i64 1, [[TMP17]]
+; VF1IC2-NEXT:    [[TMP19:%.*]] = icmp eq i1 [[TMP14]], false
+; VF1IC2-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i64
+; VF1IC2-NEXT:    [[TMP21:%.*]] = add i64 0, [[TMP20]]
+; VF1IC2-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[TMP20]], 1
+; VF1IC2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 [[TMP18]]
+; VF1IC2-NEXT:    [[TMP24:%.*]] = sub i64 [[TMP23]], 1
+; VF1IC2-NEXT:    [[TMP25:%.*]] = sub i64 [[TMP24]], 1
+; VF1IC2-NEXT:    [[TMP26:%.*]] = icmp uge i64 [[TMP24]], 1
+; VF1IC2-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP8]], i32 [[TMP5]]
+; VF1IC2-NEXT:    br label %[[FOR_END:.*]]
 ; VF1IC2:       [[FOR_END]]:
-; VF1IC2-NEXT:    [[TMP27:%.*]] = phi i32 [ [[TMP7]], %[[LOOP]] ]
 ; VF1IC2-NEXT:    ret i32 [[TMP27]]
 ;
 entry:
@@ -160,64 +583,287 @@ for.end:
 define i32 @FOR_and_next_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) {
 ; VF2IC1-LABEL: define i32 @FOR_and_next_used_outside(
 ; VF2IC1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; VF2IC1-NEXT:  [[ENTRY:.*]]:
-; VF2IC1-NEXT:    br label %[[LOOP:.*]]
-; VF2IC1:       [[LOOP]]:
-; VF2IC1-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF2IC1-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
+; VF2IC1-NEXT:  [[ENTRY:.*:]]
+; VF2IC1-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC1:       [[VECTOR_PH]]:
+; VF2IC1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 1
+; VF2IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; VF2IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF2IC1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; VF2IC1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF2IC1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; VF2IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC1:       [[VECTOR_BODY]]:
+; VF2IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE4:.*]] ]
+; VF2IC1-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE4]] ]
+; VF2IC1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 33>, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[PRED_STORE_CONTINUE4]] ]
+; VF2IC1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
+; VF2IC1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 1
+; VF2IC1-NEXT:    [[TMP2:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; VF2IC1-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; VF2IC1-NEXT:    br i1 [[TMP3]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; VF2IC1:       [[PRED_LOAD_IF]]:
 ; VF2IC1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]]
-; VF2IC1-NEXT:    [[TMP10]] = load i32, ptr [[TMP9]], align 4
-; VF2IC1-NEXT:    [[TMP23:%.*]] = add nsw i32 [[FOR]], [[TMP10]]
-; VF2IC1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; VF2IC1-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
+; VF2IC1-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; VF2IC1:       [[PRED_LOAD_CONTINUE]]:
+; VF2IC1-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ]
+; VF2IC1-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; VF2IC1-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]]
+; VF2IC1:       [[PRED_LOAD_IF1]]:
+; VF2IC1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP4]]
+; VF2IC1-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4
+; VF2IC1-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP35]], i32 1
+; VF2IC1-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; VF2IC1:       [[PRED_LOAD_CONTINUE2]]:
+; VF2IC1-NEXT:    [[TMP12]] = phi <2 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], %[[PRED_LOAD_IF1]] ]
+; VF2IC1-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP12]], <2 x i32> <i32 1, i32 2>
+; VF2IC1-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; VF2IC1-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC1:       [[PRED_STORE_IF]]:
+; VF2IC1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC1-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0
+; VF2IC1-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0
+; VF2IC1-NEXT:    [[TMP18:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+; VF2IC1-NEXT:    store i32 [[TMP18]], ptr [[TMP15]], align 4
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC1:       [[PRED_STORE_CONTINUE]]:
+; VF2IC1-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; VF2IC1-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4]]
+; VF2IC1:       [[PRED_STORE_IF3]]:
+; VF2IC1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP4]]
+; VF2IC1-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
+; VF2IC1-NEXT:    [[TMP22:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1
+; VF2IC1-NEXT:    [[TMP23:%.*]] = add nsw i32 [[TMP21]], [[TMP22]]
 ; VF2IC1-NEXT:    store i32 [[TMP23]], ptr [[TMP20]], align 4
-; VF2IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1
-; VF2IC1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF2IC1-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF2IC1:       [[PRED_STORE_CONTINUE4]]:
+; VF2IC1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; VF2IC1-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2IC1-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2IC1-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2IC1:       [[MIDDLE_BLOCK]]:
+; VF2IC1-NEXT:    [[TMP25:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
+; VF2IC1-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP25]], i1 true)
+; VF2IC1-NEXT:    [[TMP27:%.*]] = sub i64 [[TMP26]], 1
+; VF2IC1-NEXT:    [[TMP28:%.*]] = sub i64 [[TMP27]], 1
+; VF2IC1-NEXT:    [[TMP29:%.*]] = extractelement <2 x i32> [[TMP12]], i64 [[TMP28]]
+; VF2IC1-NEXT:    [[TMP30:%.*]] = extractelement <2 x i32> [[VECTOR_RECUR]], i32 1
+; VF2IC1-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[TMP27]], 0
+; VF2IC1-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 [[TMP29]]
+; VF2IC1-NEXT:    [[TMP33:%.*]] = extractelement <2 x i32> [[TMP12]], i64 [[TMP27]]
+; VF2IC1-NEXT:    br label %[[FOR_END:.*]]
 ; VF2IC1:       [[FOR_END]]:
-; VF2IC1-NEXT:    [[TMP32:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
-; VF2IC1-NEXT:    [[TMP33:%.*]] = phi i32 [ [[TMP10]], %[[LOOP]] ]
 ; VF2IC1-NEXT:    [[RES:%.*]] = add i32 [[TMP32]], [[TMP33]]
 ; VF2IC1-NEXT:    ret i32 [[RES]]
 ;
 ; VF2IC2-LABEL: define i32 @FOR_and_next_used_outside(
 ; VF2IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; VF2IC2-NEXT:  [[ENTRY:.*]]:
-; VF2IC2-NEXT:    br label %[[LOOP:.*]]
-; VF2IC2:       [[LOOP]]:
-; VF2IC2-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF2IC2-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP23:%.*]], %[[LOOP]] ]
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 3
+; VF2IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; VF2IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF2IC2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; VF2IC2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF2IC2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ]
+; VF2IC2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE12]] ]
+; VF2IC2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 33>, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[PRED_STORE_CONTINUE12]] ]
+; VF2IC2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2IC2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; VF2IC2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF2IC2-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; VF2IC2-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 3
+; VF2IC2-NEXT:    [[TMP4:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; VF2IC2-NEXT:    [[TMP5:%.*]] = icmp ule <2 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; VF2IC2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP6]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; VF2IC2:       [[PRED_LOAD_IF]]:
 ; VF2IC2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]]
-; VF2IC2-NEXT:    [[TMP23]] = load i32, ptr [[TMP22]], align 4
-; VF2IC2-NEXT:    [[TMP47:%.*]] = add nsw i32 [[FOR]], [[TMP23]]
-; VF2IC2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF2IC2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+; VF2IC2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP23]], i32 0
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE]]:
+; VF2IC2-NEXT:    [[TMP10:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP9]], %[[PRED_LOAD_IF]] ]
+; VF2IC2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP11]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]]
+; VF2IC2:       [[PRED_LOAD_IF1]]:
+; VF2IC2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]]
+; VF2IC2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; VF2IC2-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP13]], i32 1
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE2]]:
+; VF2IC2-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ [[TMP10]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
+; VF2IC2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP16]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]]
+; VF2IC2:       [[PRED_LOAD_IF3]]:
+; VF2IC2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP2]]
+; VF2IC2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+; VF2IC2-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE4]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE4]]:
+; VF2IC2-NEXT:    [[TMP20:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], %[[PRED_LOAD_IF3]] ]
+; VF2IC2-NEXT:    [[TMP21:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP21]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]]
+; VF2IC2:       [[PRED_LOAD_IF5]]:
+; VF2IC2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP7]]
+; VF2IC2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP34]], align 4
+; VF2IC2-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP37]], i32 1
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE6]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE6]]:
+; VF2IC2-NEXT:    [[TMP25]] = phi <2 x i32> [ [[TMP20]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], %[[PRED_LOAD_IF5]] ]
+; VF2IC2-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP15]], <2 x i32> <i32 1, i32 2>
+; VF2IC2-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> [[TMP25]], <2 x i32> <i32 1, i32 2>
+; VF2IC2-NEXT:    [[TMP28:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC2:       [[PRED_STORE_IF]]:
+; VF2IC2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF2IC2-NEXT:    [[TMP30:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0
+; VF2IC2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
+; VF2IC2-NEXT:    [[TMP32:%.*]] = add nsw i32 [[TMP30]], [[TMP31]]
+; VF2IC2-NEXT:    store i32 [[TMP32]], ptr [[TMP29]], align 4
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC2:       [[PRED_STORE_CONTINUE]]:
+; VF2IC2-NEXT:    [[TMP33:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP33]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF2IC2:       [[PRED_STORE_IF7]]:
+; VF2IC2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1
+; VF2IC2-NEXT:    [[TMP36:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1
+; VF2IC2-NEXT:    [[TMP47:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
 ; VF2IC2-NEXT:    store i32 [[TMP47]], ptr [[TMP44]], align 4
-; VF2IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP3]], 1
-; VF2IC2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF2IC2-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF2IC2:       [[PRED_STORE_CONTINUE8]]:
+; VF2IC2-NEXT:    [[TMP38:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP38]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF2IC2:       [[PRED_STORE_IF9]]:
+; VF2IC2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP2]]
+; VF2IC2-NEXT:    [[TMP40:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0
+; VF2IC2-NEXT:    [[TMP41:%.*]] = extractelement <2 x i32> [[TMP25]], i32 0
+; VF2IC2-NEXT:    [[TMP42:%.*]] = add nsw i32 [[TMP40]], [[TMP41]]
+; VF2IC2-NEXT:    store i32 [[TMP42]], ptr [[TMP39]], align 4
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF2IC2:       [[PRED_STORE_CONTINUE10]]:
+; VF2IC2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP43]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]]
+; VF2IC2:       [[PRED_STORE_IF11]]:
+; VF2IC2-NEXT:    [[TMP72:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP7]]
+; VF2IC2-NEXT:    [[TMP45:%.*]] = extractelement <2 x i32> [[TMP27]], i32 1
+; VF2IC2-NEXT:    [[TMP46:%.*]] = extractelement <2 x i32> [[TMP25]], i32 1
+; VF2IC2-NEXT:    [[TMP73:%.*]] = add nsw i32 [[TMP45]], [[TMP46]]
+; VF2IC2-NEXT:    store i32 [[TMP73]], ptr [[TMP72]], align 4
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF2IC2:       [[PRED_STORE_CONTINUE12]]:
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VF2IC2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
+; VF2IC2-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2IC2-NEXT:    br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+; VF2IC2-NEXT:    [[TMP49:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true)
+; VF2IC2-NEXT:    [[TMP50:%.*]] = xor <2 x i1> [[TMP5]], splat (i1 true)
+; VF2IC2-NEXT:    [[TMP51:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP50]], i1 true)
+; VF2IC2-NEXT:    [[TMP52:%.*]] = add i64 2, [[TMP51]]
+; VF2IC2-NEXT:    [[TMP53:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP49]], i1 true)
+; VF2IC2-NEXT:    [[TMP54:%.*]] = add i64 0, [[TMP53]]
+; VF2IC2-NEXT:    [[TMP55:%.*]] = icmp ne i64 [[TMP53]], 2
+; VF2IC2-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i64 [[TMP54]], i64 [[TMP52]]
+; VF2IC2-NEXT:    [[TMP57:%.*]] = sub i64 [[TMP56]], 1
+; VF2IC2-NEXT:    [[TMP58:%.*]] = sub i64 [[TMP57]], 1
+; VF2IC2-NEXT:    [[TMP59:%.*]] = extractelement <2 x i32> [[TMP15]], i64 [[TMP58]]
+; VF2IC2-NEXT:    [[TMP60:%.*]] = sub i64 [[TMP58]], 2
+; VF2IC2-NEXT:    [[TMP61:%.*]] = extractelement <2 x i32> [[TMP25]], i64 [[TMP60]]
+; VF2IC2-NEXT:    [[TMP62:%.*]] = icmp uge i64 [[TMP58]], 2
+; VF2IC2-NEXT:    [[TMP63:%.*]] = select i1 [[TMP62]], i32 [[TMP61]], i32 [[TMP59]]
+; VF2IC2-NEXT:    [[TMP64:%.*]] = extractelement <2 x i32> [[VECTOR_RECUR]], i32 1
+; VF2IC2-NEXT:    [[TMP65:%.*]] = icmp eq i64 [[TMP57]], 0
+; VF2IC2-NEXT:    [[TMP66:%.*]] = select i1 [[TMP65]], i32 [[TMP64]], i32 [[TMP63]]
+; VF2IC2-NEXT:    [[TMP67:%.*]] = extractelement <2 x i32> [[TMP15]], i64 [[TMP57]]
+; VF2IC2-NEXT:    [[TMP68:%.*]] = sub i64 [[TMP57]], 2
+; VF2IC2-NEXT:    [[TMP69:%.*]] = extractelement <2 x i32> [[TMP25]], i64 [[TMP68]]
+; VF2IC2-NEXT:    [[TMP70:%.*]] = icmp uge i64 [[TMP57]], 2
+; VF2IC2-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP69]], i32 [[TMP67]]
+; VF2IC2-NEXT:    br label %[[FOR_END:.*]]
 ; VF2IC2:       [[FOR_END]]:
-; VF2IC2-NEXT:    [[TMP66:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
-; VF2IC2-NEXT:    [[TMP71:%.*]] = phi i32 [ [[TMP23]], %[[LOOP]] ]
 ; VF2IC2-NEXT:    [[RES:%.*]] = add i32 [[TMP66]], [[TMP71]]
 ; VF2IC2-NEXT:    ret i32 [[RES]]
 ;
 ; VF1IC2-LABEL: define i32 @FOR_and_next_used_outside(
 ; VF1IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; VF1IC2-NEXT:  [[ENTRY:.*]]:
-; VF1IC2-NEXT:    br label %[[LOOP:.*]]
-; VF1IC2:       [[LOOP]]:
-; VF1IC2-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF1IC2-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[LOOP]] ]
+; VF1IC2-NEXT:  [[ENTRY:.*:]]
+; VF1IC2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF1IC2:       [[VECTOR_PH]]:
+; VF1IC2-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 1
+; VF1IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; VF1IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF1IC2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; VF1IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF1IC2:       [[VECTOR_BODY]]:
+; VF1IC2-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE5:.*]] ]
+; VF1IC2-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 33, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[PRED_STORE_CONTINUE5]] ]
+; VF1IC2-NEXT:    [[TMP3:%.*]] = add i64 [[TMP0]], 1
+; VF1IC2-NEXT:    [[VEC_IV:%.*]] = add i64 [[TMP0]], 0
+; VF1IC2-NEXT:    [[VEC_IV1:%.*]] = add i64 [[TMP0]], 1
+; VF1IC2-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]]
+; VF1IC2-NEXT:    [[TMP2:%.*]] = icmp ule i64 [[VEC_IV1]], [[TRIP_COUNT_MINUS_1]]
+; VF1IC2-NEXT:    br i1 [[TMP1]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; VF1IC2:       [[PRED_LOAD_IF]]:
 ; VF1IC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]]
-; VF1IC2-NEXT:    [[TMP7]] = load i32, ptr [[TMP6]], align 4
-; VF1IC2-NEXT:    [[TMP12:%.*]] = add nsw i32 [[FOR]], [[TMP7]]
-; VF1IC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
+; VF1IC2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+; VF1IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; VF1IC2:       [[PRED_LOAD_CONTINUE]]:
+; VF1IC2-NEXT:    [[TMP5:%.*]] = phi i32 [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ]
+; VF1IC2-NEXT:    br i1 [[TMP2]], label %[[PRED_LOAD_IF2:.*]], label %[[PRED_LOAD_CONTINUE3:.*]]
+; VF1IC2:       [[PRED_LOAD_IF2]]:
+; VF1IC2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]]
+; VF1IC2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4
+; VF1IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE3]]
+; VF1IC2:       [[PRED_LOAD_CONTINUE3]]:
+; VF1IC2-NEXT:    [[TMP8]] = phi i32 [ poison, %[[PRED_LOAD_CONTINUE]] ], [ [[TMP35]], %[[PRED_LOAD_IF2]] ]
+; VF1IC2-NEXT:    br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF1IC2:       [[PRED_STORE_IF]]:
+; VF1IC2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
+; VF1IC2-NEXT:    [[TMP10:%.*]] = add nsw i32 [[VECTOR_RECUR]], [[TMP5]]
+; VF1IC2-NEXT:    store i32 [[TMP10]], ptr [[TMP9]], align 4
+; VF1IC2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF1IC2:       [[PRED_STORE_CONTINUE]]:
+; VF1IC2-NEXT:    br i1 [[TMP2]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5]]
+; VF1IC2:       [[PRED_STORE_IF4]]:
+; VF1IC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF1IC2-NEXT:    [[TMP12:%.*]] = add nsw i32 [[TMP5]], [[TMP8]]
 ; VF1IC2-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
-; VF1IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1
-; VF1IC2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF1IC2-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF1IC2-NEXT:    br label %[[PRED_STORE_CONTINUE5]]
+; VF1IC2:       [[PRED_STORE_CONTINUE5]]:
+; VF1IC2-NEXT:    [[INDEX_NEXT]] = add i64 [[TMP0]], 2
+; VF1IC2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF1IC2-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF1IC2:       [[MIDDLE_BLOCK]]:
+; VF1IC2-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP1]], true
+; VF1IC2-NEXT:    [[TMP15:%.*]] = xor i1 [[TMP2]], true
+; VF1IC2-NEXT:    [[TMP16:%.*]] = icmp eq i1 [[TMP15]], false
+; VF1IC2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i64
+; VF1IC2-NEXT:    [[TMP18:%.*]] = add i64 1, [[TMP17]]
+; VF1IC2-NEXT:    [[TMP19:%.*]] = icmp eq i1 [[TMP14]], false
+; VF1IC2-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i64
+; VF1IC2-NEXT:    [[TMP21:%.*]] = add i64 0, [[TMP20]]
+; VF1IC2-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[TMP20]], 1
+; VF1IC2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 [[TMP18]]
+; VF1IC2-NEXT:    [[TMP24:%.*]] = sub i64 [[TMP23]], 1
+; VF1IC2-NEXT:    [[TMP25:%.*]] = sub i64 [[TMP24]], 1
+; VF1IC2-NEXT:    [[TMP26:%.*]] = sub i64 [[TMP25]], 1
+; VF1IC2-NEXT:    [[TMP27:%.*]] = icmp uge i64 [[TMP25]], 1
+; VF1IC2-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP8]], i32 [[TMP5]]
+; VF1IC2-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[TMP24]], 0
+; VF1IC2-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[VECTOR_RECUR]], i32 [[TMP28]]
+; VF1IC2-NEXT:    [[TMP31:%.*]] = sub i64 [[TMP24]], 1
+; VF1IC2-NEXT:    [[TMP32:%.*]] = icmp uge i64 [[TMP24]], 1
+; VF1IC2-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP8]], i32 [[TMP5]]
+; VF1IC2-NEXT:    br label %[[FOR_END:.*]]
 ; VF1IC2:       [[FOR_END]]:
-; VF1IC2-NEXT:    [[TMP30:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
-; VF1IC2-NEXT:    [[TMP33:%.*]] = phi i32 [ [[TMP7]], %[[LOOP]] ]
 ; VF1IC2-NEXT:    [[RES:%.*]] = add i32 [[TMP30]], [[TMP33]]
 ; VF1IC2-NEXT:    ret i32 [[RES]]
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll
index f9f7feb7bdfbc..9931137566c1a 100644
--- a/llvm/test/Transforms/LoopVectorize/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/optsize.ll
@@ -476,45 +476,87 @@ define i32 @pr45526() optsize {
 ;
 ; CHECK-LABEL: define i32 @pr45526(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[PIV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[FOR:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
-; CHECK-NEXT:    [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; CHECK-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510)
+; CHECK-NEXT:    [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 true)
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i32 [[FOR_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP10]]
 ;
 ; PGSO-LABEL: define i32 @pr45526(
 ; PGSO-SAME: ) #[[ATTR0]] {
-; PGSO-NEXT:  [[ENTRY:.*]]:
-; PGSO-NEXT:    br label %[[LOOP:.*]]
-; PGSO:       [[LOOP]]:
-; PGSO-NEXT:    [[PIV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ]
-; PGSO-NEXT:    [[FOR:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
-; PGSO-NEXT:    [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
-; PGSO-NEXT:    [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; PGSO-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; PGSO-NEXT:  [[ENTRY:.*:]]
+; PGSO-NEXT:    br label %[[VECTOR_PH:.*]]
+; PGSO:       [[VECTOR_PH]]:
+; PGSO-NEXT:    br label %[[VECTOR_BODY:.*]]
+; PGSO:       [[VECTOR_BODY]]:
+; PGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; PGSO-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; PGSO-NEXT:    [[TMP2:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ]
+; PGSO-NEXT:    [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510)
+; PGSO-NEXT:    [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1)
+; PGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; PGSO-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; PGSO-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512
+; PGSO-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; PGSO:       [[MIDDLE_BLOCK]]:
+; PGSO-NEXT:    [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 true)
+; PGSO-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; PGSO-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; PGSO-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 [[TMP6]]
+; PGSO-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; PGSO-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP5]], 0
+; PGSO-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]]
+; PGSO-NEXT:    br label %[[EXIT:.*]]
 ; PGSO:       [[EXIT]]:
-; PGSO-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
-; PGSO-NEXT:    ret i32 [[FOR_LCSSA]]
+; PGSO-NEXT:    ret i32 [[TMP10]]
 ;
 ; NPGSO-LABEL: define i32 @pr45526(
 ; NPGSO-SAME: ) #[[ATTR0]] {
-; NPGSO-NEXT:  [[ENTRY:.*]]:
-; NPGSO-NEXT:    br label %[[LOOP:.*]]
-; NPGSO:       [[LOOP]]:
-; NPGSO-NEXT:    [[PIV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ]
-; NPGSO-NEXT:    [[FOR:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
-; NPGSO-NEXT:    [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
-; NPGSO-NEXT:    [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; NPGSO-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; NPGSO-NEXT:  [[ENTRY:.*:]]
+; NPGSO-NEXT:    br label %[[VECTOR_PH:.*]]
+; NPGSO:       [[VECTOR_PH]]:
+; NPGSO-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NPGSO:       [[VECTOR_BODY]]:
+; NPGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NPGSO-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NPGSO-NEXT:    [[TMP2:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ]
+; NPGSO-NEXT:    [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510)
+; NPGSO-NEXT:    [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1)
+; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; NPGSO-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; NPGSO-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512
+; NPGSO-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; NPGSO:       [[MIDDLE_BLOCK]]:
+; NPGSO-NEXT:    [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 true)
+; NPGSO-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; NPGSO-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; NPGSO-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 [[TMP6]]
+; NPGSO-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; NPGSO-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP5]], 0
+; NPGSO-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]]
+; NPGSO-NEXT:    br label %[[EXIT:.*]]
 ; NPGSO:       [[EXIT]]:
-; NPGSO-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
-; NPGSO-NEXT:    ret i32 [[FOR_LCSSA]]
+; NPGSO-NEXT:    ret i32 [[TMP10]]
 ;
 entry:
   br label %loop
@@ -534,31 +576,59 @@ define i32 @pr45526_pgso() !prof !14 {
 ;
 ; CHECK-LABEL: define i32 @pr45526_pgso(
 ; CHECK-SAME: ) !prof [[PROF14]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[PIV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[FOR:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
-; CHECK-NEXT:    [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; CHECK-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510)
+; CHECK-NEXT:    [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 true)
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i32 [[FOR_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP10]]
 ;
 ; PGSO-LABEL: define i32 @pr45526_pgso(
 ; PGSO-SAME: ) !prof [[PROF14]] {
-; PGSO-NEXT:  [[ENTRY:.*]]:
-; PGSO-NEXT:    br label %[[LOOP:.*]]
-; PGSO:       [[LOOP]]:
-; PGSO-NEXT:    [[PIV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ]
-; PGSO-NEXT:    [[FOR:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
-; PGSO-NEXT:    [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
-; PGSO-NEXT:    [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; PGSO-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; PGSO-NEXT:  [[ENTRY:.*:]]
+; PGSO-NEXT:    br label %[[VECTOR_PH:.*]]
+; PGSO:       [[VECTOR_PH]]:
+; PGSO-NEXT:    br label %[[VECTOR_BODY:.*]]
+; PGSO:       [[VECTOR_BODY]]:
+; PGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; PGSO-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; PGSO-NEXT:    [[TMP2:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ]
+; PGSO-NEXT:    [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510)
+; PGSO-NEXT:    [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1)
+; PGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; PGSO-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; PGSO-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512
+; PGSO-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; PGSO:       [[MIDDLE_BLOCK]]:
+; PGSO-NEXT:    [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 true)
+; PGSO-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; PGSO-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; PGSO-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 [[TMP6]]
+; PGSO-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; PGSO-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP5]], 0
+; PGSO-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]]
+; PGSO-NEXT:    br label %[[EXIT:.*]]
 ; PGSO:       [[EXIT]]:
-; PGSO-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
-; PGSO-NEXT:    ret i32 [[FOR_LCSSA]]
+; PGSO-NEXT:    ret i32 [[TMP10]]
 ;
 ; NPGSO-LABEL: define i32 @pr45526_pgso(
 ; NPGSO-SAME: ) !prof [[PROF14]] {
@@ -573,7 +643,7 @@ define i32 @pr45526_pgso() !prof !14 {
 ; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; NPGSO-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; NPGSO-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 508
-; NPGSO-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; NPGSO-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; NPGSO:       [[MIDDLE_BLOCK]]:
 ; NPGSO-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
 ; NPGSO-NEXT:    br label %[[SCALAR_PH:.*]]
@@ -584,7 +654,7 @@ define i32 @pr45526_pgso() !prof !14 {
 ; NPGSO-NEXT:    [[FOR:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[SCALAR_PH]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
 ; NPGSO-NEXT:    [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
 ; NPGSO-NEXT:    [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; NPGSO-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP23:![0-9]+]]
+; NPGSO-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP24:![0-9]+]]
 ; NPGSO:       [[EXIT]]:
 ; NPGSO-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
 ; NPGSO-NEXT:    ret i32 [[FOR_LCSSA]]
@@ -640,7 +710,7 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
 ; CHECK:       [[FOR_END]]:
@@ -678,7 +748,7 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize {
 ; PGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; PGSO-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; PGSO-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026
-; PGSO-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; PGSO-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; PGSO:       [[MIDDLE_BLOCK]]:
 ; PGSO-NEXT:    br label %[[FOR_END:.*]]
 ; PGSO:       [[FOR_END]]:
@@ -716,7 +786,7 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize {
 ; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; NPGSO-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; NPGSO-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026
-; NPGSO-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; NPGSO-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; NPGSO:       [[MIDDLE_BLOCK]]:
 ; NPGSO-NEXT:    br label %[[FOR_END:.*]]
 ; NPGSO:       [[FOR_END]]:
@@ -757,7 +827,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; CHECK-NEXT:    store <2 x i16> splat (i16 42), ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -770,7 +840,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; CHECK-NEXT:    store i16 42, ptr [[GEPOFB]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -789,7 +859,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; PGSO-NEXT:    store <2 x i16> splat (i16 42), ptr [[TMP1]], align 4
 ; PGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[TMP0]], 2
 ; PGSO-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; PGSO-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; PGSO-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; PGSO:       [[MIDDLE_BLOCK]]:
 ; PGSO-NEXT:    br label %[[SCALAR_PH]]
 ; PGSO:       [[SCALAR_PH]]:
@@ -802,7 +872,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; PGSO-NEXT:    store i16 42, ptr [[GEPOFB]], align 4
 ; PGSO-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; PGSO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; PGSO-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; PGSO-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; PGSO:       [[FOR_END]]:
 ; PGSO-NEXT:    ret void
 ;
@@ -821,7 +891,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; NPGSO-NEXT:    store <2 x i16> splat (i16 42), ptr [[TMP1]], align 4
 ; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[TMP0]], 2
 ; NPGSO-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; NPGSO-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; NPGSO-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; NPGSO:       [[MIDDLE_BLOCK]]:
 ; NPGSO-NEXT:    br label %[[SCALAR_PH]]
 ; NPGSO:       [[SCALAR_PH]]:
@@ -834,7 +904,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; NPGSO-NEXT:    store i16 42, ptr [[GEPOFB]], align 4
 ; NPGSO-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; NPGSO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; NPGSO-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; NPGSO-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
 ; NPGSO:       [[FOR_END]]:
 ; NPGSO-NEXT:    ret void
 ;
@@ -1020,7 +1090,9 @@ exit:
 ; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META16]], [[META17]]}
 ; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META16]], [[META17]]}
 ; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META16]], [[META17]]}
-; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]]}
+; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]], [[META17]]}
+; CHECK: [[LOOP22]] = distinct !{[[LOOP22]], [[META16]], [[META17]]}
+; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META16]]}
 ;.
 ; PGSO: [[PROF14]] = !{!"function_entry_count", i64 0}
 ; PGSO: [[LOOP15]] = distinct !{[[LOOP15]], [[META16:![0-9]+]], [[META17:![0-9]+]]}
@@ -1029,7 +1101,9 @@ exit:
 ; PGSO: [[LOOP18]] = distinct !{[[LOOP18]], [[META16]], [[META17]]}
 ; PGSO: [[LOOP19]] = distinct !{[[LOOP19]], [[META16]], [[META17]]}
 ; PGSO: [[LOOP20]] = distinct !{[[LOOP20]], [[META16]], [[META17]]}
-; PGSO: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]]}
+; PGSO: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]], [[META17]]}
+; PGSO: [[LOOP22]] = distinct !{[[LOOP22]], [[META16]], [[META17]]}
+; PGSO: [[LOOP23]] = distinct !{[[LOOP23]], [[META16]]}
 ;.
 ; NPGSO: [[PROF14]] = !{!"function_entry_count", i64 0}
 ; NPGSO: [[LOOP15]] = distinct !{[[LOOP15]], [[META16:![0-9]+]], [[META17:![0-9]+]]}
@@ -1040,8 +1114,9 @@ exit:
 ; NPGSO: [[LOOP20]] = distinct !{[[LOOP20]], [[META16]], [[META17]]}
 ; NPGSO: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]]}
 ; NPGSO: [[LOOP22]] = distinct !{[[LOOP22]], [[META16]], [[META17]]}
-; NPGSO: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META16]]}
-; NPGSO: [[LOOP24]] = distinct !{[[LOOP24]], [[META16]], [[META17]]}
+; NPGSO: [[LOOP23]] = distinct !{[[LOOP23]], [[META16]], [[META17]]}
+; NPGSO: [[LOOP24]] = distinct !{[[LOOP24]], [[META17]], [[META16]]}
 ; NPGSO: [[LOOP25]] = distinct !{[[LOOP25]], [[META16]], [[META17]]}
-; NPGSO: [[LOOP26]] = distinct !{[[LOOP26]], [[META16]]}
+; NPGSO: [[LOOP26]] = distinct !{[[LOOP26]], [[META16]], [[META17]]}
+; NPGSO: [[LOOP27]] = distinct !{[[LOOP27]], [[META16]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll
index cbc9fccebb881..960065b09cfd1 100644
--- a/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll
@@ -39,22 +39,24 @@
 define i64 @test1(i64 %y) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[Y:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i64> splat (i64 3), [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP2]], <4 x i64> splat (i64 77), <4 x i64> [[TMP1]]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]]
-; CHECK:       cond.false:
-; CHECK-NEXT:    [[DIV:%.*]] = xor i64 3, [[Y]]
-; CHECK-NEXT:    br label [[COND_END]]
-; CHECK:       cond.end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i64 [ [[DIV]], [[COND_FALSE]] ], [ 77, [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> <i1 false, i1 false, i1 false, i1 true>, i1 true)
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI]], i64 [[TMP4]]
+; CHECK-NEXT:    br label [[COND_END:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ]
-; CHECK-NEXT:    ret i64 [[COND_LCSSA]]
+; CHECK-NEXT:    ret i64 [[TMP5]]
 ;
 entry:
   br label %for.body
@@ -84,21 +86,23 @@ for.cond.cleanup:
 define i64 @test2(i64 %y) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[Y:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP1]], <4 x i64> splat (i64 77), <4 x i64> splat (i64 55)
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]]
-; CHECK:       cond.false:
-; CHECK-NEXT:    br label [[COND_END]]
-; CHECK:       cond.end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i64 [ 55, [[COND_FALSE]] ], [ 77, [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> <i1 false, i1 false, i1 false, i1 true>, i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[PREDPHI]], i64 [[TMP3]]
+; CHECK-NEXT:    br label [[COND_END:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ]
-; CHECK-NEXT:    ret i64 [[COND_LCSSA]]
+; CHECK-NEXT:    ret i64 [[TMP4]]
 ;
 entry:
   br label %for.body
@@ -127,21 +131,23 @@ for.cond.cleanup:
 define i32 @test3(i64 %y) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[Y:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]]
-; CHECK:       cond.false:
-; CHECK-NEXT:    br label [[COND_END]]
-; CHECK:       cond.end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ 55, [[COND_FALSE]] ], [ [[I]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> splat (i32 55)
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> <i1 false, i1 false, i1 false, i1 true>, i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 [[TMP3]]
+; CHECK-NEXT:    br label [[COND_END:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[COND_END]] ]
-; CHECK-NEXT:    ret i32 [[COND_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
index 3b34b75a4c511..52dbe931db8bc 100644
--- a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
+++ b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s
+; RUN: opt -S -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck --check-prefix=FORCED-TF %s
 ; RUN: opt -S -passes=loop-vectorize < %s | FileCheck %s
 
 ; This tests should produce the same result as with default options, and when tail folding
@@ -13,6 +13,24 @@
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @basic_loop(ptr nocapture readonly %ptr, i32 %size, ptr %pos) {
+; FORCED-TF-LABEL: @basic_loop(
+; FORCED-TF-NEXT:  header:
+; FORCED-TF-NEXT:    [[PTR0:%.*]] = load ptr, ptr [[POS:%.*]], align 4
+; FORCED-TF-NEXT:    br label [[BODY:%.*]]
+; FORCED-TF:       body:
+; FORCED-TF-NEXT:    [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[SIZE:%.*]], [[HEADER:%.*]] ]
+; FORCED-TF-NEXT:    [[BUFF:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[PTR:%.*]], [[HEADER]] ]
+; FORCED-TF-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[BUFF]], i32 1
+; FORCED-TF-NEXT:    [[DEC]] = add nsw i32 [[DEC66]], -1
+; FORCED-TF-NEXT:    [[TMP0:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1
+; FORCED-TF-NEXT:    store i8 [[TMP0]], ptr [[BUFF]], align 1
+; FORCED-TF-NEXT:    [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
+; FORCED-TF-NEXT:    br i1 [[TOBOOL11]], label [[END:%.*]], label [[BODY]]
+; FORCED-TF:       end:
+; FORCED-TF-NEXT:    [[INCDEC_PTR_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR]], [[BODY]] ]
+; FORCED-TF-NEXT:    store ptr [[INCDEC_PTR_LCSSA]], ptr [[POS]], align 4
+; FORCED-TF-NEXT:    ret void
+;
 ; CHECK-LABEL: @basic_loop(
 ; CHECK-NEXT:  header:
 ; CHECK-NEXT:    [[PTR0:%.*]] = load ptr, ptr [[POS:%.*]], align 4
@@ -21,36 +39,36 @@ define void @basic_loop(ptr nocapture readonly %ptr, i32 %size, ptr %pos) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[SIZE]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[SIZE]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[SIZE]], [[N_VEC]]
-; CHECK-NEXT:    [[IND_END1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[N_VEC]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i32 [[SIZE]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[N_VEC]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], ptr [[NEXT_GEP]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SIZE]], [[HEADER:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[SIZE]], [[HEADER:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ]
 ; CHECK-NEXT:    br label [[BODY:%.*]]
 ; CHECK:       body:
 ; CHECK-NEXT:    [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[BUFF:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[BUFF:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[BUFF]], i32 1
 ; CHECK-NEXT:    [[DEC]] = add nsw i32 [[DEC66]], -1
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1
-; CHECK-NEXT:    store i8 [[TMP5]], ptr [[BUFF]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1
+; CHECK-NEXT:    store i8 [[TMP4]], ptr [[BUFF]], align 1
 ; CHECK-NEXT:    [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[INCDEC_PTR_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR]], [[BODY]] ], [ [[IND_END1]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[INCDEC_PTR_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR]], [[BODY]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    store ptr [[INCDEC_PTR_LCSSA]], ptr [[POS]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -74,45 +92,162 @@ end:
 }
 
 define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) {
+; FORCED-TF-LABEL: @metadata(
+; FORCED-TF-NEXT:  header:
+; FORCED-TF-NEXT:    [[PTR0:%.*]] = load ptr, ptr [[POS:%.*]], align 4
+; FORCED-TF-NEXT:    br label [[VECTOR_PH:%.*]]
+; FORCED-TF:       vector.ph:
+; FORCED-TF-NEXT:    [[N_RND_UP:%.*]] = add i32 [[SIZE:%.*]], 3
+; FORCED-TF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
+; FORCED-TF-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; FORCED-TF-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[SIZE]], 1
+; FORCED-TF-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
+; FORCED-TF-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; FORCED-TF-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FORCED-TF:       vector.body:
+; FORCED-TF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE11:%.*]] ]
+; FORCED-TF-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; FORCED-TF-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
+; FORCED-TF-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
+; FORCED-TF-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 3
+; FORCED-TF-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[TMP0]]
+; FORCED-TF-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP1]]
+; FORCED-TF-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP2]]
+; FORCED-TF-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP3]]
+; FORCED-TF-NEXT:    [[TMP4:%.*]] = insertelement <4 x ptr> poison, ptr [[NEXT_GEP]], i32 0
+; FORCED-TF-NEXT:    [[TMP5:%.*]] = insertelement <4 x ptr> [[TMP4]], ptr [[NEXT_GEP1]], i32 1
+; FORCED-TF-NEXT:    [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[NEXT_GEP2]], i32 2
+; FORCED-TF-NEXT:    [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[NEXT_GEP3]], i32 3
+; FORCED-TF-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0
+; FORCED-TF-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
+; FORCED-TF-NEXT:    [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT5]], <i32 0, i32 1, i32 2, i32 3>
+; FORCED-TF-NEXT:    [[TMP8:%.*]] = icmp ule <4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; FORCED-TF-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1
+; FORCED-TF-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP1]], i32 1
+; FORCED-TF-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP2]], i32 1
+; FORCED-TF-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP3]], i32 1
+; FORCED-TF-NEXT:    [[TMP13:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP9]], i32 0
+; FORCED-TF-NEXT:    [[TMP14:%.*]] = insertelement <4 x ptr> [[TMP13]], ptr [[TMP10]], i32 1
+; FORCED-TF-NEXT:    [[TMP15:%.*]] = insertelement <4 x ptr> [[TMP14]], ptr [[TMP11]], i32 2
+; FORCED-TF-NEXT:    [[TMP16:%.*]] = insertelement <4 x ptr> [[TMP15]], ptr [[TMP12]], i32 3
+; FORCED-TF-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
+; FORCED-TF-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FORCED-TF:       pred.store.if:
+; FORCED-TF-NEXT:    [[TMP18:%.*]] = load i8, ptr [[TMP9]], align 1
+; FORCED-TF-NEXT:    store i8 [[TMP18]], ptr [[NEXT_GEP]], align 1
+; FORCED-TF-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; FORCED-TF:       pred.store.continue:
+; FORCED-TF-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1
+; FORCED-TF-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
+; FORCED-TF:       pred.store.if6:
+; FORCED-TF-NEXT:    [[TMP20:%.*]] = load i8, ptr [[TMP10]], align 1
+; FORCED-TF-NEXT:    store i8 [[TMP20]], ptr [[NEXT_GEP1]], align 1
+; FORCED-TF-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; FORCED-TF:       pred.store.continue7:
+; FORCED-TF-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2
+; FORCED-TF-NEXT:    br i1 [[TMP21]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; FORCED-TF:       pred.store.if8:
+; FORCED-TF-NEXT:    [[TMP22:%.*]] = load i8, ptr [[TMP11]], align 1
+; FORCED-TF-NEXT:    store i8 [[TMP22]], ptr [[NEXT_GEP2]], align 1
+; FORCED-TF-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; FORCED-TF:       pred.store.continue9:
+; FORCED-TF-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3
+; FORCED-TF-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11]]
+; FORCED-TF:       pred.store.if10:
+; FORCED-TF-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP12]], align 1
+; FORCED-TF-NEXT:    store i8 [[TMP24]], ptr [[NEXT_GEP3]], align 1
+; FORCED-TF-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; FORCED-TF:       pred.store.continue11:
+; FORCED-TF-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; FORCED-TF-NEXT:    [[TMP25:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; FORCED-TF-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; FORCED-TF:       middle.block:
+; FORCED-TF-NEXT:    [[TMP26:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true)
+; FORCED-TF-NEXT:    [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP26]], i1 true)
+; FORCED-TF-NEXT:    [[TMP28:%.*]] = sub i64 [[TMP27]], 1
+; FORCED-TF-NEXT:    [[TMP29:%.*]] = extractelement <4 x ptr> [[TMP16]], i64 [[TMP28]]
+; FORCED-TF-NEXT:    br label [[END:%.*]]
+; FORCED-TF:       end:
+; FORCED-TF-NEXT:    store ptr [[TMP29]], ptr [[POS]], align 4
+; FORCED-TF-NEXT:    ret void
+;
 ; CHECK-LABEL: @metadata(
 ; CHECK-NEXT:  header:
 ; CHECK-NEXT:    [[PTR0:%.*]] = load ptr, ptr [[POS:%.*]], align 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[SIZE]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[SIZE]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[SIZE]], [[N_VEC]]
-; CHECK-NEXT:    [[IND_END1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[N_VEC]]
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[SIZE:%.*]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[SIZE]], 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], ptr [[NEXT_GEP]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE11:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 3
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP1]]
+; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP2]]
+; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x ptr> poison, ptr [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x ptr> [[TMP4]], ptr [[NEXT_GEP1]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[NEXT_GEP2]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[NEXT_GEP3]], i32 3
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT5]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ule <4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP1]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP2]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP3]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x ptr> [[TMP13]], ptr [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x ptr> [[TMP14]], ptr [[TMP11]], i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x ptr> [[TMP15]], ptr [[TMP12]], i32 3
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP18:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    store i8 [[TMP18]], ptr [[NEXT_GEP]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
+; CHECK:       pred.store.if6:
+; CHECK-NEXT:    [[TMP20:%.*]] = load i8, ptr [[TMP10]], align 1
+; CHECK-NEXT:    store i8 [[TMP20]], ptr [[NEXT_GEP1]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; CHECK:       pred.store.continue7:
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2
+; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; CHECK:       pred.store.if8:
+; CHECK-NEXT:    [[TMP22:%.*]] = load i8, ptr [[TMP11]], align 1
+; CHECK-NEXT:    store i8 [[TMP22]], ptr [[NEXT_GEP2]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; CHECK:       pred.store.continue9:
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3
+; CHECK-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11]]
+; CHECK:       pred.store.if10:
+; CHECK-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP12]], align 1
+; CHECK-NEXT:    store i8 [[TMP24]], ptr [[NEXT_GEP3]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; CHECK:       pred.store.continue11:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SIZE]], [[HEADER:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ]
-; CHECK-NEXT:    br label [[BODY:%.*]]
-; CHECK:       body:
-; CHECK-NEXT:    [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[BUFF:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[BUFF]], i32 1
-; CHECK-NEXT:    [[DEC]] = add nsw i32 [[DEC66]], -1
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1
-; CHECK-NEXT:    store i8 [[TMP5]], ptr [[BUFF]], align 1
-; CHECK-NEXT:    [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP26:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true)
+; CHECK-NEXT:    [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP26]], i1 true)
+; CHECK-NEXT:    [[TMP28:%.*]] = sub i64 [[TMP27]], 1
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x ptr> [[TMP16]], i64 [[TMP28]]
+; CHECK-NEXT:    br label [[END:%.*]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[INCDEC_PTR_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR]], [[BODY]] ], [ [[IND_END1]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    store ptr [[INCDEC_PTR_LCSSA]], ptr [[POS]], align 4
+; CHECK-NEXT:    store ptr [[TMP29]], ptr [[POS]], align 4
 ; CHECK-NEXT:    ret void
 ;
 header:

From b6dd511f70dcaa874cc0ff4dcd7cd32463da5ba2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 12 Nov 2025 15:20:38 +0000
Subject: [PATCH 13/15] [X86] AVX512 optimised CTLZ/CTTZ implementations for
 i256/i512 scalars (#164671)

Make use of AVX512 VPLZCNT/VPOPCNT to perform the big integer bit counts per vector element and then use VPCOMPRESS to extract the first non-zero element result.

There's more we can do here (widen/split other vector widths etc.) - but this is a good starting point.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp     |  67 ++
 llvm/test/CodeGen/X86/bitcnt-big-integer.ll | 721 ++++++++------------
 2 files changed, 370 insertions(+), 418 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fa3dce256046f..6483e07afadee 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2654,6 +2654,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                        ISD::AVGCEILU,
                        ISD::AVGFLOORS,
                        ISD::AVGFLOORU,
+                       ISD::CTLZ,
+                       ISD::CTTZ,
+                       ISD::CTLZ_ZERO_UNDEF,
+                       ISD::CTTZ_ZERO_UNDEF,
                        ISD::BITREVERSE,
                        ISD::ADD,
                        ISD::FADD,
@@ -55162,6 +55166,65 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
   return combineFneg(N, DAG, DCI, Subtarget);
 }
 
+// Fold i256/i512 CTLZ/CTTZ patterns to make use of AVX512
+// vXi64 CTLZ/CTTZ and VECTOR_COMPRESS.
+// Compute the CTLZ/CTTZ of each element, add the element's bit offset, compress
+// the result to remove all zero elements (passthru is set to scalar bitwidth if
+// all elements are zero) and extract the lowest compressed element.
+static SDValue combineCTZ(SDNode *N, SelectionDAG &DAG,
+                          TargetLowering::DAGCombinerInfo &DCI,
+                          const X86Subtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  unsigned Opc = N->getOpcode();
+  unsigned SizeInBits = VT.getSizeInBits();
+  assert((Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF || Opc == ISD::CTTZ ||
+          Opc == ISD::CTTZ_ZERO_UNDEF) &&
+         "Unsupported bit count");
+
+  if (VT.isScalarInteger() && Subtarget.hasCDI() &&
+      ((SizeInBits == 512 && Subtarget.useAVX512Regs()) ||
+       (SizeInBits == 256 && Subtarget.hasVLX() &&
+        X86::mayFoldLoad(N0, Subtarget)))) {
+    MVT VecVT = MVT::getVectorVT(MVT::i64, SizeInBits / 64);
+    MVT BoolVT = VecVT.changeVectorElementType(MVT::i1);
+    SDValue Vec = DAG.getBitcast(VecVT, N0);
+    SDLoc DL(N);
+
+    SmallVector<int, 8> RevMask;
+    SmallVector<SDValue, 8> Offsets;
+    for (unsigned I = 0, E = VecVT.getVectorNumElements(); I != E; ++I) {
+      RevMask.push_back((int)((E - 1) - I));
+      Offsets.push_back(DAG.getConstant(I * 64, DL, MVT::i64));
+    }
+
+    // CTLZ - reverse the elements as we want the top non-zero element at the
+    // bottom for compression.
+    unsigned VecOpc = ISD::CTTZ;
+    if (Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF) {
+      VecOpc = ISD::CTLZ;
+      Vec = DAG.getVectorShuffle(VecVT, DL, Vec, Vec, RevMask);
+    }
+
+    SDValue PassThrough = DAG.getUNDEF(VecVT);
+    if (Opc == ISD::CTLZ || Opc == ISD::CTTZ)
+      PassThrough = DAG.getConstant(SizeInBits, DL, VecVT);
+
+    SDValue IsNonZero = DAG.getSetCC(DL, BoolVT, Vec,
+                                     DAG.getConstant(0, DL, VecVT), ISD::SETNE);
+    SDValue Cnt = DAG.getNode(VecOpc, DL, VecVT, Vec);
+    Cnt = DAG.getNode(ISD::ADD, DL, VecVT, Cnt,
+                      DAG.getBuildVector(VecVT, DL, Offsets));
+    Cnt = DAG.getNode(ISD::VECTOR_COMPRESS, DL, VecVT, Cnt, IsNonZero,
+                      PassThrough);
+    Cnt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cnt,
+                      DAG.getVectorIdxConstant(0, DL));
+    return DAG.getZExtOrTrunc(Cnt, DL, VT);
+  }
+
+  return SDValue();
+}
+
 static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget &Subtarget) {
@@ -60885,6 +60948,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
+  case ISD::CTLZ:
+  case ISD::CTTZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTTZ_ZERO_UNDEF:return combineCTZ(N, DAG, DCI, Subtarget);
   case ISD::BITREVERSE:     return combineBITREVERSE(N, DAG, DCI, Subtarget);
   case ISD::AVGCEILS:
   case ISD::AVGCEILU:
diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
index fe3fbf141682a..330c978d2a9f7 100644
--- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512POPCNT
 
 ;
 ; CTPOP
@@ -712,23 +712,15 @@ define i32 @load_ctlz_i256(ptr %p0) nounwind {
 ;
 ; AVX512-LABEL: load_ctlz_i256:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    movq 8(%rdi), %rcx
-; AVX512-NEXT:    movq 16(%rdi), %rdx
-; AVX512-NEXT:    movq 24(%rdi), %rsi
-; AVX512-NEXT:    lzcntq %rsi, %rax
-; AVX512-NEXT:    lzcntq %rdx, %r8
-; AVX512-NEXT:    addl $64, %r8d
-; AVX512-NEXT:    testq %rsi, %rsi
-; AVX512-NEXT:    cmovnel %eax, %r8d
-; AVX512-NEXT:    lzcntq %rcx, %r9
-; AVX512-NEXT:    lzcntq (%rdi), %rax
-; AVX512-NEXT:    addl $64, %eax
-; AVX512-NEXT:    testq %rcx, %rcx
-; AVX512-NEXT:    cmovnel %r9d, %eax
-; AVX512-NEXT:    subl $-128, %eax
-; AVX512-NEXT:    orq %rsi, %rdx
-; AVX512-NEXT:    cmovnel %r8d, %eax
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = mem[3,2,1,0]
+; AVX512-NEXT:    vplzcntq %ymm0, %ymm1
+; AVX512-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512-NEXT:    vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %a0 = load i256, ptr %p0
   %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 0)
@@ -845,47 +837,28 @@ define i32 @test_ctlz_i512(i512 %a0) nounwind {
 ;
 ; AVX512-LABEL: test_ctlz_i512:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    lzcntq %r11, %rax
-; AVX512-NEXT:    lzcntq %r10, %r14
-; AVX512-NEXT:    addl $64, %r14d
-; AVX512-NEXT:    testq %r11, %r11
-; AVX512-NEXT:    cmovnel %eax, %r14d
-; AVX512-NEXT:    lzcntq %r9, %rax
-; AVX512-NEXT:    lzcntq %r8, %rbx
-; AVX512-NEXT:    addl $64, %ebx
-; AVX512-NEXT:    testq %r9, %r9
-; AVX512-NEXT:    cmovnel %eax, %ebx
-; AVX512-NEXT:    subl $-128, %ebx
-; AVX512-NEXT:    movq %r10, %rax
-; AVX512-NEXT:    orq %r11, %rax
-; AVX512-NEXT:    cmovnel %r14d, %ebx
-; AVX512-NEXT:    lzcntq %rcx, %rax
-; AVX512-NEXT:    lzcntq %rdx, %r14
-; AVX512-NEXT:    addl $64, %r14d
-; AVX512-NEXT:    testq %rcx, %rcx
-; AVX512-NEXT:    cmovnel %eax, %r14d
-; AVX512-NEXT:    lzcntq %rsi, %r15
-; AVX512-NEXT:    lzcntq %rdi, %rax
-; AVX512-NEXT:    addl $64, %eax
-; AVX512-NEXT:    testq %rsi, %rsi
-; AVX512-NEXT:    cmovnel %r15d, %eax
-; AVX512-NEXT:    subl $-128, %eax
-; AVX512-NEXT:    orq %rcx, %rdx
-; AVX512-NEXT:    cmovnel %r14d, %eax
-; AVX512-NEXT:    addl $256, %eax # imm = 0x100
-; AVX512-NEXT:    orq %r11, %r9
-; AVX512-NEXT:    orq %r10, %r8
-; AVX512-NEXT:    orq %r9, %r8
-; AVX512-NEXT:    cmovnel %ebx, %eax
-; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    vmovq %rdi, %xmm0
+; AVX512-NEXT:    vmovq %rsi, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vmovq %rdx, %xmm1
+; AVX512-NEXT:    vmovq %rcx, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vmovq %r8, %xmm1
+; AVX512-NEXT:    vmovq %r9, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    vplzcntq %zmm0, %zmm1
+; AVX512-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0)
   %res = trunc i512 %cnt to i32
@@ -1010,50 +983,16 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind {
 ;
 ; AVX512-LABEL: load_ctlz_i512:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    movq 8(%rdi), %r11
-; AVX512-NEXT:    movq 16(%rdi), %r9
-; AVX512-NEXT:    movq 24(%rdi), %r10
-; AVX512-NEXT:    movq 32(%rdi), %rcx
-; AVX512-NEXT:    movq 40(%rdi), %rdx
-; AVX512-NEXT:    movq 48(%rdi), %rsi
-; AVX512-NEXT:    movq 56(%rdi), %r8
-; AVX512-NEXT:    lzcntq %r8, %rax
-; AVX512-NEXT:    lzcntq %rsi, %r14
-; AVX512-NEXT:    addl $64, %r14d
-; AVX512-NEXT:    testq %r8, %r8
-; AVX512-NEXT:    cmovnel %eax, %r14d
-; AVX512-NEXT:    lzcntq %rdx, %rax
-; AVX512-NEXT:    lzcntq %rcx, %rbx
-; AVX512-NEXT:    addl $64, %ebx
-; AVX512-NEXT:    testq %rdx, %rdx
-; AVX512-NEXT:    cmovnel %eax, %ebx
-; AVX512-NEXT:    subl $-128, %ebx
-; AVX512-NEXT:    movq %rsi, %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    cmovnel %r14d, %ebx
-; AVX512-NEXT:    lzcntq %r10, %rax
-; AVX512-NEXT:    lzcntq %r9, %r14
-; AVX512-NEXT:    addl $64, %r14d
-; AVX512-NEXT:    testq %r10, %r10
-; AVX512-NEXT:    cmovnel %eax, %r14d
-; AVX512-NEXT:    lzcntq (%rdi), %rax
-; AVX512-NEXT:    lzcntq %r11, %rdi
-; AVX512-NEXT:    addl $64, %eax
-; AVX512-NEXT:    testq %r11, %r11
-; AVX512-NEXT:    cmovnel %edi, %eax
-; AVX512-NEXT:    subl $-128, %eax
-; AVX512-NEXT:    orq %r10, %r9
-; AVX512-NEXT:    cmovnel %r14d, %eax
-; AVX512-NEXT:    addl $256, %eax # imm = 0x100
-; AVX512-NEXT:    orq %r8, %rdx
-; AVX512-NEXT:    orq %rsi, %rcx
-; AVX512-NEXT:    orq %rdx, %rcx
-; AVX512-NEXT:    cmovnel %ebx, %eax
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512-NEXT:    vplzcntq %zmm0, %zmm1
+; AVX512-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %a0 = load i512, ptr %p0
   %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0)
@@ -2002,23 +1941,14 @@ define i32 @load_ctlz_undef_i256(ptr %p0) nounwind {
 ;
 ; AVX512-LABEL: load_ctlz_undef_i256:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    movq 8(%rdi), %rcx
-; AVX512-NEXT:    movq 16(%rdi), %rdx
-; AVX512-NEXT:    movq 24(%rdi), %rsi
-; AVX512-NEXT:    lzcntq %rsi, %rax
-; AVX512-NEXT:    lzcntq %rdx, %r8
-; AVX512-NEXT:    addl $64, %r8d
-; AVX512-NEXT:    testq %rsi, %rsi
-; AVX512-NEXT:    cmovnel %eax, %r8d
-; AVX512-NEXT:    lzcntq %rcx, %r9
-; AVX512-NEXT:    lzcntq (%rdi), %rax
-; AVX512-NEXT:    addl $64, %eax
-; AVX512-NEXT:    testq %rcx, %rcx
-; AVX512-NEXT:    cmovnel %r9d, %eax
-; AVX512-NEXT:    subl $-128, %eax
-; AVX512-NEXT:    orq %rsi, %rdx
-; AVX512-NEXT:    cmovnel %r8d, %eax
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = mem[3,2,1,0]
+; AVX512-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512-NEXT:    vplzcntq %ymm0, %ymm0
+; AVX512-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpcompressq %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %a0 = load i256, ptr %p0
   %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 -1)
@@ -2134,47 +2064,27 @@ define i32 @test_ctlz_undef_i512(i512 %a0) nounwind {
 ;
 ; AVX512-LABEL: test_ctlz_undef_i512:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    lzcntq %r11, %rax
-; AVX512-NEXT:    lzcntq %r10, %r14
-; AVX512-NEXT:    addl $64, %r14d
-; AVX512-NEXT:    testq %r11, %r11
-; AVX512-NEXT:    cmovnel %eax, %r14d
-; AVX512-NEXT:    lzcntq %r9, %rax
-; AVX512-NEXT:    lzcntq %r8, %rbx
-; AVX512-NEXT:    addl $64, %ebx
-; AVX512-NEXT:    testq %r9, %r9
-; AVX512-NEXT:    cmovnel %eax, %ebx
-; AVX512-NEXT:    subl $-128, %ebx
-; AVX512-NEXT:    movq %r10, %rax
-; AVX512-NEXT:    orq %r11, %rax
-; AVX512-NEXT:    cmovnel %r14d, %ebx
-; AVX512-NEXT:    lzcntq %rcx, %rax
-; AVX512-NEXT:    lzcntq %rdx, %r14
-; AVX512-NEXT:    addl $64, %r14d
-; AVX512-NEXT:    testq %rcx, %rcx
-; AVX512-NEXT:    cmovnel %eax, %r14d
-; AVX512-NEXT:    lzcntq %rsi, %r15
-; AVX512-NEXT:    lzcntq %rdi, %rax
-; AVX512-NEXT:    addl $64, %eax
-; AVX512-NEXT:    testq %rsi, %rsi
-; AVX512-NEXT:    cmovnel %r15d, %eax
-; AVX512-NEXT:    subl $-128, %eax
-; AVX512-NEXT:    orq %rcx, %rdx
-; AVX512-NEXT:    cmovnel %r14d, %eax
-; AVX512-NEXT:    addl $256, %eax # imm = 0x100
-; AVX512-NEXT:    orq %r11, %r9
-; AVX512-NEXT:    orq %r10, %r8
-; AVX512-NEXT:    orq %r9, %r8
-; AVX512-NEXT:    cmovnel %ebx, %eax
-; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    vmovq %rdi, %xmm0
+; AVX512-NEXT:    vmovq %rsi, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vmovq %rdx, %xmm1
+; AVX512-NEXT:    vmovq %rcx, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vmovq %r8, %xmm1
+; AVX512-NEXT:    vmovq %r9, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 -1)
   %res = trunc i512 %cnt to i32
@@ -2298,50 +2208,15 @@ define i32 @load_ctlz_undef_i512(ptr %p0) nounwind {
 ;
 ; AVX512-LABEL: load_ctlz_undef_i512:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    movq 8(%rdi), %r11
-; AVX512-NEXT:    movq 16(%rdi), %r9
-; AVX512-NEXT:    movq 24(%rdi), %r10
-; AVX512-NEXT:    movq 32(%rdi), %rcx
-; AVX512-NEXT:    movq 40(%rdi), %rdx
-; AVX512-NEXT:    movq 48(%rdi), %rsi
-; AVX512-NEXT:    movq 56(%rdi), %r8
-; AVX512-NEXT:    lzcntq %r8, %rax
-; AVX512-NEXT:    lzcntq %rsi, %r14
-; AVX512-NEXT:    addl $64, %r14d
-; AVX512-NEXT:    testq %r8, %r8
-; AVX512-NEXT:    cmovnel %eax, %r14d
-; AVX512-NEXT:    lzcntq %rdx, %rax
-; AVX512-NEXT:    lzcntq %rcx, %rbx
-; AVX512-NEXT:    addl $64, %ebx
-; AVX512-NEXT:    testq %rdx, %rdx
-; AVX512-NEXT:    cmovnel %eax, %ebx
-; AVX512-NEXT:    subl $-128, %ebx
-; AVX512-NEXT:    movq %rsi, %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    cmovnel %r14d, %ebx
-; AVX512-NEXT:    lzcntq %r10, %rax
-; AVX512-NEXT:    lzcntq %r9, %r14
-; AVX512-NEXT:    addl $64, %r14d
-; AVX512-NEXT:    testq %r10, %r10
-; AVX512-NEXT:    cmovnel %eax, %r14d
-; AVX512-NEXT:    lzcntq (%rdi), %rax
-; AVX512-NEXT:    lzcntq %r11, %rdi
-; AVX512-NEXT:    addl $64, %eax
-; AVX512-NEXT:    testq %r11, %r11
-; AVX512-NEXT:    cmovnel %edi, %eax
-; AVX512-NEXT:    subl $-128, %eax
-; AVX512-NEXT:    orq %r10, %r9
-; AVX512-NEXT:    cmovnel %r14d, %eax
-; AVX512-NEXT:    addl $256, %eax # imm = 0x100
-; AVX512-NEXT:    orq %r8, %rdx
-; AVX512-NEXT:    orq %rsi, %rcx
-; AVX512-NEXT:    orq %rdx, %rcx
-; AVX512-NEXT:    cmovnel %ebx, %eax
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %a0 = load i512, ptr %p0
   %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 -1)
@@ -3282,26 +3157,38 @@ define i32 @load_cttz_i256(ptr %p0) nounwind {
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: load_cttz_i256:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movq 16(%rdi), %rcx
-; AVX512-NEXT:    movq (%rdi), %rdx
-; AVX512-NEXT:    movq 8(%rdi), %rsi
-; AVX512-NEXT:    tzcntq %rdx, %rax
-; AVX512-NEXT:    tzcntq %rsi, %r8
-; AVX512-NEXT:    addl $64, %r8d
-; AVX512-NEXT:    testq %rdx, %rdx
-; AVX512-NEXT:    cmovnel %eax, %r8d
-; AVX512-NEXT:    tzcntq %rcx, %r9
-; AVX512-NEXT:    tzcntq 24(%rdi), %rax
-; AVX512-NEXT:    addl $64, %eax
-; AVX512-NEXT:    testq %rcx, %rcx
-; AVX512-NEXT:    cmovnel %r9d, %eax
-; AVX512-NEXT:    subl $-128, %eax
-; AVX512-NEXT:    orq %rsi, %rdx
-; AVX512-NEXT:    cmovnel %r8d, %eax
-; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: load_cttz_i256:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512F-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vpandn %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vplzcntq %ymm1, %ymm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [64,128,192,256]
+; AVX512F-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512F-NEXT:    vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: load_cttz_i256:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512POPCNT-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT:    vpandn %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT:    vpopcntq %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512POPCNT-NEXT:    vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
   %a0 = load i256, ptr %p0
   %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0)
   %res = trunc i256 %cnt to i32
@@ -3399,47 +3286,58 @@ define i32 @test_cttz_i512(i512 %a0) nounwind {
 ; AVX2-NEXT:    popq %r14
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cttz_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    tzcntq %rdi, %rax
-; AVX512-NEXT:    tzcntq %rsi, %rbx
-; AVX512-NEXT:    addl $64, %ebx
-; AVX512-NEXT:    testq %rdi, %rdi
-; AVX512-NEXT:    cmovnel %eax, %ebx
-; AVX512-NEXT:    tzcntq %rdx, %rax
-; AVX512-NEXT:    tzcntq %rcx, %r10
-; AVX512-NEXT:    addl $64, %r10d
-; AVX512-NEXT:    testq %rdx, %rdx
-; AVX512-NEXT:    cmovnel %eax, %r10d
-; AVX512-NEXT:    subl $-128, %r10d
-; AVX512-NEXT:    movq %rdi, %rax
-; AVX512-NEXT:    orq %rsi, %rax
-; AVX512-NEXT:    cmovnel %ebx, %r10d
-; AVX512-NEXT:    tzcntq %r8, %rax
-; AVX512-NEXT:    tzcntq %r9, %rbx
-; AVX512-NEXT:    addl $64, %ebx
-; AVX512-NEXT:    testq %r8, %r8
-; AVX512-NEXT:    cmovnel %eax, %ebx
-; AVX512-NEXT:    tzcntq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    tzcntq %r11, %r14
-; AVX512-NEXT:    addl $64, %eax
-; AVX512-NEXT:    testq %r11, %r11
-; AVX512-NEXT:    cmovnel %r14d, %eax
-; AVX512-NEXT:    subl $-128, %eax
-; AVX512-NEXT:    orq %r9, %r8
-; AVX512-NEXT:    cmovnel %ebx, %eax
-; AVX512-NEXT:    addl $256, %eax # imm = 0x100
-; AVX512-NEXT:    orq %rcx, %rsi
-; AVX512-NEXT:    orq %rdx, %rdi
-; AVX512-NEXT:    orq %rsi, %rdi
-; AVX512-NEXT:    cmovnel %r10d, %eax
-; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_cttz_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %rcx, %xmm0
+; AVX512F-NEXT:    vmovq %rdx, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT:    vmovq %rsi, %xmm1
+; AVX512F-NEXT:    vmovq %rdi, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vmovq %r9, %xmm1
+; AVX512F-NEXT:    vmovq %r8, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT:    vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: test_cttz_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovq %rcx, %xmm0
+; AVX512POPCNT-NEXT:    vmovq %rdx, %xmm1
+; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512POPCNT-NEXT:    vmovq %rsi, %xmm1
+; AVX512POPCNT-NEXT:    vmovq %rdi, %xmm2
+; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512POPCNT-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512POPCNT-NEXT:    vmovq %r9, %xmm1
+; AVX512POPCNT-NEXT:    vmovq %r8, %xmm2
+; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512POPCNT-NEXT:    vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512POPCNT-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
   %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0)
   %res = trunc i512 %cnt to i32
   ret i32 %res
@@ -3553,53 +3451,38 @@ define i32 @load_cttz_i512(ptr %p0) nounwind {
 ; AVX2-NEXT:    popq %r15
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: load_cttz_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    movq 48(%rdi), %r11
-; AVX512-NEXT:    movq 40(%rdi), %r9
-; AVX512-NEXT:    movq 32(%rdi), %r10
-; AVX512-NEXT:    movq 24(%rdi), %r8
-; AVX512-NEXT:    movq 16(%rdi), %rdx
-; AVX512-NEXT:    movq (%rdi), %rcx
-; AVX512-NEXT:    movq 8(%rdi), %rsi
-; AVX512-NEXT:    tzcntq %rcx, %rax
-; AVX512-NEXT:    tzcntq %rsi, %r14
-; AVX512-NEXT:    addl $64, %r14d
-; AVX512-NEXT:    testq %rcx, %rcx
-; AVX512-NEXT:    cmovnel %eax, %r14d
-; AVX512-NEXT:    tzcntq %rdx, %rax
-; AVX512-NEXT:    tzcntq %r8, %rbx
-; AVX512-NEXT:    addl $64, %ebx
-; AVX512-NEXT:    testq %rdx, %rdx
-; AVX512-NEXT:    cmovnel %eax, %ebx
-; AVX512-NEXT:    subl $-128, %ebx
-; AVX512-NEXT:    movq %rcx, %rax
-; AVX512-NEXT:    orq %rsi, %rax
-; AVX512-NEXT:    cmovnel %r14d, %ebx
-; AVX512-NEXT:    tzcntq %r10, %rax
-; AVX512-NEXT:    tzcntq %r9, %r14
-; AVX512-NEXT:    addl $64, %r14d
-; AVX512-NEXT:    testq %r10, %r10
-; AVX512-NEXT:    cmovnel %eax, %r14d
-; AVX512-NEXT:    tzcntq 56(%rdi), %rax
-; AVX512-NEXT:    tzcntq %r11, %rdi
-; AVX512-NEXT:    addl $64, %eax
-; AVX512-NEXT:    testq %r11, %r11
-; AVX512-NEXT:    cmovnel %edi, %eax
-; AVX512-NEXT:    subl $-128, %eax
-; AVX512-NEXT:    orq %r9, %r10
-; AVX512-NEXT:    cmovnel %r14d, %eax
-; AVX512-NEXT:    addl $256, %eax # imm = 0x100
-; AVX512-NEXT:    orq %r8, %rsi
-; AVX512-NEXT:    orq %rdx, %rcx
-; AVX512-NEXT:    orq %rsi, %rcx
-; AVX512-NEXT:    cmovnel %ebx, %eax
-; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: load_cttz_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: load_cttz_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512POPCNT-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
   %a0 = load i512, ptr %p0
   %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0)
   %res = trunc i512 %cnt to i32
@@ -4492,26 +4375,36 @@ define i32 @load_cttz_undef_i256(ptr %p0) nounwind {
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: load_cttz_undef_i256:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movq 16(%rdi), %rcx
-; AVX512-NEXT:    movq (%rdi), %rdx
-; AVX512-NEXT:    movq 8(%rdi), %rsi
-; AVX512-NEXT:    tzcntq %rdx, %rax
-; AVX512-NEXT:    tzcntq %rsi, %r8
-; AVX512-NEXT:    addl $64, %r8d
-; AVX512-NEXT:    testq %rdx, %rdx
-; AVX512-NEXT:    cmovnel %eax, %r8d
-; AVX512-NEXT:    tzcntq %rcx, %r9
-; AVX512-NEXT:    tzcntq 24(%rdi), %rax
-; AVX512-NEXT:    addl $64, %eax
-; AVX512-NEXT:    testq %rcx, %rcx
-; AVX512-NEXT:    cmovnel %r9d, %eax
-; AVX512-NEXT:    subl $-128, %eax
-; AVX512-NEXT:    orq %rsi, %rdx
-; AVX512-NEXT:    cmovnel %r8d, %eax
-; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: load_cttz_undef_i256:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512F-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vpandn %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [64,128,192,256]
+; AVX512F-NEXT:    vplzcntq %ymm1, %ymm1
+; AVX512F-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512F-NEXT:    vpcompressq %ymm1, %ymm0 {%k1} {z}
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: load_cttz_undef_i256:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512POPCNT-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT:    vpandn %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT:    vpopcntq %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT:    vpcompressq %ymm1, %ymm0 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
   %a0 = load i256, ptr %p0
   %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 -1)
   %res = trunc i256 %cnt to i32
@@ -4608,47 +4501,56 @@ define i32 @test_cttz_undef_i512(i512 %a0) nounwind {
 ; AVX2-NEXT:    popq %r14
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cttz_undef_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    tzcntq %rdi, %rax
-; AVX512-NEXT:    tzcntq %rsi, %rbx
-; AVX512-NEXT:    addl $64, %ebx
-; AVX512-NEXT:    testq %rdi, %rdi
-; AVX512-NEXT:    cmovnel %eax, %ebx
-; AVX512-NEXT:    tzcntq %rdx, %rax
-; AVX512-NEXT:    tzcntq %rcx, %r10
-; AVX512-NEXT:    addl $64, %r10d
-; AVX512-NEXT:    testq %rdx, %rdx
-; AVX512-NEXT:    cmovnel %eax, %r10d
-; AVX512-NEXT:    subl $-128, %r10d
-; AVX512-NEXT:    movq %rdi, %rax
-; AVX512-NEXT:    orq %rsi, %rax
-; AVX512-NEXT:    cmovnel %ebx, %r10d
-; AVX512-NEXT:    tzcntq %r8, %rax
-; AVX512-NEXT:    tzcntq %r9, %rbx
-; AVX512-NEXT:    addl $64, %ebx
-; AVX512-NEXT:    testq %r8, %r8
-; AVX512-NEXT:    cmovnel %eax, %ebx
-; AVX512-NEXT:    tzcntq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    tzcntq %r11, %r14
-; AVX512-NEXT:    addl $64, %eax
-; AVX512-NEXT:    testq %r11, %r11
-; AVX512-NEXT:    cmovnel %r14d, %eax
-; AVX512-NEXT:    subl $-128, %eax
-; AVX512-NEXT:    orq %r9, %r8
-; AVX512-NEXT:    cmovnel %ebx, %eax
-; AVX512-NEXT:    addl $256, %eax # imm = 0x100
-; AVX512-NEXT:    orq %rcx, %rsi
-; AVX512-NEXT:    orq %rdx, %rdi
-; AVX512-NEXT:    orq %rsi, %rdi
-; AVX512-NEXT:    cmovnel %r10d, %eax
-; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_cttz_undef_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %rcx, %xmm0
+; AVX512F-NEXT:    vmovq %rdx, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT:    vmovq %rsi, %xmm1
+; AVX512F-NEXT:    vmovq %rdi, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vmovq %r9, %xmm1
+; AVX512F-NEXT:    vmovq %r8, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT:    vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: test_cttz_undef_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovq %rcx, %xmm0
+; AVX512POPCNT-NEXT:    vmovq %rdx, %xmm1
+; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512POPCNT-NEXT:    vmovq %rsi, %xmm1
+; AVX512POPCNT-NEXT:    vmovq %rdi, %xmm2
+; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512POPCNT-NEXT:    vmovq %r9, %xmm2
+; AVX512POPCNT-NEXT:    vmovq %r8, %xmm3
+; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512POPCNT-NEXT:    vinserti128 $1, {{[0-9]+}}(%rsp), %ymm2, %ymm2
+; AVX512POPCNT-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512POPCNT-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512POPCNT-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
   %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 -1)
   %res = trunc i512 %cnt to i32
   ret i32 %res
@@ -4761,53 +4663,36 @@ define i32 @load_cttz_undef_i512(ptr %p0) nounwind {
 ; AVX2-NEXT:    popq %r15
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: load_cttz_undef_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    movq 48(%rdi), %r11
-; AVX512-NEXT:    movq 40(%rdi), %r9
-; AVX512-NEXT:    movq 32(%rdi), %r10
-; AVX512-NEXT:    movq 24(%rdi), %r8
-; AVX512-NEXT:    movq 16(%rdi), %rdx
-; AVX512-NEXT:    movq (%rdi), %rcx
-; AVX512-NEXT:    movq 8(%rdi), %rsi
-; AVX512-NEXT:    tzcntq %rcx, %rax
-; AVX512-NEXT:    tzcntq %rsi, %r14
-; AVX512-NEXT:    addl $64, %r14d
-; AVX512-NEXT:    testq %rcx, %rcx
-; AVX512-NEXT:    cmovnel %eax, %r14d
-; AVX512-NEXT:    tzcntq %rdx, %rax
-; AVX512-NEXT:    tzcntq %r8, %rbx
-; AVX512-NEXT:    addl $64, %ebx
-; AVX512-NEXT:    testq %rdx, %rdx
-; AVX512-NEXT:    cmovnel %eax, %ebx
-; AVX512-NEXT:    subl $-128, %ebx
-; AVX512-NEXT:    movq %rcx, %rax
-; AVX512-NEXT:    orq %rsi, %rax
-; AVX512-NEXT:    cmovnel %r14d, %ebx
-; AVX512-NEXT:    tzcntq %r10, %rax
-; AVX512-NEXT:    tzcntq %r9, %r14
-; AVX512-NEXT:    addl $64, %r14d
-; AVX512-NEXT:    testq %r10, %r10
-; AVX512-NEXT:    cmovnel %eax, %r14d
-; AVX512-NEXT:    tzcntq 56(%rdi), %rax
-; AVX512-NEXT:    tzcntq %r11, %rdi
-; AVX512-NEXT:    addl $64, %eax
-; AVX512-NEXT:    testq %r11, %r11
-; AVX512-NEXT:    cmovnel %edi, %eax
-; AVX512-NEXT:    subl $-128, %eax
-; AVX512-NEXT:    orq %r9, %r10
-; AVX512-NEXT:    cmovnel %r14d, %eax
-; AVX512-NEXT:    addl $256, %eax # imm = 0x100
-; AVX512-NEXT:    orq %r8, %rsi
-; AVX512-NEXT:    orq %rdx, %rcx
-; AVX512-NEXT:    orq %rsi, %rcx
-; AVX512-NEXT:    cmovnel %ebx, %eax
-; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: load_cttz_undef_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: load_cttz_undef_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512POPCNT-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
   %a0 = load i512, ptr %p0
   %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 -1)
   %res = trunc i512 %cnt to i32

From 1deaedd972b21b9eff03e1b89207329b71b2824c Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 12 Nov 2025 07:21:13 -0800
Subject: [PATCH 14/15] [ADT] Make DenseMapBase::swap the public entry point
 (#167650)

Without this patch, DenseMap::swap and SmallDenseMap::swap are
inconsistent because DenseMap::swap increments the epoch while
SmallDenseMap::swap does not.

This patch solves the inconsistency by making DenseMapBase::swap the
public entry point and renaming the existing swap to swapImpl.

To ease the review process, this patch does not move or group
functions according to access specifiers like private: and protected:.
---
 llvm/include/llvm/ADT/DenseMap.h | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 22ef7ed64451e..d5b13e7731550 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -360,6 +360,12 @@ class DenseMapBase : public DebugEpochBase {
     return getBuckets();
   }
 
+  void swap(DerivedT &RHS) {
+    this->incrementEpoch();
+    RHS.incrementEpoch();
+    derived().swapImpl(RHS);
+  }
+
 protected:
   DenseMapBase() = default;
 
@@ -736,7 +742,7 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
 
   DenseMap(DenseMap &&other) : BaseT() {
     init(0);
-    swap(other);
+    this->swap(other);
   }
 
   template <typename InputIt> DenseMap(const InputIt &I, const InputIt &E) {
@@ -756,15 +762,15 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
     deallocateBuckets();
   }
 
-  void swap(DenseMap &RHS) {
-    this->incrementEpoch();
-    RHS.incrementEpoch();
+private:
+  void swapImpl(DenseMap &RHS) {
     std::swap(Buckets, RHS.Buckets);
     std::swap(NumEntries, RHS.NumEntries);
     std::swap(NumTombstones, RHS.NumTombstones);
     std::swap(NumBuckets, RHS.NumBuckets);
   }
 
+public:
   DenseMap &operator=(const DenseMap &other) {
     if (&other != this)
       this->copyFrom(other);
@@ -775,7 +781,7 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
     this->destroyAll();
     deallocateBuckets();
     init(0);
-    swap(other);
+    this->swap(other);
     return *this;
   }
 
@@ -895,7 +901,7 @@ class SmallDenseMap
 
   SmallDenseMap(SmallDenseMap &&other) : BaseT() {
     init(0);
-    swap(other);
+    this->swap(other);
   }
 
   template <typename InputIt>
@@ -916,7 +922,8 @@ class SmallDenseMap
     deallocateBuckets();
   }
 
-  void swap(SmallDenseMap &RHS) {
+private:
+  void swapImpl(SmallDenseMap &RHS) {
     unsigned TmpNumEntries = RHS.NumEntries;
     RHS.NumEntries = NumEntries;
     NumEntries = TmpNumEntries;
@@ -987,6 +994,7 @@ class SmallDenseMap
     new (SmallSide.getLargeRep()) LargeRep(std::move(TmpRep));
   }
 
+public:
   SmallDenseMap &operator=(const SmallDenseMap &other) {
     if (&other != this)
       this->copyFrom(other);
@@ -997,7 +1005,7 @@ class SmallDenseMap
     this->destroyAll();
     deallocateBuckets();
     init(0);
-    swap(other);
+    this->swap(other);
     return *this;
   }
 

From 834a3cca3145775e948e9c081db9741152b11392 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Wed, 12 Nov 2025 10:27:31 -0500
Subject: [PATCH 15/15] [SPIRV] Handle ptrcast between array and vector types
 (#166418)

This commit adds support for legalizing pointer casts between array and
vector types within the SPIRV backend.

This is necessary to handle cases where a vector is loaded from or
stored to an array, which can occur with HLSL matrix types.

The following changes are included:
- Added  to load a vector from an array.
- Added  to store a vector to an array.
- Added the  test case to verify the functionality.
---
 .../Target/SPIRV/SPIRVLegalizePointerCast.cpp | 80 +++++++++++++++++++
 .../pointers/load-store-vec-from-array.ll     | 54 +++++++++++++
 2 files changed, 134 insertions(+)
 create mode 100644 llvm/test/CodeGen/SPIRV/pointers/load-store-vec-from-array.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
index 65dffc7908b78..4ce871b6f5e5d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
@@ -116,6 +116,81 @@ class SPIRVLegalizePointerCast : public FunctionPass {
     return LI;
   }
 
+  // Loads elements from an array and constructs a vector.
+  Value *loadVectorFromArray(IRBuilder<> &B, FixedVectorType *TargetType,
+                             Value *Source) {
+    // Load each element of the array.
+    SmallVector<Value *, 4> LoadedElements;
+    for (unsigned i = 0; i < TargetType->getNumElements(); ++i) {
+      // Create a GEP to access the i-th element of the array.
+      SmallVector<Type *, 2> Types = {Source->getType(), Source->getType()};
+      SmallVector<Value *, 4> Args;
+      Args.push_back(B.getInt1(false));
+      Args.push_back(Source);
+      Args.push_back(B.getInt32(0));
+      Args.push_back(ConstantInt::get(B.getInt32Ty(), i));
+      auto *ElementPtr = B.CreateIntrinsic(Intrinsic::spv_gep, {Types}, {Args});
+      GR->buildAssignPtr(B, TargetType->getElementType(), ElementPtr);
+
+      // Load the value from the element pointer.
+      Value *Load = B.CreateLoad(TargetType->getElementType(), ElementPtr);
+      buildAssignType(B, TargetType->getElementType(), Load);
+      LoadedElements.push_back(Load);
+    }
+
+    // Build the vector from the loaded elements.
+    Value *NewVector = PoisonValue::get(TargetType);
+    buildAssignType(B, TargetType, NewVector);
+
+    for (unsigned i = 0; i < TargetType->getNumElements(); ++i) {
+      Value *Index = B.getInt32(i);
+      SmallVector<Type *, 4> Types = {TargetType, TargetType,
+                                      TargetType->getElementType(),
+                                      Index->getType()};
+      SmallVector<Value *> Args = {NewVector, LoadedElements[i], Index};
+      NewVector = B.CreateIntrinsic(Intrinsic::spv_insertelt, {Types}, {Args});
+      buildAssignType(B, TargetType, NewVector);
+    }
+    return NewVector;
+  }
+
+  // Stores elements from a vector into an array.
+  void storeArrayFromVector(IRBuilder<> &B, Value *SrcVector,
+                            Value *DstArrayPtr, ArrayType *ArrTy,
+                            Align Alignment) {
+    auto *VecTy = cast<FixedVectorType>(SrcVector->getType());
+
+    // Ensure the element types of the array and vector are the same.
+    assert(VecTy->getElementType() == ArrTy->getElementType() &&
+           "Element types of array and vector must be the same.");
+
+    for (unsigned i = 0; i < VecTy->getNumElements(); ++i) {
+      // Create a GEP to access the i-th element of the array.
+      SmallVector<Type *, 2> Types = {DstArrayPtr->getType(),
+                                      DstArrayPtr->getType()};
+      SmallVector<Value *, 4> Args;
+      Args.push_back(B.getInt1(false));
+      Args.push_back(DstArrayPtr);
+      Args.push_back(B.getInt32(0));
+      Args.push_back(ConstantInt::get(B.getInt32Ty(), i));
+      auto *ElementPtr = B.CreateIntrinsic(Intrinsic::spv_gep, {Types}, {Args});
+      GR->buildAssignPtr(B, ArrTy->getElementType(), ElementPtr);
+
+      // Extract the element from the vector and store it.
+      Value *Index = B.getInt32(i);
+      SmallVector<Type *, 3> EltTypes = {VecTy->getElementType(), VecTy,
+                                         Index->getType()};
+      SmallVector<Value *, 2> EltArgs = {SrcVector, Index};
+      Value *Element =
+          B.CreateIntrinsic(Intrinsic::spv_extractelt, {EltTypes}, {EltArgs});
+      buildAssignType(B, VecTy->getElementType(), Element);
+
+      Types = {Element->getType(), ElementPtr->getType()};
+      Args = {Element, ElementPtr, B.getInt16(2), B.getInt8(Alignment.value())};
+      B.CreateIntrinsic(Intrinsic::spv_store, {Types}, {Args});
+    }
+  }
+
   // Replaces the load instruction to get rid of the ptrcast used as source
   // operand.
   void transformLoad(IRBuilder<> &B, LoadInst *LI, Value *CastedOperand,
@@ -154,6 +229,8 @@ class SPIRVLegalizePointerCast : public FunctionPass {
     // - float v = s.m;
     else if (SST && SST->getTypeAtIndex(0u) == ToTy)
       Output = loadFirstValueFromAggregate(B, ToTy, OriginalOperand, LI);
+    else if (SAT && DVT && SAT->getElementType() == DVT->getElementType())
+      Output = loadVectorFromArray(B, DVT, OriginalOperand);
     else
       llvm_unreachable("Unimplemented implicit down-cast from load.");
 
@@ -288,6 +365,7 @@ class SPIRVLegalizePointerCast : public FunctionPass {
     auto *S_VT = dyn_cast<FixedVectorType>(FromTy);
     auto *D_ST = dyn_cast<StructType>(ToTy);
     auto *D_VT = dyn_cast<FixedVectorType>(ToTy);
+    auto *D_AT = dyn_cast<ArrayType>(ToTy);
 
     B.SetInsertPoint(BadStore);
     if (D_ST && isTypeFirstElementAggregate(FromTy, D_ST))
@@ -296,6 +374,8 @@ class SPIRVLegalizePointerCast : public FunctionPass {
       storeVectorFromVector(B, Src, Dst, Alignment);
     else if (D_VT && !S_VT && FromTy == D_VT->getElementType())
       storeToFirstValueAggregate(B, Src, Dst, D_VT, Alignment);
+    else if (D_AT && S_VT && S_VT->getElementType() == D_AT->getElementType())
+      storeArrayFromVector(B, Src, Dst, D_AT, Alignment);
     else
       llvm_unreachable("Unsupported ptrcast use in store. Please fix.");
 
diff --git a/llvm/test/CodeGen/SPIRV/pointers/load-store-vec-from-array.ll b/llvm/test/CodeGen/SPIRV/pointers/load-store-vec-from-array.ll
new file mode 100644
index 0000000000000..917bb27afad00
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/load-store-vec-from-array.ll
@@ -0,0 +1,54 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: [[FLOAT:%[0-9]+]] = OpTypeFloat 32
+; CHECK-DAG: [[VEC4FLOAT:%[0-9]+]] = OpTypeVector [[FLOAT]] 4
+; CHECK-DAG: [[UINT_TYPE:%[0-9]+]] = OpTypeInt 32 0
+; CHECK-DAG: [[UINT4:%[0-9]+]] = OpConstant [[UINT_TYPE]] 4
+; CHECK-DAG: [[ARRAY4FLOAT:%[0-9]+]] = OpTypeArray [[FLOAT]] [[UINT4]]
+; CHECK-DAG: [[PTR_ARRAY4FLOAT:%[0-9]+]] = OpTypePointer Private [[ARRAY4FLOAT]]
+; CHECK-DAG: [[G_IN:%[0-9]+]] = OpVariable [[PTR_ARRAY4FLOAT]] Private
+; CHECK-DAG: [[G_OUT:%[0-9]+]] = OpVariable [[PTR_ARRAY4FLOAT]] Private
+; CHECK-DAG: [[UINT0:%[0-9]+]] = OpConstant [[UINT_TYPE]] 0
+; CHECK-DAG: [[UINT1:%[0-9]+]] = OpConstant [[UINT_TYPE]] 1
+; CHECK-DAG: [[UINT2:%[0-9]+]] = OpConstant [[UINT_TYPE]] 2
+; CHECK-DAG: [[UINT3:%[0-9]+]] = OpConstant [[UINT_TYPE]] 3
+; CHECK-DAG: [[PTR_FLOAT:%[0-9]+]] = OpTypePointer Private [[FLOAT]]
+; CHECK-DAG: [[UNDEF_VEC:%[0-9]+]] = OpUndef [[VEC4FLOAT]]
+
+@G_in = internal addrspace(10) global [4 x float] zeroinitializer
+@G_out = internal addrspace(10) global [4 x float] zeroinitializer
+
+define spir_func void @main() {
+entry:
+; CHECK:      [[GEP0:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_IN]] [[UINT0]]
+; CHECK-NEXT: [[LOAD0:%[0-9]+]] = OpLoad [[FLOAT]] [[GEP0]]
+; CHECK-NEXT: [[GEP1:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_IN]] [[UINT1]]
+; CHECK-NEXT: [[LOAD1:%[0-9]+]] = OpLoad [[FLOAT]] [[GEP1]]
+; CHECK-NEXT: [[GEP2:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_IN]] [[UINT2]]
+; CHECK-NEXT: [[LOAD2:%[0-9]+]] = OpLoad [[FLOAT]] [[GEP2]]
+; CHECK-NEXT: [[GEP3:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_IN]] [[UINT3]]
+; CHECK-NEXT: [[LOAD3:%[0-9]+]] = OpLoad [[FLOAT]] [[GEP3]]
+; CHECK-NEXT: [[VEC_INSERT0:%[0-9]+]] = OpCompositeInsert [[VEC4FLOAT]] [[LOAD0]] [[UNDEF_VEC]] 0
+; CHECK-NEXT: [[VEC_INSERT1:%[0-9]+]] = OpCompositeInsert [[VEC4FLOAT]] [[LOAD1]] [[VEC_INSERT0]] 1
+; CHECK-NEXT: [[VEC_INSERT2:%[0-9]+]] = OpCompositeInsert [[VEC4FLOAT]] [[LOAD2]] [[VEC_INSERT1]] 2
+; CHECK-NEXT: [[VEC:%[0-9]+]] = OpCompositeInsert [[VEC4FLOAT]] [[LOAD3]] [[VEC_INSERT2]] 3
+  %0 = load <4 x float>, ptr addrspace(10) @G_in, align 64
+
+; CHECK-NEXT: [[GEP_OUT0:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_OUT]] [[UINT0]]
+; CHECK-NEXT: [[VEC_EXTRACT0:%[0-9]+]] = OpCompositeExtract [[FLOAT]] [[VEC]] 0
+; CHECK-NEXT: OpStore [[GEP_OUT0]] [[VEC_EXTRACT0]]
+; CHECK-NEXT: [[GEP_OUT1:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_OUT]] [[UINT1]]
+; CHECK-NEXT: [[VEC_EXTRACT1:%[0-9]+]] = OpCompositeExtract [[FLOAT]] [[VEC]] 1
+; CHECK-NEXT: OpStore [[GEP_OUT1]] [[VEC_EXTRACT1]]
+; CHECK-NEXT: [[GEP_OUT2:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_OUT]] [[UINT2]]
+; CHECK-NEXT: [[VEC_EXTRACT2:%[0-9]+]] = OpCompositeExtract [[FLOAT]] [[VEC]] 2
+; CHECK-NEXT: OpStore [[GEP_OUT2]] [[VEC_EXTRACT2]]
+; CHECK-NEXT: [[GEP_OUT3:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_OUT]] [[UINT3]]
+; CHECK-NEXT: [[VEC_EXTRACT3:%[0-9]+]] = OpCompositeExtract [[FLOAT]] [[VEC]] 3
+; CHECK-NEXT: OpStore [[GEP_OUT3]] [[VEC_EXTRACT3]]
+  store <4 x float> %0, ptr addrspace(10) @G_out, align 64
+
+; CHECK-NEXT: OpReturn
+  ret void
+}