From d05370e6863e28fcf988b8491dc583fcf5e4e1be Mon Sep 17 00:00:00 2001
From: Baranov Victor <bar.victor.2002@gmail.com>
Date: Wed, 3 Dec 2025 08:56:24 +0300
Subject: [PATCH 01/36] [clang-tidy][NFC] Enable readability-any-all-of check
 (#167134)

Closes https://github.com/llvm/llvm-project/issues/156161.
Assisted-by: Claude Sonnet 4.5 via Claude Code
---
 clang-tools-extra/clang-tidy/.clang-tidy      |  3 +--
 .../ClangTidyDiagnosticConsumer.cpp           |  9 ++++----
 .../clang-tidy/bugprone/BranchCloneCheck.cpp  |  9 +++-----
 .../CapturingThisInMemberVariableCheck.cpp    | 23 +++++++++----------
 .../EasilySwappableParametersCheck.cpp        |  8 +++----
 .../clang-tidy/bugprone/InfiniteLoopCheck.cpp | 18 +++++----------
 .../bugprone/SuspiciousReallocUsageCheck.cpp  |  7 +++---
 .../ProBoundsArrayToPointerDecayCheck.cpp     |  9 ++++----
 .../fuchsia/VirtualInheritanceCheck.cpp       |  7 +++---
 .../misc/NewDeleteOverloadsCheck.cpp          | 12 ++++------
 .../clang-tidy/misc/UnusedParametersCheck.cpp | 11 ++++-----
 .../clang-tidy/modernize/LoopConvertCheck.cpp | 19 +++++++--------
 .../clang-tidy/modernize/LoopConvertUtils.cpp | 19 ++++++---------
 .../clang-tidy/modernize/PassByValueCheck.cpp |  6 +----
 .../clang-tidy/modernize/UseEmplaceCheck.cpp  | 17 +++++---------
 .../modernize/UseTrailingReturnTypeCheck.cpp  | 11 ++++-----
 .../clang-tidy/objc/MissingHashCheck.cpp      |  8 +++----
 .../TriviallyDestructibleCheck.cpp            |  9 +++-----
 .../AmbiguousSmartptrResetCallCheck.cpp       |  9 +++-----
 .../OperatorsRepresentationCheck.cpp          |  8 +++----
 .../SuspiciousCallArgumentCheck.cpp           | 10 ++++----
 .../clang-tidy/utils/Aliasing.cpp             | 12 +++-------
 .../clang-tidy/utils/DeclRefExprUtils.cpp     |  5 +---
 .../clang-tidy/utils/ExprSequence.cpp         |  9 +++-----
 .../clang-tidy/utils/TypeTraits.cpp           | 20 ++++++----------
 25 files changed, 105 insertions(+), 173 deletions(-)
diff --git a/clang-tools-extra/clang-tidy/.clang-tidy b/clang-tools-extra/clang-tidy/.clang-tidy
index 82d0df8697178..576b4a7b8443e 100644
--- a/clang-tools-extra/clang-tidy/.clang-tidy
+++ b/clang-tools-extra/clang-tidy/.clang-tidy
@@ -32,8 +32,7 @@ Checks: >
   -readability-qualified-auto,
   -readability-simplify-boolean-expr,
   -readability-static-definition-in-anonymous-namespace,
-  -readability-suspicious-call-argument,
-  -readability-use-anyofallof
+  -readability-suspicious-call-argument
 
 CheckOptions:
   - key:             performance-move-const-arg.CheckTriviallyCopyableMove
diff --git a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp
index 6716d90a1acaf..a8fd499e45c92 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp
@@ -478,11 +478,10 @@ bool ClangTidyDiagnosticConsumer::passesLineFilter(StringRef FileName,
     if (FileName.ends_with(Filter.Name)) {
       if (Filter.LineRanges.empty())
         return true;
-      for (const FileFilter::LineRange &Range : Filter.LineRanges) {
-        if (Range.first <= LineNumber && LineNumber <= Range.second)
-          return true;
-      }
-      return false;
+      return llvm::any_of(
+          Filter.LineRanges, [&](const FileFilter::LineRange &Range) {
+            return Range.first <= LineNumber && LineNumber <= Range.second;
+          });
     }
   }
   return false;
diff --git a/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp
index 103b403b9fe5d..4f33670a8500a 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp
@@ -75,12 +75,9 @@ static bool isFallthroughSwitchBranch(const SwitchBranch &Branch) {
       if (!S)
         return true;
 
-      for (const Attr *A : S->getAttrs()) {
-        if (isa<FallThroughAttr>(A))
-          return false;
-      }
-
-      return true;
+      return llvm::all_of(S->getAttrs(), [](const Attr *A) {
+        return !isa<FallThroughAttr>(A);
+      });
     }
   } Visitor;
 
diff --git a/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.cpp
index a376de505dd70..6aed454813a22 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.cpp
@@ -44,18 +44,17 @@ AST_MATCHER(CXXRecordDecl, correctHandleCaptureThisLambda) {
   if (Node.hasSimpleMoveAssignment())
     return false;
 
-  for (const CXXConstructorDecl *C : Node.ctors()) {
-    if (C->isCopyOrMoveConstructor() && C->isDefaulted() && !C->isDeleted())
-      return false;
-  }
-  for (const CXXMethodDecl *M : Node.methods()) {
-    if (M->isCopyAssignmentOperator())
-      llvm::errs() << M->isDeleted() << "\n";
-    if (M->isCopyAssignmentOperator() && M->isDefaulted() && !M->isDeleted())
-      return false;
-    if (M->isMoveAssignmentOperator() && M->isDefaulted() && !M->isDeleted())
-      return false;
-  }
+  if (llvm::any_of(Node.ctors(), [](const CXXConstructorDecl *C) {
+        return C->isCopyOrMoveConstructor() && C->isDefaulted() &&
+               !C->isDeleted();
+      }))
+    return false;
+  if (llvm::any_of(Node.methods(), [](const CXXMethodDecl *M) {
+        return (M->isCopyAssignmentOperator() ||
+                M->isMoveAssignmentOperator()) &&
+               M->isDefaulted() && !M->isDeleted();
+      }))
+    return false;
   // FIXME: find ways to identifier correct handle capture this lambda
   return true;
 }
diff --git a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
index a07a68c8a3e65..496f3e5015990 100644
--- a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
@@ -1589,11 +1589,9 @@ static bool lazyMapOfSetsIntersectionExists(const MapTy &Map, const ElemTy &E1,
   if (E1Iterator == Map.end() || E2Iterator == Map.end())
     return false;
 
-  for (const auto &E1SetElem : E1Iterator->second)
-    if (E2Iterator->second.contains(E1SetElem))
-      return true;
-
-  return false;
+  return llvm::any_of(E1Iterator->second, [&E2Iterator](const auto &E1SetElem) {
+    return E2Iterator->second.contains(E1SetElem);
+  });
 }
 
 /// Implements the heuristic that marks two parameters related if there is
diff --git a/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp
index 50280d22be0d8..6749c59d5fd57 100644
--- a/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp
@@ -119,14 +119,9 @@ static bool isAtLeastOneCondVarChanged(const Decl *Func, const Stmt *LoopStmt,
   if (isVarThatIsPossiblyChanged(Func, LoopStmt, Cond, Context))
     return true;
 
-  for (const Stmt *Child : Cond->children()) {
-    if (!Child)
-      continue;
-
-    if (isAtLeastOneCondVarChanged(Func, LoopStmt, Child, Context))
-      return true;
-  }
-  return false;
+  return llvm::any_of(Cond->children(), [&](const Stmt *Child) {
+    return Child && isAtLeastOneCondVarChanged(Func, LoopStmt, Child, Context);
+  });
 }
 
 /// Return the variable names in `Cond`.
@@ -240,10 +235,9 @@ static bool hasStaticLocalVariable(const Stmt *Cond) {
           return true;
   }
 
-  for (const Stmt *Child : Cond->children())
-    if (Child && hasStaticLocalVariable(Child))
-      return true;
-  return false;
+  return llvm::any_of(Cond->children(), [](const Stmt *Child) {
+    return Child && hasStaticLocalVariable(Child);
+  });
 }
 
 /// Tests if the loop condition `Cond` involves static local variables and
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.cpp
index 7cc3630204e63..bf31218131d5e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.cpp
@@ -92,10 +92,9 @@ class FindAssignToVarBefore
     return false;
   }
   bool VisitStmt(const Stmt *S) {
-    for (const Stmt *Child : S->children())
-      if (Child && Visit(Child))
-        return true;
-    return false;
+    return llvm::any_of(S->children(), [this](const Stmt *Child) {
+      return Child && Visit(Child);
+    });
   }
 };
 
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.cpp
index d0f86526d1a29..1c5c854cb4d84 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.cpp
@@ -19,10 +19,11 @@ namespace clang::tidy::cppcoreguidelines {
 namespace {
 AST_MATCHER_P(CXXForRangeStmt, hasRangeBeginEndStmt,
               ast_matchers::internal::Matcher<DeclStmt>, InnerMatcher) {
-  for (const DeclStmt *Stmt : {Node.getBeginStmt(), Node.getEndStmt()})
-    if (Stmt != nullptr && InnerMatcher.matches(*Stmt, Finder, Builder))
-      return true;
-  return false;
+  return llvm::any_of(llvm::ArrayRef{Node.getBeginStmt(), Node.getEndStmt()},
+                      [&](const DeclStmt *Stmt) {
+                        return Stmt &&
+                               InnerMatcher.matches(*Stmt, Finder, Builder);
+                      });
 }
 
 AST_MATCHER(Stmt, isInsideOfRangeBeginEndStmt) {
diff --git a/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.cpp b/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.cpp
index b6fb22c66d374..9c98b4938844f 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.cpp
@@ -20,10 +20,9 @@ AST_MATCHER(CXXRecordDecl, hasDirectVirtualBaseClass) {
     return false;
   if (!Node.getNumVBases())
     return false;
-  for (const CXXBaseSpecifier &Base : Node.bases())
-    if (Base.isVirtual())
-      return true;
-  return false;
+  return llvm::any_of(Node.bases(), [](const CXXBaseSpecifier &Base) {
+    return Base.isVirtual();
+  });
 }
 } // namespace
 
diff --git a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp
index a44e9b381d982..0471ba8ae291d 100644
--- a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp
@@ -114,17 +114,15 @@ hasCorrespondingOverloadInBaseClass(const CXXMethodDecl *MD,
     RD = MD->getParent();
   }
 
-  for (const auto &BS : RD->bases()) {
+  return llvm::any_of(RD->bases(), [&](const CXXBaseSpecifier &BS) {
     // We can't say much about a dependent base class, but to avoid false
     // positives assume it can have a corresponding overload.
     if (BS.getType()->isDependentType())
       return true;
-    if (const auto *BaseRD = BS.getType()->getAsCXXRecordDecl())
-      if (hasCorrespondingOverloadInBaseClass(MD, BaseRD))
-        return true;
-  }
-
-  return false;
+    if (const CXXRecordDecl *BaseRD = BS.getType()->getAsCXXRecordDecl())
+      return hasCorrespondingOverloadInBaseClass(MD, BaseRD);
+    return false;
+  });
 }
 
 void NewDeleteOverloadsCheck::registerMatchers(MatchFinder *Finder) {
diff --git a/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp
index 870905087a9bd..9c38bb129022f 100644
--- a/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp
@@ -30,13 +30,10 @@ static bool isOverrideMethod(const FunctionDecl *Function) {
 
 static bool hasAttrAfterParam(const SourceManager *SourceManager,
                               const ParmVarDecl *Param) {
-  for (const auto *Attr : Param->attrs()) {
-    if (SourceManager->isBeforeInTranslationUnit(Param->getLocation(),
-                                                 Attr->getLocation())) {
-      return true;
-    }
-  }
-  return false;
+  return llvm::any_of(Param->attrs(), [&](const Attr *Attr) {
+    return SourceManager->isBeforeInTranslationUnit(Param->getLocation(),
+                                                    Attr->getLocation());
+  });
 }
 
 void UnusedParametersCheck::registerMatchers(MatchFinder *Finder) {
diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
index 668ba08400b2b..19b406f05a746 100644
--- a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
@@ -510,27 +510,24 @@ static bool canBeModified(ASTContext *Context, const Expr *E) {
 /// Returns true when it can be guaranteed that the elements of the
 /// container are not being modified.
 static bool usagesAreConst(ASTContext *Context, const UsageResult &Usages) {
-  for (const Usage &U : Usages) {
+  return llvm::none_of(Usages, [&Context](const Usage &U) {
     // Lambda captures are just redeclarations (VarDecl) of the same variable,
     // not expressions. If we want to know if a variable that is captured by
     // reference can be modified in an usage inside the lambda's body, we need
     // to find the expression corresponding to that particular usage, later in
     // this loop.
-    if (U.Kind != Usage::UK_CaptureByCopy && U.Kind != Usage::UK_CaptureByRef &&
-        canBeModified(Context, U.Expression))
-      return false;
-  }
-  return true;
+    return U.Kind != Usage::UK_CaptureByCopy &&
+           U.Kind != Usage::UK_CaptureByRef &&
+           canBeModified(Context, U.Expression);
+  });
 }
 
 /// Returns true if the elements of the container are never accessed
 /// by reference.
 static bool usagesReturnRValues(const UsageResult &Usages) {
-  for (const auto &U : Usages) {
-    if (U.Expression && !U.Expression->isPRValue())
-      return false;
-  }
-  return true;
+  return llvm::all_of(Usages, [](const Usage &U) {
+    return !U.Expression || U.Expression->isPRValue();
+  });
 }
 
 /// Returns true if the container is const-qualified.
diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp
index 170a4f6d8731f..f6685dda7e09e 100644
--- a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp
@@ -89,13 +89,11 @@ bool DependencyFinderASTVisitor::VisitVarDecl(VarDecl *V) {
 
   // Next, check if the variable was removed from existence by an earlier
   // iteration.
-  for (const auto &I : *ReplacedVars) {
-    if (I.second == V) {
-      DependsOnInsideVariable = true;
-      return false;
-    }
-  }
-  return true;
+  if (llvm::none_of(*ReplacedVars,
+                    [&](const auto &I) { return I.second == V; }))
+    return true;
+  DependsOnInsideVariable = true;
+  return false;
 }
 
 /// If we already created a variable for TheLoop, check to make sure
@@ -234,11 +232,8 @@ static bool containsExpr(ASTContext *Context, const ContainerT *Container,
                          const Expr *E) {
   llvm::FoldingSetNodeID ID;
   E->Profile(ID, *Context, true);
-  for (const auto &I : *Container) {
-    if (ID == I.second)
-      return true;
-  }
-  return false;
+  return llvm::any_of(*Container,
+                      [&](const auto &I) { return ID == I.second; });
 }
 
 /// Returns true when the index expression is a declaration reference to
diff --git a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp
index a257f5325f780..09d98ee8bea6f 100644
--- a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp
@@ -196,11 +196,7 @@ static bool hasRValueOverload(const CXXConstructorDecl *Ctor,
     return true;
   };
 
-  for (const auto *Candidate : Record->ctors()) {
-    if (IsRValueOverload(Candidate))
-      return true;
-  }
-  return false;
+  return llvm::any_of(Record->ctors(), IsRValueOverload);
 }
 
 /// Find all references to \p ParamDecl across all of the
diff --git a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
index e585dd1d40002..ca97b11b9990b 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
@@ -44,17 +44,12 @@ AST_MATCHER_P(NamedDecl, hasAnyNameIgnoringTemplates, std::vector<StringRef>,
   // clang/lib/ASTMatchers/ASTMatchersInternal.cpp and checks whether
   // FullNameTrimmed matches any of the given Names.
   const StringRef FullNameTrimmedRef = FullNameTrimmed;
-  for (const StringRef Pattern : Names) {
-    if (Pattern.starts_with("::")) {
-      if (FullNameTrimmed == Pattern)
-        return true;
-    } else if (FullNameTrimmedRef.ends_with(Pattern) &&
-               FullNameTrimmedRef.drop_back(Pattern.size()).ends_with("::")) {
-      return true;
-    }
-  }
-
-  return false;
+  return llvm::any_of(Names, [&](const StringRef Pattern) {
+    if (Pattern.starts_with("::"))
+      return FullNameTrimmed == Pattern;
+    return FullNameTrimmedRef.ends_with(Pattern) &&
+           FullNameTrimmedRef.drop_back(Pattern.size()).ends_with("::");
+  });
 }
 
 // Checks if the given matcher is the last argument of the given CallExpr.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
index f9afd5044b584..02865b65a9ec2 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
@@ -55,13 +55,12 @@ struct UnqualNameVisitor : public RecursiveASTVisitor<UnqualNameVisitor> {
 
   bool visitUnqualName(StringRef UnqualName) {
     // Check for collisions with function arguments.
-    for (const ParmVarDecl *Param : F.parameters())
+    Collision = llvm::any_of(F.parameters(), [&](const ParmVarDecl *Param) {
       if (const IdentifierInfo *Ident = Param->getIdentifier())
-        if (Ident->getName() == UnqualName) {
-          Collision = true;
-          return true;
-        }
-    return false;
+        return Ident->getName() == UnqualName;
+      return false;
+    });
+    return Collision;
   }
 
   bool TraverseTypeLoc(TypeLoc TL, bool TraverseQualifier = true) {
diff --git a/clang-tools-extra/clang-tidy/objc/MissingHashCheck.cpp b/clang-tools-extra/clang-tidy/objc/MissingHashCheck.cpp
index 7b48fd9f77bca..b8010e0d29eb5 100644
--- a/clang-tools-extra/clang-tidy/objc/MissingHashCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/MissingHashCheck.cpp
@@ -25,11 +25,9 @@ AST_MATCHER_P(ObjCImplementationDecl, hasInterface,
 AST_MATCHER_P(ObjCContainerDecl, hasInstanceMethod,
               ast_matchers::internal::Matcher<ObjCMethodDecl>, Base) {
   // Check each instance method against the provided matcher.
-  for (const auto *I : Node.instance_methods()) {
-    if (Base.matches(*I, Finder, Builder))
-      return true;
-  }
-  return false;
+  return llvm::any_of(Node.instance_methods(), [&](const ObjCMethodDecl *I) {
+    return Base.matches(*I, Finder, Builder);
+  });
 }
 
 } // namespace
diff --git a/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.cpp b/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.cpp
index 2f54b17367b06..416c41d7acd66 100644
--- a/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.cpp
@@ -23,12 +23,9 @@ namespace {
 AST_MATCHER(Decl, isFirstDecl) { return Node.isFirstDecl(); }
 
 AST_MATCHER_P(CXXRecordDecl, hasBase, Matcher<QualType>, InnerMatcher) {
-  for (const CXXBaseSpecifier &BaseSpec : Node.bases()) {
-    const QualType BaseType = BaseSpec.getType();
-    if (InnerMatcher.matches(BaseType, Finder, Builder))
-      return true;
-  }
-  return false;
+  return llvm::any_of(Node.bases(), [&](const CXXBaseSpecifier &BaseSpec) {
+    return InnerMatcher.matches(BaseSpec.getType(), Finder, Builder);
+  });
 }
 
 } // namespace
diff --git a/clang-tools-extra/clang-tidy/readability/AmbiguousSmartptrResetCallCheck.cpp b/clang-tools-extra/clang-tidy/readability/AmbiguousSmartptrResetCallCheck.cpp
index 22ff5ce1545a5..ef9263beebfdd 100644
--- a/clang-tools-extra/clang-tidy/readability/AmbiguousSmartptrResetCallCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/AmbiguousSmartptrResetCallCheck.cpp
@@ -20,12 +20,9 @@ namespace clang::tidy::readability {
 namespace {
 
 AST_MATCHER(CXXMethodDecl, hasOnlyDefaultParameters) {
-  for (const auto *Param : Node.parameters()) {
-    if (!Param->hasDefaultArg())
-      return false;
-  }
-
-  return true;
+  return llvm::all_of(Node.parameters(), [](const ParmVarDecl *Param) {
+    return Param->hasDefaultArg();
+  });
 }
 
 const auto DefaultSmartPointers = "::std::shared_ptr;::std::unique_ptr;"
diff --git a/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.cpp b/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.cpp
index 269996dc07916..05f31c76c4c75 100644
--- a/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.cpp
@@ -136,11 +136,9 @@ getRepresentation(const std::vector<llvm::StringRef> &Config,
 template <typename T>
 static bool isAnyOperatorEnabled(const std::vector<llvm::StringRef> &Config,
                                  const T &Operators) {
-  for (const auto &[traditional, alternative] : Operators) {
-    if (!getRepresentation(Config, traditional, alternative).empty())
-      return true;
-  }
-  return false;
+  return llvm::any_of(Operators, [&](const auto &Op) {
+    return !getRepresentation(Config, Op.first, Op.second).empty();
+  });
 }
 
 OperatorsRepresentationCheck::OperatorsRepresentationCheck(
diff --git a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
index d9ccd9920bb6f..0b52a9664a7a5 100644
--- a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
@@ -758,7 +758,7 @@ bool SuspiciousCallArgumentCheck::areParamAndArgComparable(
 
 bool SuspiciousCallArgumentCheck::areArgsSwapped(std::size_t Position1,
                                                  std::size_t Position2) const {
-  for (const Heuristic H : AppliedHeuristics) {
+  return llvm::any_of(AppliedHeuristics, [&](Heuristic H) {
     const bool A1ToP2Similar = areNamesSimilar(
         ArgNames[Position2], ParamNames[Position1], H, BoundKind::SimilarAbove);
     const bool A2ToP1Similar = areNamesSimilar(
@@ -771,11 +771,9 @@ bool SuspiciousCallArgumentCheck::areArgsSwapped(std::size_t Position1,
         !areNamesSimilar(ArgNames[Position2], ParamNames[Position2], H,
                          BoundKind::DissimilarBelow);
 
-    if ((A1ToP2Similar || A2ToP1Similar) && A1ToP1Dissimilar &&
-        A2ToP2Dissimilar)
-      return true;
-  }
-  return false;
+    return (A1ToP2Similar || A2ToP1Similar) && A1ToP1Dissimilar &&
+           A2ToP2Dissimilar;
+  });
 }
 
 bool SuspiciousCallArgumentCheck::areNamesSimilar(StringRef Arg,
diff --git a/clang-tools-extra/clang-tidy/utils/Aliasing.cpp b/clang-tools-extra/clang-tidy/utils/Aliasing.cpp
index a22d2358bc560..1b12859c7e450 100644
--- a/clang-tools-extra/clang-tidy/utils/Aliasing.cpp
+++ b/clang-tools-extra/clang-tidy/utils/Aliasing.cpp
@@ -65,15 +65,9 @@ static bool hasPtrOrReferenceInStmt(const Stmt *S, const ValueDecl *Var) {
   if (isPtrOrReferenceForVar(S, Var))
     return true;
 
-  for (const Stmt *Child : S->children()) {
-    if (!Child)
-      continue;
-
-    if (hasPtrOrReferenceInStmt(Child, Var))
-      return true;
-  }
-
-  return false;
+  return llvm::any_of(S->children(), [&](const Stmt *Child) {
+    return Child && hasPtrOrReferenceInStmt(Child, Var);
+  });
 }
 
 static bool refersToEnclosingLambdaCaptureByRef(const Decl *Func,
diff --git a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
index 75a6dafed3c5e..a807c951a0e98 100644
--- a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
@@ -21,10 +21,7 @@ using llvm::SmallPtrSet;
 
 template <typename S>
 static bool isSetDifferenceEmpty(const S &S1, const S &S2) {
-  for (auto E : S1)
-    if (S2.count(E) == 0)
-      return false;
-  return true;
+  return llvm::none_of(S1, [&S2](const auto &E) { return !S2.contains(E); });
 }
 
 // Extracts all Nodes keyed by ID from Matches and inserts them into Nodes.
diff --git a/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp b/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp
index 0375d0f6c740f..45fcacf584157 100644
--- a/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp
+++ b/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp
@@ -148,12 +148,9 @@ bool ExprSequence::inSequence(const Stmt *Before, const Stmt *After) const {
 
   // If 'After' is a parent of 'Before' or is sequenced after one of these
   // parents, we know that it is sequenced after 'Before'.
-  for (const Stmt *Parent : BeforeParents) {
-    if (Parent == After || inSequence(Parent, After))
-      return true;
-  }
-
-  return false;
+  return llvm::any_of(BeforeParents, [&](const Stmt *Parent) {
+    return Parent == After || inSequence(Parent, After);
+  });
 }
 
 bool ExprSequence::potentiallyAfter(const Stmt *After,
diff --git a/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp b/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp
index 98a5d40d49313..dde6e9a8dca70 100644
--- a/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp
+++ b/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp
@@ -24,11 +24,9 @@ static bool hasDeletedCopyConstructor(QualType Type) {
   auto *Record = Type->getAsCXXRecordDecl();
   if (!Record || !Record->hasDefinition())
     return false;
-  for (const auto *Constructor : Record->ctors()) {
-    if (Constructor->isCopyConstructor() && Constructor->isDeleted())
-      return true;
-  }
-  return false;
+  return llvm::any_of(Record->ctors(), [](const auto *Constructor) {
+    return Constructor->isCopyConstructor() && Constructor->isDeleted();
+  });
 }
 
 std::optional<bool> isExpensiveToCopy(QualType Type,
@@ -70,14 +68,10 @@ bool recordIsTriviallyDefaultConstructible(const RecordDecl &RecordDecl,
       return false;
   }
   // If all its direct bases are trivially constructible.
-  for (const CXXBaseSpecifier &Base : ClassDecl->bases()) {
-    if (!isTriviallyDefaultConstructible(Base.getType(), Context))
-      return false;
-    if (Base.isVirtual())
-      return false;
-  }
-
-  return true;
+  return llvm::all_of(ClassDecl->bases(), [&](const CXXBaseSpecifier &Base) {
+    return isTriviallyDefaultConstructible(Base.getType(), Context) &&
+           !Base.isVirtual();
+  });
 }
 
 // Based on QualType::isTrivial.

From 73036cf9113b4748d4fbb28037e8714ff2486238 Mon Sep 17 00:00:00 2001
From: Baranov Victor <bar.victor.2002@gmail.com>
Date: Wed, 3 Dec 2025 09:02:50 +0300
Subject: [PATCH 02/36] [clang-tidy][NFC] Fix miscellaneous clang-tidy warnings
 (#170424)

---
 .../clang-tidy/bugprone/ExceptionEscapeCheck.cpp              | 3 ++-
 clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h     | 2 +-
 .../clang-tidy/readability/IdentifierNamingCheck.cpp          | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
index b7de8395ffa05..1cfb1511fa94e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
@@ -72,7 +72,8 @@ void ExceptionEscapeCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
 
 void ExceptionEscapeCheck::registerMatchers(MatchFinder *Finder) {
   auto MatchIf = [](bool Enabled, const auto &Matcher) {
-    ast_matchers::internal::Matcher<FunctionDecl> Nothing = unless(anything());
+    const ast_matchers::internal::Matcher<FunctionDecl> Nothing =
+        unless(anything());
     return Enabled ? Matcher : Nothing;
   };
   Finder->addMatcher(
diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h b/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h
index 18cff9aa962b5..7d771c4446cd3 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h
@@ -36,7 +36,7 @@ class UseStdPrintCheck : public ClangTidyCheck {
   }
 
 private:
-  Preprocessor *PP;
+  Preprocessor *PP = nullptr;
   bool StrictMode;
   std::vector<StringRef> PrintfLikeFunctions;
   std::vector<StringRef> FprintfLikeFunctions;
diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
index d1583a62a8e5e..79f8437057b23 100644
--- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
@@ -318,8 +318,8 @@ std::string IdentifierNamingCheck::HungarianNotation::getDeclTypeName(
   if (!EOL)
     EOL = Begin + strlen(Begin);
 
-  const char *PosList[] = {strchr(Begin, '='), strchr(Begin, ';'),
-                           strchr(Begin, ','), strchr(Begin, ')'), EOL};
+  const char *const PosList[] = {strchr(Begin, '='), strchr(Begin, ';'),
+                                 strchr(Begin, ','), strchr(Begin, ')'), EOL};
   for (const auto &Pos : PosList) {
     if (Pos > Begin)
       EOL = std::min(EOL, Pos);

From 689b3cc7c700b1687cf4aaaf4ef2c81a4e988917 Mon Sep 17 00:00:00 2001
From: Jinjie Huang <huangjinjie@bytedance.com>
Date: Wed, 3 Dec 2025 14:08:20 +0800
Subject: [PATCH 03/36] [clang] Support header shadowing diagnostics in Clang
 header search (#162491)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When including a header file, multiple files with the same name may
exist across different search paths, like:
 &emsp;&emsp;|-- main.cpp
&emsp;&emsp;|-- **header.h**
&emsp;&emsp;|-- include
&emsp;&emsp;|&emsp; └── **header.h**
The compiler usually picks the first match it finds (typically following
MSVC rules for current/include-chain paths first, then regular -I
paths), which may not be the user’s intended header.
This silent behavior can lead to subtle runtime API mismatches or
increase the cost of resolving errors such as “error: use of undeclared
identifier”, especially in large projects.

Therefore, this patch tries to provide a diagnostic message without
changing the current header selection. It does this by performing an
additional search for duplicate filenames across all search paths (both
MSVC rules and standard paths). This informs the user about a potential
"header shadowing" issue and clarifies which header path was actually
used.

Since header searching is much cheaper than file loading, the added
overhead should be within an acceptable range -- assuming the diagnostic
message is valuable.
---
 clang/docs/ReleaseNotes.rst                   |  3 +
 clang/include/clang/Basic/DiagnosticGroups.td |  2 +
 .../include/clang/Basic/DiagnosticLexKinds.td |  4 ++
 clang/include/clang/Lex/HeaderSearch.h        |  6 ++
 clang/lib/Lex/HeaderSearch.cpp                | 67 +++++++++++++++++++
 clang/test/Preprocessor/header-shadowing.c    | 57 ++++++++++++++++
 6 files changed, 139 insertions(+)
 create mode 100644 clang/test/Preprocessor/header-shadowing.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 3526ffb40f350..aa172ce402f8f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -449,6 +449,9 @@ Improvements to Clang's diagnostics
   comparison operators when mixed with bitwise operators in enum value initializers.
   This can be locally disabled by explicitly casting the initializer value.
 
+- A new warning ``-Wshadow-header`` has been added to detect when a header file
+  is found in multiple search directories (excluding system paths).
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 063957e7b18ae..80fc12caa1d24 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -790,6 +790,8 @@ def ShadowFieldInConstructor : DiagGroup<"shadow-field-in-constructor",
 def ShadowIvar : DiagGroup<"shadow-ivar">;
 def ShadowUncapturedLocal : DiagGroup<"shadow-uncaptured-local">;
 
+def ShadowHeader : DiagGroup<"shadow-header">;
+
 // -Wshadow-all is a catch-all for all shadowing. -Wshadow is just the
 // shadowing that we think is unsafe.
 def Shadow : DiagGroup<"shadow", [ShadowFieldInConstructorModified,
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 417187222e448..a72d3f37b1b72 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -959,6 +959,10 @@ def warn_quoted_include_in_framework_header : Warning<
 def warn_framework_include_private_from_public : Warning<
   "public framework header includes private framework header '%0'"
   >, InGroup<FrameworkIncludePrivateFromPublic>;
+def warn_header_shadowing : Warning<
+  "multiple candidates for header '%0' found; "
+  "directory '%1' chosen, ignoring others including '%2'">,
+  InGroup<ShadowHeader>, DefaultIgnore;
 def warn_deprecated_module_dot_map : Warning<
   "'%0' as a module map name is deprecated, rename it to %select{module.modulemap|module.private.modulemap}1%select{| in the 'Modules' directory of the framework}2">,
   InGroup<DeprecatedModuleDotMap>;
diff --git a/clang/include/clang/Lex/HeaderSearch.h b/clang/include/clang/Lex/HeaderSearch.h
index 850aea41c4c3b..5369c872ac1cd 100644
--- a/clang/include/clang/Lex/HeaderSearch.h
+++ b/clang/include/clang/Lex/HeaderSearch.h
@@ -465,6 +465,12 @@ class HeaderSearch {
     ExternalSource = ES;
   }
 
+  void diagnoseHeaderShadowing(
+      StringRef Filename, OptionalFileEntryRef FE, bool &DiagnosedShadowing,
+      SourceLocation IncludeLoc, ConstSearchDirIterator FromDir,
+      ArrayRef<std::pair<OptionalFileEntryRef, DirectoryEntryRef>> Includers,
+      bool isAngled, int IncluderLoopIndex, ConstSearchDirIterator MainLoopIt);
+
   /// Set the target information for the header search, if not
   /// already known.
   void setTarget(const TargetInfo &Target);
diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp
index f05c28fd7a123..b2ed24f765dab 100644
--- a/clang/lib/Lex/HeaderSearch.cpp
+++ b/clang/lib/Lex/HeaderSearch.cpp
@@ -881,6 +881,66 @@ diagnoseFrameworkInclude(DiagnosticsEngine &Diags, SourceLocation IncludeLoc,
         << IncludeFilename;
 }
 
+void HeaderSearch::diagnoseHeaderShadowing(
+    StringRef Filename, OptionalFileEntryRef FE, bool &DiagnosedShadowing,
+    SourceLocation IncludeLoc, ConstSearchDirIterator FromDir,
+    ArrayRef<std::pair<OptionalFileEntryRef, DirectoryEntryRef>> Includers,
+    bool isAngled, int IncluderLoopIndex, ConstSearchDirIterator MainLoopIt) {
+
+  if (Diags.isIgnored(diag::warn_header_shadowing, IncludeLoc) ||
+      DiagnosedShadowing)
+    return;
+  // Ignore diagnostics from system headers.
+  if (MainLoopIt && MainLoopIt->isSystemHeaderDirectory())
+    return;
+
+  DiagnosedShadowing = true;
+
+  // Indicates that file is first found in the includer's directory
+  if (!MainLoopIt) {
+    for (size_t i = IncluderLoopIndex + 1; i < Includers.size(); ++i) {
+      const auto &IncluderAndDir = Includers[i];
+      SmallString<1024> TmpDir = IncluderAndDir.second.getName();
+      llvm::sys::path::append(TmpDir, Filename);
+      if (auto File = getFileMgr().getFileRef(TmpDir, false, false)) {
+        if (&File->getFileEntry() == *FE)
+          continue;
+        Diags.Report(IncludeLoc, diag::warn_header_shadowing)
+            << Filename << (*FE).getDir().getName()
+            << IncluderAndDir.second.getName();
+        return;
+      } else {
+        llvm::errorToErrorCode(File.takeError());
+      }
+    }
+  }
+
+  // Continue searching in the regular search paths
+  ConstSearchDirIterator It =
+      isAngled ? angled_dir_begin() : search_dir_begin();
+  if (MainLoopIt) {
+    It = std::next(MainLoopIt);
+  } else if (FromDir) {
+    It = FromDir;
+  }
+  for (; It != search_dir_end(); ++It) {
+    // Suppress check for system headers, as duplicates are often intentional.
+    if (It->getDirCharacteristic() != SrcMgr::C_User)
+      continue;
+    SmallString<1024> TmpPath = It->getName();
+    llvm::sys::path::append(TmpPath, Filename);
+    if (auto File = getFileMgr().getFileRef(TmpPath, false, false)) {
+      if (&File->getFileEntry() == *FE)
+        continue;
+      Diags.Report(IncludeLoc, diag::warn_header_shadowing)
+          << Filename << (*FE).getDir().getName() << It->getName();
+      return;
+    } else {
+      llvm::errorToErrorCode(File.takeError());
+    }
+  }
+}
+
 /// LookupFile - Given a "foo" or \<foo> reference, look up the indicated file,
 /// return null on failure.  isAngled indicates whether the file reference is
 /// for system \#include's or not (i.e. using <> instead of ""). Includers, if
@@ -930,6 +990,7 @@ OptionalFileEntryRef HeaderSearch::LookupFile(
   // This is the header that MSVC's header search would have found.
   ModuleMap::KnownHeader MSSuggestedModule;
   OptionalFileEntryRef MSFE;
+  bool DiagnosedShadowing = false;
 
   // Check to see if the file is in the #includer's directory. This cannot be
   // based on CurDir, because each includer could be a #include of a
@@ -963,6 +1024,9 @@ OptionalFileEntryRef HeaderSearch::LookupFile(
       if (OptionalFileEntryRef FE = getFileAndSuggestModule(
               TmpDir, IncludeLoc, IncluderAndDir.second, IncluderIsSystemHeader,
               RequestingModule, SuggestedModule)) {
+        diagnoseHeaderShadowing(Filename, FE, DiagnosedShadowing, IncludeLoc,
+                                FromDir, Includers, isAngled,
+                                &IncluderAndDir - Includers.begin(), nullptr);
         if (!Includer) {
           assert(First && "only first includer can have no file");
           return FE;
@@ -1097,6 +1161,9 @@ OptionalFileEntryRef HeaderSearch::LookupFile(
     if (!File)
       continue;
 
+    diagnoseHeaderShadowing(Filename, File, DiagnosedShadowing, IncludeLoc,
+                            FromDir, Includers, isAngled, -1, It);
+
     CurDir = It;
 
     IncludeNames[*File] = Filename;
diff --git a/clang/test/Preprocessor/header-shadowing.c b/clang/test/Preprocessor/header-shadowing.c
new file mode 100644
index 0000000000000..c6d90d6f7760e
--- /dev/null
+++ b/clang/test/Preprocessor/header-shadowing.c
@@ -0,0 +1,57 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+
+/// Check that:
+/// - Quoted includes ("...") trigger the diagnostic.
+/// - System headers are ignored.
+/// - #include_next does not cause a duplicate warning.
+// RUN: %clang_cc1 -Wshadow-header -Eonly %t/main.c -I %t/include1 -I %t/include2 \
+// RUN: -isystem %t/system1 -isystem %t/system2 2>&1 | FileCheck %s --check-prefix=SHADOWING
+
+// SHADOWING: {{.*}} warning: multiple candidates for header 'header.h' found; directory '{{.*}}include1' chosen, ignoring others including '{{.*}}include2' [-Wshadow-header]
+// SHADOWING: warning: include1/header.h included!
+// SHADOWING-NOT: {{.*}} warning: multiple candidates for header 'header.h' found; directory '{{.*}}include2' chosen, ignoring others including '{{.*}}include1' [-Wshadow-header]
+// SHADOWING: warning: include2/header.h included!
+// SHADOWING-NOT: {{.*}} warning: multiple candidates for header 'stdio.h' found; directory '{{.*}}system1' chosen, ignoring others including '{{.*}}system2' [-Wshadow-header]
+// SHADOWING: warning: system1/stdio.h included!
+
+/// Check that the diagnostic is only performed once in MSVC compatibility mode.
+// RUN: %clang_cc1 -fms-compatibility -Wshadow-header -Eonly %t/t.c 2>&1 | FileCheck %s --check-prefix=SHADOWING-MS
+
+// SHADOWING-MS: {{.*}} warning: multiple candidates for header 't3.h' found; directory '{{.*}}foo' chosen, ignoring others including '{{.*}}' [-Wshadow-header]
+// SHADOWING-MS-NOT: {{.*}} warning: multiple candidates for header 't3.h' found; directory '{{.*}}' chosen, ignoring others including '{{.*}}foo' [-Wshadow-header]
+// SHADOWING-MS: warning: Found foo/t3.h.
+
+//--- main.c
+#include "header.h"
+#include <stdio.h>
+
+//--- include1/header.h
+#warning include1/header.h included!
+#include_next "header.h"
+
+//--- include2/header.h
+#warning include2/header.h included!
+
+//--- system1/stdio.h
+#warning system1/stdio.h included!
+
+//--- system2/stdio.h
+#warning system2/stdio.h included!
+
+
+/// Used to test when running in MSVC compatibility
+//--- t.c
+#include "foo/t1.h"
+
+//--- foo/t1.h
+#include "bar/t2.h"
+
+//--- foo/bar/t2.h
+#include "t3.h"
+
+//--- foo/t3.h
+#warning Found foo/t3.h.
+
+//--- t3.h
+#warning Found t3.h.

From 9f634c6777701794a6ed5577857ffb8f202513b8 Mon Sep 17 00:00:00 2001
From: Jianjian Guan <jacquesguan@me.com>
Date: Wed, 3 Dec 2025 14:19:20 +0800
Subject: [PATCH 04/36] [RISCV][GISel] Fix legalize G_EXTRACT_SUBVECTOR
 (#169877)

Fix wrong mask type that used by G_VSLIDEDOWN_VL.
---
 .../Target/RISCV/GISel/RISCVLegalizerInfo.cpp |  2 +-
 .../rvv/legalize-extract-subvector.mir        | 56 +++++++++----------
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 1fba16d3d51c2..2cc594a33eb0d 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -1211,7 +1211,7 @@ bool RISCVLegalizerInfo::legalizeExtractSubvector(MachineInstr &MI,
   // to place the desired subvector starting at element 0.
   const LLT XLenTy(STI.getXLenVT());
   auto SlidedownAmt = MIB.buildVScale(XLenTy, RemIdx);
-  auto [Mask, VL] = buildDefaultVLOps(LitTy, MIB, MRI);
+  auto [Mask, VL] = buildDefaultVLOps(InterLitTy, MIB, MRI);
   uint64_t Policy = RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC;
   auto Slidedown = MIB.buildInstr(
       RISCV::G_VSLIDEDOWN_VL, {InterLitTy},
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-extract-subvector.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-extract-subvector.mir
index dcee71432f4c3..ab352553b4b55 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-extract-subvector.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-extract-subvector.mir
@@ -20,9 +20,9 @@ body:             |
     ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C2]](s32)
     ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL [[C3]](s32)
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C3]](s32)
     ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
-    ; RV32-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VSLIDEDOWN_VL [[DEF1]], [[SELECT]], [[LSHR]](s32), [[VMSET_VL]](<vscale x 2 x s1>), [[C3]], 3
+    ; RV32-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VSLIDEDOWN_VL [[DEF1]], [[SELECT]], [[LSHR]](s32), [[VMSET_VL]](<vscale x 4 x s1>), [[C3]], 3
     ; RV32-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_EXTRACT_SUBVECTOR [[VSLIDEDOWN_VL]](<vscale x 4 x s8>), 0
     ; RV32-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C4]](s32)
@@ -41,9 +41,9 @@ body:             |
     ; RV64-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C2]](s64)
     ; RV64-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL [[C3]](s64)
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C3]](s64)
     ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
-    ; RV64-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VSLIDEDOWN_VL [[DEF1]], [[SELECT]], [[LSHR]](s64), [[VMSET_VL]](<vscale x 2 x s1>), [[C3]], 3
+    ; RV64-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VSLIDEDOWN_VL [[DEF1]], [[SELECT]], [[LSHR]](s64), [[VMSET_VL]](<vscale x 4 x s1>), [[C3]], 3
     ; RV64-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_EXTRACT_SUBVECTOR [[VSLIDEDOWN_VL]](<vscale x 4 x s8>), 0
     ; RV64-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C4]](s64)
@@ -72,9 +72,9 @@ body:             |
     ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C2]](s32)
     ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL [[C3]](s32)
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL [[C3]](s32)
     ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
-    ; RV32-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEDOWN_VL [[DEF1]], [[SELECT]], [[LSHR]](s32), [[VMSET_VL]](<vscale x 2 x s1>), [[C3]], 3
+    ; RV32-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEDOWN_VL [[DEF1]], [[SELECT]], [[LSHR]](s32), [[VMSET_VL]](<vscale x 8 x s1>), [[C3]], 3
     ; RV32-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_EXTRACT_SUBVECTOR [[VSLIDEDOWN_VL]](<vscale x 8 x s8>), 0
     ; RV32-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C4]](s32)
@@ -93,9 +93,9 @@ body:             |
     ; RV64-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C2]](s64)
     ; RV64-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL [[C3]](s64)
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL [[C3]](s64)
     ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
-    ; RV64-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEDOWN_VL [[DEF1]], [[SELECT]], [[LSHR]](s64), [[VMSET_VL]](<vscale x 2 x s1>), [[C3]], 3
+    ; RV64-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEDOWN_VL [[DEF1]], [[SELECT]], [[LSHR]](s64), [[VMSET_VL]](<vscale x 8 x s1>), [[C3]], 3
     ; RV64-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_EXTRACT_SUBVECTOR [[VSLIDEDOWN_VL]](<vscale x 8 x s8>), 0
     ; RV64-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C4]](s64)
@@ -158,9 +158,9 @@ body:             |
     ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C]](s32)
     ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C1]](s32)
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL [[C1]](s32)
     ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
-    ; RV32-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEDOWN_VL [[DEF1]], [[BITCAST]], [[LSHR]](s32), [[VMSET_VL]](<vscale x 4 x s1>), [[C1]], 3
+    ; RV32-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEDOWN_VL [[DEF1]], [[BITCAST]], [[LSHR]](s32), [[VMSET_VL]](<vscale x 8 x s1>), [[C1]], 3
     ; RV32-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_EXTRACT_SUBVECTOR [[VSLIDEDOWN_VL]](<vscale x 8 x s8>), 0
     ; RV32-NEXT: [[BITCAST1:%[0-9]+]]:_(<vscale x 32 x s1>) = G_BITCAST [[EXTRACT_SUBVECTOR]](<vscale x 4 x s8>)
     ; RV32-NEXT: $v8 = COPY [[BITCAST1]](<vscale x 32 x s1>)
@@ -173,9 +173,9 @@ body:             |
     ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C]](s64)
     ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C1]](s64)
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL [[C1]](s64)
     ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
-    ; RV64-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEDOWN_VL [[DEF1]], [[BITCAST]], [[LSHR]](s64), [[VMSET_VL]](<vscale x 4 x s1>), [[C1]], 3
+    ; RV64-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEDOWN_VL [[DEF1]], [[BITCAST]], [[LSHR]](s64), [[VMSET_VL]](<vscale x 8 x s1>), [[C1]], 3
     ; RV64-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_EXTRACT_SUBVECTOR [[VSLIDEDOWN_VL]](<vscale x 8 x s8>), 0
     ; RV64-NEXT: [[BITCAST1:%[0-9]+]]:_(<vscale x 32 x s1>) = G_BITCAST [[EXTRACT_SUBVECTOR]](<vscale x 4 x s8>)
     ; RV64-NEXT: $v8 = COPY [[BITCAST1]](<vscale x 32 x s1>)
@@ -317,8 +317,8 @@ body:             |
     ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C]](s32)
     ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL [[C1]](s32)
-    ; RV32-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 2 x s8>) = G_VSLIDEDOWN_VL [[DEF]], [[DEF]], [[LSHR]](s32), [[VMSET_VL]](<vscale x 1 x s1>), [[C1]], 3
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL [[C1]](s32)
+    ; RV32-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 2 x s8>) = G_VSLIDEDOWN_VL [[DEF]], [[DEF]], [[LSHR]](s32), [[VMSET_VL]](<vscale x 2 x s1>), [[C1]], 3
     ; RV32-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_EXTRACT_SUBVECTOR [[VSLIDEDOWN_VL]](<vscale x 2 x s8>), 0
     ; RV32-NEXT: $v8 = COPY [[EXTRACT_SUBVECTOR]](<vscale x 1 x s8>)
     ; RV32-NEXT: PseudoRET implicit $v8
@@ -329,8 +329,8 @@ body:             |
     ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
     ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C]](s64)
     ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL [[C1]](s64)
-    ; RV64-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 2 x s8>) = G_VSLIDEDOWN_VL [[DEF]], [[DEF]], [[LSHR]](s64), [[VMSET_VL]](<vscale x 1 x s1>), [[C1]], 3
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL [[C1]](s64)
+    ; RV64-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 2 x s8>) = G_VSLIDEDOWN_VL [[DEF]], [[DEF]], [[LSHR]](s64), [[VMSET_VL]](<vscale x 2 x s1>), [[C1]], 3
     ; RV64-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_EXTRACT_SUBVECTOR [[VSLIDEDOWN_VL]](<vscale x 2 x s8>), 0
     ; RV64-NEXT: $v8 = COPY [[EXTRACT_SUBVECTOR]](<vscale x 1 x s8>)
     ; RV64-NEXT: PseudoRET implicit $v8
@@ -351,8 +351,8 @@ body:             |
     ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C]](s32)
     ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL [[C1]](s32)
-    ; RV32-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 4 x s16>) = G_VSLIDEDOWN_VL [[DEF]], [[DEF]], [[LSHR]](s32), [[VMSET_VL]](<vscale x 1 x s1>), [[C1]], 3
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C1]](s32)
+    ; RV32-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 4 x s16>) = G_VSLIDEDOWN_VL [[DEF]], [[DEF]], [[LSHR]](s32), [[VMSET_VL]](<vscale x 4 x s1>), [[C1]], 3
     ; RV32-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_EXTRACT_SUBVECTOR [[VSLIDEDOWN_VL]](<vscale x 4 x s16>), 0
     ; RV32-NEXT: $v8 = COPY [[EXTRACT_SUBVECTOR]](<vscale x 1 x s16>)
     ; RV32-NEXT: PseudoRET implicit $v8
@@ -363,8 +363,8 @@ body:             |
     ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C]](s64)
     ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL [[C1]](s64)
-    ; RV64-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 4 x s16>) = G_VSLIDEDOWN_VL [[DEF]], [[DEF]], [[LSHR]](s64), [[VMSET_VL]](<vscale x 1 x s1>), [[C1]], 3
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C1]](s64)
+    ; RV64-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 4 x s16>) = G_VSLIDEDOWN_VL [[DEF]], [[DEF]], [[LSHR]](s64), [[VMSET_VL]](<vscale x 4 x s1>), [[C1]], 3
     ; RV64-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_EXTRACT_SUBVECTOR [[VSLIDEDOWN_VL]](<vscale x 4 x s16>), 0
     ; RV64-NEXT: $v8 = COPY [[EXTRACT_SUBVECTOR]](<vscale x 1 x s16>)
     ; RV64-NEXT: PseudoRET implicit $v8
@@ -418,9 +418,9 @@ body:             |
     ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C]](s32)
     ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL [[C1]](s32)
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL [[C1]](s32)
     ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
-    ; RV32-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 2 x s32>) = G_VSLIDEDOWN_VL [[DEF1]], [[EXTRACT_SUBVECTOR]], [[LSHR]](s32), [[VMSET_VL]](<vscale x 1 x s1>), [[C1]], 3
+    ; RV32-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 2 x s32>) = G_VSLIDEDOWN_VL [[DEF1]], [[EXTRACT_SUBVECTOR]], [[LSHR]](s32), [[VMSET_VL]](<vscale x 2 x s1>), [[C1]], 3
     ; RV32-NEXT: [[EXTRACT_SUBVECTOR1:%[0-9]+]]:_(<vscale x 1 x s32>) = G_EXTRACT_SUBVECTOR [[VSLIDEDOWN_VL]](<vscale x 2 x s32>), 0
     ; RV32-NEXT: $v8 = COPY [[EXTRACT_SUBVECTOR1]](<vscale x 1 x s32>)
     ; RV32-NEXT: PseudoRET implicit $v8
@@ -432,9 +432,9 @@ body:             |
     ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
     ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C]](s64)
     ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL [[C1]](s64)
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL [[C1]](s64)
     ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
-    ; RV64-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 2 x s32>) = G_VSLIDEDOWN_VL [[DEF1]], [[EXTRACT_SUBVECTOR]], [[LSHR]](s64), [[VMSET_VL]](<vscale x 1 x s1>), [[C1]], 3
+    ; RV64-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 2 x s32>) = G_VSLIDEDOWN_VL [[DEF1]], [[EXTRACT_SUBVECTOR]], [[LSHR]](s64), [[VMSET_VL]](<vscale x 2 x s1>), [[C1]], 3
     ; RV64-NEXT: [[EXTRACT_SUBVECTOR1:%[0-9]+]]:_(<vscale x 1 x s32>) = G_EXTRACT_SUBVECTOR [[VSLIDEDOWN_VL]](<vscale x 2 x s32>), 0
     ; RV64-NEXT: $v8 = COPY [[EXTRACT_SUBVECTOR1]](<vscale x 1 x s32>)
     ; RV64-NEXT: PseudoRET implicit $v8
@@ -456,9 +456,9 @@ body:             |
     ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C]](s32)
     ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL [[C1]](s32)
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL [[C1]](s32)
     ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
-    ; RV32-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 2 x s32>) = G_VSLIDEDOWN_VL [[DEF1]], [[EXTRACT_SUBVECTOR]], [[LSHR]](s32), [[VMSET_VL]](<vscale x 1 x s1>), [[C1]], 3
+    ; RV32-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 2 x s32>) = G_VSLIDEDOWN_VL [[DEF1]], [[EXTRACT_SUBVECTOR]], [[LSHR]](s32), [[VMSET_VL]](<vscale x 2 x s1>), [[C1]], 3
     ; RV32-NEXT: [[EXTRACT_SUBVECTOR1:%[0-9]+]]:_(<vscale x 1 x s32>) = G_EXTRACT_SUBVECTOR [[VSLIDEDOWN_VL]](<vscale x 2 x s32>), 0
     ; RV32-NEXT: $v8 = COPY [[EXTRACT_SUBVECTOR1]](<vscale x 1 x s32>)
     ; RV32-NEXT: PseudoRET implicit $v8
@@ -470,9 +470,9 @@ body:             |
     ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
     ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C]](s64)
     ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL [[C1]](s64)
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL [[C1]](s64)
     ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
-    ; RV64-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 2 x s32>) = G_VSLIDEDOWN_VL [[DEF1]], [[EXTRACT_SUBVECTOR]], [[LSHR]](s64), [[VMSET_VL]](<vscale x 1 x s1>), [[C1]], 3
+    ; RV64-NEXT: [[VSLIDEDOWN_VL:%[0-9]+]]:_(<vscale x 2 x s32>) = G_VSLIDEDOWN_VL [[DEF1]], [[EXTRACT_SUBVECTOR]], [[LSHR]](s64), [[VMSET_VL]](<vscale x 2 x s1>), [[C1]], 3
     ; RV64-NEXT: [[EXTRACT_SUBVECTOR1:%[0-9]+]]:_(<vscale x 1 x s32>) = G_EXTRACT_SUBVECTOR [[VSLIDEDOWN_VL]](<vscale x 2 x s32>), 0
     ; RV64-NEXT: $v8 = COPY [[EXTRACT_SUBVECTOR1]](<vscale x 1 x s32>)
     ; RV64-NEXT: PseudoRET implicit $v8

From 042a38f0bfe5c9f49df5d4cb5e23092e512c9fbe Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 3 Dec 2025 07:55:06 +0100
Subject: [PATCH 05/36] [Support] Optimize DebugCounter (#170305)

Currently, DebugCounters work by creating a unique counter ID during
registration, and then using that ID to look up the counter information
in the global registry.

However, this means that anything working with counters has to always go
through the global instance. This includes the fast path that checks
whether any counters are enabled.

Instead, we can drop the counter IDs, and make the counter variables use
CounterInfo themselves. We can then directly check whether the specific
counter is active without going through the global registry. This is
both faster for the fast-path where all counters are disabled, and also
faster for the case where only one counter is active (as the fast-path
can now still be used for all the disabled counters).

After this change, disabled counters become essentially free at runtime,
and we should be able to enable them in non-assert builds as well.
---
 llvm/include/llvm/Support/DebugCounter.h      | 130 ++++++++----------
 llvm/lib/Support/DebugCounter.cpp             |  66 ++++-----
 .../Transforms/Vectorize/SLPVectorizer.cpp    |   1 +
 3 files changed, 90 insertions(+), 107 deletions(-)

diff --git a/llvm/include/llvm/Support/DebugCounter.h b/llvm/include/llvm/Support/DebugCounter.h
index 028653224e62b..7e222eea6de2f 100644
--- a/llvm/include/llvm/Support/DebugCounter.h
+++ b/llvm/include/llvm/Support/DebugCounter.h
@@ -44,8 +44,8 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/UniqueVector.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include <string>
@@ -63,6 +63,29 @@ class DebugCounter {
     bool contains(int64_t Idx) const { return Idx >= Begin && Idx <= End; }
   };
 
+  /// Struct to store counter info.
+  class CounterInfo {
+    friend class DebugCounter;
+
+    /// Whether counting should be enabled, either due to -debug-counter or
+    /// -print-debug-counter.
+    bool Active = false;
+    /// Whether chunks for the counter are set (differs from Active in that
+    /// -print-debug-counter uses Active=true, IsSet=false).
+    bool IsSet = false;
+
+    int64_t Count = 0;
+    uint64_t CurrChunkIdx = 0;
+    StringRef Name;
+    StringRef Desc;
+    SmallVector<Chunk> Chunks;
+
+  public:
+    CounterInfo(StringRef Name, StringRef Desc) : Name(Name), Desc(Desc) {
+      DebugCounter::registerCounter(this);
+    }
+  };
+
   LLVM_ABI static void printChunks(raw_ostream &OS, ArrayRef<Chunk>);
 
   /// Return true on parsing error and print the error message on the
@@ -75,28 +98,31 @@ class DebugCounter {
   // Used by the command line option parser to push a new value it parsed.
   LLVM_ABI void push_back(const std::string &);
 
-  // Register a counter with the specified name.
+  // Register a counter with the specified counter information.
   //
   // FIXME: Currently, counter registration is required to happen before command
   // line option parsing. The main reason to register counters is to produce a
   // nice list of them on the command line, but i'm not sure this is worth it.
-  static unsigned registerCounter(StringRef Name, StringRef Desc) {
-    return instance().addCounter(std::string(Name), std::string(Desc));
+  static void registerCounter(CounterInfo *Info) {
+    instance().addCounter(Info);
   }
-  LLVM_ABI static bool shouldExecuteImpl(unsigned CounterName);
+  LLVM_ABI static bool shouldExecuteImpl(CounterInfo &Counter);
 
-  inline static bool shouldExecute(unsigned CounterName) {
-    if (!isCountingEnabled())
+  inline static bool shouldExecute(CounterInfo &Counter) {
+    // Compile to nothing when debugging is off
+#ifdef NDEBUG
+    return true;
+#else
+    if (!Counter.Active)
       return true;
-    return shouldExecuteImpl(CounterName);
+    return shouldExecuteImpl(Counter);
+#endif
   }
 
   // Return true if a given counter had values set (either programatically or on
   // the command line).  This will return true even if those values are
   // currently in a state where the counter will always execute.
-  static bool isCounterSet(unsigned ID) {
-    return instance().Counters[ID].IsSet;
-  }
+  static bool isCounterSet(CounterInfo &Info) { return Info.IsSet; }
 
   struct CounterState {
     int64_t Count;
@@ -104,19 +130,14 @@ class DebugCounter {
   };
 
   // Return the state of a counter. This only works for set counters.
-  static CounterState getCounterState(unsigned ID) {
-    auto &Us = instance();
-    auto Result = Us.Counters.find(ID);
-    assert(Result != Us.Counters.end() && "Asking about a non-set counter");
-    return {Result->second.Count, Result->second.CurrChunkIdx};
+  static CounterState getCounterState(CounterInfo &Info) {
+    return {Info.Count, Info.CurrChunkIdx};
   }
 
   // Set a registered counter to a given state.
-  static void setCounterState(unsigned ID, CounterState State) {
-    auto &Us = instance();
-    auto &Counter = Us.Counters[ID];
-    Counter.Count = State.Count;
-    Counter.CurrChunkIdx = State.ChunkIdx;
+  static void setCounterState(CounterInfo &Info, CounterState State) {
+    Info.Count = State.Count;
+    Info.CurrChunkIdx = State.ChunkIdx;
   }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -126,66 +147,38 @@ class DebugCounter {
 
   LLVM_ABI void print(raw_ostream &OS) const;
 
-  // Get the counter ID for a given named counter, or return 0 if none is found.
-  unsigned getCounterId(const std::string &Name) const {
-    return RegisteredCounters.idFor(Name);
+  // Get the counter info for a given named counter,
+  // or return null if none is found.
+  CounterInfo *getCounterInfo(StringRef Name) const {
+    return Counters.lookup(Name);
   }
 
   // Return the number of registered counters.
-  unsigned int getNumCounters() const { return RegisteredCounters.size(); }
+  unsigned int getNumCounters() const { return Counters.size(); }
 
-  // Return the name and description of the counter with the given ID.
-  std::pair<std::string, std::string> getCounterInfo(unsigned ID) const {
-    return {RegisteredCounters[ID], Counters.lookup(ID).Desc};
+  // Return the name and description of the counter with the given info.
+  std::pair<StringRef, StringRef> getCounterDesc(CounterInfo *Info) const {
+    return {Info->Name, Info->Desc};
   }
 
   // Iterate through the registered counters
-  using CounterVector = UniqueVector<std::string>;
-  CounterVector::const_iterator begin() const {
-    return RegisteredCounters.begin();
+  MapVector<StringRef, CounterInfo *>::const_iterator begin() const {
+    return Counters.begin();
+  }
+  MapVector<StringRef, CounterInfo *>::const_iterator end() const {
+    return Counters.end();
   }
-  CounterVector::const_iterator end() const { return RegisteredCounters.end(); }
-
-  // Force-enables counting all DebugCounters.
-  //
-  // Since DebugCounters are incompatible with threading (not only do they not
-  // make sense, but we'll also see data races), this should only be used in
-  // contexts where we're certain we won't spawn threads.
-  static void enableAllCounters() { instance().Enabled = true; }
 
-  static bool isCountingEnabled() {
-// Compile to nothing when debugging is off
-#ifdef NDEBUG
-    return false;
-#else
-    return instance().Enabled;
-#endif
+  void activateAllCounters() {
+    for (auto &[_, Counter] : Counters)
+      Counter->Active = true;
   }
 
 protected:
-  unsigned addCounter(const std::string &Name, const std::string &Desc) {
-    unsigned Result = RegisteredCounters.insert(Name);
-    auto &C = Counters[Result];
-    C = {};
-    C.Desc = Desc;
-    return Result;
-  }
-  // Struct to store counter info.
-  struct CounterInfo {
-    int64_t Count = 0;
-    uint64_t CurrChunkIdx = 0;
-    bool IsSet = false;
-    std::string Desc;
-    SmallVector<Chunk> Chunks;
-  };
+  void addCounter(CounterInfo *Info) { Counters[Info->Name] = Info; }
   bool handleCounterIncrement(CounterInfo &Info);
 
-  DenseMap<unsigned, CounterInfo> Counters;
-  CounterVector RegisteredCounters;
-
-  // Whether we should do DebugCounting at all. DebugCounters aren't
-  // thread-safe, so this should always be false in multithreaded scenarios.
-  bool Enabled = false;
+  MapVector<StringRef, CounterInfo *> Counters;
 
   bool ShouldPrintCounter = false;
 
@@ -195,8 +188,7 @@ class DebugCounter {
 };
 
 #define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)                              \
-  static const unsigned VARNAME =                                              \
-      DebugCounter::registerCounter(COUNTERNAME, DESC)
+  static DebugCounter::CounterInfo VARNAME(COUNTERNAME, DESC)
 
 } // namespace llvm
 #endif
diff --git a/llvm/lib/Support/DebugCounter.cpp b/llvm/lib/Support/DebugCounter.cpp
index 7fb841780ae2f..f7ead467669cb 100644
--- a/llvm/lib/Support/DebugCounter.cpp
+++ b/llvm/lib/Support/DebugCounter.cpp
@@ -4,6 +4,7 @@
 
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/ManagedStatic.h"
 
 using namespace llvm;
 
@@ -110,12 +111,11 @@ class DebugCounterList : public cl::list<std::string, DebugCounter> {
     // width, so we do the same.
     Option::printHelpStr(HelpStr, GlobalWidth, ArgStr.size() + 6);
     const auto &CounterInstance = DebugCounter::instance();
-    for (const auto &Name : CounterInstance) {
-      const auto Info =
-          CounterInstance.getCounterInfo(CounterInstance.getCounterId(Name));
-      size_t NumSpaces = GlobalWidth - Info.first.size() - 8;
-      outs() << "    =" << Info.first;
-      outs().indent(NumSpaces) << " -   " << Info.second << '\n';
+    for (const auto &Entry : CounterInstance) {
+      const auto &[Name, Desc] = CounterInstance.getCounterDesc(Entry.second);
+      size_t NumSpaces = GlobalWidth - Name.size() - 8;
+      outs() << "    =" << Name;
+      outs().indent(NumSpaces) << " -   " << Desc << '\n';
     }
   }
 };
@@ -138,7 +138,7 @@ struct DebugCounterOwner : DebugCounter {
       cl::desc("Print out debug counter info after all counters accumulated"),
       cl::callback([&](const bool &Value) {
         if (Value)
-          enableAllCounters();
+          activateAllCounters();
       })};
   cl::opt<bool, true> PrintDebugCounterQueries{
       "print-debug-counter-queries",
@@ -171,12 +171,14 @@ struct DebugCounterOwner : DebugCounter {
 
 } // anonymous namespace
 
+// Use ManagedStatic instead of function-local static variable to ensure
+// the destructor (which accesses counters and streams) runs during
+// llvm_shutdown() rather than at some unspecified point.
+static ManagedStatic<DebugCounterOwner> Owner;
+
 void llvm::initDebugCounterOptions() { (void)DebugCounter::instance(); }
 
-DebugCounter &DebugCounter::instance() {
-  static DebugCounterOwner O;
-  return O;
-}
+DebugCounter &DebugCounter::instance() { return *Owner; }
 
 // This is called by the command line parser when it sees a value for the
 // debug-counter option defined above.
@@ -202,32 +204,26 @@ void DebugCounter::push_back(const std::string &Val) {
     return;
   }
 
-  unsigned CounterID = getCounterId(std::string(CounterName));
-  if (!CounterID) {
+  CounterInfo *Counter = getCounterInfo(CounterName);
+  if (!Counter) {
     errs() << "DebugCounter Error: " << CounterName
            << " is not a registered counter\n";
     return;
   }
-  enableAllCounters();
 
-  CounterInfo &Counter = Counters[CounterID];
-  Counter.IsSet = true;
-  Counter.Chunks = std::move(Chunks);
+  Counter->Active = Counter->IsSet = true;
+  Counter->Chunks = std::move(Chunks);
 }
 
 void DebugCounter::print(raw_ostream &OS) const {
-  SmallVector<StringRef, 16> CounterNames(RegisteredCounters.begin(),
-                                          RegisteredCounters.end());
+  SmallVector<StringRef, 16> CounterNames(Counters.keys());
   sort(CounterNames);
 
-  auto &Us = instance();
   OS << "Counters and values:\n";
-  for (auto &CounterName : CounterNames) {
-    unsigned CounterID = getCounterId(std::string(CounterName));
-    const CounterInfo &C = Us.Counters[CounterID];
-    OS << left_justify(RegisteredCounters[CounterID], 32) << ": {" << C.Count
-       << ",";
-    printChunks(OS, C.Chunks);
+  for (StringRef CounterName : CounterNames) {
+    const CounterInfo *C = getCounterInfo(CounterName);
+    OS << left_justify(C->Name, 32) << ": {" << C->Count << ",";
+    printChunks(OS, C->Chunks);
     OS << "}\n";
   }
 }
@@ -257,20 +253,14 @@ bool DebugCounter::handleCounterIncrement(CounterInfo &Info) {
   return Res;
 }
 
-bool DebugCounter::shouldExecuteImpl(unsigned CounterName) {
+bool DebugCounter::shouldExecuteImpl(CounterInfo &Counter) {
   auto &Us = instance();
-  auto Result = Us.Counters.find(CounterName);
-  if (Result != Us.Counters.end()) {
-    auto &CounterInfo = Result->second;
-    bool Res = Us.handleCounterIncrement(CounterInfo);
-    if (Us.ShouldPrintCounterQueries && CounterInfo.IsSet) {
-      dbgs() << "DebugCounter " << Us.RegisteredCounters[CounterName] << "="
-             << (CounterInfo.Count - 1) << (Res ? " execute" : " skip") << "\n";
-    }
-    return Res;
+  bool Res = Us.handleCounterIncrement(Counter);
+  if (Us.ShouldPrintCounterQueries && Counter.IsSet) {
+    dbgs() << "DebugCounter " << Counter.Name << "=" << (Counter.Count - 1)
+           << (Res ? " execute" : " skip") << "\n";
   }
-  // Didn't find the counter, should we warn?
-  return true;
+  return Res;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0eb8ad8d3c93d..5ca89766dc837 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -95,6 +95,7 @@
 #include <cassert>
 #include <cstdint>
 #include <iterator>
+#include <map>
 #include <memory>
 #include <optional>
 #include <set>

From 30f479fa2b08d6e480939a57384996f7a276eb91 Mon Sep 17 00:00:00 2001
From: Henrich Lauko <xlauko@mail.muni.cz>
Date: Wed, 3 Dec 2025 08:20:50 +0100
Subject: [PATCH 06/36] [CIR] Use default attribute printer/parser (NFC)
 (#170366)

---
 .../include/clang/CIR/Dialect/IR/CIRDialect.td |  9 +--------
 clang/lib/CIR/Dialect/IR/CIRAttrs.cpp          | 18 ------------------
 2 files changed, 1 insertion(+), 26 deletions(-)

diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
index 34df9af7fc06d..7c38492544b39 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
@@ -24,8 +24,7 @@ def CIR_Dialect : Dialect {
 
   let cppNamespace = "::cir";
 
-  let useDefaultAttributePrinterParser = 0;
-  let useDefaultTypePrinterParser = 0;
+  let useDefaultAttributePrinterParser = 1;
 
   // Enable constant materialization for the CIR dialect. This generates a
   // declaration for the cir::CIRDialect::materializeConstant function. This
@@ -52,12 +51,6 @@ def CIR_Dialect : Dialect {
     mlir::Type parseType(mlir::DialectAsmParser &parser) const override;
     void printType(mlir::Type type,
                    mlir::DialectAsmPrinter &printer) const override;
-
-    mlir::Attribute parseAttribute(mlir::DialectAsmParser &parser,
-                                   mlir::Type type) const override;
-
-    void printAttribute(mlir::Attribute attr,
-                        mlir::DialectAsmPrinter &os) const override;
   }];
 }
 
diff --git a/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp b/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
index 64ac97025e7c7..ee296f171e0d9 100644
--- a/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
@@ -68,24 +68,6 @@ using namespace cir;
 // General CIR parsing / printing
 //===----------------------------------------------------------------------===//
 
-Attribute CIRDialect::parseAttribute(DialectAsmParser &parser,
-                                     Type type) const {
-  llvm::SMLoc typeLoc = parser.getCurrentLocation();
-  llvm::StringRef mnemonic;
-  Attribute genAttr;
-  OptionalParseResult parseResult =
-      generatedAttributeParser(parser, &mnemonic, type, genAttr);
-  if (parseResult.has_value())
-    return genAttr;
-  parser.emitError(typeLoc, "unknown attribute in CIR dialect");
-  return Attribute();
-}
-
-void CIRDialect::printAttribute(Attribute attr, DialectAsmPrinter &os) const {
-  if (failed(generatedAttributePrinter(attr, os)))
-    llvm_unreachable("unexpected CIR type kind");
-}
-
 static void printRecordMembers(mlir::AsmPrinter &printer,
                                mlir::ArrayAttr members) {
   printer << '{';

From e6110cb3395b22a941cba4726c9e36308e5b5613 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Wed, 3 Dec 2025 08:35:05 +0100
Subject: [PATCH 07/36] [mlir][Transforms] Fix crash in `-remove-dead-values`
 on private functions (#169269)

This commit fixes two crashes in the `-remove-dead-values` pass related
to private functions.

Private functions are considered entirely "dead" by the liveness
analysis, which drives the `-remove-dead-values` pass.

The `-remove-dead-values` pass removes dead block arguments from private
functions. Private functions are entirely dead, so all of their block
arguments are removed. However, the pass did not correctly update all
users of these dropped block arguments.

1. A side-effecting operation must be removed if one of its operands is
dead. Otherwise, the operation would end up with a NULL operand. Note:
The liveness analysis would not have marked an SSA value as "dead" if it
had a reachable side-effecting users. (Therefore, it is safe to erase
such side-effecting operations.)
2. A branch operation must be removed if one of its non-forwarded
operands is dead. (E.g., the condition value of a `cf.cond_br`.)
Whenever a terminator is removed, a `ub.unrechable` operation is
inserted. This fixes #158760.
---
 mlir/include/mlir/Transforms/Passes.td       |  1 +
 mlir/lib/Transforms/CMakeLists.txt           |  1 +
 mlir/lib/Transforms/RemoveDeadValues.cpp     | 47 +++++++++++++++++++-
 mlir/test/Transforms/remove-dead-values.mlir | 27 +++++++++++
 4 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
index 28b4a01cf0ecd..55addfdb693e4 100644
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -248,6 +248,7 @@ def RemoveDeadValues : Pass<"remove-dead-values"> {
     ```
   }];
   let constructor = "mlir::createRemoveDeadValuesPass()";
+  let dependentDialects = ["ub::UBDialect"];
 }
 
 def PrintIRPass : Pass<"print-ir"> {
diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt
index 54b67f5c7a91e..06161293e907f 100644
--- a/mlir/lib/Transforms/CMakeLists.txt
+++ b/mlir/lib/Transforms/CMakeLists.txt
@@ -39,4 +39,5 @@ add_mlir_library(MLIRTransforms
   MLIRSideEffectInterfaces
   MLIRSupport
   MLIRTransformUtils
+  MLIRUBDialect
   )
diff --git a/mlir/lib/Transforms/RemoveDeadValues.cpp b/mlir/lib/Transforms/RemoveDeadValues.cpp
index 989c614ef6617..e9ced064c9884 100644
--- a/mlir/lib/Transforms/RemoveDeadValues.cpp
+++ b/mlir/lib/Transforms/RemoveDeadValues.cpp
@@ -33,6 +33,7 @@
 
 #include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
 #include "mlir/Analysis/DataFlow/LivenessAnalysis.h"
+#include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/Dialect.h"
@@ -260,6 +261,22 @@ static SmallVector<OpOperand *> operandsToOpOperands(OperandRange operands) {
 static void processSimpleOp(Operation *op, RunLivenessAnalysis &la,
                             DenseSet<Value> &nonLiveSet,
                             RDVFinalCleanupList &cl) {
+  // Operations that have dead operands can be erased regardless of their
+  // side effects. The liveness analysis would not have marked an SSA value as
+  // "dead" if it had a side-effecting user that is reachable.
+  bool hasDeadOperand =
+      markLives(op->getOperands(), nonLiveSet, la).flip().any();
+  if (hasDeadOperand) {
+    LDBG() << "Simple op has dead operands, so the op must be dead: "
+           << OpWithFlags(op, OpPrintingFlags().skipRegions());
+    assert(!hasLive(op->getResults(), nonLiveSet, la) &&
+           "expected the op to have no live results");
+    cl.operations.push_back(op);
+    collectNonLiveValues(nonLiveSet, op->getResults(),
+                         BitVector(op->getNumResults(), true));
+    return;
+  }
+
   if (!isMemoryEffectFree(op) || hasLive(op->getResults(), nonLiveSet, la)) {
     LDBG() << "Simple op is not memory effect free or has live results, "
               "preserving it: "
@@ -361,6 +378,8 @@ static void processFuncOp(FunctionOpInterface funcOp, Operation *module,
   // block other than the entry block, because every block has a terminator.
   for (Block &block : funcOp.getBlocks()) {
     Operation *returnOp = block.getTerminator();
+    if (!returnOp->hasTrait<OpTrait::ReturnLike>())
+      continue;
     if (returnOp && returnOp->getNumOperands() == numReturns)
       cl.operands.push_back({returnOp, nonLiveRets});
   }
@@ -700,7 +719,11 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp,
 }
 
 /// Steps to process a `BranchOpInterface` operation:
-/// Iterate through each successor block of `branchOp`.
+///
+/// When a non-forwarded operand is dead (e.g., the condition value of a
+/// conditional branch op), the entire operation is dead.
+///
+/// Otherwise, iterate through each successor block of `branchOp`.
 /// (1) For each successor block, gather all operands from all successors.
 /// (2) Fetch their associated liveness analysis data and collect for future
 ///     removal.
@@ -711,7 +734,22 @@ static void processBranchOp(BranchOpInterface branchOp, RunLivenessAnalysis &la,
                             DenseSet<Value> &nonLiveSet,
                             RDVFinalCleanupList &cl) {
   LDBG() << "Processing branch op: " << *branchOp;
+
+  // Check for dead non-forwarded operands.
+  BitVector deadNonForwardedOperands =
+      markLives(branchOp->getOperands(), nonLiveSet, la).flip();
   unsigned numSuccessors = branchOp->getNumSuccessors();
+  for (unsigned succIdx = 0; succIdx < numSuccessors; ++succIdx) {
+    SuccessorOperands successorOperands =
+        branchOp.getSuccessorOperands(succIdx);
+    // Remove all non-forwarded operands from the bit vector.
+    for (OpOperand &opOperand : successorOperands.getMutableForwardedOperands())
+      deadNonForwardedOperands[opOperand.getOperandNumber()] = false;
+  }
+  if (deadNonForwardedOperands.any()) {
+    cl.operations.push_back(branchOp.getOperation());
+    return;
+  }
 
   for (unsigned succIdx = 0; succIdx < numSuccessors; ++succIdx) {
     Block *successorBlock = branchOp->getSuccessor(succIdx);
@@ -786,9 +824,14 @@ static void cleanUpDeadVals(RDVFinalCleanupList &list) {
 
   // 3. Operations
   LDBG() << "Cleaning up " << list.operations.size() << " operations";
-  for (auto &op : list.operations) {
+  for (Operation *op : list.operations) {
     LDBG() << "Erasing operation: "
            << OpWithFlags(op, OpPrintingFlags().skipRegions());
+    if (op->hasTrait<OpTrait::IsTerminator>()) {
+      // When erasing a terminator, insert an unreachable op in its place.
+      OpBuilder b(op);
+      ub::UnreachableOp::create(b, op->getLoc());
+    }
     op->dropAllUses();
     op->erase();
   }
diff --git a/mlir/test/Transforms/remove-dead-values.mlir b/mlir/test/Transforms/remove-dead-values.mlir
index 4bae85dcf4f7d..71306676d48e9 100644
--- a/mlir/test/Transforms/remove-dead-values.mlir
+++ b/mlir/test/Transforms/remove-dead-values.mlir
@@ -118,6 +118,17 @@ func.func @main(%arg0 : i32) {
 
 // -----
 
+// CHECK-LABEL: func.func private @clean_func_op_remove_side_effecting_op() {
+// CHECK-NEXT:    return
+// CHECK-NEXT:  }
+func.func private @clean_func_op_remove_side_effecting_op(%arg0: i32) -> (i32) {
+  // vector.print has a side effect but the op is dead.
+  vector.print %arg0 : i32
+  return %arg0 : i32
+}
+
+// -----
+
 // %arg0 is not live because it is never used. %arg1 is not live because its
 // user `arith.addi` doesn't have any uses and the value that it is forwarded to
 // (%non_live_0) also doesn't have any uses.
@@ -687,3 +698,19 @@ func.func @op_block_have_dead_arg(%arg0: index, %arg1: index, %arg2: i1) {
   // CHECK-NEXT: return
   return
 }
+
+// -----
+
+// CHECK-LABEL: func private @remove_dead_branch_op()
+// CHECK-NEXT:    ub.unreachable
+// CHECK-NEXT:  ^{{.*}}:
+// CHECK-NEXT:    return
+// CHECK-NEXT:  ^{{.*}}:
+// CHECK-NEXT:    return
+func.func private @remove_dead_branch_op(%c: i1, %arg0: i64, %arg1: i64) -> (i64) {
+  cf.cond_br %c, ^bb1, ^bb2
+^bb1:
+  return %arg0 : i64
+^bb2:
+  return %arg1 : i64
+}

From 98182f4d209ded292cb6030f45bcae132096acae Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Wed, 3 Dec 2025 08:58:31 +0100
Subject: [PATCH 08/36] Move CodeGenFunction::EmitScalarOrConstFoldImmArg; NFC
 (#170286)

This function is called from various .cpp files under `TargetBuiltins/`,
and was moved unintentionally into `AMDGPU.cpp` in PR #132252. Move it
to a common place.
---
 clang/lib/CodeGen/CodeGenFunction.cpp       | 17 +++++++++++++++++
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 17 -----------------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 64e594d09067b..ac25bd95f0463 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -2158,6 +2158,23 @@ void CodeGenFunction::EmitBranchOnBoolExpr(
   }
 }
 
+llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
+                                                          unsigned Idx,
+                                                          const CallExpr *E) {
+  llvm::Value *Arg = nullptr;
+  if ((ICEArguments & (1 << Idx)) == 0) {
+    Arg = EmitScalarExpr(E->getArg(Idx));
+  } else {
+    // If this is required to be a constant, constant fold it so that we
+    // know that the generated intrinsic gets a ConstantInt.
+    std::optional<llvm::APSInt> Result =
+        E->getArg(Idx)->getIntegerConstantExpr(getContext());
+    assert(Result && "Expected argument to be a constant");
+    Arg = llvm::ConstantInt::get(getLLVMContext(), *Result);
+  }
+  return Arg;
+}
+
 /// ErrorUnsupported - Print out an error that codegen doesn't support the
 /// specified stmt yet.
 void CodeGenFunction::ErrorUnsupported(const Stmt *S, const char *Type) {
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 81b3fe9e79483..eabdc370da6b4 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -343,23 +343,6 @@ void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
     SSID = getLLVMContext().getOrInsertSyncScopeID(SSN);
 }
 
-llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
-                                                          unsigned Idx,
-                                                          const CallExpr *E) {
-  llvm::Value *Arg = nullptr;
-  if ((ICEArguments & (1 << Idx)) == 0) {
-    Arg = EmitScalarExpr(E->getArg(Idx));
-  } else {
-    // If this is required to be a constant, constant fold it so that we
-    // know that the generated intrinsic gets a ConstantInt.
-    std::optional<llvm::APSInt> Result =
-        E->getArg(Idx)->getIntegerConstantExpr(getContext());
-    assert(Result && "Expected argument to be a constant");
-    Arg = llvm::ConstantInt::get(getLLVMContext(), *Result);
-  }
-  return Arg;
-}
-
 void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
                                                      const CallExpr *E) {
   constexpr const char *Tag = "amdgpu-synchronize-as";

From 5ee6cff90ba5d8e08066eeeef0c27aa0b6f24d2c Mon Sep 17 00:00:00 2001
From: Jonas Hahnfeld <jonas.hahnfeld@cern.ch>
Date: Wed, 3 Dec 2025 09:10:56 +0100
Subject: [PATCH 09/36] [clang] Propagate definition data to all redecls
 (#170090)

Fix the propagation added in commit 0d490ae55f to include all redecls,
not only previous ones. This fixes another instance of the assertion
"Cannot get layout of forward declarations" in getASTRecordLayout().

Kudos to Alexander Kornienko for providing an initial version of the
reproducer that I further simplified.

Fixes #170084
---
 clang/lib/Serialization/ASTReaderDecl.cpp |  4 +-
 clang/test/Modules/GH170084.cpp           | 75 +++++++++++++++++++++++
 2 files changed, 77 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/Modules/GH170084.cpp

diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 5456e73956659..882d54f31280a 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -2107,8 +2107,8 @@ void ASTDeclMerger::MergeDefinitionData(
     auto *Def = DD.Definition;
     DD = std::move(MergeDD);
     DD.Definition = Def;
-    while ((Def = Def->getPreviousDecl()))
-      cast<CXXRecordDecl>(Def)->DefinitionData = &DD;
+    for (auto *D : Def->redecls())
+      cast<CXXRecordDecl>(D)->DefinitionData = &DD;
     return;
   }
 
diff --git a/clang/test/Modules/GH170084.cpp b/clang/test/Modules/GH170084.cpp
new file mode 100644
index 0000000000000..950499467a6bb
--- /dev/null
+++ b/clang/test/Modules/GH170084.cpp
@@ -0,0 +1,75 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+// RUN: cd %t
+
+// RUN: %clang_cc1 -fmodule-name=stl -fno-cxx-modules -emit-module -fmodules -xc++ stl.cppmap -o stl.pcm
+// RUN: %clang_cc1 -fmodule-name=d -fno-cxx-modules -emit-module -fmodules -fmodule-file=stl.pcm -xc++ d.cppmap -o d.pcm
+// RUN: %clang_cc1 -fmodule-name=b -fno-cxx-modules -emit-module -fmodules -fmodule-file=stl.pcm -xc++ b.cppmap -o b.pcm
+// RUN: %clang_cc1 -fmodule-name=a -fno-cxx-modules -emit-module -fmodules -fmodule-file=stl.pcm -fmodule-file=d.pcm -fmodule-file=b.pcm -xc++ a.cppmap -o a.pcm
+// RUN: %clang_cc1 -fno-cxx-modules -fmodules -fmodule-file=a.pcm -emit-llvm -o /dev/null main.cpp
+
+//--- a.cppmap
+module "a" {
+header "a.h"
+}
+
+//--- a.h
+#include "b.h"
+namespace {
+void a(absl::set<char> c) {
+  absl::set<int> b;
+  c.end();
+  c.contains();
+}
+}  // namespace
+
+//--- b.cppmap
+module "b" {
+header "b.h"
+}
+
+//--- b.h
+#include "c.h"
+void b() { absl::set<char> x; }
+
+//--- c.h
+#include "stl.h"
+namespace absl {
+template <typename>
+class set {
+ public:
+  struct iterator {
+    void u() const;
+  };
+  iterator end() const { return {}; }
+  void contains() const { end().u(); }
+  pair<iterator> e();
+};
+}  // namespace absl
+
+//--- d.cppmap
+module "d" {
+header "d.h"
+}
+
+//--- d.h
+#include "c.h"
+void d() { absl::set<char> x; }
+
+//--- stl.cppmap
+module "stl" {
+header "stl.h"
+}
+
+//--- stl.h
+#ifndef _STL_H_
+#define _STL_H_
+template <class>
+struct pair;
+#endif
+
+//--- main.cpp
+// expected-no-diagnostics
+#include "c.h"
+void f(absl::set<char> o) { o.contains(); }

From befa4e85e4fab6a109203903a2fbeb979164cd2e Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Wed, 3 Dec 2025 00:14:32 -0800
Subject: [PATCH 10/36] [AMDGPU] Avoid undefs in
 hazard-gfx1250-flat-scr-hi.mir. NFC (#170396)

---
 .../AMDGPU/hazard-gfx1250-flat-scr-hi.mir     | 89 ++++++++++++-------
 1 file changed, 57 insertions(+), 32 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/hazard-gfx1250-flat-scr-hi.mir b/llvm/test/CodeGen/AMDGPU/hazard-gfx1250-flat-scr-hi.mir
index e3b28c5518695..e98c08248af75 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-gfx1250-flat-scr-hi.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-gfx1250-flat-scr-hi.mir
@@ -8,10 +8,12 @@ body:             |
   bb.0:
 
     ; GCN-LABEL: name: s_ashr_i64
-    ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $src_flat_scratch_base_hi
-    ; GCN-NEXT: [[S_ASHR_I64_:%[0-9]+]]:sreg_64 = S_ASHR_I64 undef %2:sreg_64, [[COPY]], implicit-def $scc
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $src_flat_scratch_base_hi
+    ; GCN-NEXT: [[S_ASHR_I64_:%[0-9]+]]:sreg_64 = S_ASHR_I64 [[DEF]], [[COPY]], implicit-def $scc
+    %1:sreg_64 = IMPLICIT_DEF
     %0:sreg_32 = COPY $src_flat_scratch_base_hi
-    %2:sreg_64 = S_ASHR_I64 undef %1:sreg_64, %0, implicit-def $scc
+    %2:sreg_64 = S_ASHR_I64 %1:sreg_64, %0, implicit-def $scc
 ...
 
 ---
@@ -21,10 +23,12 @@ body:             |
   bb.0:
 
     ; GCN-LABEL: name: s_lshl_b64
-    ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $src_flat_scratch_base_hi
-    ; GCN-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 undef %2:sreg_64, [[COPY]], implicit-def $scc
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $src_flat_scratch_base_hi
+    ; GCN-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 [[DEF]], [[COPY]], implicit-def $scc
+    %1:sreg_64 = IMPLICIT_DEF
     %0:sreg_32 = COPY $src_flat_scratch_base_hi
-    %2:sreg_64 = S_LSHL_B64 undef %1:sreg_64, %0, implicit-def $scc
+    %2:sreg_64 = S_LSHL_B64 %1:sreg_64, %0, implicit-def $scc
 ...
 
 ---
@@ -34,10 +38,12 @@ body:             |
   bb.0:
 
     ; GCN-LABEL: name: s_lshr_b64
-    ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $src_flat_scratch_base_hi
-    ; GCN-NEXT: [[S_LSHR_B64_:%[0-9]+]]:sreg_64 = S_LSHR_B64 undef %2:sreg_64, [[COPY]], implicit-def $scc
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $src_flat_scratch_base_hi
+    ; GCN-NEXT: [[S_LSHR_B64_:%[0-9]+]]:sreg_64 = S_LSHR_B64 [[DEF]], [[COPY]], implicit-def $scc
+    %1:sreg_64 = IMPLICIT_DEF
     %0:sreg_32 = COPY $src_flat_scratch_base_hi
-    %2:sreg_64 = S_LSHR_B64 undef %1:sreg_64, %0, implicit-def $scc
+    %2:sreg_64 = S_LSHR_B64 %1:sreg_64, %0, implicit-def $scc
 ...
 
 ---
@@ -47,10 +53,12 @@ body:             |
   bb.0:
 
     ; GCN-LABEL: name: s_bfe_i64
-    ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $src_flat_scratch_base_hi
-    ; GCN-NEXT: [[S_BFE_I64_:%[0-9]+]]:sreg_64 = S_BFE_I64 undef %2:sreg_64, [[COPY]], implicit-def $scc
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $src_flat_scratch_base_hi
+    ; GCN-NEXT: [[S_BFE_I64_:%[0-9]+]]:sreg_64 = S_BFE_I64 [[DEF]], [[COPY]], implicit-def $scc
+    %1:sreg_64 = IMPLICIT_DEF
     %0:sreg_32 = COPY $src_flat_scratch_base_hi
-    %2:sreg_64 = S_BFE_I64 undef %1:sreg_64, %0, implicit-def $scc
+    %2:sreg_64 = S_BFE_I64 %1:sreg_64, %0, implicit-def $scc
 ...
 
 ---
@@ -60,10 +68,12 @@ body:             |
   bb.0:
 
     ; GCN-LABEL: name: s_bfe_u64
-    ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $src_flat_scratch_base_hi
-    ; GCN-NEXT: [[S_BFE_U64_:%[0-9]+]]:sreg_64 = S_BFE_U64 undef %2:sreg_64, [[COPY]], implicit-def $scc
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $src_flat_scratch_base_hi
+    ; GCN-NEXT: [[S_BFE_U64_:%[0-9]+]]:sreg_64 = S_BFE_U64 [[DEF]], [[COPY]], implicit-def $scc
+    %1:sreg_64 = IMPLICIT_DEF
     %0:sreg_32 = COPY $src_flat_scratch_base_hi
-    %2:sreg_64 = S_BFE_U64 undef %1:sreg_64, %0, implicit-def $scc
+    %2:sreg_64 = S_BFE_U64 %1:sreg_64, %0, implicit-def $scc
 ...
 
 ---
@@ -86,10 +96,14 @@ body:             |
   bb.0:
 
     ; GCN-LABEL: name: s_bitcmp0_b64
-    ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $src_flat_scratch_base_hi
-    ; GCN-NEXT: S_BITCMP0_B64 undef %1:sreg_64, [[COPY]], implicit undef $scc, implicit-def $scc
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; GCN-NEXT: $scc = IMPLICIT_DEF
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $src_flat_scratch_base_hi
+    ; GCN-NEXT: S_BITCMP0_B64 [[DEF]], [[COPY]], implicit $scc, implicit-def $scc
+    %1:sreg_64 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
     %0:sreg_32 = COPY $src_flat_scratch_base_hi
-    S_BITCMP0_B64 undef %1:sreg_64, %0, implicit undef $scc, implicit-def $scc
+    S_BITCMP0_B64 %1:sreg_64, %0, implicit $scc, implicit-def $scc
 ...
 
 ---
@@ -99,10 +113,14 @@ body:             |
   bb.0:
 
     ; GCN-LABEL: name: s_bitcmp1_b64
-    ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $src_flat_scratch_base_hi
-    ; GCN-NEXT: S_BITCMP1_B64 undef %1:sreg_64, [[COPY]], implicit undef $scc, implicit-def $scc
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; GCN-NEXT: $scc = IMPLICIT_DEF
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $src_flat_scratch_base_hi
+    ; GCN-NEXT: S_BITCMP1_B64 [[DEF]], [[COPY]], implicit $scc, implicit-def $scc
+    %1:sreg_64 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
     %0:sreg_32 = COPY $src_flat_scratch_base_hi
-    S_BITCMP1_B64 undef %1:sreg_64, %0, implicit undef $scc, implicit-def $scc
+    S_BITCMP1_B64 %1:sreg_64, %0, implicit $scc, implicit-def $scc
 ...
 
 ---
@@ -125,10 +143,12 @@ body:             |
   bb.0:
 
     ; GCN-LABEL: name: s_bitset0_b64
-    ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $src_flat_scratch_base_hi
-    ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64 = S_BITSET0_B64 [[COPY]], undef [[S_BITSET0_B64_]], implicit-def $scc
-    %0:sreg_32 = COPY $src_flat_scratch_base_hi
-    %1:sreg_64 = S_BITSET0_B64 %0, undef %1:sreg_64, implicit-def $scc
+    ; GCN: $sgpr0_sgpr1 = IMPLICIT_DEF
+    ; GCN-NEXT: $sgpr2 = S_MOV_B32 $src_flat_scratch_base_hi
+    ; GCN-NEXT: $sgpr0_sgpr1 = S_BITSET0_B64 $sgpr2, $sgpr0_sgpr1, implicit-def $scc
+    $sgpr0_sgpr1 = IMPLICIT_DEF
+    $sgpr2 = S_MOV_B32 $src_flat_scratch_base_hi
+    $sgpr0_sgpr1 = S_BITSET0_B64 $sgpr2, $sgpr0_sgpr1, implicit-def $scc
 ...
 
 ---
@@ -138,10 +158,12 @@ body:             |
   bb.0:
 
     ; GCN-LABEL: name: s_bitset1_b64
-    ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $src_flat_scratch_base_hi
-    ; GCN-NEXT: [[S_BITSET1_B64_:%[0-9]+]]:sreg_64 = S_BITSET1_B64 [[COPY]], undef [[S_BITSET1_B64_]], implicit-def $scc
-    %0:sreg_32 = COPY $src_flat_scratch_base_hi
-    %1:sreg_64 = S_BITSET1_B64 %0, undef %1:sreg_64, implicit-def $scc
+    ; GCN: $sgpr0_sgpr1 = IMPLICIT_DEF
+    ; GCN-NEXT: $sgpr2 = S_MOV_B32 $src_flat_scratch_base_hi
+    ; GCN-NEXT: $sgpr0_sgpr1 = S_BITSET1_B64 $sgpr2, $sgpr0_sgpr1, implicit-def $scc
+    $sgpr0_sgpr1 = IMPLICIT_DEF
+    $sgpr2 = S_MOV_B32 $src_flat_scratch_base_hi
+    $sgpr0_sgpr1 = S_BITSET1_B64 $sgpr2, $sgpr0_sgpr1, implicit-def $scc
 ...
 
 ---
@@ -151,8 +173,11 @@ body:             |
   bb.0:
 
     ; GCN-LABEL: name: s_ashr_i64_phys_dst
-    ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $src_flat_scratch_base_hi
-    ; GCN-NEXT: $sgpr0_sgpr1 = S_ASHR_I64 undef %1:sreg_64, [[COPY]], implicit-def $scc
+    ; GCN: $sgpr0_sgpr1 = IMPLICIT_DEF
+    ; GCN-NEXT: $sgpr2 = COPY $src_flat_scratch_base_hi
+    ; GCN-NEXT: $sgpr0_sgpr1 = S_ASHR_I64 $sgpr0_sgpr1, $sgpr2, implicit-def $scc
+    $sgpr0_sgpr1 = IMPLICIT_DEF
+    $sgpr2 = COPY $src_flat_scratch_base_hi
     %0:sreg_32 = COPY $src_flat_scratch_base_hi
-    $sgpr0_sgpr1 = S_ASHR_I64 undef %1:sreg_64, %0, implicit-def $scc
+    $sgpr0_sgpr1 = S_ASHR_I64 $sgpr0_sgpr1, $sgpr2, implicit-def $scc
 ...

From ae4289f0e6e1bf61f45f88870aec220c9164800b Mon Sep 17 00:00:00 2001
From: Shih-Po Hung <shihpo.hung@sifive.com>
Date: Wed, 3 Dec 2025 16:24:56 +0800
Subject: [PATCH 11/36] [Hexagon][NFC] Drop no-op
 getMaskedMemoryOpCost/getGatherScatterOpCost stubs (#170426)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These stubs (from 4bdf1aa416b02) don’t actually override anything.
Removing them eliminates the need for a local getMemIntrinsicCost()
forwarder in #169885.
---
 .../Target/Hexagon/HexagonTargetTransformInfo.cpp    | 12 ------------
 llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h |  6 ------
 2 files changed, 18 deletions(-)

diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index c7d6ee306be84..59c6201e07081 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -223,12 +223,6 @@ InstructionCost HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                 OpInfo, I);
 }
 
-InstructionCost
-HexagonTTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
-                                      TTI::TargetCostKind CostKind) const {
-  return BaseT::getMaskedMemoryOpCost(MICA, CostKind);
-}
-
 InstructionCost
 HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
                                VectorType *SrcTy, ArrayRef<int> Mask,
@@ -238,12 +232,6 @@ HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
   return 1;
 }
 
-InstructionCost
-HexagonTTIImpl::getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA,
-                                       TTI::TargetCostKind CostKind) const {
-  return BaseT::getGatherScatterOpCost(MICA, CostKind);
-}
-
 InstructionCost HexagonTTIImpl::getInterleavedMemoryOpCost(
     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 3135958ba4b50..edf88cf476f6d 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -120,16 +120,10 @@ class HexagonTTIImpl final : public BasicTTIImplBase<HexagonTTIImpl> {
       TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
       const Instruction *I = nullptr) const override;
   InstructionCost
-  getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
-                        TTI::TargetCostKind CostKind) const override;
-  InstructionCost
   getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
                  ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
                  VectorType *SubTp, ArrayRef<const Value *> Args = {},
                  const Instruction *CxtI = nullptr) const override;
-  InstructionCost
-  getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA,
-                         TTI::TargetCostKind CostKind) const override;
   InstructionCost getInterleavedMemoryOpCost(
       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,

From cd86b2ab32bb2c444fb48e41a40f43c80a7eaeae Mon Sep 17 00:00:00 2001
From: Vikash Gupta <Vikash.Gupta@amd.com>
Date: Wed, 3 Dec 2025 13:56:15 +0530
Subject: [PATCH 12/36] [CodeGen] Add MO_LaneMask type and a new COPY_LANEMASK
 instruction (#151944)

Introduce MO_LaneMask as new machine operand type. This can be used to
hold liveness infomation at sub-register granularity for register-type
operands. We also introduce a new COPY_LANEMASK instruction that uses
MO_lanemask operand to perform partial copy from source register
opernad.

One such use case of MO_LaneMask can be seen in #151123, where it can be
used to store live regUnits information corresponding to the source
register of the COPY instructions, later can be used during CopyPhysReg
expansion.
---
 llvm/docs/MIRLangRef.rst                      | 16 ++++++++
 llvm/include/llvm/CodeGen/MIR2Vec.h           |  2 +-
 llvm/include/llvm/CodeGen/MachineInstr.h      |  5 +++
 .../llvm/CodeGen/MachineInstrBuilder.h        |  5 +++
 llvm/include/llvm/CodeGen/MachineOperand.h    | 17 +++++++-
 llvm/include/llvm/Support/TargetOpcodes.def   |  6 +++
 llvm/include/llvm/Target/Target.td            |  7 ++++
 llvm/lib/CodeGen/MIRParser/MILexer.cpp        |  1 +
 llvm/lib/CodeGen/MIRParser/MILexer.h          |  1 +
 llvm/lib/CodeGen/MIRParser/MIParser.cpp       | 28 +++++++++++++
 llvm/lib/CodeGen/MIRPrinter.cpp               |  3 +-
 llvm/lib/CodeGen/MIRVRegNamerUtils.cpp        |  3 +-
 llvm/lib/CodeGen/MachineOperand.cpp           | 19 +++++++--
 llvm/lib/CodeGen/MachineStableHash.cpp        |  4 ++
 llvm/lib/CodeGen/MachineVerifier.cpp          | 40 +++++++++++++++++++
 llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp  |  1 +
 .../parse-lanemask-operand-empty-lanemask.mir | 13 ++++++
 ...arse-lanemask-operand-invalid-lanemask.mir | 13 ++++++
 .../parse-lanemask-operand-missing-lparen.mir | 13 ++++++
 .../parse-lanemask-operand-missing-rparen.mir | 13 ++++++
 .../MIR/AMDGPU/parse-lanemask-operand.mir     | 17 ++++++++
 .../Inputs/reference_x86_vocab_print.txt      |  2 +
 .../reference_x86_vocab_wo=0.5_print.txt      |  2 +
 ...verifier-copyLanemask-invalid-lanemask.mir | 37 +++++++++++++++++
 ...verifier-copyLanemask-missing-lanemask.mir | 19 +++++++++
 .../match-table-cxx.td                        |  2 +-
 llvm/unittests/CodeGen/MachineOperandTest.cpp | 17 ++++++++
 27 files changed, 298 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/MIR/AMDGPU/parse-lanemask-operand-empty-lanemask.mir
 create mode 100644 llvm/test/CodeGen/MIR/AMDGPU/parse-lanemask-operand-invalid-lanemask.mir
 create mode 100644 llvm/test/CodeGen/MIR/AMDGPU/parse-lanemask-operand-missing-lparen.mir
 create mode 100644 llvm/test/CodeGen/MIR/AMDGPU/parse-lanemask-operand-missing-rparen.mir
 create mode 100644 llvm/test/CodeGen/MIR/AMDGPU/parse-lanemask-operand.mir
 create mode 100644 llvm/test/MachineVerifier/AMDGPU/verifier-copyLanemask-invalid-lanemask.mir
 create mode 100644 llvm/test/MachineVerifier/AMDGPU/verifier-copyLanemask-missing-lanemask.mir

diff --git a/llvm/docs/MIRLangRef.rst b/llvm/docs/MIRLangRef.rst
index f7647c898c1e6..32c96d197a027 100644
--- a/llvm/docs/MIRLangRef.rst
+++ b/llvm/docs/MIRLangRef.rst
@@ -807,6 +807,22 @@ For an int eq predicate ``ICMP_EQ``, the syntax is:
 
    %2:gpr(s32) = G_ICMP intpred(eq), %0, %1
 
+LaneMask Operands
+^^^^^^^^^^^^^^^^^
+
+A LaneMask operand contains a LaneBitmask struct representing the covering of a
+register with sub-registers. Instructions typically associate a LaneMask operand
+with one or more Register operands, and use it to represent sub-register
+granularity information like liveness for those associated Register operands.
+
+
+For example, the COPY_LANEMASK instruction uses this operand to copy only active
+lanes (of the source register) in the mask. The syntax for it would look like:
+
+.. code-block:: text
+
+   $vgpr1 = COPY_LANEMASK $vgpr0, lanemask(0x00000000000000C0)
+
 .. TODO: Describe the parsers default behaviour when optional YAML attributes
    are missing.
 .. TODO: Describe the syntax for virtual register YAML definitions.
diff --git a/llvm/include/llvm/CodeGen/MIR2Vec.h b/llvm/include/llvm/CodeGen/MIR2Vec.h
index c12d0043bc481..9e59b0d3da4f4 100644
--- a/llvm/include/llvm/CodeGen/MIR2Vec.h
+++ b/llvm/include/llvm/CodeGen/MIR2Vec.h
@@ -139,7 +139,7 @@ class MIRVocabulary {
       "FrameIndex",      "ConstantPoolIndex", "TargetIndex",  "JumpTableIndex",
       "ExternalSymbol",  "GlobalAddress",     "BlockAddress", "RegisterMask",
       "RegisterLiveOut", "Metadata",          "MCSymbol",     "CFIIndex",
-      "IntrinsicID",     "Predicate",         "ShuffleMask"};
+      "IntrinsicID",     "Predicate",         "ShuffleMask",  "LaneMask"};
   static_assert(std::size(CommonOperandNames) == MachineOperand::MO_Last - 1 &&
                 "Common operand names size changed, update accordingly");
 
diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 077e39b49df6f..8e2574974a82d 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -1451,6 +1451,10 @@ class MachineInstr
     return getOpcode() == TargetOpcode::COPY;
   }
 
+  bool isCopyLaneMask() const {
+    return getOpcode() == TargetOpcode::COPY_LANEMASK;
+  }
+
   bool isFullCopy() const {
     return isCopy() && !getOperand(0).getSubReg() && !getOperand(1).getSubReg();
   }
@@ -1484,6 +1488,7 @@ class MachineInstr
     case TargetOpcode::PHI:
     case TargetOpcode::G_PHI:
     case TargetOpcode::COPY:
+    case TargetOpcode::COPY_LANEMASK:
     case TargetOpcode::INSERT_SUBREG:
     case TargetOpcode::SUBREG_TO_REG:
     case TargetOpcode::REG_SEQUENCE:
diff --git a/llvm/include/llvm/CodeGen/MachineInstrBuilder.h b/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
index caeb430d6fd1c..060f0c41de73a 100644
--- a/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -307,6 +307,11 @@ class MachineInstrBuilder {
     return *this;
   }
 
+  const MachineInstrBuilder &addLaneMask(LaneBitmask LaneMask) const {
+    MI->addOperand(*MF, MachineOperand::CreateLaneMask(LaneMask));
+    return *this;
+  }
+
   const MachineInstrBuilder &addSym(MCSymbol *Sym,
                                     unsigned char TargetFlags = 0) const {
     MI->addOperand(*MF, MachineOperand::CreateMCSymbol(Sym, TargetFlags));
diff --git a/llvm/include/llvm/CodeGen/MachineOperand.h b/llvm/include/llvm/CodeGen/MachineOperand.h
index 9104e93ed9783..d85da5a4997f1 100644
--- a/llvm/include/llvm/CodeGen/MachineOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineOperand.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/CodeGen/Register.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/Compiler.h"
 #include <cassert>
 
@@ -69,7 +70,8 @@ class MachineOperand {
     MO_Predicate,         ///< Generic predicate for ISel
     MO_ShuffleMask,       ///< Other IR Constant for ISel (shuffle masks)
     MO_DbgInstrRef, ///< Integer indices referring to an instruction+operand
-    MO_Last = MO_DbgInstrRef
+    MO_LaneMask,    ///< Mask to represent active parts of registers
+    MO_Last = MO_LaneMask
   };
 
 private:
@@ -178,6 +180,7 @@ class MachineOperand {
     Intrinsic::ID IntrinsicID; // For MO_IntrinsicID.
     unsigned Pred;           // For MO_Predicate
     ArrayRef<int> ShuffleMask; // For MO_ShuffleMask
+    LaneBitmask LaneMask;      // For MO_LaneMask
 
     struct {                  // For MO_Register.
       // Register number is in SmallContents.RegNo.
@@ -360,6 +363,7 @@ class MachineOperand {
   bool isIntrinsicID() const { return OpKind == MO_IntrinsicID; }
   bool isPredicate() const { return OpKind == MO_Predicate; }
   bool isShuffleMask() const { return OpKind == MO_ShuffleMask; }
+  bool isLaneMask() const { return OpKind == MO_LaneMask; }
   //===--------------------------------------------------------------------===//
   // Accessors for Register Operands
   //===--------------------------------------------------------------------===//
@@ -624,6 +628,11 @@ class MachineOperand {
     return Contents.ShuffleMask;
   }
 
+  LaneBitmask getLaneMask() const {
+    assert(isLaneMask() && "Wrong MachineOperand accessor");
+    return Contents.LaneMask;
+  }
+
   /// Return the offset from the symbol in this operand. This always returns 0
   /// for ExternalSymbol operands.
   int64_t getOffset() const {
@@ -992,6 +1001,12 @@ class MachineOperand {
     return Op;
   }
 
+  static MachineOperand CreateLaneMask(LaneBitmask LaneMask) {
+    MachineOperand Op(MachineOperand::MO_LaneMask);
+    Op.Contents.LaneMask = LaneMask;
+    return Op;
+  }
+
   friend class MachineInstr;
   friend class MachineRegisterInfo;
 
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index 341fc5e50b33c..0d92f50a09d38 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -114,6 +114,12 @@ HANDLE_TARGET_OPCODE(REG_SEQUENCE)
 /// used to copy between subregisters of virtual registers.
 HANDLE_TARGET_OPCODE(COPY)
 
+/// COPY_LANEMASK - Target-independent partial register copy. The laneMask
+/// operand indicates which parts of the source register are copied to the
+/// destination. Other parts of the destination are undefined. It does not
+/// support copy between virtual registers having subregister indices.
+HANDLE_TARGET_OPCODE(COPY_LANEMASK)
+
 /// BUNDLE - This instruction represents an instruction bundle. Instructions
 /// which immediately follow a BUNDLE instruction which are marked with
 /// 'InsideBundle' flag are inside the bundle.
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index ef2ccb0abeb1e..315de55b75510 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -1352,6 +1352,13 @@ def COPY : StandardPseudoInstruction {
   let isAsCheapAsAMove = true;
   let hasNoSchedulingInfo = false;
 }
+def COPY_LANEMASK : StandardPseudoInstruction {
+  let OutOperandList = (outs unknown:$dst);
+  let InOperandList = (ins unknown:$src, unknown:$lanemask);
+  let AsmString = "";
+  let hasSideEffects = false;
+  let isAsCheapAsAMove = true;
+}
 def BUNDLE : StandardPseudoInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
index dbd56c7414f38..aa79aad781ee8 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -266,6 +266,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("constant-pool", MIToken::kw_constant_pool)
       .Case("call-entry", MIToken::kw_call_entry)
       .Case("custom", MIToken::kw_custom)
+      .Case("lanemask", MIToken::kw_lanemask)
       .Case("liveout", MIToken::kw_liveout)
       .Case("landing-pad", MIToken::kw_landing_pad)
       .Case("inlineasm-br-indirect-target",
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h
index 0407a0e7540d7..ff4c0c6f6d59d 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.h
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.h
@@ -122,6 +122,7 @@ struct MIToken {
     kw_constant_pool,
     kw_call_entry,
     kw_custom,
+    kw_lanemask,
     kw_liveout,
     kw_landing_pad,
     kw_inlineasm_br_indirect_target,
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index f35274d4e2edf..51862b501cc47 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -496,6 +496,7 @@ class MIParser {
   bool parseTargetIndexOperand(MachineOperand &Dest);
   bool parseDbgInstrRefOperand(MachineOperand &Dest);
   bool parseCustomRegisterMaskOperand(MachineOperand &Dest);
+  bool parseLaneMaskOperand(MachineOperand &Dest);
   bool parseLiveoutRegisterMaskOperand(MachineOperand &Dest);
   bool parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
                            MachineOperand &Dest,
@@ -2886,6 +2887,31 @@ bool MIParser::parseCustomRegisterMaskOperand(MachineOperand &Dest) {
   return false;
 }
 
+bool MIParser::parseLaneMaskOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::kw_lanemask));
+
+  lex();
+  if (expectAndConsume(MIToken::lparen))
+    return true;
+
+  // Parse lanemask.
+  if (Token.isNot(MIToken::IntegerLiteral) && Token.isNot(MIToken::HexLiteral))
+    return error("expected a valid lane mask value");
+  static_assert(sizeof(LaneBitmask::Type) == sizeof(uint64_t),
+                "Use correct get-function for lane mask.");
+  LaneBitmask::Type V;
+  if (getUint64(V))
+    return true;
+  LaneBitmask LaneMask(V);
+  lex();
+
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+
+  Dest = MachineOperand::CreateLaneMask(LaneMask);
+  return false;
+}
+
 bool MIParser::parseLiveoutRegisterMaskOperand(MachineOperand &Dest) {
   assert(Token.is(MIToken::kw_liveout));
   uint32_t *Mask = MF.allocateRegMask();
@@ -2986,6 +3012,8 @@ bool MIParser::parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
     return parseIntrinsicOperand(Dest);
   case MIToken::kw_target_index:
     return parseTargetIndexOperand(Dest);
+  case MIToken::kw_lanemask:
+    return parseLaneMaskOperand(Dest);
   case MIToken::kw_liveout:
     return parseLiveoutRegisterMaskOperand(Dest);
   case MIToken::kw_floatpred:
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index c0554497653f8..02f07b474c048 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -965,7 +965,8 @@ static void printMIOperand(raw_ostream &OS, MFPrintState &State,
   case MachineOperand::MO_Predicate:
   case MachineOperand::MO_BlockAddress:
   case MachineOperand::MO_DbgInstrRef:
-  case MachineOperand::MO_ShuffleMask: {
+  case MachineOperand::MO_ShuffleMask:
+  case MachineOperand::MO_LaneMask: {
     unsigned TiedOperandIdx = 0;
     if (ShouldPrintRegisterTies && Op.isReg() && Op.isTied() && !Op.isDef())
       TiedOperandIdx = Op.getParent()->findTiedOperandIdx(OpIdx);
diff --git a/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp b/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
index a22cc91b90542..884c625f94cc7 100644
--- a/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
+++ b/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
@@ -93,11 +93,12 @@ std::string VRegRenamer::getInstructionOpcodeHash(MachineInstr &MI) {
     // is contributing to a hash collision but there's enough information
     // (Opcodes,other registers etc) that this will likely not be a problem.
 
-    // TODO: Handle the following Index/ID/Predicate cases. They can
+    // TODO: Handle the following Index/ID/Predicate/LaneMask cases. They can
     // be hashed on in a stable manner.
     case MachineOperand::MO_CFIIndex:
     case MachineOperand::MO_IntrinsicID:
     case MachineOperand::MO_Predicate:
+    case MachineOperand::MO_LaneMask:
 
     // In the cases below we havn't found a way to produce an artifact that will
     // result in a stable hash, in most cases because they are pointers. We want
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 8c6d2194433d0..ac1f201bc8b83 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -394,6 +394,8 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
     return getPredicate() == Other.getPredicate();
   case MachineOperand::MO_ShuffleMask:
     return getShuffleMask() == Other.getShuffleMask();
+  case MachineOperand::MO_LaneMask:
+    return getLaneMask() == Other.getLaneMask();
   }
   llvm_unreachable("Invalid machine operand type");
 }
@@ -460,6 +462,9 @@ hash_code llvm::hash_value(const MachineOperand &MO) {
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getPredicate());
   case MachineOperand::MO_ShuffleMask:
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getShuffleMask());
+  case MachineOperand::MO_LaneMask:
+    return hash_combine(MO.getType(), MO.getTargetFlags(),
+                        MO.getLaneMask().getAsInteger());
   }
   llvm_unreachable("Invalid machine operand type");
 }
@@ -1019,11 +1024,11 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
   }
   case MachineOperand::MO_Predicate: {
     auto Pred = static_cast<CmpInst::Predicate>(getPredicate());
-    OS << (CmpInst::isIntPredicate(Pred) ? "int" : "float") << "pred("
-       << Pred << ')';
+    OS << (CmpInst::isIntPredicate(Pred) ? "int" : "float") << "pred(" << Pred
+       << ')';
     break;
   }
-  case MachineOperand::MO_ShuffleMask:
+  case MachineOperand::MO_ShuffleMask: {
     OS << "shufflemask(";
     ArrayRef<int> Mask = getShuffleMask();
     StringRef Separator;
@@ -1038,6 +1043,14 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
     OS << ')';
     break;
   }
+  case MachineOperand::MO_LaneMask: {
+    OS << "lanemask(";
+    LaneBitmask LaneMask = getLaneMask();
+    OS << "0x" << PrintLaneMask(LaneMask);
+    OS << ')';
+    break;
+  }
+  }
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp
index 6da708d51b95f..2f5f5aeccb2e4 100644
--- a/llvm/lib/CodeGen/MachineStableHash.cpp
+++ b/llvm/lib/CodeGen/MachineStableHash.cpp
@@ -165,6 +165,10 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
     return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
                                stable_hash_name(SymbolName));
   }
+  case MachineOperand::MO_LaneMask: {
+    return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
+                               MO.getLaneMask().getAsInteger());
+  }
   case MachineOperand::MO_CFIIndex:
     return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
                                MO.getCFIIndex());
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index a2a66d6128348..9be741d96e456 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -2428,6 +2428,46 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
     }
     break;
   }
+  case TargetOpcode::COPY_LANEMASK: {
+    const MachineOperand &DstOp = MI->getOperand(0);
+    const MachineOperand &SrcOp = MI->getOperand(1);
+    const MachineOperand &LaneMaskOp = MI->getOperand(2);
+    const Register SrcReg = SrcOp.getReg();
+    const LaneBitmask LaneMask = LaneMaskOp.getLaneMask();
+    LaneBitmask SrcMaxLaneMask = LaneBitmask::getAll();
+
+    if (DstOp.getSubReg())
+      report("COPY_LANEMASK must not use a subregister index", &DstOp, 0);
+
+    if (SrcOp.getSubReg())
+      report("COPY_LANEMASK must not use a subregister index", &SrcOp, 1);
+
+    if (LaneMask.none())
+      report("COPY_LANEMASK must read at least one lane", MI);
+
+    if (SrcReg.isPhysical()) {
+      const TargetRegisterClass *SrcRC = TRI->getMinimalPhysRegClass(SrcReg);
+      if (SrcRC)
+        SrcMaxLaneMask = SrcRC->getLaneMask();
+    } else {
+      SrcMaxLaneMask = MRI->getMaxLaneMaskForVReg(SrcReg);
+    }
+
+    // COPY_LANEMASK should be used only for partial copy. For full
+    // copy, one should strictly use the COPY instruction.
+    if (SrcMaxLaneMask == LaneMask)
+      report("COPY_LANEMASK cannot be used to do full copy", MI);
+
+    // If LaneMask is greater than the SrcMaxLaneMask, it implies
+    // COPY_LANEMASK is attempting to read from the lanes that
+    // don't exists in the source register.
+    if (SrcMaxLaneMask < LaneMask)
+      report("COPY_LANEMASK attempts to read from the lanes that "
+             "don't exist in the source register",
+             MI);
+
+    break;
+  }
   case TargetOpcode::STATEPOINT: {
     StatepointOpers SO(MI);
     if (!MI->getOperand(SO.getIDPos()).isImm() ||
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index fffb63738166d..d69c09fcb39db 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -932,6 +932,7 @@ static bool IsAnAddressOperand(const MachineOperand &MO) {
     return true;
   case MachineOperand::MO_RegisterMask:
   case MachineOperand::MO_RegisterLiveOut:
+  case MachineOperand::MO_LaneMask:
     return false;
   case MachineOperand::MO_Metadata:
   case MachineOperand::MO_MCSymbol:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/parse-lanemask-operand-empty-lanemask.mir b/llvm/test/CodeGen/MIR/AMDGPU/parse-lanemask-operand-empty-lanemask.mir
new file mode 100644
index 0000000000000..68324f1b2f90e
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/parse-lanemask-operand-empty-lanemask.mir
@@ -0,0 +1,13 @@
+# RUN: not llc -mtriple=amdgcn-amd-amdhsa -run-pass=none -filetype=null %s 2>&1 | FileCheck %s
+
+---
+name: test_empty_lanemask_type
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK: [[@LINE+1]]:45: expected a valid lane mask value
+    $vgpr1 = COPY_LANEMASK $vgpr0, lanemask()
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/parse-lanemask-operand-invalid-lanemask.mir b/llvm/test/CodeGen/MIR/AMDGPU/parse-lanemask-operand-invalid-lanemask.mir
new file mode 100644
index 0000000000000..647f6116f18f7
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/parse-lanemask-operand-invalid-lanemask.mir
@@ -0,0 +1,13 @@
+# RUN: not llc -mtriple=amdgcn-amd-amdhsa -run-pass=none -filetype=null %s 2>&1 | FileCheck %s
+
+---
+name: test_wrong_lanemask_type
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK: [[@LINE+1]]:45: expected a valid lane mask value
+    $vgpr1 = COPY_LANEMASK $vgpr0, lanemask(undef)
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/parse-lanemask-operand-missing-lparen.mir b/llvm/test/CodeGen/MIR/AMDGPU/parse-lanemask-operand-missing-lparen.mir
new file mode 100644
index 0000000000000..3382572f67213
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/parse-lanemask-operand-missing-lparen.mir
@@ -0,0 +1,13 @@
+# RUN: not llc -mtriple=amdgcn-amd-amdhsa -run-pass=none -filetype=null %s 2>&1 | FileCheck %s
+
+---
+name: test_missing_lparen
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK: [[@LINE+1]]:45: expected '('
+    $vgpr1 = COPY_LANEMASK $vgpr0, lanemask 14)
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/parse-lanemask-operand-missing-rparen.mir b/llvm/test/CodeGen/MIR/AMDGPU/parse-lanemask-operand-missing-rparen.mir
new file mode 100644
index 0000000000000..052305d9f9c36
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/parse-lanemask-operand-missing-rparen.mir
@@ -0,0 +1,13 @@
+# RUN: not llc -mtriple=amdgcn-amd-amdhsa -run-pass=none -filetype=null %s 2>&1 | FileCheck %s
+
+---
+name: test_missing_rparen
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK: [[@LINE+1]]:47: expected ')'
+    $vgpr1 = COPY_LANEMASK $vgpr0, lanemask(16
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/parse-lanemask-operand.mir b/llvm/test/CodeGen/MIR/AMDGPU/parse-lanemask-operand.mir
new file mode 100644
index 0000000000000..066bc8e79a56e
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/parse-lanemask-operand.mir
@@ -0,0 +1,17 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=none -verify-machineinstrs -o - %s | FileCheck %s
+
+# This test checks for the correctness of the MIR parser for lanemask
+
+# CHECK-LABEL: name: test_lanemask_operand
+# CHECK: COPY_LANEMASK $vgpr0, lanemask(0x0000000000000002)
+---
+name: test_lanemask_operand
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    $vgpr1 = COPY_LANEMASK $vgpr0, lanemask(2)
+    S_ENDPGM 0
+...
+
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
index 74ef1e608d4ba..62e07445ad12e 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
@@ -186,6 +186,7 @@ Key: CONVERGENCECTRL_ENTRY:  [ 0.00  0.00 ]
 Key: CONVERGENCECTRL_GLUE:  [ 0.00  0.00 ]
 Key: CONVERGENCECTRL_LOOP:  [ 0.00  0.00 ]
 Key: COPY:  [ 0.00  0.00 ]
+Key: COPY_LANEMASK:  [ 0.00  0.00 ]
 Key: COPY_TO_REGCLASS:  [ 0.00  0.00 ]
 Key: CPUID:  [ 0.00  0.00 ]
 Key: CQO:  [ 0.00  0.00 ]
@@ -6884,6 +6885,7 @@ Key: CFIIndex:  [ 0.00  0.00 ]
 Key: IntrinsicID:  [ 0.00  0.00 ]
 Key: Predicate:  [ 0.00  0.00 ]
 Key: ShuffleMask:  [ 0.00  0.00 ]
+Key: LaneMask:  [ 0.00  0.00 ]
 Key: PhyReg_GR8:  [ 0.00  0.00 ]
 Key: PhyReg_GRH8:  [ 0.00  0.00 ]
 Key: PhyReg_GR8_NOREX2:  [ 0.00  0.00 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
index 1ba4f13e69c92..03a3fafc6b801 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
@@ -186,6 +186,7 @@ Key: CONVERGENCECTRL_ENTRY:  [ 0.00  0.00 ]
 Key: CONVERGENCECTRL_GLUE:  [ 0.00  0.00 ]
 Key: CONVERGENCECTRL_LOOP:  [ 0.00  0.00 ]
 Key: COPY:  [ 0.00  0.00 ]
+Key: COPY_LANEMASK:  [ 0.00  0.00 ]
 Key: COPY_TO_REGCLASS:  [ 0.00  0.00 ]
 Key: CPUID:  [ 0.00  0.00 ]
 Key: CQO:  [ 0.00  0.00 ]
@@ -6884,6 +6885,7 @@ Key: CFIIndex:  [ 0.00  0.00 ]
 Key: IntrinsicID:  [ 0.00  0.00 ]
 Key: Predicate:  [ 0.00  0.00 ]
 Key: ShuffleMask:  [ 0.00  0.00 ]
+Key: LaneMask:  [ 0.00  0.00 ]
 Key: PhyReg_GR8:  [ 0.00  0.00 ]
 Key: PhyReg_GRH8:  [ 0.00  0.00 ]
 Key: PhyReg_GR8_NOREX2:  [ 0.00  0.00 ]
diff --git a/llvm/test/MachineVerifier/AMDGPU/verifier-copyLanemask-invalid-lanemask.mir b/llvm/test/MachineVerifier/AMDGPU/verifier-copyLanemask-invalid-lanemask.mir
new file mode 100644
index 0000000000000..b7d775f7a1b35
--- /dev/null
+++ b/llvm/test/MachineVerifier/AMDGPU/verifier-copyLanemask-invalid-lanemask.mir
@@ -0,0 +1,37 @@
+# RUN: not --crash llc -o - -mtriple=amdgcn-amd-amdhsa -run-pass=none %s 2>&1 | FileCheck %s
+
+---
+name: test_copy_lanemask_instruction_0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    %0:vgpr_32 = IMPLICIT_DEF
+
+    ; CHECK: *** Bad machine code: COPY_LANEMASK must read at least one lane ***
+    $vgpr2 = COPY_LANEMASK $vgpr0, lanemask(0)
+
+    ; CHECK: *** Bad machine code: COPY_LANEMASK attempts to read from the lanes that don't exist in the source register ***
+    $vgpr3 = COPY_LANEMASK $vgpr1, lanemask(0xFFFFFFFFFFFFFFFF)
+
+    ; CHECK: *** Bad machine code: COPY_LANEMASK cannot be used to do full copy ***
+    $vgpr4_vgpr5 = COPY_LANEMASK $vgpr2_vgpr3, lanemask(0x000000000000000F)
+
+    ; CHECK: *** Bad machine code: COPY_LANEMASK cannot be used to do full copy ***
+    %1:vgpr_32 = COPY_LANEMASK %0, lanemask(0x0000000000000003)
+
+    ; CHECK: *** Bad machine code: COPY_LANEMASK attempts to read from the lanes that don't exist in the source register ***
+    %2:vgpr_32 = COPY_LANEMASK %1, lanemask(0x0000000FFFFFFFFF)
+
+    ; CHECK: *** Bad machine code: COPY_LANEMASK attempts to read from the lanes that don't exist in the source register ***
+    %3:vreg_64 = COPY_LANEMASK $vgpr4_vgpr5, lanemask(0x00000000000000FF)
+
+    ; CHECK: *** Bad machine code: COPY_LANEMASK cannot be used to do full copy ***
+    $vgpr6_vgpr7 = COPY_LANEMASK %3, lanemask(0x000000000000000F)
+
+    ; CHECK: *** Bad machine code: COPY_LANEMASK must not use a subregister index ***
+    %4:vgpr_32 = COPY_LANEMASK %3.sub0, lanemask(0x0000000000000003)
+
+    S_ENDPGM 0
+...
diff --git a/llvm/test/MachineVerifier/AMDGPU/verifier-copyLanemask-missing-lanemask.mir b/llvm/test/MachineVerifier/AMDGPU/verifier-copyLanemask-missing-lanemask.mir
new file mode 100644
index 0000000000000..0b461107f5b5f
--- /dev/null
+++ b/llvm/test/MachineVerifier/AMDGPU/verifier-copyLanemask-missing-lanemask.mir
@@ -0,0 +1,19 @@
+# RUN: not --crash llc -o - -mtriple=amdgcn-amd-amdhsa -run-pass=none %s 2>&1 | FileCheck %s
+
+# CHECK: *** Bad machine code: Too few operands ***
+# CHECK-NEXT: - function:    test_copy_lanemask_instruction_1
+# CHECK-NEXT: - basic block: %bb.0
+# CHECK-NEXT: - instruction: $vgpr2 = COPY_LANEMASK %0:vgpr_32
+
+---
+name: test_copy_lanemask_instruction_1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    %0:vgpr_32 = COPY $vgpr0
+    $vgpr2 = COPY_LANEMASK %0
+    S_ENDPGM 0
+...
+
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
index a8488ca3b8e6a..28017700a0448 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
@@ -96,7 +96,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 
 // CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:      /*   0 */ GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(104), GIMT_Encode2(216), /*)*//*default:*//*Label 5*/ GIMT_Encode4(524),
+// CHECK-NEXT:      /*   0 */ GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(105), GIMT_Encode2(217), /*)*//*default:*//*Label 5*/ GIMT_Encode4(524),
 // CHECK-NEXT:      /* 10 */ /*TargetOpcode::G_STORE*//*Label 0*/ GIMT_Encode4(458), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
 // CHECK-NEXT:      /* 182 */ /*TargetOpcode::G_SEXT*//*Label 1*/ GIMT_Encode4(476), GIMT_Encode4(0),
 // CHECK-NEXT:      /* 190 */ /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4(488), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
diff --git a/llvm/unittests/CodeGen/MachineOperandTest.cpp b/llvm/unittests/CodeGen/MachineOperandTest.cpp
index 0373c7a0f629b..c0b2b1895975a 100644
--- a/llvm/unittests/CodeGen/MachineOperandTest.cpp
+++ b/llvm/unittests/CodeGen/MachineOperandTest.cpp
@@ -288,6 +288,23 @@ TEST(MachineOperandTest, PrintGlobalAddress) {
   }
 }
 
+TEST(MachineOperandTest, PrintLaneMask) {
+  // Create a MachineOperand with a lanemask and print it.
+  LaneBitmask LaneMask = LaneBitmask(12);
+  MachineOperand MO = MachineOperand::CreateLaneMask(LaneMask);
+
+  // Checking some preconditions on the newly created
+  // MachineOperand.
+  ASSERT_TRUE(MO.isLaneMask());
+  ASSERT_EQ(MO.getLaneMask(), LaneMask);
+
+  std::string str;
+  // Print a MachineOperand that is lanemask as in HEX representation.
+  raw_string_ostream OS(str);
+  MO.print(OS, /*TRI=*/nullptr);
+  ASSERT_EQ(str, "lanemask(0x000000000000000C)");
+}
+
 TEST(MachineOperandTest, PrintRegisterLiveOut) {
   // Create a MachineOperand with a register live out list and print it.
   uint32_t Mask = 0;

From c5ecdec9fb84e6865fe44f69e380afa1291c2adf Mon Sep 17 00:00:00 2001
From: Ebuka Ezike <yerimyah1@gmail.com>
Date: Wed, 3 Dec 2025 08:30:35 +0000
Subject: [PATCH 13/36] [lldb-dap] start all sent protocol message from number
 one. (#170378)

This aligns with the DAP
[specification](https://microsoft.github.io/debug-adapter-protocol//specification.html#Base_Protocol_ProtocolMessage)

Force it to be an error in test cases.
---
 .../Python/lldbsuite/test/tools/lldb-dap/dap_server.py     | 2 +-
 lldb/tools/lldb-dap/DAP.cpp                                | 7 +++++--
 lldb/tools/lldb-dap/Protocol/ProtocolBase.h                | 4 ++--
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 4a7ba78b63993..7a9d5a82983d7 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -391,7 +391,7 @@ def _process_recv_packets(self) -> None:
         with self._recv_condition:
             for packet in self._recv_packets:
                 if packet and ("seq" not in packet or packet["seq"] == 0):
-                    warnings.warn(
+                    raise ValueError(
                         f"received a malformed packet, expected 'seq != 0' for {packet!r}"
                     )
                 # Handle events that may modify any stateful properties of
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index 465d85a07bd34..6971bfea5c128 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -274,8 +274,11 @@ Id DAP::Send(const Message &message) {
   std::lock_guard<std::mutex> guard(call_mutex);
   Message msg = std::visit(
       [this](auto &&msg) -> Message {
-        if (msg.seq == kCalculateSeq)
-          msg.seq = seq++;
+        if (msg.seq == kCalculateSeq) {
+          seq++;
+          msg.seq = seq;
+        }
+        assert(msg.seq > 0 && "message sequence must be greater than zero.");
         return msg;
       },
       Message(message));
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolBase.h b/lldb/tools/lldb-dap/Protocol/ProtocolBase.h
index 42c6c8890af24..09ce6802b17c0 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolBase.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolBase.h
@@ -31,11 +31,11 @@ namespace lldb_dap::protocol {
 // MARK: Base Protocol
 
 /// Message unique identifier type.
-using Id = int64_t;
+using Id = uint64_t;
 
 /// A unique identifier that indicates the `seq` field should be calculated by
 /// the current session.
-static constexpr Id kCalculateSeq = INT64_MAX;
+static constexpr Id kCalculateSeq = UINT64_MAX;
 
 /// A client or debug adapter initiated request.
 struct Request {

From 6638d59c972512d45da474c214abc67ec3cfe333 Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Wed, 3 Dec 2025 08:31:34 +0000
Subject: [PATCH 14/36] [lldb][NFC] Rename forward_branch_offset to
 branch_offset in UnwindAssemblyInstEmulation (#169631)

This will reduce the diff in subsequent patches

Part of a sequence of PRs:
[lldb][NFCI] Rewrite UnwindAssemblyInstEmulation in terms of a CFG visit
#169630
[lldb][NFC] Rename forward_branch_offset to branch_offset in
UnwindAssemblyInstEmulation #169631
[lldb] Add DisassemblerLLVMC::IsBarrier API #169632
[lldb] Handle backwards branches in UnwindAssemblyInstEmulation #169633
commit-id:5e758a22
---
 .../UnwindAssemblyInstEmulation.cpp           | 19 +++++++++----------
 .../UnwindAssemblyInstEmulation.h             |  4 ++--
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
index c3d92f6a13d97..3913684e4ac62 100644
--- a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
+++ b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
@@ -166,7 +166,7 @@ bool UnwindAssemblyInstEmulation::GetNonCallSiteUnwindPlanFromAssembly(
     DumpInstToLog(log, inst, inst_list);
 
     m_curr_row_modified = false;
-    m_forward_branch_offset = 0;
+    m_branch_offset = 0;
 
     lldb::addr_t current_offset =
         inst.GetAddress().GetFileAddress() - base_addr;
@@ -211,13 +211,13 @@ bool UnwindAssemblyInstEmulation::GetNonCallSiteUnwindPlanFromAssembly(
     // If the current instruction is a branch forward then save the current
     // CFI information for the offset where we are branching.
     Address branch_address = inst.GetAddress();
-    branch_address.Slide(m_forward_branch_offset);
-    if (m_forward_branch_offset != 0 &&
+    branch_address.Slide(m_branch_offset);
+    if (m_branch_offset != 0 &&
         range.ContainsFileAddress(branch_address.GetFileAddress())) {
       if (auto [it, inserted] = saved_unwind_states.emplace(
-              current_offset + m_forward_branch_offset, m_state);
+              current_offset + m_branch_offset, m_state);
           inserted) {
-        it->second.row.SetOffset(current_offset + m_forward_branch_offset);
+        it->second.row.SetOffset(current_offset + m_branch_offset);
         if (std::size_t dest_instr_index =
                 inst_list.GetIndexOfInstructionAtAddress(branch_address);
             dest_instr_index < inst_list.GetSize()) {
@@ -531,20 +531,19 @@ bool UnwindAssemblyInstEmulation::WriteRegister(
   case EmulateInstruction::eContextRelativeBranchImmediate: {
     if (context.GetInfoType() == EmulateInstruction::eInfoTypeISAAndImmediate &&
         context.info.ISAAndImmediate.unsigned_data32 > 0) {
-      m_forward_branch_offset = context.info.ISAAndImmediate.unsigned_data32;
+      m_branch_offset = context.info.ISAAndImmediate.unsigned_data32;
     } else if (context.GetInfoType() ==
                    EmulateInstruction::eInfoTypeISAAndImmediateSigned &&
                context.info.ISAAndImmediateSigned.signed_data32 > 0) {
-      m_forward_branch_offset =
-          context.info.ISAAndImmediateSigned.signed_data32;
+      m_branch_offset = context.info.ISAAndImmediateSigned.signed_data32;
     } else if (context.GetInfoType() ==
                    EmulateInstruction::eInfoTypeImmediate &&
                context.info.unsigned_immediate > 0) {
-      m_forward_branch_offset = context.info.unsigned_immediate;
+      m_branch_offset = context.info.unsigned_immediate;
     } else if (context.GetInfoType() ==
                    EmulateInstruction::eInfoTypeImmediateSigned &&
                context.info.signed_immediate > 0) {
-      m_forward_branch_offset = context.info.signed_immediate;
+      m_branch_offset = context.info.signed_immediate;
     }
   } break;
 
diff --git a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.h b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.h
index 6c0492f5dfc66..1c80199235b4b 100644
--- a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.h
+++ b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.h
@@ -64,7 +64,7 @@ class UnwindAssemblyInstEmulation : public lldb_private::UnwindAssembly {
                               lldb_private::EmulateInstruction *inst_emulator)
       : UnwindAssembly(arch), m_inst_emulator_up(inst_emulator),
         m_range_ptr(nullptr), m_unwind_plan_ptr(nullptr),
-        m_curr_row_modified(false), m_forward_branch_offset(0) {
+        m_curr_row_modified(false) {
     if (m_inst_emulator_up) {
       m_inst_emulator_up->SetBaton(this);
       m_inst_emulator_up->SetCallbacks(ReadMemory, WriteMemory, ReadRegister,
@@ -152,7 +152,7 @@ class UnwindAssemblyInstEmulation : public lldb_private::UnwindAssembly {
   bool m_curr_row_modified;
   // The instruction is branching forward with the given offset. 0 value means
   // no branching.
-  uint32_t m_forward_branch_offset;
+  uint32_t m_branch_offset = 0;
 };
 
 #endif // LLDB_SOURCE_PLUGINS_UNWINDASSEMBLY_INSTEMULATION_UNWINDASSEMBLYINSTEMULATION_H

From 4b0a9759395f3e9cbefa9c194ca331f4d88003bf Mon Sep 17 00:00:00 2001
From: Hongyu Chen <xxs_chy@outlook.com>
Date: Wed, 3 Dec 2025 16:53:25 +0800
Subject: [PATCH 15/36] [OpenCL][NVPTX] Don't set calling convention for OpenCL
 kernel (#170170)

Fixes #154772
We previously set `ptx_kernel` for all kernels. But it's incorrect to
add `ptx_kernel` to the stub version of kernel introduced in #115821.
This patch copies the workaround of AMDGPU.
---
 clang/lib/CodeGen/Targets/AMDGPU.cpp  |  5 +----
 clang/lib/CodeGen/Targets/NVPTX.cpp   |  3 ---
 clang/lib/CodeGen/Targets/SPIR.cpp    | 21 ---------------------
 clang/lib/Sema/SemaType.cpp           |  4 +++-
 clang/test/CodeGenOpenCL/ptx-calls.cl | 26 +++++++++++++++++++++++---
 clang/test/CodeGenOpenCL/reflect.cl   |  2 +-
 6 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index e4ad078dab197..0ab6c753b8bad 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -439,11 +439,8 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
     return;
 
   const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
-  if (FD) {
+  if (FD)
     setFunctionDeclAttributes(FD, F, M);
-    if (FD->hasAttr<DeviceKernelAttr>() && !M.getLangOpts().OpenCL)
-      F->setCallingConv(getDeviceKernelCallingConv());
-  }
   if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
     F->addFnAttr("amdgpu-ieee", "false");
 }
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index f6715861d91bc..ba2acd821c704 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -276,9 +276,6 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
         M.handleCUDALaunchBoundsAttr(F, Attr);
     }
   }
-  // Attach kernel metadata directly if compiling for NVPTX.
-  if (FD->hasAttr<DeviceKernelAttr>())
-    F->setCallingConv(getDeviceKernelCallingConv());
 }
 
 void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::GlobalValue *GV,
diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp
index 1a8c85d8871ec..ccc35a22d9938 100644
--- a/clang/lib/CodeGen/Targets/SPIR.cpp
+++ b/clang/lib/CodeGen/Targets/SPIR.cpp
@@ -77,8 +77,6 @@ class CommonSPIRTargetCodeGenInfo : public TargetCodeGenInfo {
   llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
                                  llvm::PointerType *T,
                                  QualType QT) const override;
-  void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
-                           CodeGen::CodeGenModule &M) const override;
 };
 class SPIRVTargetCodeGenInfo : public CommonSPIRTargetCodeGenInfo {
 public:
@@ -292,22 +290,6 @@ CommonSPIRTargetCodeGenInfo::getNullPointer(const CodeGen::CodeGenModule &CGM,
       llvm::ConstantPointerNull::get(NPT), PT);
 }
 
-void CommonSPIRTargetCodeGenInfo::setTargetAttributes(
-    const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
-  if (M.getLangOpts().OpenCL || GV->isDeclaration())
-    return;
-
-  const FunctionDecl *FD = dyn_cast<FunctionDecl>(D);
-  if (!FD)
-    return;
-
-  llvm::Function *F = dyn_cast<llvm::Function>(GV);
-  assert(F && "Expected GlobalValue to be a Function");
-
-  if (FD->hasAttr<DeviceKernelAttr>())
-    F->setCallingConv(getDeviceKernelCallingConv());
-}
-
 LangAS
 SPIRVTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
                                                  const VarDecl *D) const {
@@ -342,9 +324,6 @@ void SPIRVTargetCodeGenInfo::setTargetAttributes(
   llvm::Function *F = dyn_cast<llvm::Function>(GV);
   assert(F && "Expected GlobalValue to be a Function");
 
-  if (FD->hasAttr<DeviceKernelAttr>())
-    F->setCallingConv(getDeviceKernelCallingConv());
-
   if (!M.getLangOpts().HIP ||
       M.getTarget().getTriple().getVendor() != llvm::Triple::AMD)
     return;
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 9f5aa153d1cbe..fd64d4456cbfa 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -3798,8 +3798,10 @@ static CallingConv getCCForDeclaratorChunk(
       }
     }
   }
+
   for (const ParsedAttr &AL : llvm::concat<ParsedAttr>(
-           D.getDeclSpec().getAttributes(), D.getAttributes())) {
+           D.getDeclSpec().getAttributes(), D.getAttributes(),
+           D.getDeclarationAttributes())) {
     if (AL.getKind() == ParsedAttr::AT_DeviceKernel) {
       CC = CC_DeviceKernel;
       break;
diff --git a/clang/test/CodeGenOpenCL/ptx-calls.cl b/clang/test/CodeGenOpenCL/ptx-calls.cl
index ae187173b1730..17c25ee78ef45 100644
--- a/clang/test/CodeGenOpenCL/ptx-calls.cl
+++ b/clang/test/CodeGenOpenCL/ptx-calls.cl
@@ -1,11 +1,31 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 6
 // RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -emit-llvm -O0 -o - | FileCheck %s
 
 void device_function() {
 }
-// CHECK-LABEL: define{{.*}} void @device_function()
 
 __kernel void kernel_function() {
   device_function();
 }
-// CHECK-LABEL: define{{.*}} ptx_kernel void @kernel_function()
-// CHECK: call void @device_function()
+// CHECK-LABEL: define dso_local void @device_function(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define dso_local ptx_kernel void @kernel_function(
+// CHECK-SAME: ) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META3]] !kernel_arg_type [[META3]] !kernel_arg_base_type [[META3]] !kernel_arg_type_qual [[META3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @__clang_ocl_kern_imp_kernel_function() #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_kernel_function(
+// CHECK-SAME: ) #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META3]] !kernel_arg_type [[META3]] !kernel_arg_base_type [[META3]] !kernel_arg_type_qual [[META3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @device_function() #[[ATTR2]]
+// CHECK-NEXT:    ret void
+//
+//.
+// CHECK: [[META3]] = !{}
+//.
diff --git a/clang/test/CodeGenOpenCL/reflect.cl b/clang/test/CodeGenOpenCL/reflect.cl
index 4abb40aa3ed50..a69e338641167 100644
--- a/clang/test/CodeGenOpenCL/reflect.cl
+++ b/clang/test/CodeGenOpenCL/reflect.cl
@@ -26,7 +26,7 @@ __kernel void kernel_function(__global int *i) {
 // CHECK-NEXT:    ret void
 //
 //
-// CHECK-LABEL: define dso_local ptx_kernel void @__clang_ocl_kern_imp_kernel_function(
+// CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_kernel_function(
 // CHECK-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META5]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 4

From 8b7a07a5f7e7b2a96417665f807cbf79a3161a76 Mon Sep 17 00:00:00 2001
From: Ebuka Ezike <yerimyah1@gmail.com>
Date: Wed, 3 Dec 2025 08:55:11 +0000
Subject: [PATCH 16/36] [lldb]  Fix abi_tag parsing for operator<< and
 operator-named tags (#170224)

The parser now correctly handles:
- abi_tags attached to operator<<: `operator<<[abi:SOMETAG]`
- abi_tags with "operator" as the tag name: `func[abi:operator]`
---
 .../Language/CPlusPlus/CPlusPlusNameParser.cpp        | 11 ++++++-----
 .../Language/CPlusPlus/CPlusPlusLanguageTest.cpp      |  6 ++++++
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusNameParser.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusNameParser.cpp
index d8c095d6edeb7..4d283bb02e533 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusNameParser.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusNameParser.cpp
@@ -315,7 +315,7 @@ bool CPlusPlusNameParser::ConsumeAbiTag() {
 
   // Consume the actual tag string (and allow some special characters)
   while (ConsumeToken(tok::raw_identifier, tok::comma, tok::period,
-                      tok::numeric_constant))
+                      tok::numeric_constant, tok::kw_operator))
     ;
 
   if (!ConsumeToken(tok::r_square))
@@ -420,10 +420,11 @@ bool CPlusPlusNameParser::ConsumeOperator() {
     // Make sure we have more tokens before attempting to look ahead one more.
     if (m_next_token_index + 1 < m_tokens.size()) {
       // Look ahead two tokens.
-      clang::Token n_token = m_tokens[m_next_token_index + 1];
-      // If we find ( or < then this is indeed operator<< no need for fix.
-      if (n_token.getKind() != tok::l_paren && n_token.getKind() != tok::less) {
-        clang::Token tmp_tok;
+      const clang::Token n_token = m_tokens[m_next_token_index + 1];
+      // If we find `(`, `<` or `[` then this is indeed operator<< no need for
+      // fix.
+      if (!n_token.isOneOf(tok::l_paren, tok::less, tok::l_square)) {
+        clang::Token tmp_tok{};
         tmp_tok.startToken();
         tmp_tok.setLength(1);
         tmp_tok.setLocation(token.getLocation().getLocWithOffset(1));
diff --git a/lldb/unittests/Language/CPlusPlus/CPlusPlusLanguageTest.cpp b/lldb/unittests/Language/CPlusPlus/CPlusPlusLanguageTest.cpp
index c05418168e62e..41df35f67a790 100644
--- a/lldb/unittests/Language/CPlusPlus/CPlusPlusLanguageTest.cpp
+++ b/lldb/unittests/Language/CPlusPlus/CPlusPlusLanguageTest.cpp
@@ -69,6 +69,12 @@ TEST(CPlusPlusLanguage, MethodNameParsing) {
        "const",
        "std::__1::ranges::__begin::__fn::operator()[abi:v160000]<char const, "
        "18ul>"},
+      {"bool Ball[abi:BALL]<int>::operator<<[abi:operator]<int>(int)", "bool",
+       "Ball[abi:BALL]<int>", "operator<<[abi:operator]<int>", "(int)", "",
+       "Ball[abi:BALL]<int>::operator<<[abi:operator]<int>"},
+      {"bool Ball[abi:BALL]<int>::operator>>[abi:operator]<int>(int)", "bool",
+       "Ball[abi:BALL]<int>", "operator>>[abi:operator]<int>", "(int)", "",
+       "Ball[abi:BALL]<int>::operator>>[abi:operator]<int>"},
       // Internal classes
       {"operator<<(Cls, Cls)::Subclass::function()", "",
        "operator<<(Cls, Cls)::Subclass", "function", "()", "",

From 7cdb27a4b3757879446596d6f042f87b5119c638 Mon Sep 17 00:00:00 2001
From: Aaditya <115080342+easyonaadit@users.noreply.github.com>
Date: Wed, 3 Dec 2025 14:36:25 +0530
Subject: [PATCH 17/36] [NFC][AMDGPU] Refactor wave reduce test files (#170440)

Separate out float wave-reduce intrinsic tests from the overloaded call.
Moved float add/sub/min/max ops from:
`llvm.amdgcn.reduce.add/sub/min/max` to
`llvm.amdgcn.reduce.fadd/fsub/fmin/fmax`.
---
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll  | 1001 ----------------
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll | 1021 +++++++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll |  928 +++++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll |  928 +++++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll | 1021 +++++++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll  |  911 ---------------
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll  |  911 ---------------
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll  | 1001 ----------------
 8 files changed, 3898 insertions(+), 3824 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
index 83b0847cd2589..e58bf6280a1f2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
@@ -2019,1007 +2019,6 @@ endif:
   store i64 %combine, ptr addrspace(1) %out
   ret void
 }
-
-define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) {
-; GFX8DAGISEL-LABEL: uniform_value_float:
-; GFX8DAGISEL:       ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; GFX8DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8DAGISEL-NEXT:    s_endpgm
-;
-; GFX8GISEL-LABEL: uniform_value_float:
-; GFX8GISEL:       ; %bb.0: ; %entry
-; GFX8GISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; GFX8GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX8GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT:    s_endpgm
-;
-; GFX9DAGISEL-LABEL: uniform_value_float:
-; GFX9DAGISEL:       ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; GFX9DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX9DAGISEL-NEXT:    s_endpgm
-;
-; GFX9GISEL-LABEL: uniform_value_float:
-; GFX9GISEL:       ; %bb.0: ; %entry
-; GFX9GISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; GFX9GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX9GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX9GISEL-NEXT:    s_endpgm
-;
-; GFX1064DAGISEL-LABEL: uniform_value_float:
-; GFX1064DAGISEL:       ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX1064DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s3, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
-; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_endpgm
-;
-; GFX1064GISEL-LABEL: uniform_value_float:
-; GFX1064GISEL:       ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX1064GISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; GFX1064GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX1064GISEL-NEXT:    s_endpgm
-;
-; GFX1032DAGISEL-LABEL: uniform_value_float:
-; GFX1032DAGISEL:       ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s0
-; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
-; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX1032DAGISEL-NEXT:    s_endpgm
-;
-; GFX1032GISEL-LABEL: uniform_value_float:
-; GFX1032GISEL:       ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX1032GISEL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s0, s0
-; GFX1032GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX1032GISEL-NEXT:    s_endpgm
-;
-; GFX1164DAGISEL-LABEL: uniform_value_float:
-; GFX1164DAGISEL:       ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s3, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
-; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_endpgm
-;
-; GFX1164GISEL-LABEL: uniform_value_float:
-; GFX1164GISEL:       ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT:    s_endpgm
-;
-; GFX1132DAGISEL-LABEL: uniform_value_float:
-; GFX1132DAGISEL:       ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s0
-; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
-; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1132DAGISEL-NEXT:    s_endpgm
-;
-; GFX1132GISEL-LABEL: uniform_value_float:
-; GFX1132GISEL:       ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX1132GISEL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s0, s0
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT:    s_endpgm
-;
-; GFX12DAGISEL-LABEL: uniform_value_float:
-; GFX12DAGISEL:       ; %bb.0: ; %entry
-; GFX12DAGISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
-; GFX12DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX12DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX12DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
-; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12DAGISEL-NEXT:    s_endpgm
-entry:
-  %result = call float @llvm.amdgcn.wave.reduce.fadd(float %in, i32 1)
-  store float %result, ptr addrspace(1) %out
-  ret void
-}
-
-define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) {
-; GFX8DAGISEL-LABEL: divergent_value_float:
-; GFX8DAGISEL:       ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX8DAGISEL-NEXT:    s_brev_b32 s6, 1
-; GFX8DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s8
-; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX8DAGISEL-NEXT:    v_add_f32_e32 v3, s6, v3
-; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX8DAGISEL-NEXT:  ; %bb.2:
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8GISEL-LABEL: divergent_value_float:
-; GFX8GISEL:       ; %bb.0: ; %entry
-; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX8GISEL-NEXT:    s_brev_b32 s6, 1
-; GFX8GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s8
-; GFX8GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX8GISEL-NEXT:    v_add_f32_e32 v3, s6, v3
-; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX8GISEL-NEXT:  ; %bb.2:
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9DAGISEL-LABEL: divergent_value_float:
-; GFX9DAGISEL:       ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9DAGISEL-NEXT:    s_brev_b32 s6, 1
-; GFX9DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9DAGISEL-NEXT:    v_add_f32_e32 v3, s6, v3
-; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX9DAGISEL-NEXT:  ; %bb.2:
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9GISEL-LABEL: divergent_value_float:
-; GFX9GISEL:       ; %bb.0: ; %entry
-; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9GISEL-NEXT:    s_brev_b32 s6, 1
-; GFX9GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9GISEL-NEXT:    v_add_f32_e32 v3, s6, v3
-; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX9GISEL-NEXT:  ; %bb.2:
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9GISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1064DAGISEL-LABEL: divergent_value_float:
-; GFX1064DAGISEL:       ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1064DAGISEL-NEXT:    s_brev_b32 s6, 1
-; GFX1064DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064DAGISEL-NEXT:    v_add_f32_e64 v3, s6, s8
-; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1064DAGISEL-NEXT:  ; %bb.2:
-; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX1064DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1064GISEL-LABEL: divergent_value_float:
-; GFX1064GISEL:       ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1064GISEL-NEXT:    s_brev_b32 s6, 1
-; GFX1064GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX1064GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064GISEL-NEXT:    v_add_f32_e64 v3, s6, s8
-; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1064GISEL-NEXT:  ; %bb.2:
-; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX1064GISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1032DAGISEL-LABEL: divergent_value_float:
-; GFX1032DAGISEL:       ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1032DAGISEL-NEXT:    s_brev_b32 s5, 1
-; GFX1032DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s6, s4
-; GFX1032DAGISEL-NEXT:    v_readlane_b32 s7, v2, s6
-; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s4, s6
-; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX1032DAGISEL-NEXT:    v_add_f32_e64 v3, s5, s7
-; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1032DAGISEL-NEXT:  ; %bb.2:
-; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX1032DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1032GISEL-LABEL: divergent_value_float:
-; GFX1032GISEL:       ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1032GISEL-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1032GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX1032GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s6, s4
-; GFX1032GISEL-NEXT:    v_readlane_b32 s7, v2, s6
-; GFX1032GISEL-NEXT:    s_bitset0_b32 s4, s6
-; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX1032GISEL-NEXT:    v_add_f32_e64 v3, s5, s7
-; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1032GISEL-NEXT:  ; %bb.2:
-; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX1032GISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1164DAGISEL-LABEL: divergent_value_float:
-; GFX1164DAGISEL:       ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164DAGISEL-NEXT:    s_brev_b32 s2, 1
-; GFX1164DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
-; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[0:1], s3
-; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164DAGISEL-NEXT:    v_add_f32_e64 v3, s2, s4
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1164DAGISEL-NEXT:  ; %bb.2:
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1164DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1164GISEL-LABEL: divergent_value_float:
-; GFX1164GISEL:       ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164GISEL-NEXT:    s_brev_b32 s2, 1
-; GFX1164GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s3
-; GFX1164GISEL-NEXT:    s_bitset0_b64 s[0:1], s3
-; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164GISEL-NEXT:    v_add_f32_e64 v3, s2, s4
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1164GISEL-NEXT:  ; %bb.2:
-; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1164GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1132DAGISEL-LABEL: divergent_value_float:
-; GFX1132DAGISEL:       ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132DAGISEL-NEXT:    s_brev_b32 s1, 1
-; GFX1132DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s2, s0
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v2, s2
-; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s0, s2
-; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX1132DAGISEL-NEXT:    v_add_f32_e64 v3, s1, s3
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1132DAGISEL-NEXT:  ; %bb.2:
-; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1132DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1132GISEL-LABEL: divergent_value_float:
-; GFX1132GISEL:       ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1132GISEL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX1132GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s2, s0
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v2, s2
-; GFX1132GISEL-NEXT:    s_bitset0_b32 s0, s2
-; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX1132GISEL-NEXT:    v_add_f32_e64 v3, s1, s3
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1132GISEL-NEXT:  ; %bb.2:
-; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1132GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12DAGISEL-LABEL: divergent_value_float:
-; GFX12DAGISEL:       ; %bb.0: ; %entry
-; GFX12DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12DAGISEL-NEXT:    s_wait_expcnt 0x0
-; GFX12DAGISEL-NEXT:    s_wait_samplecnt 0x0
-; GFX12DAGISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX12DAGISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s2, s0
-; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12DAGISEL-NEXT:    v_readlane_b32 s3, v2, s2
-; GFX12DAGISEL-NEXT:    s_bitset0_b32 s0, s2
-; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12DAGISEL-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX12DAGISEL-NEXT:    v_add_f32_e64 v3, s1, s3
-; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX12DAGISEL-NEXT:  ; %bb.2:
-; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call float @llvm.amdgcn.wave.reduce.fadd(float %id.x, i32 1)
-  store float %result, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
-; GFX8DAGISEL-LABEL: divergent_cfg_float:
-; GFX8DAGISEL:       ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
-; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr6
-; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX8DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX8DAGISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
-; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
-; GFX8DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX8DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
-; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT:  .LBB8_4: ; %endif
-; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8DAGISEL-NEXT:    flat_store_dword v[1:2], v0
-; GFX8DAGISEL-NEXT:    s_endpgm
-;
-; GFX8GISEL-LABEL: divergent_cfg_float:
-; GFX8GISEL:       ; %bb.0: ; %entry
-; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
-; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
-; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX8GISEL-NEXT:  ; %bb.1: ; %else
-; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX8GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX8GISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX8GISEL-NEXT:  ; %bb.3: ; %if
-; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
-; GFX8GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX8GISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
-; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX8GISEL-NEXT:  .LBB8_4: ; %endif
-; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT:    s_endpgm
-;
-; GFX9DAGISEL-LABEL: divergent_cfg_float:
-; GFX9DAGISEL:       ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
-; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr6
-; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX9DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9DAGISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
-; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
-; GFX9DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX9DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
-; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9DAGISEL-NEXT:  .LBB8_4: ; %endif
-; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX9DAGISEL-NEXT:    s_endpgm
-;
-; GFX9GISEL-LABEL: divergent_cfg_float:
-; GFX9GISEL:       ; %bb.0: ; %entry
-; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
-; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
-; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX9GISEL-NEXT:  ; %bb.1: ; %else
-; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX9GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9GISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX9GISEL-NEXT:  ; %bb.3: ; %if
-; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
-; GFX9GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX9GISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
-; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9GISEL-NEXT:  .LBB8_4: ; %endif
-; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX9GISEL-NEXT:    s_endpgm
-;
-; GFX1064DAGISEL-LABEL: divergent_cfg_float:
-; GFX1064DAGISEL:       ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
-; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr6
-; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX1064DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1064DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX1064DAGISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
-; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
-; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
-; GFX1064DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX1064DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
-; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1064DAGISEL-NEXT:  .LBB8_4: ; %endif
-; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_endpgm
-;
-; GFX1064GISEL-LABEL: divergent_cfg_float:
-; GFX1064GISEL:       ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
-; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
-; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX1064GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
-; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1064GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX1064GISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
-; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
-; GFX1064GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX1064GISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
-; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX1064GISEL-NEXT:  .LBB8_4: ; %endif
-; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX1064GISEL-NEXT:    s_endpgm
-;
-; GFX1032DAGISEL-LABEL: divergent_cfg_float:
-; GFX1032DAGISEL:       ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
-; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr3
-; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX1032DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX1032DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
-; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1032DAGISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s2
-; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s3
-; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
-; GFX1032DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX1032DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
-; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1032DAGISEL-NEXT:  .LBB8_4: ; %endif
-; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX1032DAGISEL-NEXT:    s_endpgm
-;
-; GFX1032GISEL-LABEL: divergent_cfg_float:
-; GFX1032GISEL:       ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
-; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr2
-; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1032GISEL-NEXT:    s_xor_b32 s3, exec_lo, s3
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
-; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
-; GFX1032GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1032GISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s0, s3
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
-; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
-; GFX1032GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX1032GISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
-; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1032GISEL-NEXT:  .LBB8_4: ; %endif
-; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX1032GISEL-NEXT:    s_endpgm
-;
-; GFX1164DAGISEL-LABEL: divergent_cfg_float:
-; GFX1164DAGISEL:       ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:    ; implicit-def: $sgpr6
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
-; GFX1164DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1164DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX1164DAGISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
-; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX1164DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1164DAGISEL-NEXT:  .LBB8_4: ; %endif
-; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_endpgm
-;
-; GFX1164GISEL-LABEL: divergent_cfg_float:
-; GFX1164GISEL:       ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
-; GFX1164GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
-; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1164GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX1164GISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[2:3]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
-; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX1164GISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX1164GISEL-NEXT:  .LBB8_4: ; %endif
-; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT:    s_endpgm
-;
-; GFX1132DAGISEL-LABEL: divergent_cfg_float:
-; GFX1132DAGISEL:       ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:    ; implicit-def: $sgpr3
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
-; GFX1132DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX1132DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
-; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1132DAGISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s2
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s3
-; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
-; GFX1132DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
-; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1132DAGISEL-NEXT:  .LBB8_4: ; %endif
-; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1132DAGISEL-NEXT:    s_endpgm
-;
-; GFX1132GISEL-LABEL: divergent_cfg_float:
-; GFX1132GISEL:       ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr2
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
-; GFX1132GISEL-NEXT:    s_xor_b32 s3, exec_lo, s3
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
-; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
-; GFX1132GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1132GISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s3
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
-; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
-; GFX1132GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
-; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1132GISEL-NEXT:  .LBB8_4: ; %endif
-; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT:    s_endpgm
-;
-; GFX12DAGISEL-LABEL: divergent_cfg_float:
-; GFX12DAGISEL:       ; %bb.0: ; %entry
-; GFX12DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX12DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX12DAGISEL-NEXT:    ; implicit-def: $sgpr3
-; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
-; GFX12DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX12DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX12DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX12DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX12DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
-; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX12DAGISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s0, s2
-; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v0, s3
-; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX12DAGISEL-NEXT:  ; %bb.3: ; %if
-; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
-; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
-; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v0, s1
-; GFX12DAGISEL-NEXT:  .LBB8_4: ; %endif
-; GFX12DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12DAGISEL-NEXT:    s_endpgm
-entry:
-  %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %d_cmp = icmp ult i32 %tid, 16
-  br i1 %d_cmp, label %if, label %else
-
-if:
-  %reducedValTid = call float @llvm.amdgcn.wave.reduce.fadd(float %in2, i32 1)
-  br label %endif
-
-else:
-  %reducedValIn = call float @llvm.amdgcn.wave.reduce.fadd(float %in, i32 1)
-  br label %endif
-
-endif:
-  %combine = phi float [%reducedValTid, %if], [%reducedValIn, %else]
-  store float %combine, ptr addrspace(1) %out
-  ret void
-}
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX10DAGISEL: {{.*}}
 ; GFX10GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll
new file mode 100644
index 0000000000000..5d408dc65d68b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll
@@ -0,0 +1,1021 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck  -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck  -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck  -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck  -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX12DAGISEL %s
+
+
+define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) {
+; GFX8DAGISEL-LABEL: uniform_value_float:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX8DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_float:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX8GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_float:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_float:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_float:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s3, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value_float:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX1064GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_float:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value_float:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s0, s0
+; GFX1032GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_float:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_float:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_float:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_float:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+;
+; GFX12DAGISEL-LABEL: uniform_value_float:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX12DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12DAGISEL-NEXT:    s_endpgm
+entry:
+  %result = call float @llvm.amdgcn.wave.reduce.fadd(float %in, i32 1)
+  store float %result, ptr addrspace(1) %out
+  ret void
+}
+
+define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) {
+; GFX8DAGISEL-LABEL: divergent_value_float:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT:    s_brev_b32 s6, 1
+; GFX8DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT:    v_add_f32_e32 v3, s6, v3
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_float:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT:    s_brev_b32 s6, 1
+; GFX8GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT:    v_add_f32_e32 v3, s6, v3
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_float:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT:    s_brev_b32 s6, 1
+; GFX9DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT:    v_add_f32_e32 v3, s6, v3
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_float:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT:    s_brev_b32 s6, 1
+; GFX9GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT:    v_add_f32_e32 v3, s6, v3
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_float:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT:    s_brev_b32 s6, 1
+; GFX1064DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT:    v_add_f32_e64 v3, s6, s8
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_float:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT:    s_brev_b32 s6, 1
+; GFX1064GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT:    v_add_f32_e64 v3, s6, s8
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_float:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT:    s_brev_b32 s5, 1
+; GFX1032DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s6, s4
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s7, v2, s6
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s4, s6
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1032DAGISEL-NEXT:    v_add_f32_e64 v3, s5, s7
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX1032DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_float:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT:    s_brev_b32 s5, 1
+; GFX1032GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s6, s4
+; GFX1032GISEL-NEXT:    v_readlane_b32 s7, v2, s6
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s4, s6
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1032GISEL-NEXT:    v_add_f32_e64 v3, s5, s7
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX1032GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_float:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT:    s_brev_b32 s2, 1
+; GFX1164DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-NEXT:    v_add_f32_e64 v3, s2, s4
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_float:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    s_brev_b32 s2, 1
+; GFX1164GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164GISEL-NEXT:    v_add_f32_e64 v3, s2, s4
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_float:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT:    s_brev_b32 s1, 1
+; GFX1132DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s2, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s0, s2
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132DAGISEL-NEXT:    v_add_f32_e64 v3, s1, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1132DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_float:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132GISEL-NEXT:    s_brev_b32 s1, 1
+; GFX1132GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s2, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s0, s2
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132GISEL-NEXT:    v_add_f32_e64 v3, s1, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1132GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12DAGISEL-LABEL: divergent_value_float:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX12DAGISEL-NEXT:    s_brev_b32 s1, 1
+; GFX12DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s2, s0
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX12DAGISEL-NEXT:    s_bitset0_b32 s0, s2
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX12DAGISEL-NEXT:    v_add_f32_e64 v3, s1, s3
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX12DAGISEL-NEXT:  ; %bb.2:
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call float @llvm.amdgcn.wave.reduce.fadd(float %id.x, i32 1)
+  store float %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
+; GFX8DAGISEL-LABEL: divergent_cfg_float:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX8DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX8DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:  .LBB2_4: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8DAGISEL-NEXT:    flat_store_dword v[1:2], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg_float:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX8GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX8GISEL-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX8GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX8GISEL-NEXT:  .LBB2_4: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg_float:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX9DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX9DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX9DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9DAGISEL-NEXT:  .LBB2_4: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg_float:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX9GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX9GISEL-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX9GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX9GISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9GISEL-NEXT:  .LBB2_4: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg_float:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1064DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX1064DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1064DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064DAGISEL-NEXT:  .LBB2_4: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg_float:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1064GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX1064GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1064GISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1064GISEL-NEXT:  .LBB2_4: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg_float:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr3
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX1032DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1032DAGISEL-NEXT:  .LBB2_4: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg_float:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s0, s3
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032GISEL-NEXT:  .LBB2_4: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg_float:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1164DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1164DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1164DAGISEL-NEXT:  .LBB2_4: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg_float:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1164GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[2:3]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1164GISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1164GISEL-NEXT:  .LBB2_4: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg_float:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $sgpr3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX1132DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1132DAGISEL-NEXT:  .LBB2_4: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg_float:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s3
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-NEXT:  .LBB2_4: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+;
+; GFX12DAGISEL-LABEL: divergent_cfg_float:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX12DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12DAGISEL-NEXT:    ; implicit-def: $sgpr3
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX12DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX12DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX12DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX12DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX12DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s0, s2
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX12DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12DAGISEL-NEXT:  .LBB2_4: ; %endif
+; GFX12DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12DAGISEL-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call float @llvm.amdgcn.wave.reduce.fadd(float %in2, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call float @llvm.amdgcn.wave.reduce.fadd(float %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi float [%reducedValTid, %if], [%reducedValIn, %else]
+  store float %combine, ptr addrspace(1) %out
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10DAGISEL: {{.*}}
+; GFX10GISEL: {{.*}}
+; GFX11DAGISEL: {{.*}}
+; GFX11GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll
new file mode 100644
index 0000000000000..f02fd876f1aac
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll
@@ -0,0 +1,928 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck  -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck  -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck  -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck  -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+
+define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) {
+; GFX8DAGISEL-LABEL: uniform_value_float:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_float:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_float:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_float:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value_float:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_clause 0x1
+; GFX10DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value_float:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_clause 0x1
+; GFX10GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_float:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_float:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_clause 0x1
+; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_float:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_float:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_clause 0x1
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call float @llvm.amdgcn.wave.reduce.fmax(float %in, i32 1)
+  store float %result, ptr addrspace(1) %out
+  ret void
+}
+
+define void @divergent_value_float(ptr addrspace(1) %out, float %in) {
+; GFX8DAGISEL-LABEL: divergent_value_float:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
+; GFX8DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT:    v_max_f32_e32 v3, s6, v3
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_float:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
+; GFX8GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT:    v_max_f32_e32 v3, s6, v3
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_float:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
+; GFX9DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT:    v_max_f32_e32 v3, s6, v3
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_float:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
+; GFX9GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT:    v_max_f32_e32 v3, s6, v3
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_float:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
+; GFX1064DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT:    v_max_f32_e64 v3, s6, s8
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_float:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
+; GFX1064GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT:    v_max_f32_e64 v3, s6, s8
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_float:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, 0x7fc00000
+; GFX1032DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s6, s4
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s7, v2, s6
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s4, s6
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1032DAGISEL-NEXT:    v_max_f32_e64 v3, s5, s7
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX1032DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_float:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s5, 0x7fc00000
+; GFX1032GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s6, s4
+; GFX1032GISEL-NEXT:    v_readlane_b32 s7, v2, s6
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s4, s6
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1032GISEL-NEXT:    v_max_f32_e64 v3, s5, s7
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX1032GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_float:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
+; GFX1164DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-NEXT:    v_max_f32_e64 v3, s2, s4
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_float:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
+; GFX1164GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164GISEL-NEXT:    v_max_f32_e64 v3, s2, s4
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_float:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0x7fc00000
+; GFX1132DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s2, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s0, s2
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132DAGISEL-NEXT:    v_max_f32_e64 v3, s1, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1132DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_float:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, 0x7fc00000
+; GFX1132GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s2, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s0, s2
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132GISEL-NEXT:    v_max_f32_e64 v3, s1, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1132GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call float @llvm.amdgcn.wave.reduce.fmax(float %in, i32 1)
+  store float %result, ptr addrspace(1) %out
+  ret void
+}
+
+define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
+; GFX8DAGISEL-LABEL: divergent_cfg_float:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v4
+; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX8DAGISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v2, s9
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX8DAGISEL-NEXT:    v_max_f32_e32 v3, s8, v3
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s8, v3
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX8DAGISEL-NEXT:  ; %bb.3:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
+; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX8DAGISEL-NEXT:  .LBB2_4: ; %Flow
+; GFX8DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB2_8
+; GFX8DAGISEL-NEXT:  ; %bb.5: ; %if
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX8DAGISEL-NEXT:  .LBB2_6: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v3, s9
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX8DAGISEL-NEXT:    v_max_f32_e32 v2, s8, v2
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_6
+; GFX8DAGISEL-NEXT:  ; %bb.7:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
+; GFX8DAGISEL-NEXT:  .LBB2_8: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v4
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_cfg_float:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v4
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr8
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_3
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX8GISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX8GISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX8GISEL-NEXT:    v_readlane_b32 s10, v2, s9
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v4, s10
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX8GISEL-NEXT:    v_max_f32_e32 v4, s8, v4
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s8, v4
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX8GISEL-NEXT:  .LBB2_3: ; %Flow
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX8GISEL-NEXT:  ; %bb.4: ; %if
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX8GISEL-NEXT:  .LBB2_5: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX8GISEL-NEXT:    v_readlane_b32 s10, v3, s9
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX8GISEL-NEXT:    v_max_f32_e32 v2, s8, v2
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX8GISEL-NEXT:  .LBB2_6: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s8
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_cfg_float:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v4
+; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX9DAGISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v2, s9
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9DAGISEL-NEXT:    v_max_f32_e32 v3, s8, v3
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s8, v3
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX9DAGISEL-NEXT:  ; %bb.3:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX9DAGISEL-NEXT:  .LBB2_4: ; %Flow
+; GFX9DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB2_8
+; GFX9DAGISEL-NEXT:  ; %bb.5: ; %if
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX9DAGISEL-NEXT:  .LBB2_6: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v3, s9
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9DAGISEL-NEXT:    v_max_f32_e32 v2, s8, v2
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_6
+; GFX9DAGISEL-NEXT:  ; %bb.7:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9DAGISEL-NEXT:  .LBB2_8: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9DAGISEL-NEXT:    global_store_dword v[0:1], v4, off
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_cfg_float:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v4
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr8
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_3
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX9GISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX9GISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9GISEL-NEXT:    v_readlane_b32 s10, v2, s9
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v4, s10
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9GISEL-NEXT:    v_max_f32_e32 v4, s8, v4
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s8, v4
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX9GISEL-NEXT:  .LBB2_3: ; %Flow
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX9GISEL-NEXT:  ; %bb.4: ; %if
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX9GISEL-NEXT:  .LBB2_5: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9GISEL-NEXT:    v_readlane_b32 s10, v3, s9
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9GISEL-NEXT:    v_max_f32_e32 v2, s8, v2
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX9GISEL-NEXT:  .LBB2_6: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg_float:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v4
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX1064DAGISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v2, s9
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064DAGISEL-NEXT:    v_max_f32_e64 v3, s8, s10
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s8, v3
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX1064DAGISEL-NEXT:  ; %bb.3:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX1064DAGISEL-NEXT:  .LBB2_4: ; %Flow
+; GFX1064DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB2_8
+; GFX1064DAGISEL-NEXT:  ; %bb.5: ; %if
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX1064DAGISEL-NEXT:  .LBB2_6: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v3, s9
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064DAGISEL-NEXT:    v_max_f32_e64 v2, s8, s10
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_6
+; GFX1064DAGISEL-NEXT:  ; %bb.7:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
+; GFX1064DAGISEL-NEXT:  .LBB2_8: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    global_store_dword v[0:1], v4, off
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_cfg_float:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr8
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v4
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX1064GISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v2, s9
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064GISEL-NEXT:    v_max_f32_e64 v3, s8, s10
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s8, v3
+; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX1064GISEL-NEXT:  .LBB2_3: ; %Flow
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1064GISEL-NEXT:  ; %bb.4: ; %if
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX1064GISEL-NEXT:  .LBB2_5: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v3, s9
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064GISEL-NEXT:    v_max_f32_e64 v2, s8, s10
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX1064GISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s8
+; GFX1064GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg_float:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v4
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s4, exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
+; GFX1032DAGISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s5
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s5, s7
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX1032DAGISEL-NEXT:    v_max_f32_e64 v3, s6, s8
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX1032DAGISEL-NEXT:  ; %bb.3:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v4, s6
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX1032DAGISEL-NEXT:  .LBB2_4: ; %Flow
+; GFX1032DAGISEL-NEXT:    s_andn2_saveexec_b32 s4, s4
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB2_8
+; GFX1032DAGISEL-NEXT:  ; %bb.5: ; %if
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
+; GFX1032DAGISEL-NEXT:  .LBB2_6: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s5
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v3, s7
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s5, s7
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX1032DAGISEL-NEXT:    v_max_f32_e64 v2, s6, s8
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_6
+; GFX1032DAGISEL-NEXT:  ; %bb.7:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v4, s6
+; GFX1032DAGISEL-NEXT:  .LBB2_8: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1032DAGISEL-NEXT:    global_store_dword v[0:1], v4, off
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_cfg_float:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr4
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v4
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s5, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s5, exec_lo, s5
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
+; GFX1032GISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
+; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1032GISEL-NEXT:    v_max_f32_e64 v3, s4, s8
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX1032GISEL-NEXT:  .LBB2_3: ; %Flow
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s5, s5
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1032GISEL-NEXT:  ; %bb.4: ; %if
+; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
+; GFX1032GISEL-NEXT:  .LBB2_5: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
+; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v3, s7
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1032GISEL-NEXT:    v_max_f32_e64 v2, s4, s8
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX1032GISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg_float:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v4
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1164DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
+; GFX1164DAGISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v2, s5
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    v_max_f32_e64 v3, s4, s6
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX1164DAGISEL-NEXT:  ; %bb.3:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX1164DAGISEL-NEXT:  .LBB2_4: ; %Flow
+; GFX1164DAGISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB2_8
+; GFX1164DAGISEL-NEXT:  ; %bb.5: ; %if
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
+; GFX1164DAGISEL-NEXT:  .LBB2_6: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v3, s5
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    v_max_f32_e64 v2, s4, s6
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_6
+; GFX1164DAGISEL-NEXT:  ; %bb.7:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-NEXT:  .LBB2_8: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_cfg_float:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr4
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v4
+; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
+; GFX1164GISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v2, s5
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    v_max_f32_e64 v3, s4, s6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX1164GISEL-NEXT:  .LBB2_3: ; %Flow
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1164GISEL-NEXT:  ; %bb.4: ; %if
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
+; GFX1164GISEL-NEXT:  .LBB2_5: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v3, s5
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    v_max_f32_e64 v2, s4, s6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX1164GISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1164GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg_float:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v4
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1132DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
+; GFX1132DAGISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s1, s3
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132DAGISEL-NEXT:    v_max_f32_e64 v3, s2, s4
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX1132DAGISEL-NEXT:  ; %bb.3:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v4, s2
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX1132DAGISEL-NEXT:  .LBB2_4: ; %Flow
+; GFX1132DAGISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB2_8
+; GFX1132DAGISEL-NEXT:  ; %bb.5: ; %if
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
+; GFX1132DAGISEL-NEXT:  .LBB2_6: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v3, s3
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s1, s3
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132DAGISEL-NEXT:    v_max_f32_e64 v2, s2, s4
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_6
+; GFX1132DAGISEL-NEXT:  ; %bb.7:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v4, s2
+; GFX1132DAGISEL-NEXT:  .LBB2_8: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_cfg_float:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v4
+; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0x7fc00000
+; GFX1132GISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT:    v_max_f32_e64 v3, s0, s4
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX1132GISEL-NEXT:  .LBB2_3: ; %Flow
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1132GISEL-NEXT:  ; %bb.4: ; %if
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0x7fc00000
+; GFX1132GISEL-NEXT:  .LBB2_5: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v3, s3
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT:    v_max_f32_e64 v2, s0, s4
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX1132GISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1132GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call float @llvm.amdgcn.wave.reduce.fmax(float %in2, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call float @llvm.amdgcn.wave.reduce.fmax(float %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi float [%reducedValTid, %if], [%reducedValIn, %else]
+  store float %combine, ptr addrspace(1) %out
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11DAGISEL: {{.*}}
+; GFX11GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll
new file mode 100644
index 0000000000000..cb093cb14c4b5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll
@@ -0,0 +1,928 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck  -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck  -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck  -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck  -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+
+define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) {
+; GFX8DAGISEL-LABEL: uniform_value_float:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_float:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_float:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_float:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value_float:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_clause 0x1
+; GFX10DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value_float:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_clause 0x1
+; GFX10GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_float:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_float:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_clause 0x1
+; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_float:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_float:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_clause 0x1
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call float @llvm.amdgcn.wave.reduce.fmin(float %in, i32 1)
+  store float %result, ptr addrspace(1) %out
+  ret void
+}
+
+define void @divergent_value_float(ptr addrspace(1) %out, float %in) {
+; GFX8DAGISEL-LABEL: divergent_value_float:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
+; GFX8DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT:    v_min_f32_e32 v3, s6, v3
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_float:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
+; GFX8GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT:    v_min_f32_e32 v3, s6, v3
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_float:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
+; GFX9DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT:    v_min_f32_e32 v3, s6, v3
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_float:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
+; GFX9GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT:    v_min_f32_e32 v3, s6, v3
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_float:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
+; GFX1064DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT:    v_min_f32_e64 v3, s6, s8
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_float:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
+; GFX1064GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT:    v_min_f32_e64 v3, s6, s8
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_float:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, 0x7fc00000
+; GFX1032DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s6, s4
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s7, v2, s6
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s4, s6
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1032DAGISEL-NEXT:    v_min_f32_e64 v3, s5, s7
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX1032DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_float:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s5, 0x7fc00000
+; GFX1032GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s6, s4
+; GFX1032GISEL-NEXT:    v_readlane_b32 s7, v2, s6
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s4, s6
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1032GISEL-NEXT:    v_min_f32_e64 v3, s5, s7
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX1032GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_float:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
+; GFX1164DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-NEXT:    v_min_f32_e64 v3, s2, s4
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_float:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
+; GFX1164GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164GISEL-NEXT:    v_min_f32_e64 v3, s2, s4
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_float:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0x7fc00000
+; GFX1132DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s2, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s0, s2
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132DAGISEL-NEXT:    v_min_f32_e64 v3, s1, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1132DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_float:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, 0x7fc00000
+; GFX1132GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s2, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s0, s2
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132GISEL-NEXT:    v_min_f32_e64 v3, s1, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1132GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call float @llvm.amdgcn.wave.reduce.fmin(float %in, i32 1)
+  store float %result, ptr addrspace(1) %out
+  ret void
+}
+
+define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
+; GFX8DAGISEL-LABEL: divergent_cfg_float:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v4
+; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX8DAGISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v2, s9
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX8DAGISEL-NEXT:    v_min_f32_e32 v3, s8, v3
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s8, v3
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX8DAGISEL-NEXT:  ; %bb.3:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
+; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX8DAGISEL-NEXT:  .LBB2_4: ; %Flow
+; GFX8DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB2_8
+; GFX8DAGISEL-NEXT:  ; %bb.5: ; %if
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX8DAGISEL-NEXT:  .LBB2_6: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v3, s9
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX8DAGISEL-NEXT:    v_min_f32_e32 v2, s8, v2
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_6
+; GFX8DAGISEL-NEXT:  ; %bb.7:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
+; GFX8DAGISEL-NEXT:  .LBB2_8: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v4
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_cfg_float:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v4
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr8
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_3
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX8GISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX8GISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX8GISEL-NEXT:    v_readlane_b32 s10, v2, s9
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v4, s10
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX8GISEL-NEXT:    v_min_f32_e32 v4, s8, v4
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s8, v4
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX8GISEL-NEXT:  .LBB2_3: ; %Flow
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX8GISEL-NEXT:  ; %bb.4: ; %if
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX8GISEL-NEXT:  .LBB2_5: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX8GISEL-NEXT:    v_readlane_b32 s10, v3, s9
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX8GISEL-NEXT:    v_min_f32_e32 v2, s8, v2
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX8GISEL-NEXT:  .LBB2_6: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s8
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_cfg_float:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v4
+; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX9DAGISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v2, s9
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9DAGISEL-NEXT:    v_min_f32_e32 v3, s8, v3
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s8, v3
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX9DAGISEL-NEXT:  ; %bb.3:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX9DAGISEL-NEXT:  .LBB2_4: ; %Flow
+; GFX9DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB2_8
+; GFX9DAGISEL-NEXT:  ; %bb.5: ; %if
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX9DAGISEL-NEXT:  .LBB2_6: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v3, s9
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9DAGISEL-NEXT:    v_min_f32_e32 v2, s8, v2
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_6
+; GFX9DAGISEL-NEXT:  ; %bb.7:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9DAGISEL-NEXT:  .LBB2_8: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9DAGISEL-NEXT:    global_store_dword v[0:1], v4, off
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_cfg_float:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v4
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr8
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_3
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX9GISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX9GISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9GISEL-NEXT:    v_readlane_b32 s10, v2, s9
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v4, s10
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9GISEL-NEXT:    v_min_f32_e32 v4, s8, v4
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s8, v4
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX9GISEL-NEXT:  .LBB2_3: ; %Flow
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX9GISEL-NEXT:  ; %bb.4: ; %if
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX9GISEL-NEXT:  .LBB2_5: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9GISEL-NEXT:    v_readlane_b32 s10, v3, s9
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9GISEL-NEXT:    v_min_f32_e32 v2, s8, v2
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX9GISEL-NEXT:  .LBB2_6: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg_float:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v4
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX1064DAGISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v2, s9
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064DAGISEL-NEXT:    v_min_f32_e64 v3, s8, s10
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s8, v3
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX1064DAGISEL-NEXT:  ; %bb.3:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX1064DAGISEL-NEXT:  .LBB2_4: ; %Flow
+; GFX1064DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB2_8
+; GFX1064DAGISEL-NEXT:  ; %bb.5: ; %if
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX1064DAGISEL-NEXT:  .LBB2_6: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v3, s9
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064DAGISEL-NEXT:    v_min_f32_e64 v2, s8, s10
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_6
+; GFX1064DAGISEL-NEXT:  ; %bb.7:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
+; GFX1064DAGISEL-NEXT:  .LBB2_8: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    global_store_dword v[0:1], v4, off
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_cfg_float:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr8
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v4
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX1064GISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v2, s9
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064GISEL-NEXT:    v_min_f32_e64 v3, s8, s10
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s8, v3
+; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX1064GISEL-NEXT:  .LBB2_3: ; %Flow
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1064GISEL-NEXT:  ; %bb.4: ; %if
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
+; GFX1064GISEL-NEXT:  .LBB2_5: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v3, s9
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1064GISEL-NEXT:    v_min_f32_e64 v2, s8, s10
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX1064GISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s8
+; GFX1064GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg_float:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v4
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s4, exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
+; GFX1032DAGISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s5
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s5, s7
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX1032DAGISEL-NEXT:    v_min_f32_e64 v3, s6, s8
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX1032DAGISEL-NEXT:  ; %bb.3:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v4, s6
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX1032DAGISEL-NEXT:  .LBB2_4: ; %Flow
+; GFX1032DAGISEL-NEXT:    s_andn2_saveexec_b32 s4, s4
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB2_8
+; GFX1032DAGISEL-NEXT:  ; %bb.5: ; %if
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
+; GFX1032DAGISEL-NEXT:  .LBB2_6: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s5
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v3, s7
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s5, s7
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX1032DAGISEL-NEXT:    v_min_f32_e64 v2, s6, s8
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_6
+; GFX1032DAGISEL-NEXT:  ; %bb.7:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v4, s6
+; GFX1032DAGISEL-NEXT:  .LBB2_8: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1032DAGISEL-NEXT:    global_store_dword v[0:1], v4, off
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_cfg_float:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr4
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v4
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s5, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s5, exec_lo, s5
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
+; GFX1032GISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
+; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1032GISEL-NEXT:    v_min_f32_e64 v3, s4, s8
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX1032GISEL-NEXT:  .LBB2_3: ; %Flow
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s5, s5
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1032GISEL-NEXT:  ; %bb.4: ; %if
+; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
+; GFX1032GISEL-NEXT:  .LBB2_5: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
+; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v3, s7
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1032GISEL-NEXT:    v_min_f32_e64 v2, s4, s8
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX1032GISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg_float:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v4
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1164DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
+; GFX1164DAGISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v2, s5
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    v_min_f32_e64 v3, s4, s6
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX1164DAGISEL-NEXT:  ; %bb.3:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX1164DAGISEL-NEXT:  .LBB2_4: ; %Flow
+; GFX1164DAGISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB2_8
+; GFX1164DAGISEL-NEXT:  ; %bb.5: ; %if
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
+; GFX1164DAGISEL-NEXT:  .LBB2_6: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v3, s5
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    v_min_f32_e64 v2, s4, s6
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_6
+; GFX1164DAGISEL-NEXT:  ; %bb.7:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-NEXT:  .LBB2_8: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_cfg_float:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr4
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v4
+; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
+; GFX1164GISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v2, s5
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    v_min_f32_e64 v3, s4, s6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX1164GISEL-NEXT:  .LBB2_3: ; %Flow
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1164GISEL-NEXT:  ; %bb.4: ; %if
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
+; GFX1164GISEL-NEXT:  .LBB2_5: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v3, s5
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    v_min_f32_e64 v2, s4, s6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX1164GISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1164GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg_float:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v4
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1132DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
+; GFX1132DAGISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s1, s3
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132DAGISEL-NEXT:    v_min_f32_e64 v3, s2, s4
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX1132DAGISEL-NEXT:  ; %bb.3:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v4, s2
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX1132DAGISEL-NEXT:  .LBB2_4: ; %Flow
+; GFX1132DAGISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB2_8
+; GFX1132DAGISEL-NEXT:  ; %bb.5: ; %if
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
+; GFX1132DAGISEL-NEXT:  .LBB2_6: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v3, s3
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s1, s3
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132DAGISEL-NEXT:    v_min_f32_e64 v2, s2, s4
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB2_6
+; GFX1132DAGISEL-NEXT:  ; %bb.7:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v4, s2
+; GFX1132DAGISEL-NEXT:  .LBB2_8: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_cfg_float:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v4
+; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0x7fc00000
+; GFX1132GISEL-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT:    v_min_f32_e64 v3, s0, s4
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr3
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB2_2
+; GFX1132GISEL-NEXT:  .LBB2_3: ; %Flow
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1132GISEL-NEXT:  ; %bb.4: ; %if
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0x7fc00000
+; GFX1132GISEL-NEXT:  .LBB2_5: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v3, s3
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT:    v_min_f32_e64 v2, s0, s4
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX1132GISEL-NEXT:  .LBB2_6: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1132GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call float @llvm.amdgcn.wave.reduce.fmin(float %in2, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call float @llvm.amdgcn.wave.reduce.fmin(float %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi float [%reducedValTid, %if], [%reducedValIn, %else]
+  store float %combine, ptr addrspace(1) %out
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11DAGISEL: {{.*}}
+; GFX11GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll
new file mode 100644
index 0000000000000..29dfb0b504f81
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll
@@ -0,0 +1,1021 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck  -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck  -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck  -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck  -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX12DAGISEL %s
+
+
+define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) {
+; GFX8DAGISEL-LABEL: uniform_value_float:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX8DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_float:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX8GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_float:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_float:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_float:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s3, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value_float:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX1064GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_float:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value_float:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s0, s0
+; GFX1032GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_float:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_float:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_float:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_float:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+;
+; GFX12DAGISEL-LABEL: uniform_value_float:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX12DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12DAGISEL-NEXT:    s_endpgm
+entry:
+  %result = call float @llvm.amdgcn.wave.reduce.fsub(float %in, i32 1)
+  store float %result, ptr addrspace(1) %out
+  ret void
+}
+
+define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) {
+; GFX8DAGISEL-LABEL: divergent_value_float:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT:    v_sub_f32_e32 v3, s6, v3
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_float:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX8GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT:    v_sub_f32_e32 v3, s6, v3
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_float:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT:    v_sub_f32_e32 v3, s6, v3
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_float:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT:    v_sub_f32_e32 v3, s6, v3
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_float:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT:    v_sub_f32_e64 v3, s6, s8
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_float:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT:    v_sub_f32_e64 v3, s6, s8
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_float:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, 0
+; GFX1032DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s6, s4
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s7, v2, s6
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s4, s6
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1032DAGISEL-NEXT:    v_sub_f32_e64 v3, s5, s7
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX1032DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_float:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s5, 0
+; GFX1032GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s6, s4
+; GFX1032GISEL-NEXT:    v_readlane_b32 s7, v2, s6
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s4, s6
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1032GISEL-NEXT:    v_sub_f32_e64 v3, s5, s7
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX1032GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_float:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1164DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-NEXT:    v_sub_f32_e64 v3, s2, s4
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_float:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1164GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164GISEL-NEXT:    v_sub_f32_e64 v3, s2, s4
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_float:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0
+; GFX1132DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s2, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s0, s2
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132DAGISEL-NEXT:    v_sub_f32_e64 v3, s1, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1132DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_float:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, 0
+; GFX1132GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s2, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s0, s2
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132GISEL-NEXT:    v_sub_f32_e64 v3, s1, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1132GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12DAGISEL-LABEL: divergent_value_float:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX12DAGISEL-NEXT:    s_mov_b32 s1, 0
+; GFX12DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s2, s0
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX12DAGISEL-NEXT:    s_bitset0_b32 s0, s2
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX12DAGISEL-NEXT:    v_sub_f32_e64 v3, s1, s3
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX12DAGISEL-NEXT:  ; %bb.2:
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call float @llvm.amdgcn.wave.reduce.fsub(float %id.x, i32 1)
+  store float %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
+; GFX8DAGISEL-LABEL: divergent_cfg_float:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX8DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX8DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:  .LBB2_4: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8DAGISEL-NEXT:    flat_store_dword v[1:2], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg_float:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX8GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX8GISEL-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX8GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX8GISEL-NEXT:  .LBB2_4: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg_float:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX9DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX9DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX9DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9DAGISEL-NEXT:  .LBB2_4: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg_float:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX9GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX9GISEL-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX9GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX9GISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9GISEL-NEXT:  .LBB2_4: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg_float:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1064DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX1064DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1064DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064DAGISEL-NEXT:  .LBB2_4: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg_float:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1064GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX1064GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1064GISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1064GISEL-NEXT:  .LBB2_4: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg_float:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr3
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX1032DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1032DAGISEL-NEXT:  .LBB2_4: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg_float:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s0, s3
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032GISEL-NEXT:  .LBB2_4: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg_float:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1164DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1164DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1164DAGISEL-NEXT:  .LBB2_4: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg_float:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1164GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[2:3]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1164GISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1164GISEL-NEXT:  .LBB2_4: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg_float:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $sgpr3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX1132DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1132DAGISEL-NEXT:  .LBB2_4: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg_float:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s3
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-NEXT:  .LBB2_4: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+;
+; GFX12DAGISEL-LABEL: divergent_cfg_float:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX12DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12DAGISEL-NEXT:    ; implicit-def: $sgpr3
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX12DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB2_2
+; GFX12DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX12DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX12DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX12DAGISEL-NEXT:  .LBB2_2: ; %Flow
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s0, s2
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX12DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12DAGISEL-NEXT:  .LBB2_4: ; %endif
+; GFX12DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12DAGISEL-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call float @llvm.amdgcn.wave.reduce.fsub(float %in2, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call float @llvm.amdgcn.wave.reduce.fsub(float %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi float [%reducedValTid, %if], [%reducedValIn, %else]
+  store float %combine, ptr addrspace(1) %out
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10DAGISEL: {{.*}}
+; GFX10GISEL: {{.*}}
+; GFX11DAGISEL: {{.*}}
+; GFX11GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
index 55b8e8541ecfb..6f299ab8bb9cf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
@@ -1534,917 +1534,6 @@ endif:
   store i64 %combine, ptr addrspace(1) %out
   ret void
 }
-
-define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) {
-; GFX8DAGISEL-LABEL: uniform_value_float:
-; GFX8DAGISEL:       ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8DAGISEL-NEXT:    s_endpgm
-;
-; GFX8GISEL-LABEL: uniform_value_float:
-; GFX8GISEL:       ; %bb.0: ; %entry
-; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT:    s_endpgm
-;
-; GFX9DAGISEL-LABEL: uniform_value_float:
-; GFX9DAGISEL:       ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9DAGISEL-NEXT:    s_endpgm
-;
-; GFX9GISEL-LABEL: uniform_value_float:
-; GFX9GISEL:       ; %bb.0: ; %entry
-; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX9GISEL-NEXT:    s_endpgm
-;
-; GFX10DAGISEL-LABEL: uniform_value_float:
-; GFX10DAGISEL:       ; %bb.0: ; %entry
-; GFX10DAGISEL-NEXT:    s_clause 0x1
-; GFX10DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10DAGISEL-NEXT:    s_endpgm
-;
-; GFX10GISEL-LABEL: uniform_value_float:
-; GFX10GISEL:       ; %bb.0: ; %entry
-; GFX10GISEL-NEXT:    s_clause 0x1
-; GFX10GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX10GISEL-NEXT:    s_endpgm
-;
-; GFX1164DAGISEL-LABEL: uniform_value_float:
-; GFX1164DAGISEL:       ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT:    s_clause 0x1
-; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_endpgm
-;
-; GFX1164GISEL-LABEL: uniform_value_float:
-; GFX1164GISEL:       ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT:    s_clause 0x1
-; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT:    s_endpgm
-;
-; GFX1132DAGISEL-LABEL: uniform_value_float:
-; GFX1132DAGISEL:       ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT:    s_clause 0x1
-; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1132DAGISEL-NEXT:    s_endpgm
-;
-; GFX1132GISEL-LABEL: uniform_value_float:
-; GFX1132GISEL:       ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT:    s_clause 0x1
-; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT:    s_endpgm
-entry:
-  %result = call float @llvm.amdgcn.wave.reduce.fmax(float %in, i32 1)
-  store float %result, ptr addrspace(1) %out
-  ret void
-}
-
-define void @divergent_value_float(ptr addrspace(1) %out, float %in) {
-; GFX8DAGISEL-LABEL: divergent_value_float:
-; GFX8DAGISEL:       ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
-; GFX8DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s8
-; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX8DAGISEL-NEXT:    v_max_f32_e32 v3, s6, v3
-; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX8DAGISEL-NEXT:  ; %bb.2:
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8GISEL-LABEL: divergent_value_float:
-; GFX8GISEL:       ; %bb.0: ; %entry
-; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX8GISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
-; GFX8GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s8
-; GFX8GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX8GISEL-NEXT:    v_max_f32_e32 v3, s6, v3
-; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX8GISEL-NEXT:  ; %bb.2:
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9DAGISEL-LABEL: divergent_value_float:
-; GFX9DAGISEL:       ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
-; GFX9DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9DAGISEL-NEXT:    v_max_f32_e32 v3, s6, v3
-; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX9DAGISEL-NEXT:  ; %bb.2:
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9GISEL-LABEL: divergent_value_float:
-; GFX9GISEL:       ; %bb.0: ; %entry
-; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9GISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
-; GFX9GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9GISEL-NEXT:    v_max_f32_e32 v3, s6, v3
-; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX9GISEL-NEXT:  ; %bb.2:
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9GISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1064DAGISEL-LABEL: divergent_value_float:
-; GFX1064DAGISEL:       ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
-; GFX1064DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064DAGISEL-NEXT:    v_max_f32_e64 v3, s6, s8
-; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1064DAGISEL-NEXT:  ; %bb.2:
-; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX1064DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1064GISEL-LABEL: divergent_value_float:
-; GFX1064GISEL:       ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
-; GFX1064GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX1064GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064GISEL-NEXT:    v_max_f32_e64 v3, s6, s8
-; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1064GISEL-NEXT:  ; %bb.2:
-; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX1064GISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1032DAGISEL-LABEL: divergent_value_float:
-; GFX1032DAGISEL:       ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, 0x7fc00000
-; GFX1032DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s6, s4
-; GFX1032DAGISEL-NEXT:    v_readlane_b32 s7, v2, s6
-; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s4, s6
-; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX1032DAGISEL-NEXT:    v_max_f32_e64 v3, s5, s7
-; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1032DAGISEL-NEXT:  ; %bb.2:
-; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX1032DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1032GISEL-LABEL: divergent_value_float:
-; GFX1032GISEL:       ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1032GISEL-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1032GISEL-NEXT:    s_mov_b32 s5, 0x7fc00000
-; GFX1032GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s6, s4
-; GFX1032GISEL-NEXT:    v_readlane_b32 s7, v2, s6
-; GFX1032GISEL-NEXT:    s_bitset0_b32 s4, s6
-; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX1032GISEL-NEXT:    v_max_f32_e64 v3, s5, s7
-; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1032GISEL-NEXT:  ; %bb.2:
-; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX1032GISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1164DAGISEL-LABEL: divergent_value_float:
-; GFX1164DAGISEL:       ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164DAGISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
-; GFX1164DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
-; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[0:1], s3
-; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164DAGISEL-NEXT:    v_max_f32_e64 v3, s2, s4
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1164DAGISEL-NEXT:  ; %bb.2:
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1164DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1164GISEL-LABEL: divergent_value_float:
-; GFX1164GISEL:       ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164GISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
-; GFX1164GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s3
-; GFX1164GISEL-NEXT:    s_bitset0_b64 s[0:1], s3
-; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164GISEL-NEXT:    v_max_f32_e64 v3, s2, s4
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1164GISEL-NEXT:  ; %bb.2:
-; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1164GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1132DAGISEL-LABEL: divergent_value_float:
-; GFX1132DAGISEL:       ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0x7fc00000
-; GFX1132DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s2, s0
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v2, s2
-; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s0, s2
-; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX1132DAGISEL-NEXT:    v_max_f32_e64 v3, s1, s3
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1132DAGISEL-NEXT:  ; %bb.2:
-; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1132DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1132GISEL-LABEL: divergent_value_float:
-; GFX1132GISEL:       ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1132GISEL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132GISEL-NEXT:    s_mov_b32 s1, 0x7fc00000
-; GFX1132GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s2, s0
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v2, s2
-; GFX1132GISEL-NEXT:    s_bitset0_b32 s0, s2
-; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX1132GISEL-NEXT:    v_max_f32_e64 v3, s1, s3
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1132GISEL-NEXT:  ; %bb.2:
-; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1132GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call float @llvm.amdgcn.wave.reduce.fmax(float %in, i32 1)
-  store float %result, ptr addrspace(1) %out
-  ret void
-}
-
-define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
-; GFX8DAGISEL-LABEL: divergent_cfg_float:
-; GFX8DAGISEL:       ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v4
-; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr4
-; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX8DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX8DAGISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v2, s9
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s10
-; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX8DAGISEL-NEXT:    v_max_f32_e32 v3, s8, v3
-; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s8, v3
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX8DAGISEL-NEXT:  ; %bb.3:
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
-; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX8DAGISEL-NEXT:  .LBB8_4: ; %Flow
-; GFX8DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_8
-; GFX8DAGISEL-NEXT:  ; %bb.5: ; %if
-; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX8DAGISEL-NEXT:  .LBB8_6: ; =>This Inner Loop Header: Depth=1
-; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v3, s9
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s10
-; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX8DAGISEL-NEXT:    v_max_f32_e32 v2, s8, v2
-; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_6
-; GFX8DAGISEL-NEXT:  ; %bb.7:
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
-; GFX8DAGISEL-NEXT:  .LBB8_8: ; %endif
-; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v4
-; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8GISEL-LABEL: divergent_cfg_float:
-; GFX8GISEL:       ; %bb.0: ; %entry
-; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v4
-; GFX8GISEL-NEXT:    ; implicit-def: $sgpr8
-; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX8GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_3
-; GFX8GISEL-NEXT:  ; %bb.1: ; %else
-; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX8GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX8GISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX8GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX8GISEL-NEXT:    v_readlane_b32 s10, v2, s9
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v4, s10
-; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX8GISEL-NEXT:    v_max_f32_e32 v4, s8, v4
-; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    v_readfirstlane_b32 s8, v4
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX8GISEL-NEXT:  .LBB8_3: ; %Flow
-; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_6
-; GFX8GISEL-NEXT:  ; %bb.4: ; %if
-; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX8GISEL-NEXT:  .LBB8_5: ; =>This Inner Loop Header: Depth=1
-; GFX8GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX8GISEL-NEXT:    v_readlane_b32 s10, v3, s9
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s10
-; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX8GISEL-NEXT:    v_max_f32_e32 v2, s8, v2
-; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_5
-; GFX8GISEL-NEXT:  .LBB8_6: ; %endif
-; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s8
-; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9DAGISEL-LABEL: divergent_cfg_float:
-; GFX9DAGISEL:       ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v4
-; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr4
-; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX9DAGISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v2, s9
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s10
-; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX9DAGISEL-NEXT:    v_max_f32_e32 v3, s8, v3
-; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s8, v3
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX9DAGISEL-NEXT:  ; %bb.3:
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX9DAGISEL-NEXT:  .LBB8_4: ; %Flow
-; GFX9DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_8
-; GFX9DAGISEL-NEXT:  ; %bb.5: ; %if
-; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX9DAGISEL-NEXT:  .LBB8_6: ; =>This Inner Loop Header: Depth=1
-; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v3, s9
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX9DAGISEL-NEXT:    v_max_f32_e32 v2, s8, v2
-; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_6
-; GFX9DAGISEL-NEXT:  ; %bb.7:
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9DAGISEL-NEXT:  .LBB8_8: ; %endif
-; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9DAGISEL-NEXT:    global_store_dword v[0:1], v4, off
-; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9GISEL-LABEL: divergent_cfg_float:
-; GFX9GISEL:       ; %bb.0: ; %entry
-; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v4
-; GFX9GISEL-NEXT:    ; implicit-def: $sgpr8
-; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_3
-; GFX9GISEL-NEXT:  ; %bb.1: ; %else
-; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX9GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX9GISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX9GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX9GISEL-NEXT:    v_readlane_b32 s10, v2, s9
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v4, s10
-; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX9GISEL-NEXT:    v_max_f32_e32 v4, s8, v4
-; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    v_readfirstlane_b32 s8, v4
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX9GISEL-NEXT:  .LBB8_3: ; %Flow
-; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_6
-; GFX9GISEL-NEXT:  ; %bb.4: ; %if
-; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX9GISEL-NEXT:  .LBB8_5: ; =>This Inner Loop Header: Depth=1
-; GFX9GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX9GISEL-NEXT:    v_readlane_b32 s10, v3, s9
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX9GISEL-NEXT:    v_max_f32_e32 v2, s8, v2
-; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_5
-; GFX9GISEL-NEXT:  .LBB8_6: ; %endif
-; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9GISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1064DAGISEL-LABEL: divergent_cfg_float:
-; GFX1064DAGISEL:       ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v4
-; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr4
-; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX1064DAGISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v2, s9
-; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    v_max_f32_e64 v3, s8, s10
-; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s8, v3
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX1064DAGISEL-NEXT:  ; %bb.3:
-; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
-; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1064DAGISEL-NEXT:  .LBB8_4: ; %Flow
-; GFX1064DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_8
-; GFX1064DAGISEL-NEXT:  ; %bb.5: ; %if
-; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX1064DAGISEL-NEXT:  .LBB8_6: ; =>This Inner Loop Header: Depth=1
-; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v3, s9
-; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    v_max_f32_e64 v2, s8, s10
-; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_6
-; GFX1064DAGISEL-NEXT:  ; %bb.7:
-; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
-; GFX1064DAGISEL-NEXT:  .LBB8_8: ; %endif
-; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX1064DAGISEL-NEXT:    global_store_dword v[0:1], v4, off
-; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1064GISEL-LABEL: divergent_cfg_float:
-; GFX1064GISEL:       ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1064GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr8
-; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v4
-; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1064GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_3
-; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
-; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX1064GISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v2, s9
-; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    v_max_f32_e64 v3, s8, s10
-; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s8, v3
-; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX1064GISEL-NEXT:  .LBB8_3: ; %Flow
-; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_6
-; GFX1064GISEL-NEXT:  ; %bb.4: ; %if
-; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX1064GISEL-NEXT:  .LBB8_5: ; =>This Inner Loop Header: Depth=1
-; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v3, s9
-; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    v_max_f32_e64 v2, s8, s10
-; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_5
-; GFX1064GISEL-NEXT:  .LBB8_6: ; %endif
-; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s8
-; GFX1064GISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1032DAGISEL-LABEL: divergent_cfg_float:
-; GFX1032DAGISEL:       ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1032DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v4
-; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr4
-; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s4, vcc_lo
-; GFX1032DAGISEL-NEXT:    s_xor_b32 s4, exec_lo, s4
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, exec_lo
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
-; GFX1032DAGISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s5
-; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s5, s7
-; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s5, 0
-; GFX1032DAGISEL-NEXT:    v_max_f32_e64 v3, s6, s8
-; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX1032DAGISEL-NEXT:  ; %bb.3:
-; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v4, s6
-; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1032DAGISEL-NEXT:  .LBB8_4: ; %Flow
-; GFX1032DAGISEL-NEXT:    s_andn2_saveexec_b32 s4, s4
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_8
-; GFX1032DAGISEL-NEXT:  ; %bb.5: ; %if
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, exec_lo
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
-; GFX1032DAGISEL-NEXT:  .LBB8_6: ; =>This Inner Loop Header: Depth=1
-; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s5
-; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v3, s7
-; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s5, s7
-; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s5, 0
-; GFX1032DAGISEL-NEXT:    v_max_f32_e64 v2, s6, s8
-; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_6
-; GFX1032DAGISEL-NEXT:  ; %bb.7:
-; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v4, s6
-; GFX1032DAGISEL-NEXT:  .LBB8_8: ; %endif
-; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX1032DAGISEL-NEXT:    global_store_dword v[0:1], v4, off
-; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1032GISEL-LABEL: divergent_cfg_float:
-; GFX1032GISEL:       ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1032GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr4
-; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v4
-; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s5, vcc_lo
-; GFX1032GISEL-NEXT:    s_xor_b32 s5, exec_lo, s5
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_3
-; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
-; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1032GISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
-; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
-; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    v_max_f32_e64 v3, s4, s8
-; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX1032GISEL-NEXT:  .LBB8_3: ; %Flow
-; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s5, s5
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_6
-; GFX1032GISEL-NEXT:  ; %bb.4: ; %if
-; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1032GISEL-NEXT:  .LBB8_5: ; =>This Inner Loop Header: Depth=1
-; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
-; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v3, s7
-; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
-; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    v_max_f32_e64 v2, s4, s8
-; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_5
-; GFX1032GISEL-NEXT:  .LBB8_6: ; %endif
-; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1032GISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1164DAGISEL-LABEL: divergent_cfg_float:
-; GFX1164DAGISEL:       ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v4
-; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr4
-; GFX1164DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1164DAGISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v2, s5
-; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    v_max_f32_e64 v3, s4, s6
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX1164DAGISEL-NEXT:  ; %bb.3:
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1164DAGISEL-NEXT:  .LBB8_4: ; %Flow
-; GFX1164DAGISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_8
-; GFX1164DAGISEL-NEXT:  ; %bb.5: ; %if
-; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1164DAGISEL-NEXT:  .LBB8_6: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v3, s5
-; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    v_max_f32_e64 v2, s4, s6
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_6
-; GFX1164DAGISEL-NEXT:  ; %bb.7:
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX1164DAGISEL-NEXT:  .LBB8_8: ; %endif
-; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    global_store_b32 v[0:1], v4, off
-; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1164GISEL-LABEL: divergent_cfg_float:
-; GFX1164GISEL:       ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1164GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr4
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v4
-; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_3
-; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
-; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1164GISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v2, s5
-; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
-; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    v_max_f32_e64 v3, s4, s6
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX1164GISEL-NEXT:  .LBB8_3: ; %Flow
-; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_6
-; GFX1164GISEL-NEXT:  ; %bb.4: ; %if
-; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1164GISEL-NEXT:  .LBB8_5: ; =>This Inner Loop Header: Depth=1
-; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v3, s5
-; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
-; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    v_max_f32_e64 v2, s4, s6
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_5
-; GFX1164GISEL-NEXT:  .LBB8_6: ; %endif
-; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1164GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1132DAGISEL-LABEL: divergent_cfg_float:
-; GFX1132DAGISEL:       ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v4
-; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr4
-; GFX1132DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
-; GFX1132DAGISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s1
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
-; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s1, s3
-; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX1132DAGISEL-NEXT:    v_max_f32_e64 v3, s2, s4
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX1132DAGISEL-NEXT:  ; %bb.3:
-; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v4, s2
-; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1132DAGISEL-NEXT:  .LBB8_4: ; %Flow
-; GFX1132DAGISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_8
-; GFX1132DAGISEL-NEXT:  ; %bb.5: ; %if
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
-; GFX1132DAGISEL-NEXT:  .LBB8_6: ; =>This Inner Loop Header: Depth=1
-; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s1
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v3, s3
-; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s1, s3
-; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX1132DAGISEL-NEXT:    v_max_f32_e64 v2, s2, s4
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_6
-; GFX1132DAGISEL-NEXT:  ; %bb.7:
-; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v4, s2
-; GFX1132DAGISEL-NEXT:  .LBB8_8: ; %endif
-; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    global_store_b32 v[0:1], v4, off
-; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1132GISEL-LABEL: divergent_cfg_float:
-; GFX1132GISEL:       ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1132GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX1132GISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr0
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v4
-; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_3
-; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
-; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0x7fc00000
-; GFX1132GISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
-; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
-; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    v_max_f32_e64 v3, s0, s4
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s0, v3
-; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX1132GISEL-NEXT:  .LBB8_3: ; %Flow
-; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_6
-; GFX1132GISEL-NEXT:  ; %bb.4: ; %if
-; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0x7fc00000
-; GFX1132GISEL-NEXT:  .LBB8_5: ; =>This Inner Loop Header: Depth=1
-; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v3, s3
-; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
-; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    v_max_f32_e64 v2, s0, s4
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s0, v2
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_5
-; GFX1132GISEL-NEXT:  .LBB8_6: ; %endif
-; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GFX1132GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %d_cmp = icmp ult i32 %tid, 16
-  br i1 %d_cmp, label %if, label %else
-
-if:
-  %reducedValTid = call float @llvm.amdgcn.wave.reduce.fmax(float %in2, i32 1)
-  br label %endif
-
-else:
-  %reducedValIn = call float @llvm.amdgcn.wave.reduce.fmax(float %in, i32 1)
-  br label %endif
-
-endif:
-  %combine = phi float [%reducedValTid, %if], [%reducedValIn, %else]
-  store float %combine, ptr addrspace(1) %out
-  ret void
-}
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11DAGISEL: {{.*}}
 ; GFX11GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
index e2ee20dd96c62..3c4cbc74aedc1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
@@ -1534,917 +1534,6 @@ endif:
   store i64 %combine, ptr addrspace(1) %out
   ret void
 }
-
-define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) {
-; GFX8DAGISEL-LABEL: uniform_value_float:
-; GFX8DAGISEL:       ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8DAGISEL-NEXT:    s_endpgm
-;
-; GFX8GISEL-LABEL: uniform_value_float:
-; GFX8GISEL:       ; %bb.0: ; %entry
-; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT:    s_endpgm
-;
-; GFX9DAGISEL-LABEL: uniform_value_float:
-; GFX9DAGISEL:       ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9DAGISEL-NEXT:    s_endpgm
-;
-; GFX9GISEL-LABEL: uniform_value_float:
-; GFX9GISEL:       ; %bb.0: ; %entry
-; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX9GISEL-NEXT:    s_endpgm
-;
-; GFX10DAGISEL-LABEL: uniform_value_float:
-; GFX10DAGISEL:       ; %bb.0: ; %entry
-; GFX10DAGISEL-NEXT:    s_clause 0x1
-; GFX10DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX10DAGISEL-NEXT:    s_endpgm
-;
-; GFX10GISEL-LABEL: uniform_value_float:
-; GFX10GISEL:       ; %bb.0: ; %entry
-; GFX10GISEL-NEXT:    s_clause 0x1
-; GFX10GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX10GISEL-NEXT:    s_endpgm
-;
-; GFX1164DAGISEL-LABEL: uniform_value_float:
-; GFX1164DAGISEL:       ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT:    s_clause 0x1
-; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_endpgm
-;
-; GFX1164GISEL-LABEL: uniform_value_float:
-; GFX1164GISEL:       ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT:    s_clause 0x1
-; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT:    s_endpgm
-;
-; GFX1132DAGISEL-LABEL: uniform_value_float:
-; GFX1132DAGISEL:       ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT:    s_clause 0x1
-; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1132DAGISEL-NEXT:    s_endpgm
-;
-; GFX1132GISEL-LABEL: uniform_value_float:
-; GFX1132GISEL:       ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT:    s_clause 0x1
-; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT:    s_endpgm
-entry:
-  %result = call float @llvm.amdgcn.wave.reduce.fmin(float %in, i32 1)
-  store float %result, ptr addrspace(1) %out
-  ret void
-}
-
-define void @divergent_value_float(ptr addrspace(1) %out, float %in) {
-; GFX8DAGISEL-LABEL: divergent_value_float:
-; GFX8DAGISEL:       ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
-; GFX8DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s8
-; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX8DAGISEL-NEXT:    v_min_f32_e32 v3, s6, v3
-; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX8DAGISEL-NEXT:  ; %bb.2:
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8GISEL-LABEL: divergent_value_float:
-; GFX8GISEL:       ; %bb.0: ; %entry
-; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX8GISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
-; GFX8GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s8
-; GFX8GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX8GISEL-NEXT:    v_min_f32_e32 v3, s6, v3
-; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX8GISEL-NEXT:  ; %bb.2:
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9DAGISEL-LABEL: divergent_value_float:
-; GFX9DAGISEL:       ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
-; GFX9DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9DAGISEL-NEXT:    v_min_f32_e32 v3, s6, v3
-; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX9DAGISEL-NEXT:  ; %bb.2:
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9GISEL-LABEL: divergent_value_float:
-; GFX9GISEL:       ; %bb.0: ; %entry
-; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9GISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
-; GFX9GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9GISEL-NEXT:    v_min_f32_e32 v3, s6, v3
-; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX9GISEL-NEXT:  ; %bb.2:
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9GISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1064DAGISEL-LABEL: divergent_value_float:
-; GFX1064DAGISEL:       ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
-; GFX1064DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064DAGISEL-NEXT:    v_min_f32_e64 v3, s6, s8
-; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1064DAGISEL-NEXT:  ; %bb.2:
-; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX1064DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1064GISEL-LABEL: divergent_value_float:
-; GFX1064GISEL:       ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
-; GFX1064GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX1064GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064GISEL-NEXT:    v_min_f32_e64 v3, s6, s8
-; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1064GISEL-NEXT:  ; %bb.2:
-; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX1064GISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1032DAGISEL-LABEL: divergent_value_float:
-; GFX1032DAGISEL:       ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, 0x7fc00000
-; GFX1032DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s6, s4
-; GFX1032DAGISEL-NEXT:    v_readlane_b32 s7, v2, s6
-; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s4, s6
-; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX1032DAGISEL-NEXT:    v_min_f32_e64 v3, s5, s7
-; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1032DAGISEL-NEXT:  ; %bb.2:
-; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX1032DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1032GISEL-LABEL: divergent_value_float:
-; GFX1032GISEL:       ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1032GISEL-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1032GISEL-NEXT:    s_mov_b32 s5, 0x7fc00000
-; GFX1032GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s6, s4
-; GFX1032GISEL-NEXT:    v_readlane_b32 s7, v2, s6
-; GFX1032GISEL-NEXT:    s_bitset0_b32 s4, s6
-; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX1032GISEL-NEXT:    v_min_f32_e64 v3, s5, s7
-; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1032GISEL-NEXT:  ; %bb.2:
-; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX1032GISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1164DAGISEL-LABEL: divergent_value_float:
-; GFX1164DAGISEL:       ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164DAGISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
-; GFX1164DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
-; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[0:1], s3
-; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164DAGISEL-NEXT:    v_min_f32_e64 v3, s2, s4
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1164DAGISEL-NEXT:  ; %bb.2:
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1164DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1164GISEL-LABEL: divergent_value_float:
-; GFX1164GISEL:       ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164GISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
-; GFX1164GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s3
-; GFX1164GISEL-NEXT:    s_bitset0_b64 s[0:1], s3
-; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164GISEL-NEXT:    v_min_f32_e64 v3, s2, s4
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1164GISEL-NEXT:  ; %bb.2:
-; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1164GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1132DAGISEL-LABEL: divergent_value_float:
-; GFX1132DAGISEL:       ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0x7fc00000
-; GFX1132DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s2, s0
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v2, s2
-; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s0, s2
-; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX1132DAGISEL-NEXT:    v_min_f32_e64 v3, s1, s3
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1132DAGISEL-NEXT:  ; %bb.2:
-; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1132DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1132GISEL-LABEL: divergent_value_float:
-; GFX1132GISEL:       ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1132GISEL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132GISEL-NEXT:    s_mov_b32 s1, 0x7fc00000
-; GFX1132GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s2, s0
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v2, s2
-; GFX1132GISEL-NEXT:    s_bitset0_b32 s0, s2
-; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX1132GISEL-NEXT:    v_min_f32_e64 v3, s1, s3
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1132GISEL-NEXT:  ; %bb.2:
-; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1132GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call float @llvm.amdgcn.wave.reduce.fmin(float %in, i32 1)
-  store float %result, ptr addrspace(1) %out
-  ret void
-}
-
-define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
-; GFX8DAGISEL-LABEL: divergent_cfg_float:
-; GFX8DAGISEL:       ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v4
-; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr4
-; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX8DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX8DAGISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v2, s9
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s10
-; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX8DAGISEL-NEXT:    v_min_f32_e32 v3, s8, v3
-; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s8, v3
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX8DAGISEL-NEXT:  ; %bb.3:
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
-; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX8DAGISEL-NEXT:  .LBB8_4: ; %Flow
-; GFX8DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_8
-; GFX8DAGISEL-NEXT:  ; %bb.5: ; %if
-; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX8DAGISEL-NEXT:  .LBB8_6: ; =>This Inner Loop Header: Depth=1
-; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX8DAGISEL-NEXT:    v_readlane_b32 s10, v3, s9
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s10
-; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX8DAGISEL-NEXT:    v_min_f32_e32 v2, s8, v2
-; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_6
-; GFX8DAGISEL-NEXT:  ; %bb.7:
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
-; GFX8DAGISEL-NEXT:  .LBB8_8: ; %endif
-; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v4
-; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8GISEL-LABEL: divergent_cfg_float:
-; GFX8GISEL:       ; %bb.0: ; %entry
-; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v4
-; GFX8GISEL-NEXT:    ; implicit-def: $sgpr8
-; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX8GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_3
-; GFX8GISEL-NEXT:  ; %bb.1: ; %else
-; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX8GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX8GISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX8GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX8GISEL-NEXT:    v_readlane_b32 s10, v2, s9
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v4, s10
-; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX8GISEL-NEXT:    v_min_f32_e32 v4, s8, v4
-; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    v_readfirstlane_b32 s8, v4
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX8GISEL-NEXT:  .LBB8_3: ; %Flow
-; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_6
-; GFX8GISEL-NEXT:  ; %bb.4: ; %if
-; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX8GISEL-NEXT:  .LBB8_5: ; =>This Inner Loop Header: Depth=1
-; GFX8GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX8GISEL-NEXT:    v_readlane_b32 s10, v3, s9
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s10
-; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX8GISEL-NEXT:    v_min_f32_e32 v2, s8, v2
-; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_5
-; GFX8GISEL-NEXT:  .LBB8_6: ; %endif
-; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s8
-; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9DAGISEL-LABEL: divergent_cfg_float:
-; GFX9DAGISEL:       ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v4
-; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr4
-; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX9DAGISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v2, s9
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s10
-; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX9DAGISEL-NEXT:    v_min_f32_e32 v3, s8, v3
-; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s8, v3
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX9DAGISEL-NEXT:  ; %bb.3:
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX9DAGISEL-NEXT:  .LBB8_4: ; %Flow
-; GFX9DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_8
-; GFX9DAGISEL-NEXT:  ; %bb.5: ; %if
-; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX9DAGISEL-NEXT:  .LBB8_6: ; =>This Inner Loop Header: Depth=1
-; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX9DAGISEL-NEXT:    v_readlane_b32 s10, v3, s9
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX9DAGISEL-NEXT:    v_min_f32_e32 v2, s8, v2
-; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_6
-; GFX9DAGISEL-NEXT:  ; %bb.7:
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9DAGISEL-NEXT:  .LBB8_8: ; %endif
-; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9DAGISEL-NEXT:    global_store_dword v[0:1], v4, off
-; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9GISEL-LABEL: divergent_cfg_float:
-; GFX9GISEL:       ; %bb.0: ; %entry
-; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v4
-; GFX9GISEL-NEXT:    ; implicit-def: $sgpr8
-; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_3
-; GFX9GISEL-NEXT:  ; %bb.1: ; %else
-; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX9GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX9GISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX9GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX9GISEL-NEXT:    v_readlane_b32 s10, v2, s9
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v4, s10
-; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX9GISEL-NEXT:    v_min_f32_e32 v4, s8, v4
-; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    v_readfirstlane_b32 s8, v4
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX9GISEL-NEXT:  .LBB8_3: ; %Flow
-; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_6
-; GFX9GISEL-NEXT:  ; %bb.4: ; %if
-; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX9GISEL-NEXT:  .LBB8_5: ; =>This Inner Loop Header: Depth=1
-; GFX9GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX9GISEL-NEXT:    v_readlane_b32 s10, v3, s9
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX9GISEL-NEXT:    v_min_f32_e32 v2, s8, v2
-; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_5
-; GFX9GISEL-NEXT:  .LBB8_6: ; %endif
-; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9GISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1064DAGISEL-LABEL: divergent_cfg_float:
-; GFX1064DAGISEL:       ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v4
-; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr4
-; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX1064DAGISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v2, s9
-; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    v_min_f32_e64 v3, s8, s10
-; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s8, v3
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX1064DAGISEL-NEXT:  ; %bb.3:
-; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
-; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1064DAGISEL-NEXT:  .LBB8_4: ; %Flow
-; GFX1064DAGISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_8
-; GFX1064DAGISEL-NEXT:  ; %bb.5: ; %if
-; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX1064DAGISEL-NEXT:  .LBB8_6: ; =>This Inner Loop Header: Depth=1
-; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX1064DAGISEL-NEXT:    v_readlane_b32 s10, v3, s9
-; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    v_min_f32_e64 v2, s8, s10
-; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_6
-; GFX1064DAGISEL-NEXT:  ; %bb.7:
-; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v4, s8
-; GFX1064DAGISEL-NEXT:  .LBB8_8: ; %endif
-; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX1064DAGISEL-NEXT:    global_store_dword v[0:1], v4, off
-; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1064GISEL-LABEL: divergent_cfg_float:
-; GFX1064GISEL:       ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1064GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr8
-; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v4
-; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1064GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_3
-; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
-; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX1064GISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v2, s9
-; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    v_min_f32_e64 v3, s8, s10
-; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s8, v3
-; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX1064GISEL-NEXT:  .LBB8_3: ; %Flow
-; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_6
-; GFX1064GISEL-NEXT:  ; %bb.4: ; %if
-; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:    s_mov_b32 s8, 0x7fc00000
-; GFX1064GISEL-NEXT:  .LBB8_5: ; =>This Inner Loop Header: Depth=1
-; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX1064GISEL-NEXT:    v_readlane_b32 s10, v3, s9
-; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    v_min_f32_e64 v2, s8, s10
-; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_5
-; GFX1064GISEL-NEXT:  .LBB8_6: ; %endif
-; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s8
-; GFX1064GISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1032DAGISEL-LABEL: divergent_cfg_float:
-; GFX1032DAGISEL:       ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1032DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v4
-; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr4
-; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s4, vcc_lo
-; GFX1032DAGISEL-NEXT:    s_xor_b32 s4, exec_lo, s4
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, exec_lo
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
-; GFX1032DAGISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s5
-; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s5, s7
-; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s5, 0
-; GFX1032DAGISEL-NEXT:    v_min_f32_e64 v3, s6, s8
-; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX1032DAGISEL-NEXT:  ; %bb.3:
-; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v4, s6
-; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1032DAGISEL-NEXT:  .LBB8_4: ; %Flow
-; GFX1032DAGISEL-NEXT:    s_andn2_saveexec_b32 s4, s4
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_8
-; GFX1032DAGISEL-NEXT:  ; %bb.5: ; %if
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, exec_lo
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, 0x7fc00000
-; GFX1032DAGISEL-NEXT:  .LBB8_6: ; =>This Inner Loop Header: Depth=1
-; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s5
-; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v3, s7
-; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s5, s7
-; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s5, 0
-; GFX1032DAGISEL-NEXT:    v_min_f32_e64 v2, s6, s8
-; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_6
-; GFX1032DAGISEL-NEXT:  ; %bb.7:
-; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v4, s6
-; GFX1032DAGISEL-NEXT:  .LBB8_8: ; %endif
-; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX1032DAGISEL-NEXT:    global_store_dword v[0:1], v4, off
-; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1032GISEL-LABEL: divergent_cfg_float:
-; GFX1032GISEL:       ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1032GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr4
-; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v4
-; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s5, vcc_lo
-; GFX1032GISEL-NEXT:    s_xor_b32 s5, exec_lo, s5
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_3
-; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
-; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1032GISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
-; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
-; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    v_min_f32_e64 v3, s4, s8
-; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX1032GISEL-NEXT:  .LBB8_3: ; %Flow
-; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s5, s5
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_6
-; GFX1032GISEL-NEXT:  ; %bb.4: ; %if
-; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1032GISEL-NEXT:  .LBB8_5: ; =>This Inner Loop Header: Depth=1
-; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
-; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v3, s7
-; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
-; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    v_min_f32_e64 v2, s4, s8
-; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_5
-; GFX1032GISEL-NEXT:  .LBB8_6: ; %endif
-; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1032GISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1164DAGISEL-LABEL: divergent_cfg_float:
-; GFX1164DAGISEL:       ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v4
-; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr4
-; GFX1164DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1164DAGISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v2, s5
-; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    v_min_f32_e64 v3, s4, s6
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX1164DAGISEL-NEXT:  ; %bb.3:
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1164DAGISEL-NEXT:  .LBB8_4: ; %Flow
-; GFX1164DAGISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_8
-; GFX1164DAGISEL-NEXT:  ; %bb.5: ; %if
-; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1164DAGISEL-NEXT:  .LBB8_6: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v3, s5
-; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    v_min_f32_e64 v2, s4, s6
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_6
-; GFX1164DAGISEL-NEXT:  ; %bb.7:
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX1164DAGISEL-NEXT:  .LBB8_8: ; %endif
-; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    global_store_b32 v[0:1], v4, off
-; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1164GISEL-LABEL: divergent_cfg_float:
-; GFX1164GISEL:       ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1164GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr4
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v4
-; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_3
-; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
-; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1164GISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v2, s5
-; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
-; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    v_min_f32_e64 v3, s4, s6
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX1164GISEL-NEXT:  .LBB8_3: ; %Flow
-; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_6
-; GFX1164GISEL-NEXT:  ; %bb.4: ; %if
-; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:    s_mov_b32 s4, 0x7fc00000
-; GFX1164GISEL-NEXT:  .LBB8_5: ; =>This Inner Loop Header: Depth=1
-; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v3, s5
-; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
-; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    v_min_f32_e64 v2, s4, s6
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_5
-; GFX1164GISEL-NEXT:  .LBB8_6: ; %endif
-; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1164GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1132DAGISEL-LABEL: divergent_cfg_float:
-; GFX1132DAGISEL:       ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v4
-; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr4
-; GFX1132DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
-; GFX1132DAGISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s1
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
-; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s1, s3
-; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX1132DAGISEL-NEXT:    v_min_f32_e64 v3, s2, s4
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX1132DAGISEL-NEXT:  ; %bb.3:
-; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v4, s2
-; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1132DAGISEL-NEXT:  .LBB8_4: ; %Flow
-; GFX1132DAGISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_8
-; GFX1132DAGISEL-NEXT:  ; %bb.5: ; %if
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0x7fc00000
-; GFX1132DAGISEL-NEXT:  .LBB8_6: ; =>This Inner Loop Header: Depth=1
-; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s1
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v3, s3
-; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s1, s3
-; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX1132DAGISEL-NEXT:    v_min_f32_e64 v2, s2, s4
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_6
-; GFX1132DAGISEL-NEXT:  ; %bb.7:
-; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v4, s2
-; GFX1132DAGISEL-NEXT:  .LBB8_8: ; %endif
-; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    global_store_b32 v[0:1], v4, off
-; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1132GISEL-LABEL: divergent_cfg_float:
-; GFX1132GISEL:       ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1132GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
-; GFX1132GISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr0
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v4
-; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_3
-; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
-; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0x7fc00000
-; GFX1132GISEL-NEXT:  .LBB8_2: ; =>This Inner Loop Header: Depth=1
-; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
-; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
-; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    v_min_f32_e64 v3, s0, s4
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s0, v3
-; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr3
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_2
-; GFX1132GISEL-NEXT:  .LBB8_3: ; %Flow
-; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_6
-; GFX1132GISEL-NEXT:  ; %bb.4: ; %if
-; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0x7fc00000
-; GFX1132GISEL-NEXT:  .LBB8_5: ; =>This Inner Loop Header: Depth=1
-; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v3, s3
-; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
-; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    v_min_f32_e64 v2, s0, s4
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s0, v2
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_5
-; GFX1132GISEL-NEXT:  .LBB8_6: ; %endif
-; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GFX1132GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %d_cmp = icmp ult i32 %tid, 16
-  br i1 %d_cmp, label %if, label %else
-
-if:
-  %reducedValTid = call float @llvm.amdgcn.wave.reduce.fmin(float %in2, i32 1)
-  br label %endif
-
-else:
-  %reducedValIn = call float @llvm.amdgcn.wave.reduce.fmin(float %in, i32 1)
-  br label %endif
-
-endif:
-  %combine = phi float [%reducedValTid, %if], [%reducedValIn, %else]
-  store float %combine, ptr addrspace(1) %out
-  ret void
-}
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11DAGISEL: {{.*}}
 ; GFX11GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
index e9a163c214011..f094213731684 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
@@ -2220,1007 +2220,6 @@ endif:
   store i64 %combine, ptr addrspace(1) %out
   ret void
 }
-
-define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) {
-; GFX8DAGISEL-LABEL: uniform_value_float:
-; GFX8DAGISEL:       ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; GFX8DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
-; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8DAGISEL-NEXT:    s_endpgm
-;
-; GFX8GISEL-LABEL: uniform_value_float:
-; GFX8GISEL:       ; %bb.0: ; %entry
-; GFX8GISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; GFX8GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
-; GFX8GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT:    s_endpgm
-;
-; GFX9DAGISEL-LABEL: uniform_value_float:
-; GFX9DAGISEL:       ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; GFX9DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
-; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX9DAGISEL-NEXT:    s_endpgm
-;
-; GFX9GISEL-LABEL: uniform_value_float:
-; GFX9GISEL:       ; %bb.0: ; %entry
-; GFX9GISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; GFX9GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
-; GFX9GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX9GISEL-NEXT:    s_endpgm
-;
-; GFX1064DAGISEL-LABEL: uniform_value_float:
-; GFX1064DAGISEL:       ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX1064DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s3, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
-; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
-; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_endpgm
-;
-; GFX1064GISEL-LABEL: uniform_value_float:
-; GFX1064GISEL:       ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX1064GISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; GFX1064GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
-; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX1064GISEL-NEXT:    s_endpgm
-;
-; GFX1032DAGISEL-LABEL: uniform_value_float:
-; GFX1032DAGISEL:       ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s0
-; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
-; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
-; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX1032DAGISEL-NEXT:    s_endpgm
-;
-; GFX1032GISEL-LABEL: uniform_value_float:
-; GFX1032GISEL:       ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX1032GISEL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s0, s0
-; GFX1032GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
-; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX1032GISEL-NEXT:    s_endpgm
-;
-; GFX1164DAGISEL-LABEL: uniform_value_float:
-; GFX1164DAGISEL:       ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s3, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
-; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_endpgm
-;
-; GFX1164GISEL-LABEL: uniform_value_float:
-; GFX1164GISEL:       ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT:    s_endpgm
-;
-; GFX1132DAGISEL-LABEL: uniform_value_float:
-; GFX1132DAGISEL:       ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s0
-; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
-; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1132DAGISEL-NEXT:    s_endpgm
-;
-; GFX1132GISEL-LABEL: uniform_value_float:
-; GFX1132GISEL:       ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX1132GISEL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s0, s0
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT:    s_endpgm
-;
-; GFX12DAGISEL-LABEL: uniform_value_float:
-; GFX12DAGISEL:       ; %bb.0: ; %entry
-; GFX12DAGISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
-; GFX12DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX12DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX12DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
-; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
-; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12DAGISEL-NEXT:    s_endpgm
-entry:
-  %result = call float @llvm.amdgcn.wave.reduce.fsub(float %in, i32 1)
-  store float %result, ptr addrspace(1) %out
-  ret void
-}
-
-define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) {
-; GFX8DAGISEL-LABEL: divergent_value_float:
-; GFX8DAGISEL:       ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s8
-; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX8DAGISEL-NEXT:    v_sub_f32_e32 v3, s6, v3
-; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX8DAGISEL-NEXT:  ; %bb.2:
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8GISEL-LABEL: divergent_value_float:
-; GFX8GISEL:       ; %bb.0: ; %entry
-; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s8
-; GFX8GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX8GISEL-NEXT:    v_sub_f32_e32 v3, s6, v3
-; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX8GISEL-NEXT:  ; %bb.2:
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9DAGISEL-LABEL: divergent_value_float:
-; GFX9DAGISEL:       ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9DAGISEL-NEXT:    v_sub_f32_e32 v3, s6, v3
-; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX9DAGISEL-NEXT:  ; %bb.2:
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9GISEL-LABEL: divergent_value_float:
-; GFX9GISEL:       ; %bb.0: ; %entry
-; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9GISEL-NEXT:    v_sub_f32_e32 v3, s6, v3
-; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX9GISEL-NEXT:  ; %bb.2:
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9GISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1064DAGISEL-LABEL: divergent_value_float:
-; GFX1064DAGISEL:       ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064DAGISEL-NEXT:    v_sub_f32_e64 v3, s6, s8
-; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1064DAGISEL-NEXT:  ; %bb.2:
-; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX1064DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1064GISEL-LABEL: divergent_value_float:
-; GFX1064GISEL:       ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s7
-; GFX1064GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064GISEL-NEXT:    v_sub_f32_e64 v3, s6, s8
-; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1064GISEL-NEXT:  ; %bb.2:
-; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX1064GISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1032DAGISEL-LABEL: divergent_value_float:
-; GFX1032DAGISEL:       ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, 0
-; GFX1032DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s6, s4
-; GFX1032DAGISEL-NEXT:    v_readlane_b32 s7, v2, s6
-; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s4, s6
-; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX1032DAGISEL-NEXT:    v_sub_f32_e64 v3, s5, s7
-; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1032DAGISEL-NEXT:  ; %bb.2:
-; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX1032DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1032GISEL-LABEL: divergent_value_float:
-; GFX1032GISEL:       ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1032GISEL-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1032GISEL-NEXT:    s_mov_b32 s5, 0
-; GFX1032GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s6, s4
-; GFX1032GISEL-NEXT:    v_readlane_b32 s7, v2, s6
-; GFX1032GISEL-NEXT:    s_bitset0_b32 s4, s6
-; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX1032GISEL-NEXT:    v_sub_f32_e64 v3, s5, s7
-; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1032GISEL-NEXT:  ; %bb.2:
-; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s5
-; GFX1032GISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1164DAGISEL-LABEL: divergent_value_float:
-; GFX1164DAGISEL:       ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164DAGISEL-NEXT:    s_mov_b32 s2, 0
-; GFX1164DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
-; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[0:1], s3
-; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164DAGISEL-NEXT:    v_sub_f32_e64 v3, s2, s4
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1164DAGISEL-NEXT:  ; %bb.2:
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1164DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1164GISEL-LABEL: divergent_value_float:
-; GFX1164GISEL:       ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164GISEL-NEXT:    s_mov_b32 s2, 0
-; GFX1164GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s3
-; GFX1164GISEL-NEXT:    s_bitset0_b64 s[0:1], s3
-; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164GISEL-NEXT:    v_sub_f32_e64 v3, s2, s4
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1164GISEL-NEXT:  ; %bb.2:
-; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1164GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1132DAGISEL-LABEL: divergent_value_float:
-; GFX1132DAGISEL:       ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1132DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s2, s0
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v2, s2
-; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s0, s2
-; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX1132DAGISEL-NEXT:    v_sub_f32_e64 v3, s1, s3
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1132DAGISEL-NEXT:  ; %bb.2:
-; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1132DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1132GISEL-LABEL: divergent_value_float:
-; GFX1132GISEL:       ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1132GISEL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132GISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1132GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s2, s0
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v2, s2
-; GFX1132GISEL-NEXT:    s_bitset0_b32 s0, s2
-; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX1132GISEL-NEXT:    v_sub_f32_e64 v3, s1, s3
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX1132GISEL-NEXT:  ; %bb.2:
-; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1132GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12DAGISEL-LABEL: divergent_value_float:
-; GFX12DAGISEL:       ; %bb.0: ; %entry
-; GFX12DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12DAGISEL-NEXT:    s_wait_expcnt 0x0
-; GFX12DAGISEL-NEXT:    s_wait_samplecnt 0x0
-; GFX12DAGISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX12DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX12DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s2, s0
-; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12DAGISEL-NEXT:    v_readlane_b32 s3, v2, s2
-; GFX12DAGISEL-NEXT:    s_bitset0_b32 s0, s2
-; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12DAGISEL-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX12DAGISEL-NEXT:    v_sub_f32_e64 v3, s1, s3
-; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
-; GFX12DAGISEL-NEXT:  ; %bb.2:
-; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12DAGISEL-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call float @llvm.amdgcn.wave.reduce.fsub(float %id.x, i32 1)
-  store float %result, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
-; GFX8DAGISEL-LABEL: divergent_cfg_float:
-; GFX8DAGISEL:       ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
-; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr6
-; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX8DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
-; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX8DAGISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
-; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
-; GFX8DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX8DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
-; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT:  .LBB8_4: ; %endif
-; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8DAGISEL-NEXT:    flat_store_dword v[1:2], v0
-; GFX8DAGISEL-NEXT:    s_endpgm
-;
-; GFX8GISEL-LABEL: divergent_cfg_float:
-; GFX8GISEL:       ; %bb.0: ; %entry
-; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
-; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
-; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX8GISEL-NEXT:  ; %bb.1: ; %else
-; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX8GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
-; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX8GISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX8GISEL-NEXT:  ; %bb.3: ; %if
-; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
-; GFX8GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX8GISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
-; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX8GISEL-NEXT:  .LBB8_4: ; %endif
-; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT:    s_endpgm
-;
-; GFX9DAGISEL-LABEL: divergent_cfg_float:
-; GFX9DAGISEL:       ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
-; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr6
-; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX9DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
-; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9DAGISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
-; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
-; GFX9DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX9DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
-; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9DAGISEL-NEXT:  .LBB8_4: ; %endif
-; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX9DAGISEL-NEXT:    s_endpgm
-;
-; GFX9GISEL-LABEL: divergent_cfg_float:
-; GFX9GISEL:       ; %bb.0: ; %entry
-; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
-; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
-; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX9GISEL-NEXT:  ; %bb.1: ; %else
-; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX9GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
-; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9GISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX9GISEL-NEXT:  ; %bb.3: ; %if
-; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
-; GFX9GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX9GISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
-; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9GISEL-NEXT:  .LBB8_4: ; %endif
-; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX9GISEL-NEXT:    s_endpgm
-;
-; GFX1064DAGISEL-LABEL: divergent_cfg_float:
-; GFX1064DAGISEL:       ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
-; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr6
-; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX1064DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1064DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
-; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX1064DAGISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
-; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
-; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
-; GFX1064DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX1064DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
-; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1064DAGISEL-NEXT:  .LBB8_4: ; %endif
-; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_endpgm
-;
-; GFX1064GISEL-LABEL: divergent_cfg_float:
-; GFX1064GISEL:       ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
-; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
-; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX1064GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
-; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1064GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
-; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX1064GISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
-; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
-; GFX1064GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX1064GISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
-; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX1064GISEL-NEXT:  .LBB8_4: ; %endif
-; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX1064GISEL-NEXT:    s_endpgm
-;
-; GFX1032DAGISEL-LABEL: divergent_cfg_float:
-; GFX1032DAGISEL:       ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
-; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr3
-; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX1032DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX1032DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
-; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
-; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1032DAGISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s2
-; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s3
-; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
-; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
-; GFX1032DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX1032DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
-; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1032DAGISEL-NEXT:  .LBB8_4: ; %endif
-; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX1032DAGISEL-NEXT:    s_endpgm
-;
-; GFX1032GISEL-LABEL: divergent_cfg_float:
-; GFX1032GISEL:       ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
-; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr2
-; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1032GISEL-NEXT:    s_xor_b32 s3, exec_lo, s3
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
-; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
-; GFX1032GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
-; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1032GISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s0, s3
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
-; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
-; GFX1032GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX1032GISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
-; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1032GISEL-NEXT:  .LBB8_4: ; %endif
-; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX1032GISEL-NEXT:    s_endpgm
-;
-; GFX1164DAGISEL-LABEL: divergent_cfg_float:
-; GFX1164DAGISEL:       ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:    ; implicit-def: $sgpr6
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
-; GFX1164DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1164DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
-; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX1164DAGISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
-; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX1164DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
-; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1164DAGISEL-NEXT:  .LBB8_4: ; %endif
-; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_endpgm
-;
-; GFX1164GISEL-LABEL: divergent_cfg_float:
-; GFX1164GISEL:       ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
-; GFX1164GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
-; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1164GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
-; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX1164GISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[2:3]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
-; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX1164GISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX1164GISEL-NEXT:  .LBB8_4: ; %endif
-; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT:    s_endpgm
-;
-; GFX1132DAGISEL-LABEL: divergent_cfg_float:
-; GFX1132DAGISEL:       ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:    ; implicit-def: $sgpr3
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
-; GFX1132DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX1132DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
-; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
-; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1132DAGISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s2
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s3
-; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
-; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
-; GFX1132DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
-; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1132DAGISEL-NEXT:  .LBB8_4: ; %endif
-; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1132DAGISEL-NEXT:    s_endpgm
-;
-; GFX1132GISEL-LABEL: divergent_cfg_float:
-; GFX1132GISEL:       ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr2
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
-; GFX1132GISEL-NEXT:    s_xor_b32 s3, exec_lo, s3
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
-; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
-; GFX1132GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
-; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1132GISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s3
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
-; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
-; GFX1132GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
-; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1132GISEL-NEXT:  .LBB8_4: ; %endif
-; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT:    s_endpgm
-;
-; GFX12DAGISEL-LABEL: divergent_cfg_float:
-; GFX12DAGISEL:       ; %bb.0: ; %entry
-; GFX12DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX12DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX12DAGISEL-NEXT:    ; implicit-def: $sgpr3
-; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
-; GFX12DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
-; GFX12DAGISEL-NEXT:  ; %bb.1: ; %else
-; GFX12DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
-; GFX12DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX12DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
-; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
-; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX12DAGISEL-NEXT:  .LBB8_2: ; %Flow
-; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s0, s2
-; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v0, s3
-; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
-; GFX12DAGISEL-NEXT:  ; %bb.3: ; %if
-; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
-; GFX12DAGISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
-; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX12DAGISEL-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v0, s1
-; GFX12DAGISEL-NEXT:  .LBB8_4: ; %endif
-; GFX12DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12DAGISEL-NEXT:    s_endpgm
-entry:
-  %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %d_cmp = icmp ult i32 %tid, 16
-  br i1 %d_cmp, label %if, label %else
-
-if:
-  %reducedValTid = call float @llvm.amdgcn.wave.reduce.fsub(float %in2, i32 1)
-  br label %endif
-
-else:
-  %reducedValIn = call float @llvm.amdgcn.wave.reduce.fsub(float %in, i32 1)
-  br label %endif
-
-endif:
-  %combine = phi float [%reducedValTid, %if], [%reducedValIn, %else]
-  store float %combine, ptr addrspace(1) %out
-  ret void
-}
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX10DAGISEL: {{.*}}
 ; GFX10GISEL: {{.*}}

From 2b725ab8bf08b0bde29910ec4fa1c610eaaffa63 Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Wed, 3 Dec 2025 09:08:05 +0000
Subject: [PATCH 18/36] [lldb] Add DisassemblerLLVMC::IsBarrier API (#169632)

This will allow the instruction emulation unwinder to reason about
instructions that prevent the subsequent instruction from executing.

Part of a sequence of PRs:
[lldb][NFCI] Rewrite UnwindAssemblyInstEmulation in terms of a CFG visit
#169630
[lldb][NFC] Rename forward_branch_offset to branch_offset in
UnwindAssemblyInstEmulation #169631
[lldb] Add DisassemblerLLVMC::IsBarrier API #169632
[lldb] Handle backwards branches in UnwindAssemblyInstEmulation #169633

commit-id:bb5df4aa
---
 lldb/include/lldb/Core/Disassembler.h               |  4 ++++
 lldb/source/Core/Disassembler.cpp                   |  5 +++++
 .../Disassembler/LLVMC/DisassemblerLLVMC.cpp        | 13 +++++++++++++
 3 files changed, 22 insertions(+)

diff --git a/lldb/include/lldb/Core/Disassembler.h b/lldb/include/lldb/Core/Disassembler.h
index 5de314109b0cc..ab0f4ac804a7c 100644
--- a/lldb/include/lldb/Core/Disassembler.h
+++ b/lldb/include/lldb/Core/Disassembler.h
@@ -167,6 +167,8 @@ class Instruction {
 
   virtual bool IsLoad() = 0;
 
+  virtual bool IsBarrier() = 0;
+
   virtual bool IsAuthenticated() = 0;
 
   bool CanSetBreakpoint();
@@ -367,6 +369,8 @@ class PseudoInstruction : public Instruction {
 
   bool IsLoad() override;
 
+  bool IsBarrier() override;
+
   bool IsAuthenticated() override;
 
   void CalculateMnemonicOperandsAndComment(
diff --git a/lldb/source/Core/Disassembler.cpp b/lldb/source/Core/Disassembler.cpp
index f2eb887986bfb..ed32caf361e0a 100644
--- a/lldb/source/Core/Disassembler.cpp
+++ b/lldb/source/Core/Disassembler.cpp
@@ -1341,6 +1341,11 @@ bool PseudoInstruction::DoesBranch() {
   return false;
 }
 
+bool PseudoInstruction::IsBarrier() {
+  // This is NOT a valid question for a pseudo instruction.
+  return false;
+}
+
 bool PseudoInstruction::HasDelaySlot() {
   // This is NOT a valid question for a pseudo instruction.
   return false;
diff --git a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
index 66d0a50985be7..e8bb706f7aab6 100644
--- a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
+++ b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
@@ -70,6 +70,7 @@ class DisassemblerLLVMC::MCDisasmInstance {
   bool HasDelaySlot(llvm::MCInst &mc_inst) const;
   bool IsCall(llvm::MCInst &mc_inst) const;
   bool IsLoad(llvm::MCInst &mc_inst) const;
+  bool IsBarrier(llvm::MCInst &mc_inst) const;
   bool IsAuthenticated(llvm::MCInst &mc_inst) const;
 
 private:
@@ -436,6 +437,11 @@ class InstructionLLVMC : public lldb_private::Instruction {
     return m_is_load;
   }
 
+  bool IsBarrier() override {
+    VisitInstruction();
+    return m_is_barrier;
+  }
+
   bool IsAuthenticated() override {
     VisitInstruction();
     return m_is_authenticated;
@@ -1195,6 +1201,7 @@ class InstructionLLVMC : public lldb_private::Instruction {
   bool m_is_call = false;
   bool m_is_load = false;
   bool m_is_authenticated = false;
+  bool m_is_barrier = false;
 
   void VisitInstruction() {
     if (m_has_visited_instruction)
@@ -1227,6 +1234,7 @@ class InstructionLLVMC : public lldb_private::Instruction {
     m_is_call = mc_disasm_ptr->IsCall(inst);
     m_is_load = mc_disasm_ptr->IsLoad(inst);
     m_is_authenticated = mc_disasm_ptr->IsAuthenticated(inst);
+    m_is_barrier = mc_disasm_ptr->IsBarrier(inst);
   }
 
 private:
@@ -1432,6 +1440,11 @@ bool DisassemblerLLVMC::MCDisasmInstance::IsLoad(llvm::MCInst &mc_inst) const {
   return m_instr_info_up->get(mc_inst.getOpcode()).mayLoad();
 }
 
+bool DisassemblerLLVMC::MCDisasmInstance::IsBarrier(
+    llvm::MCInst &mc_inst) const {
+  return m_instr_info_up->get(mc_inst.getOpcode()).isBarrier();
+}
+
 bool DisassemblerLLVMC::MCDisasmInstance::IsAuthenticated(
     llvm::MCInst &mc_inst) const {
   const auto &InstrDesc = m_instr_info_up->get(mc_inst.getOpcode());

From 9296223b28029095c1e734ba9373b9bcfc853d7b Mon Sep 17 00:00:00 2001
From: mitchell <mitchell.xu2@gmail.com>
Date: Wed, 3 Dec 2025 17:22:34 +0800
Subject: [PATCH 19/36] [clang-tidy] Fix
 `cppcoreguidelines-pro-type-member-init` check (#169832)

Closes [#169677](https://github.com/llvm/llvm-project/issues/169677)

---------

Co-authored-by: EugeneZelenko <eugene.zelenko@gmail.com>
---
 .../ProTypeMemberInitCheck.cpp                | 47 +++++++++++++------
 clang-tools-extra/docs/ReleaseNotes.rst       |  5 ++
 .../pro-type-member-init.ignorearrays.cpp     | 36 ++++++++++++++
 3 files changed, 74 insertions(+), 14 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp
index 66508da89f0dd..f2676468b6871 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp
@@ -361,7 +361,8 @@ void ProTypeMemberInitCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
 }
 
 // FIXME: Copied from clang/lib/Sema/SemaDeclCXX.cpp.
-static bool isIncompleteOrZeroLengthArrayType(ASTContext &Context, QualType T) {
+static bool isIncompleteOrZeroLengthArrayType(const ASTContext &Context,
+                                              QualType T) {
   if (T->isIncompleteArrayType())
     return true;
 
@@ -375,7 +376,7 @@ static bool isIncompleteOrZeroLengthArrayType(ASTContext &Context, QualType T) {
   return false;
 }
 
-static bool isEmpty(ASTContext &Context, const QualType &Type) {
+static bool isEmpty(const ASTContext &Context, const QualType &Type) {
   if (const CXXRecordDecl *ClassDecl = Type->getAsCXXRecordDecl()) {
     return ClassDecl->isEmpty();
   }
@@ -431,19 +432,13 @@ static llvm::StringLiteral getInitializer(QualType QT, bool UseAssignment) {
   }
 }
 
-void ProTypeMemberInitCheck::checkMissingMemberInitializer(
-    ASTContext &Context, const CXXRecordDecl &ClassDecl,
-    const CXXConstructorDecl *Ctor) {
-  const bool IsUnion = ClassDecl.isUnion();
-
-  if (IsUnion && ClassDecl.hasInClassInitializer())
-    return;
-
-  // Gather all fields (direct and indirect) that need to be initialized.
-  SmallPtrSet<const FieldDecl *, 16> FieldsToInit;
+static void
+computeFieldsToInit(const ASTContext &Context, const RecordDecl &Record,
+                    bool IgnoreArrays,
+                    SmallPtrSetImpl<const FieldDecl *> &FieldsToInit) {
   bool AnyMemberHasInitPerUnion = false;
   forEachFieldWithFilter(
-      ClassDecl, ClassDecl.fields(), AnyMemberHasInitPerUnion,
+      Record, Record.fields(), AnyMemberHasInitPerUnion,
       [&](const FieldDecl *F) {
         if (IgnoreArrays && F->getType()->isArrayType())
           return;
@@ -458,6 +453,19 @@ void ProTypeMemberInitCheck::checkMissingMemberInitializer(
             !AnyMemberHasInitPerUnion)
           FieldsToInit.insert(F);
       });
+}
+
+void ProTypeMemberInitCheck::checkMissingMemberInitializer(
+    ASTContext &Context, const CXXRecordDecl &ClassDecl,
+    const CXXConstructorDecl *Ctor) {
+  const bool IsUnion = ClassDecl.isUnion();
+
+  if (IsUnion && ClassDecl.hasInClassInitializer())
+    return;
+
+  // Gather all fields (direct and indirect) that need to be initialized.
+  SmallPtrSet<const FieldDecl *, 16> FieldsToInit;
+  computeFieldsToInit(Context, ClassDecl, IgnoreArrays, FieldsToInit);
   if (FieldsToInit.empty())
     return;
 
@@ -507,7 +515,7 @@ void ProTypeMemberInitCheck::checkMissingMemberInitializer(
   // Collect all fields but only suggest a fix for the first member of unions,
   // as initializing more than one union member is an error.
   SmallPtrSet<const FieldDecl *, 16> FieldsToFix;
-  AnyMemberHasInitPerUnion = false;
+  bool AnyMemberHasInitPerUnion = false;
   forEachFieldWithFilter(ClassDecl, ClassDecl.fields(),
                          AnyMemberHasInitPerUnion, [&](const FieldDecl *F) {
                            if (!FieldsToInit.contains(F))
@@ -582,6 +590,17 @@ void ProTypeMemberInitCheck::checkMissingBaseClassInitializer(
 
 void ProTypeMemberInitCheck::checkUninitializedTrivialType(
     const ASTContext &Context, const VarDecl *Var) {
+  // Verify that the record actually needs initialization
+  const CXXRecordDecl *Record = Var->getType()->getAsCXXRecordDecl();
+  if (!Record)
+    return;
+
+  SmallPtrSet<const FieldDecl *, 16> FieldsToInit;
+  computeFieldsToInit(Context, *Record, IgnoreArrays, FieldsToInit);
+
+  if (FieldsToInit.empty())
+    return;
+
   const DiagnosticBuilder Diag =
       diag(Var->getBeginLoc(), "uninitialized record type: %0") << Var;
 
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 79a768e599cfd..8d63fa85c0a8f 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -424,6 +424,11 @@ Changes in existing checks
   adding an option to allow pointer arithmetic via prefix/postfix increment or
   decrement operators.
 
+- Improved :doc:`cppcoreguidelines-pro-type-member-init
+  <clang-tidy/checks/cppcoreguidelines/pro-type-member-init>` check to
+  correctly ignore ``std::array`` and other array-like containers when
+  `IgnoreArrays` option is set to `true`.
+
 - Improved :doc:`google-readability-casting
   <clang-tidy/checks/google/readability-casting>` check by adding fix-it
   notes for downcasts and casts to void pointer.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-member-init.ignorearrays.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-member-init.ignorearrays.cpp
index 01859b3ad98f4..e4cfe679cfce9 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-member-init.ignorearrays.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-member-init.ignorearrays.cpp
@@ -14,3 +14,39 @@ struct HasArrayMember {
   int RawArray[4];
   int Number;
 };
+
+namespace std {
+template <typename T, int N>
+struct array {
+  T _Elems[N];
+  void fill(const T &);
+};
+}
+
+void test_local_std_array() {
+  std::array<int, 4> a;
+}
+
+struct OnlyArray {
+  int a[4];
+};
+
+void test_local_only_array() {
+  OnlyArray a;
+}
+
+struct Mixed {
+  int a[4];
+  int b;
+};
+
+void test_local_mixed() {
+  Mixed m;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: uninitialized record type: 'm'
+}
+
+void test_std_array_fill() {
+  std::array<char, 10> someArray;
+  // CHECK-MESSAGES-NOT: warning: uninitialized record type: 'someArray'
+  someArray.fill('n');
+}

From 114ca6522e4ea425115adb778c39fd89745a6853 Mon Sep 17 00:00:00 2001
From: Shih-Po Hung <shihpo.hung@sifive.com>
Date: Wed, 3 Dec 2025 17:24:40 +0800
Subject: [PATCH 20/36] [TTI] Use MemIntrinsicCostAttributes for
 getStridedOpCost (#170436)

- Following #168029. This is a step toward a unified interface for
masked/gather-scatter/strided/expand-compress cost modeling.
- Replace the ad-hoc parameter list with a single attributes object.

API change:
```
- InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
                                                                       bool VariableMask, Align Alignment,
                                                                       TTI::TargetCostKind CostKind,
                                                                       const Instruction *I = nullptr);
+ InstructionCost getStridedMemoryOpCost(MemIntrinsicCostAttributes,
+                                                                      CostKind);
```

Notes:
- NFCI intended: callers populate MemIntrinsicCostAttributes with same
information as before.
---
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  6 ++--
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      | 32 +++++++------------
 .../Target/RISCV/RISCVTargetTransformInfo.cpp | 22 ++++++++-----
 .../Target/RISCV/RISCVTargetTransformInfo.h   |  8 ++---
 4 files changed, 30 insertions(+), 38 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 9b2a7f432a544..5f1d855621c93 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -864,10 +864,8 @@ class TargetTransformInfoImplBase {
   }
 
   virtual InstructionCost
-  getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
-                         bool VariableMask, Align Alignment,
-                         TTI::TargetCostKind CostKind,
-                         const Instruction *I = nullptr) const {
+  getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
+                         TTI::TargetCostKind CostKind) const {
     return InstructionCost::getInvalid();
   }
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 314830652f0b6..fceff5f93b765 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1599,19 +1599,19 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
                                        /*IsGatherScatter*/ true, CostKind);
   }
 
-  InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy,
-                                         const Value *Ptr, bool VariableMask,
-                                         Align Alignment,
-                                         TTI::TargetCostKind CostKind,
-                                         const Instruction *I) const override {
+  InstructionCost
+  getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
+                         TTI::TargetCostKind CostKind) const override {
     // For a target without strided memory operations (or for an illegal
     // operation type on one which does), assume we lower to a gather/scatter
     // operation.  (Which may in turn be scalarized.)
-    unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_gather
-                                               : Intrinsic::masked_scatter;
+    unsigned IID = MICA.getID() == Intrinsic::experimental_vp_strided_load
+                       ? Intrinsic::masked_gather
+                       : Intrinsic::masked_scatter;
     return thisT()->getGatherScatterOpCost(
-        MemIntrinsicCostAttributes(IID, DataTy, Ptr, VariableMask, Alignment,
-                                   I),
+        MemIntrinsicCostAttributes(IID, MICA.getDataType(), MICA.getPointer(),
+                                   MICA.getVariableMask(), MICA.getAlignment(),
+                                   MICA.getInst()),
         CostKind);
   }
 
@@ -3062,21 +3062,11 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA,
                            TTI::TargetCostKind CostKind) const override {
     unsigned Id = MICA.getID();
-    Type *DataTy = MICA.getDataType();
-    const Value *Ptr = MICA.getPointer();
-    const Instruction *I = MICA.getInst();
-    bool VariableMask = MICA.getVariableMask();
-    Align Alignment = MICA.getAlignment();
 
     switch (Id) {
     case Intrinsic::experimental_vp_strided_load:
-    case Intrinsic::experimental_vp_strided_store: {
-      unsigned Opcode = Id == Intrinsic::experimental_vp_strided_load
-                            ? Instruction::Load
-                            : Instruction::Store;
-      return thisT()->getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
-                                             Alignment, CostKind, I);
-    }
+    case Intrinsic::experimental_vp_strided_store:
+      return thisT()->getStridedMemoryOpCost(MICA, CostKind);
     case Intrinsic::masked_scatter:
     case Intrinsic::masked_gather:
     case Intrinsic::vp_scatter:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 1d431959eaea3..74c2c896a8a88 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1212,14 +1212,20 @@ InstructionCost RISCVTTIImpl::getExpandCompressMemoryOpCost(
          LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
 }
 
-InstructionCost RISCVTTIImpl::getStridedMemoryOpCost(
-    unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
-    Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
-  if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
-       !isLegalStridedLoadStore(DataTy, Alignment)) ||
-      (Opcode != Instruction::Load && Opcode != Instruction::Store))
-    return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
-                                         Alignment, CostKind, I);
+InstructionCost
+RISCVTTIImpl::getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
+                                     TTI::TargetCostKind CostKind) const {
+
+  unsigned Opcode = MICA.getID() == Intrinsic::experimental_vp_strided_load
+                        ? Instruction::Load
+                        : Instruction::Store;
+
+  Type *DataTy = MICA.getDataType();
+  Align Alignment = MICA.getAlignment();
+  const Instruction *I = MICA.getInst();
+
+  if (!isLegalStridedLoadStore(DataTy, Alignment))
+    return BaseT::getStridedMemoryOpCost(MICA, CostKind);
 
   if (CostKind == TTI::TCK_CodeSize)
     return TTI::TCC_Basic;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index e32b1c553c57a..c1746e6d13166 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -202,11 +202,9 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
   getExpandCompressMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
                                 TTI::TargetCostKind CostKind) const override;
 
-  InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy,
-                                         const Value *Ptr, bool VariableMask,
-                                         Align Alignment,
-                                         TTI::TargetCostKind CostKind,
-                                         const Instruction *I) const override;
+  InstructionCost
+  getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
+                         TTI::TargetCostKind CostKind) const override;
 
   InstructionCost
   getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const override;

From 5ccf8c90d1e4020d5f9bc255fe521aa0763f2b2b Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Wed, 3 Dec 2025 09:36:22 +0000
Subject: [PATCH 21/36] [flang] implement VECTOR VECTORLENGTH directive
 (#170114)

This should match exactly the llvm attributes generated by classic
flang.
---
 flang/docs/Directives.md                      |  9 +++
 flang/include/flang/Parser/dump-parse-tree.h  |  2 +
 flang/include/flang/Parser/parse-tree.h       | 12 +++-
 flang/lib/Lower/Bridge.cpp                    | 44 ++++++++++--
 flang/lib/Parser/Fortran-parsers.cpp          | 11 +++
 flang/lib/Parser/unparse.cpp                  | 19 ++++++
 .../lib/Semantics/canonicalize-directives.cpp |  4 ++
 flang/lib/Semantics/resolve-names.cpp         |  1 +
 flang/test/Lower/vectorlength.f90             | 67 +++++++++++++++++++
 flang/test/Parser/compiler-directives.f90     | 22 ++++++
 10 files changed, 183 insertions(+), 8 deletions(-)
 create mode 100644 flang/test/Lower/vectorlength.f90

diff --git a/flang/docs/Directives.md b/flang/docs/Directives.md
index 128d8f9b6b707..5640e44e16bae 100644
--- a/flang/docs/Directives.md
+++ b/flang/docs/Directives.md
@@ -57,6 +57,15 @@ A list of non-standard directives supported by Flang
 * `!dir$ vector always` forces vectorization on the following loop regardless
   of cost model decisions. The loop must still be vectorizable.
   [This directive currently only works on plain do loops without labels].
+* `!dir$ vector vectorlength({fixed|scalable|<num>|<num>,fixed|<num>,scalable})`
+  specifies a hint to the compiler about the desired vectorization factor. If
+  `fixed` is used, the compiler should prefer fixed-width vectorization.
+  Scalable vectorization instructions may still be used with a fixed-width
+  predicate. If `scalable` is used the compiler should prefer scalable
+  vectorization, though it can choose to use fixed length vectorization or not
+  at all. `<num>` means that the compiler should consider using this specific
+  vectorization factor, which should be an integer literal. This directive
+  currently has the same limitations as `!dir$ vector always`.
 * `!dir$ unroll [n]` specifies that the compiler ought to unroll the immediately
   following loop `n` times. When `n` is `0` or `1`, the loop should not be unrolled
   at all. When `n` is `2` or greater, the loop should be unrolled exactly `n`
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index f460e61fbb915..18218cee40ec9 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -229,6 +229,8 @@ class ParseTreeDumper {
   NODE(CompilerDirective, NoInline)
   NODE(CompilerDirective, Unrecognized)
   NODE(CompilerDirective, VectorAlways)
+  NODE_ENUM(CompilerDirective::VectorLength, VectorLength::Kind)
+  NODE(CompilerDirective, VectorLength)
   NODE(CompilerDirective, Unroll)
   NODE(CompilerDirective, UnrollAndJam)
   NODE(CompilerDirective, NoVector)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index dd928e1244a2f..10820b61a9ee8 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -3384,6 +3384,12 @@ struct CompilerDirective {
     std::tuple<common::Indirection<Designator>, uint64_t> t;
   };
   EMPTY_CLASS(VectorAlways);
+  struct VectorLength {
+    TUPLE_CLASS_BOILERPLATE(VectorLength);
+    ENUM_CLASS(Kind, Auto, Fixed, Scalable);
+
+    std::tuple<std::uint64_t, Kind> t;
+  };
   struct NameValue {
     TUPLE_CLASS_BOILERPLATE(NameValue);
     std::tuple<Name, std::optional<std::uint64_t>> t;
@@ -3408,9 +3414,9 @@ struct CompilerDirective {
   EMPTY_CLASS(Unrecognized);
   CharBlock source;
   std::variant<std::list<IgnoreTKR>, LoopCount, std::list<AssumeAligned>,
-      VectorAlways, std::list<NameValue>, Unroll, UnrollAndJam, Unrecognized,
-      NoVector, NoUnroll, NoUnrollAndJam, ForceInline, Inline, NoInline,
-      Prefetch, IVDep>
+      VectorAlways, VectorLength, std::list<NameValue>, Unroll, UnrollAndJam,
+      Unrecognized, NoVector, NoUnroll, NoUnrollAndJam, ForceInline, Inline,
+      NoInline, Prefetch, IVDep>
       u;
 };
 
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index ba13b8f098da8..d175e2a8a73cb 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2584,12 +2584,16 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
   // Enabling loop vectorization attribute.
   mlir::LLVM::LoopVectorizeAttr
-  genLoopVectorizeAttr(mlir::BoolAttr disableAttr) {
+  genLoopVectorizeAttr(mlir::BoolAttr disableAttr,
+                       mlir::BoolAttr scalableEnable,
+                       mlir::IntegerAttr vectorWidth) {
     mlir::LLVM::LoopVectorizeAttr va;
     if (disableAttr)
-      va = mlir::LLVM::LoopVectorizeAttr::get(builder->getContext(),
-                                              /*disable=*/disableAttr, {}, {},
-                                              {}, {}, {}, {});
+      va = mlir::LLVM::LoopVectorizeAttr::get(
+          builder->getContext(),
+          /*disable=*/disableAttr, /*predicate=*/{},
+          /*scalableEnable=*/scalableEnable,
+          /*vectorWidth=*/vectorWidth, {}, {}, {});
     return va;
   }
 
@@ -2597,6 +2601,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       IncrementLoopInfo &info,
       llvm::SmallVectorImpl<const Fortran::parser::CompilerDirective *> &dirs) {
     mlir::BoolAttr disableVecAttr;
+    mlir::BoolAttr scalableEnable;
+    mlir::IntegerAttr vectorWidth;
     mlir::LLVM::LoopUnrollAttr ua;
     mlir::LLVM::LoopUnrollAndJamAttr uja;
     llvm::SmallVector<mlir::LLVM::AccessGroupAttr> aga;
@@ -2609,6 +2615,30 @@ class FirConverter : public Fortran::lower::AbstractConverter {
                     mlir::BoolAttr::get(builder->getContext(), false);
                 has_attrs = true;
               },
+              [&](const Fortran::parser::CompilerDirective::VectorLength &vl) {
+                using Kind =
+                    Fortran::parser::CompilerDirective::VectorLength::Kind;
+                Kind kind = std::get<Kind>(vl.t);
+                uint64_t length = std::get<uint64_t>(vl.t);
+                disableVecAttr =
+                    mlir::BoolAttr::get(builder->getContext(), false);
+                if (length != 0)
+                  vectorWidth =
+                      builder->getIntegerAttr(builder->getI64Type(), length);
+                switch (kind) {
+                case Kind::Scalable:
+                  scalableEnable =
+                      mlir::BoolAttr::get(builder->getContext(), true);
+                  break;
+                case Kind::Fixed:
+                  scalableEnable =
+                      mlir::BoolAttr::get(builder->getContext(), false);
+                  break;
+                case Kind::Auto:
+                  break;
+                }
+                has_attrs = true;
+              },
               [&](const Fortran::parser::CompilerDirective::Unroll &u) {
                 ua = genLoopUnrollAttr(u.v);
                 has_attrs = true;
@@ -2640,7 +2670,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
               [&](const auto &) {}},
           dir->u);
     }
-    mlir::LLVM::LoopVectorizeAttr va = genLoopVectorizeAttr(disableVecAttr);
+    mlir::LLVM::LoopVectorizeAttr va =
+        genLoopVectorizeAttr(disableVecAttr, scalableEnable, vectorWidth);
     mlir::LLVM::LoopAnnotationAttr la = mlir::LLVM::LoopAnnotationAttr::get(
         builder->getContext(), {}, /*vectorize=*/va, {}, /*unroll*/ ua,
         /*unroll_and_jam*/ uja, {}, {}, {}, {}, {}, {}, {}, {}, {},
@@ -3347,6 +3378,9 @@ class FirConverter : public Fortran::lower::AbstractConverter {
             [&](const Fortran::parser::CompilerDirective::VectorAlways &) {
               attachDirectiveToLoop(dir, &eval);
             },
+            [&](const Fortran::parser::CompilerDirective::VectorLength &) {
+              attachDirectiveToLoop(dir, &eval);
+            },
             [&](const Fortran::parser::CompilerDirective::Unroll &) {
               attachDirectiveToLoop(dir, &eval);
             },
diff --git a/flang/lib/Parser/Fortran-parsers.cpp b/flang/lib/Parser/Fortran-parsers.cpp
index fccb9d82f4fc9..988db5450abc9 100644
--- a/flang/lib/Parser/Fortran-parsers.cpp
+++ b/flang/lib/Parser/Fortran-parsers.cpp
@@ -1295,6 +1295,7 @@ TYPE_PARSER(construct<StatOrErrmsg>("STAT =" >> statVariable) ||
 // Directives, extensions, and deprecated statements
 // !DIR$ IGNORE_TKR [ [(tkrdmac...)] name ]...
 // !DIR$ LOOP COUNT (n1[, n2]...)
+// !DIR$ VECTOR VECTORLENGTH ({FIXED|SCALABLE|<num>|<num>,FIXED|<num>,SCALABLE})
 // !DIR$ name[=value] [, name[=value]]...
 // !DIR$ UNROLL [n]
 // !DIR$ PREFETCH designator[, designator]...
@@ -1311,6 +1312,15 @@ constexpr auto assumeAligned{"ASSUME_ALIGNED" >>
         indirect(designator), ":"_tok >> digitString64))};
 constexpr auto vectorAlways{
     "VECTOR ALWAYS" >> construct<CompilerDirective::VectorAlways>()};
+constexpr auto vectorLengthKind{
+    "FIXED" >> pure(CompilerDirective::VectorLength::Kind::Fixed) ||
+    "SCALABLE" >> pure(CompilerDirective::VectorLength::Kind::Scalable)};
+constexpr auto vectorLength{"VECTOR VECTORLENGTH" >>
+    parenthesized(construct<CompilerDirective::VectorLength>(
+                      digitString64, ","_tok >> vectorLengthKind) ||
+        construct<CompilerDirective::VectorLength>(pure(0), vectorLengthKind) ||
+        construct<CompilerDirective::VectorLength>(
+            digitString64, pure(CompilerDirective::VectorLength::Kind::Auto)))};
 constexpr auto unroll{
     "UNROLL" >> construct<CompilerDirective::Unroll>(maybe(digitString64))};
 constexpr auto prefetch{"PREFETCH" >>
@@ -1332,6 +1342,7 @@ TYPE_PARSER(beginDirective >> "DIR$ "_tok >>
                 construct<CompilerDirective>(loopCount) ||
                 construct<CompilerDirective>(assumeAligned) ||
                 construct<CompilerDirective>(vectorAlways) ||
+                construct<CompilerDirective>(vectorLength) ||
                 construct<CompilerDirective>(unrollAndJam) ||
                 construct<CompilerDirective>(unroll) ||
                 construct<CompilerDirective>(prefetch) ||
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 8e9c7d04bc522..5421ee6f948a2 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -1848,6 +1848,25 @@ class UnparseVisitor {
             [&](const CompilerDirective::VectorAlways &valways) {
               Word("!DIR$ VECTOR ALWAYS");
             },
+            [&](const CompilerDirective::VectorLength &vlength) {
+              using Kind = CompilerDirective::VectorLength::Kind;
+              std::uint64_t length = std::get<std::uint64_t>(vlength.t);
+              Kind kind = std::get<Kind>(vlength.t);
+
+              Word("!DIR$ VECTOR VECTORLENGTH (");
+              // || kind == Kind::Auto handles the case of VECTORLENGTH(0) so we
+              // don't print nothing
+              if (length != 0 || kind == Kind::Auto) {
+                Walk(length);
+              }
+              if (length != 0 && kind != Kind::Auto) {
+                Word(", ");
+              }
+              if (kind != Kind::Auto) {
+                Word(CompilerDirective::VectorLength::EnumToString(kind));
+              }
+              Word(")");
+            },
             [&](const std::list<CompilerDirective::NameValue> &names) {
               Walk("!DIR$ ", names, " ");
             },
diff --git a/flang/lib/Semantics/canonicalize-directives.cpp b/flang/lib/Semantics/canonicalize-directives.cpp
index b21da4d041a97..f32a3d34c6572 100644
--- a/flang/lib/Semantics/canonicalize-directives.cpp
+++ b/flang/lib/Semantics/canonicalize-directives.cpp
@@ -56,6 +56,7 @@ bool CanonicalizeDirectives(
 static bool IsExecutionDirective(const parser::CompilerDirective &dir) {
   return std::holds_alternative<parser::CompilerDirective::VectorAlways>(
              dir.u) ||
+      std::holds_alternative<parser::CompilerDirective::VectorLength>(dir.u) ||
       std::holds_alternative<parser::CompilerDirective::Unroll>(dir.u) ||
       std::holds_alternative<parser::CompilerDirective::UnrollAndJam>(dir.u) ||
       std::holds_alternative<parser::CompilerDirective::NoVector>(dir.u) ||
@@ -121,6 +122,9 @@ void CanonicalizationOfDirectives::Post(parser::Block &block) {
           common::visitors{[&](parser::CompilerDirective::VectorAlways &) {
                              CheckLoopDirective(*dir, block, it);
                            },
+              [&](parser::CompilerDirective::VectorLength &) {
+                CheckLoopDirective(*dir, block, it);
+              },
               [&](parser::CompilerDirective::Unroll &) {
                 CheckLoopDirective(*dir, block, it);
               },
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 2a487a6d39d51..5814841053132 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -10075,6 +10075,7 @@ void ResolveNamesVisitor::Post(const parser::AssignedGotoStmt &x) {
 
 void ResolveNamesVisitor::Post(const parser::CompilerDirective &x) {
   if (std::holds_alternative<parser::CompilerDirective::VectorAlways>(x.u) ||
+      std::holds_alternative<parser::CompilerDirective::VectorLength>(x.u) ||
       std::holds_alternative<parser::CompilerDirective::Unroll>(x.u) ||
       std::holds_alternative<parser::CompilerDirective::UnrollAndJam>(x.u) ||
       std::holds_alternative<parser::CompilerDirective::NoVector>(x.u) ||
diff --git a/flang/test/Lower/vectorlength.f90 b/flang/test/Lower/vectorlength.f90
new file mode 100644
index 0000000000000..95753c3f78090
--- /dev/null
+++ b/flang/test/Lower/vectorlength.f90
@@ -0,0 +1,67 @@
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
+
+! CHECK: #[[FIXED:.*]] = #llvm.loop_vectorize<disable = false, scalableEnable = false>
+! CHECK: #[[SCALABLE:.*]] = #llvm.loop_vectorize<disable = false, scalableEnable = true>
+! CHECK: #[[WIDTH2:.*]] = #llvm.loop_vectorize<disable = false, width = 2 : i64>
+! CHECK: #[[FIXED_WIDTH2:.*]] = #llvm.loop_vectorize<disable = false, scalableEnable = false, width = 2 : i64>
+! CHECK: #[[SCALABLE_WIDTH2:.*]] = #llvm.loop_vectorize<disable = false, scalableEnable = true, width = 2 : i64>
+! CHECK: #[[FIXED_TAG:.*]] = #llvm.loop_annotation<vectorize = #[[FIXED]]>
+! CHECK: #[[SCALABLE_TAG:.*]] = #llvm.loop_annotation<vectorize = #[[SCALABLE]]>
+! CHECK: #[[WIDTH2_TAG:.*]]  = #llvm.loop_annotation<vectorize = #[[WIDTH2]]>
+! CHECK: #[[FIXED_WIDTH2_TAG:.*]] = #llvm.loop_annotation<vectorize = #[[FIXED_WIDTH2]]>
+! CHECK: #[[SCALABLE_WIDTH2_TAG:.*]] = #llvm.loop_annotation<vectorize = #[[SCALABLE_WIDTH2]]>
+
+! CHECK-LABEL: func.func @_QPfixed(
+subroutine fixed(a, b, m)
+  integer :: i, m, a(m), b(m)
+
+  !dir$ vector vectorlength(fixed)
+  ! CHECK: fir.do_loop {{.*}} attributes {loopAnnotation = #[[FIXED_TAG]]}
+  do i = 1, m
+    b(i) = a(i) + 1
+  end do
+end subroutine
+
+! CHECK-LABEL: func.func @_QPscalable(
+subroutine scalable(a, b, m)
+  integer :: i, m, a(m), b(m)
+
+  !dir$ vector vectorlength(scalable)
+  ! CHECK: fir.do_loop {{.*}} attributes {loopAnnotation = #[[SCALABLE_TAG]]}
+  do i = 1, m
+    b(i) = a(i) + 1
+  end do
+end subroutine
+
+! CHECK-LABEL: func.func @_QPlen2(
+subroutine len2(a, b, m)
+  integer :: i, m, a(m), b(m)
+
+  !dir$ vector vectorlength(2)
+  ! CHECK: fir.do_loop {{.*}} attributes {loopAnnotation = #[[WIDTH2_TAG]]}
+  do i = 1, m
+    b(i) = a(i) + 1
+  end do
+end subroutine
+
+! CHECK-LABEL: func.func @_QPlen2fixed(
+subroutine len2fixed(a, b, m)
+  integer :: i, m, a(m), b(m)
+
+  !dir$ vector vectorlength(2,fixed)
+  ! CHECK: fir.do_loop {{.*}} attributes {loopAnnotation = #[[FIXED_WIDTH2_TAG]]}
+  do i = 1, m
+    b(i) = a(i) + 1
+  end do
+end subroutine
+
+! CHECK-LABEL: func.func @_QPlen2scalable(
+subroutine len2scalable(a, b, m)
+  integer :: i, m, a(m), b(m)
+
+  !dir$ vector vectorlength(2,scalable)
+  ! CHECK: fir.do_loop {{.*}} attributes {loopAnnotation = #[[SCALABLE_WIDTH2_TAG]]}
+  do i = 1, m
+    b(i) = a(i) + 1
+  end do
+end subroutine
diff --git a/flang/test/Parser/compiler-directives.f90 b/flang/test/Parser/compiler-directives.f90
index 56a10f9177997..ce592692cfc67 100644
--- a/flang/test/Parser/compiler-directives.f90
+++ b/flang/test/Parser/compiler-directives.f90
@@ -36,6 +36,28 @@ subroutine vector_always
   enddo
 end subroutine
 
+subroutine vector_vectorlength
+  !dir$ vector vectorlength(fixed)
+  ! CHECK: !DIR$ VECTOR VECTORLENGTH (FIXED)
+  do i=1,10
+  enddo
+
+  !dir$ vector vectorlength(scalable)
+  ! CHECK: !DIR$ VECTOR VECTORLENGTH (SCALABLE)
+  do i=1,10
+  enddo
+
+  !dir$ vector vectorlength(8,scalable)
+  ! CHECK: !DIR$ VECTOR VECTORLENGTH (8, SCALABLE)
+  do i=1,10
+  enddo
+
+  !dir$ vector vectorlength(4)
+  ! CHECK: !DIR$ VECTOR VECTORLENGTH (4)
+  do i=1,10
+  enddo
+end subroutine
+
 subroutine unroll
   !dir$ unroll
   ! CHECK: !DIR$ UNROLL

From 8feb6762ba9fb83f8e13ef9486c3b743e1b5cfa7 Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Wed, 3 Dec 2025 10:37:58 +0100
Subject: [PATCH 22/36] [AMDGPU] Take BUF instructions into account in
 mayAccessScratchThroughFlat (#170274)

BUF instructions can access the scratch address space, so
SIInsertWaitCnt needs to be able
to track the SCRATCH_WRITE_ACCESS event for such BUF instructions.

The release-vgprs.mir test had to be updated because BUF instructions
w/o a MMO are now
tracked as a SCRATCH_WRITE_ACCESS. I added a MMO that touches global to
keep the test result unchanged. I also added a couple of testcases with no MMO to test the corrected behavior.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp |  4 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp      |  5 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h        |  6 +-
 llvm/test/CodeGen/AMDGPU/release-vgprs.mir  | 70 ++++++++++++++-------
 4 files changed, 56 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 70db7b4918515..64a29e1841245 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -552,9 +552,7 @@ class SIInsertWaitcnts {
       return VMEM_ACCESS;
     if (Inst.mayStore() &&
         (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
-      // FLAT and SCRATCH instructions may access scratch. Other VMEM
-      // instructions do not.
-      if (TII->mayAccessScratchThroughFlat(Inst))
+      if (TII->mayAccessScratch(Inst))
         return SCRATCH_WRITE_ACCESS;
       return VMEM_WRITE_ACCESS;
     }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 088533306c79b..7535407741f1f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4399,8 +4399,9 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
          Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
 }
 
-bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
-  if (!isFLAT(MI) || isFLATGlobal(MI))
+bool SIInstrInfo::mayAccessScratch(const MachineInstr &MI) const {
+  // Instructions that access scratch use FLAT encoding or BUF encodings.
+  if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
     return false;
 
   // If scratch is not initialized, we can never access it.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 3174bfafb4154..b1d6563bf3c0b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -694,11 +694,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     return get(Opcode).TSFlags & SIInstrFlags::FLAT;
   }
 
-  /// \returns true for SCRATCH_ instructions, or FLAT_ instructions with
-  /// SCRATCH_ memory operands.
+  /// \returns true for SCRATCH_ instructions, or FLAT/BUF instructions unless
+  /// the MMOs do not include scratch.
   /// Conservatively correct; will return true if \p MI cannot be proven
   /// to not hit scratch.
-  bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
+  bool mayAccessScratch(const MachineInstr &MI) const;
 
   /// \returns true for FLAT instructions that can access VMEM.
   bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
index c845a4c82b9cc..57db12e6f41d8 100644
--- a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
@@ -25,6 +25,8 @@
   define amdgpu_cs void @with_calls() { ret void }
   define fastcc void @with_tail_calls() { ret void }
   define amdgpu_cs void @waveslot_limited() { ret void }
+  define amdgpu_ps void @tbuffer_without_mmo_may_hit_scratch() { ret void }
+  define amdgpu_ps void @buffer_without_mmo_may_hit_scratch() { ret void }
 ...
 
 ---
@@ -34,15 +36,15 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; OPT-LABEL: name: tbuffer_store1
-    ; OPT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
+    ; OPT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
     ; OPT-NEXT: S_NOP 0
     ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
     ; OPT-NEXT: S_ENDPGM 0, implicit $vgpr97
     ;
     ; NOOPT-LABEL: name: tbuffer_store1
-    ; NOOPT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
+    ; NOOPT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
     ; NOOPT-NEXT: S_ENDPGM 0, implicit $vgpr97
-    TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
+    TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
     S_ENDPGM 0, implicit $vgpr97
 ...
 
@@ -107,15 +109,15 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; OPT-LABEL: name: buffer_store_format
-    ; OPT: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
+    ; OPT: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
     ; OPT-NEXT: S_NOP 0
     ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
     ; OPT-NEXT: S_ENDPGM 0, implicit $vgpr97
     ;
     ; NOOPT-LABEL: name: buffer_store_format
-    ; NOOPT: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
+    ; NOOPT: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
     ; NOOPT-NEXT: S_ENDPGM 0, implicit $vgpr97
-    BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
+    BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
     S_ENDPGM 0, implicit $vgpr97
 ...
 
@@ -218,7 +220,7 @@ body:             |
   ; OPT: bb.0:
   ; OPT-NEXT:   successors: %bb.2(0x80000000)
   ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+  ; OPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
   ; OPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
   ; OPT-NEXT:   S_BRANCH %bb.2
   ; OPT-NEXT: {{  $}}
@@ -226,7 +228,7 @@ body:             |
   ; OPT-NEXT:   successors: %bb.2(0x80000000)
   ; OPT-NEXT: {{  $}}
   ; OPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; OPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+  ; OPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
   ; OPT-NEXT:   S_BRANCH %bb.2
   ; OPT-NEXT: {{  $}}
   ; OPT-NEXT: bb.2:
@@ -238,7 +240,7 @@ body:             |
   ; NOOPT: bb.0:
   ; NOOPT-NEXT:   successors: %bb.2(0x80000000)
   ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+  ; NOOPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
   ; NOOPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
   ; NOOPT-NEXT:   S_BRANCH %bb.2
   ; NOOPT-NEXT: {{  $}}
@@ -246,7 +248,7 @@ body:             |
   ; NOOPT-NEXT:   successors: %bb.2(0x80000000)
   ; NOOPT-NEXT: {{  $}}
   ; NOOPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; NOOPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+  ; NOOPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
   ; NOOPT-NEXT:   S_BRANCH %bb.2
   ; NOOPT-NEXT: {{  $}}
   ; NOOPT-NEXT: bb.2:
@@ -254,7 +256,7 @@ body:             |
   bb.0:
     successors: %bb.2
 
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
     $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
     S_BRANCH %bb.2
 
@@ -262,7 +264,7 @@ body:             |
     successors: %bb.2
 
     $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
     S_BRANCH %bb.2
 
   bb.2:
@@ -281,7 +283,7 @@ body:             |
   ; OPT-NEXT:   successors: %bb.2(0x80000000)
   ; OPT-NEXT: {{  $}}
   ; OPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; OPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+  ; OPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
   ; OPT-NEXT:   S_BRANCH %bb.2
   ; OPT-NEXT: {{  $}}
   ; OPT-NEXT: bb.1:
@@ -311,7 +313,7 @@ body:             |
   ; NOOPT-NEXT:   successors: %bb.2(0x80000000)
   ; NOOPT-NEXT: {{  $}}
   ; NOOPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; NOOPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+  ; NOOPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
   ; NOOPT-NEXT:   S_BRANCH %bb.2
   ; NOOPT-NEXT: {{  $}}
   ; NOOPT-NEXT: bb.1:
@@ -337,7 +339,7 @@ body:             |
     successors: %bb.2
 
     $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
     S_BRANCH %bb.2
 
   bb.1:
@@ -408,14 +410,14 @@ body:             |
   ; OPT: bb.0:
   ; OPT-NEXT:   successors: %bb.1(0x80000000)
   ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT:   renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+  ; OPT-NEXT:   renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (volatile load (s32), addrspace 1)
   ; OPT-NEXT:   S_BRANCH %bb.1
   ; OPT-NEXT: {{  $}}
   ; OPT-NEXT: bb.1:
   ; OPT-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; OPT-NEXT: {{  $}}
   ; OPT-NEXT:   S_WAITCNT 1015
-  ; OPT-NEXT:   TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec
+  ; OPT-NEXT:   TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
   ; OPT-NEXT:   S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
   ; OPT-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
   ; OPT-NEXT:   S_BRANCH %bb.2
@@ -429,14 +431,14 @@ body:             |
   ; NOOPT: bb.0:
   ; NOOPT-NEXT:   successors: %bb.1(0x80000000)
   ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT:   renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+  ; NOOPT-NEXT:   renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (volatile load (s32), addrspace 1)
   ; NOOPT-NEXT:   S_BRANCH %bb.1
   ; NOOPT-NEXT: {{  $}}
   ; NOOPT-NEXT: bb.1:
   ; NOOPT-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; NOOPT-NEXT: {{  $}}
   ; NOOPT-NEXT:   S_WAITCNT 1015
-  ; NOOPT-NEXT:   TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec
+  ; NOOPT-NEXT:   TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
   ; NOOPT-NEXT:   S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
   ; NOOPT-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
   ; NOOPT-NEXT:   S_BRANCH %bb.2
@@ -446,13 +448,13 @@ body:             |
   bb.0:
     successors: %bb.1
 
-    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (volatile load (s32), addrspace 1)
     S_BRANCH %bb.1
 
   bb.1:
     successors: %bb.1, %bb.2
 
-    TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec
+    TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
     S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
     S_CBRANCH_SCC1 %bb.1, implicit killed $scc
     S_BRANCH %bb.2
@@ -619,3 +621,29 @@ body:             |
     GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr96, 0, 4, implicit $exec
     S_ENDPGM 0
 ...
+
+---
+name:            tbuffer_without_mmo_may_hit_scratch
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: tbuffer_without_mmo_may_hit_scratch
+    ; CHECK: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
+    ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97
+    TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
+    S_ENDPGM 0, implicit $vgpr97
+...
+
+---
+name:            buffer_without_mmo_may_hit_scratch
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: buffer_without_mmo_may_hit_scratch
+    ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec
+    ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97
+    BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec
+    S_ENDPGM 0, implicit $vgpr97
+...

From 2697c8cb459c1705f6c3a60c908462ca099e657f Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter@amd.com>
Date: Wed, 3 Dec 2025 11:13:52 +0100
Subject: [PATCH 23/36] [LowerMemIntrinsics] Factor control flow generation out
 of the memcpy lowering (#169039)

So far, memcpy with known size, memcpy with unknown size, memmove with known
size, and memmove with unknown size have individual optimized loop lowering
implementations, while memset and memset.pattern use an unoptimized loop
lowering. This patch extracts the parts of the memcpy lowerings (for known and
unknown sizes) that generate the control flow for the loop expansion into an
`insertLoopExpansion` function. The `createMemCpyLoop(Unk|K)nownSize` functions
then only collect the necessary arguments for `insertLoopExpansion`, call it,
and fill the generated loop basic blocks.

The immediate benefit of this is that logic from the two memcpy lowerings is
deduplicated. Moreover, it enables follow-up patches that will use
`insertLoopExpansion` to optimize the memset and memset.pattern implementations
similarly to memcpy, since they can use the exact same control flow patterns.

The test changes are due to more consistent and useful basic block names in the
loop expansion and an improvement in basic block ordering: previously, the
basic block that determines if the residual loop is executed would be put at
the end of the function, now it is put before the residual loop body.
Otherwise, the generated code should be equivalent.

This patch doesn't affect memmove; deduplicating its logic would also be nice,
but to extract all CF generation from the memmove lowering,
`insertLoopExpansion` would need to be able to also create code that iterates
backwards over the argument buffers. That would make `insertLoopExpansion` a
lot more complex for a code path that's only used for memmove, so it's probably
not worth refactoring.

For SWDEV-543208.
---
 .../Transforms/Utils/LowerMemIntrinsics.cpp   | 518 +++++++++++-------
 .../CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll  |   4 +-
 .../AMDGPU/buffer-fat-pointers-memcpy.ll      |  32 +-
 .../lower-buffer-fat-pointers-mem-transfer.ll | 144 ++---
 .../CodeGen/AMDGPU/lower-mem-intrinsics.ll    | 420 +++++++-------
 .../CodeGen/AMDGPU/memcpy-crash-issue63986.ll |  54 +-
 .../CodeGen/AMDGPU/memintrinsic-unroll.ll     |  60 +-
 llvm/test/CodeGen/AMDGPU/memmove-var-size.ll  | 120 ++--
 llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll  |  28 +-
 .../Transforms/Utils/MemTransferLowering.cpp  |   9 +-
 10 files changed, 733 insertions(+), 656 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 18b0f617ca232..4ab99edd64baa 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -21,6 +21,218 @@
 
 using namespace llvm;
 
+/// \returns \p Len urem \p OpSize, checking for optimization opportunities.
+/// \p OpSizeVal must be the integer value of the \c ConstantInt \p OpSize.
+static Value *getRuntimeLoopRemainder(IRBuilderBase &B, Value *Len,
+                                      Value *OpSize, unsigned OpSizeVal) {
+  // For powers of 2, we can and by (OpSizeVal - 1) instead of using urem.
+  if (isPowerOf2_32(OpSizeVal))
+    return B.CreateAnd(Len, OpSizeVal - 1);
+  return B.CreateURem(Len, OpSize);
+}
+
+/// \returns (\p Len udiv \p OpSize) mul \p OpSize, checking for optimization
+/// opportunities.
+/// If \p RTLoopRemainder is provided, it must be the result of
+/// \c getRuntimeLoopRemainder() with the same arguments.
+static Value *getRuntimeLoopUnits(IRBuilderBase &B, Value *Len, Value *OpSize,
+                                  unsigned OpSizeVal,
+                                  Value *RTLoopRemainder = nullptr) {
+  if (!RTLoopRemainder)
+    RTLoopRemainder = getRuntimeLoopRemainder(B, Len, OpSize, OpSizeVal);
+  return B.CreateSub(Len, RTLoopRemainder);
+}
+
+namespace {
+/// Container for the return values of insertLoopExpansion.
+struct LoopExpansionInfo {
+  /// The instruction at the end of the main loop body.
+  Instruction *MainLoopIP = nullptr;
+
+  /// The unit index in the main loop body.
+  Value *MainLoopIndex = nullptr;
+
+  /// The instruction at the end of the residual loop body. Can be nullptr if no
+  /// residual is required.
+  Instruction *ResidualLoopIP = nullptr;
+
+  /// The unit index in the residual loop body. Can be nullptr if no residual is
+  /// required.
+  Value *ResidualLoopIndex = nullptr;
+};
+} // namespace
+
+/// Insert the control flow and loop counters for a memcpy/memset loop
+/// expansion.
+///
+/// This function inserts IR corresponding to the following C code before
+/// \p InsertBefore:
+/// \code
+/// LoopUnits = (Len / MainLoopStep) * MainLoopStep;
+/// ResidualUnits = Len - LoopUnits;
+/// MainLoopIndex = 0;
+/// if (LoopUnits > 0) {
+///   do {
+///     // MainLoopIP
+///     MainLoopIndex += MainLoopStep;
+///   } while (MainLoopIndex < LoopUnits);
+/// }
+/// for (size_t i = 0; i < ResidualUnits; i += ResidualLoopStep) {
+///   ResidualLoopIndex = LoopUnits + i;
+///   // ResidualLoopIP
+/// }
+/// \endcode
+///
+/// \p MainLoopStep and \p ResidualLoopStep determine by how many "units" the
+/// loop index is increased in each iteration of the main and residual loops,
+/// respectively. In most cases, the "unit" will be bytes, but larger units are
+/// useful for lowering memset.pattern.
+///
+/// The computation of \c LoopUnits and \c ResidualUnits is performed at compile
+/// time if \p Len is a \c ConstantInt.
+/// The second (residual) loop is omitted if \p ResidualLoopStep is 0 or equal
+/// to \p MainLoopStep.
+/// The generated \c MainLoopIP, \c MainLoopIndex, \c ResidualLoopIP, and
+/// \c ResidualLoopIndex are returned in a \c LoopExpansionInfo object.
+static LoopExpansionInfo insertLoopExpansion(Instruction *InsertBefore,
+                                             Value *Len, unsigned MainLoopStep,
+                                             unsigned ResidualLoopStep,
+                                             StringRef BBNamePrefix) {
+  assert((ResidualLoopStep == 0 || MainLoopStep % ResidualLoopStep == 0) &&
+         "ResidualLoopStep must divide MainLoopStep if specified");
+  assert(ResidualLoopStep <= MainLoopStep &&
+         "ResidualLoopStep cannot be larger than MainLoopStep");
+  assert(MainLoopStep > 0 && "MainLoopStep must be non-zero");
+  LoopExpansionInfo LEI;
+  BasicBlock *PreLoopBB = InsertBefore->getParent();
+  BasicBlock *PostLoopBB = PreLoopBB->splitBasicBlock(
+      InsertBefore, BBNamePrefix + "-post-expansion");
+  Function *ParentFunc = PreLoopBB->getParent();
+  LLVMContext &Ctx = PreLoopBB->getContext();
+  IRBuilder<> PreLoopBuilder(PreLoopBB->getTerminator());
+
+  // Calculate the main loop trip count and remaining units to cover after the
+  // loop.
+  Type *LenType = Len->getType();
+  IntegerType *ILenType = cast<IntegerType>(LenType);
+  ConstantInt *CIMainLoopStep = ConstantInt::get(ILenType, MainLoopStep);
+
+  Value *LoopUnits = Len;
+  Value *ResidualUnits = nullptr;
+  // We can make a conditional branch unconditional if we know that the
+  // MainLoop must be executed at least once.
+  bool MustTakeMainLoop = false;
+  if (MainLoopStep != 1) {
+    if (auto *CLen = dyn_cast<ConstantInt>(Len)) {
+      uint64_t TotalUnits = CLen->getZExtValue();
+      uint64_t LoopEndCount = alignDown(TotalUnits, MainLoopStep);
+      uint64_t ResidualCount = TotalUnits - LoopEndCount;
+      LoopUnits = ConstantInt::get(LenType, LoopEndCount);
+      ResidualUnits = ConstantInt::get(LenType, ResidualCount);
+      MustTakeMainLoop = LoopEndCount > 0;
+      // As an optimization, we could skip generating the residual loop if
+      // ResidualCount is known to be 0. However, current uses of this function
+      // don't request a residual loop if the length is constant (they generate
+      // a (potentially empty) sequence of loads and stores instead), so this
+      // optimization would have no effect here.
+    } else {
+      ResidualUnits = getRuntimeLoopRemainder(PreLoopBuilder, Len,
+                                              CIMainLoopStep, MainLoopStep);
+      LoopUnits = getRuntimeLoopUnits(PreLoopBuilder, Len, CIMainLoopStep,
+                                      MainLoopStep, ResidualUnits);
+    }
+  } else if (auto *CLen = dyn_cast<ConstantInt>(Len)) {
+    MustTakeMainLoop = CLen->getZExtValue() > 0;
+  }
+
+  BasicBlock *MainLoopBB = BasicBlock::Create(
+      Ctx, BBNamePrefix + "-expansion-main-body", ParentFunc, PostLoopBB);
+  IRBuilder<> LoopBuilder(MainLoopBB);
+
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(LenType, 2, "loop-index");
+  LEI.MainLoopIndex = LoopIndex;
+  LoopIndex->addIncoming(ConstantInt::get(LenType, 0U), PreLoopBB);
+
+  Value *NewIndex =
+      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(LenType, MainLoopStep));
+  LoopIndex->addIncoming(NewIndex, MainLoopBB);
+
+  // One argument of the addition is a loop-variant PHI, so it must be an
+  // Instruction (i.e., it cannot be a Constant).
+  LEI.MainLoopIP = cast<Instruction>(NewIndex);
+
+  if (ResidualLoopStep > 0 && ResidualLoopStep < MainLoopStep) {
+    // Loop body for the residual accesses.
+    BasicBlock *ResLoopBB =
+        BasicBlock::Create(Ctx, BBNamePrefix + "-expansion-residual-body",
+                           PreLoopBB->getParent(), PostLoopBB);
+    // BB to check if the residual loop is needed.
+    BasicBlock *ResidualCondBB =
+        BasicBlock::Create(Ctx, BBNamePrefix + "-expansion-residual-cond",
+                           PreLoopBB->getParent(), ResLoopBB);
+
+    // Enter the MainLoop unless no main loop iteration is required.
+    ConstantInt *Zero = ConstantInt::get(ILenType, 0U);
+    if (MustTakeMainLoop)
+      PreLoopBuilder.CreateBr(MainLoopBB);
+    else
+      PreLoopBuilder.CreateCondBr(PreLoopBuilder.CreateICmpNE(LoopUnits, Zero),
+                                  MainLoopBB, ResidualCondBB);
+    PreLoopBB->getTerminator()->eraseFromParent();
+
+    // Stay in the MainLoop until we have handled all the LoopUnits. Then go to
+    // the residual condition BB.
+    LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, LoopUnits),
+                             MainLoopBB, ResidualCondBB);
+
+    // Determine if we need to branch to the residual loop or bypass it.
+    IRBuilder<> RCBuilder(ResidualCondBB);
+    RCBuilder.CreateCondBr(RCBuilder.CreateICmpNE(ResidualUnits, Zero),
+                           ResLoopBB, PostLoopBB);
+
+    IRBuilder<> ResBuilder(ResLoopBB);
+    PHINode *ResidualIndex =
+        ResBuilder.CreatePHI(LenType, 2, "residual-loop-index");
+    ResidualIndex->addIncoming(Zero, ResidualCondBB);
+
+    // Add the offset at the end of the main loop to the loop counter of the
+    // residual loop to get the proper index.
+    Value *FullOffset = ResBuilder.CreateAdd(LoopUnits, ResidualIndex);
+    LEI.ResidualLoopIndex = FullOffset;
+
+    Value *ResNewIndex = ResBuilder.CreateAdd(
+        ResidualIndex, ConstantInt::get(LenType, ResidualLoopStep));
+    ResidualIndex->addIncoming(ResNewIndex, ResLoopBB);
+
+    // One argument of the addition is a loop-variant PHI, so it must be an
+    // Instruction (i.e., it cannot be a Constant).
+    LEI.ResidualLoopIP = cast<Instruction>(ResNewIndex);
+
+    // Stay in the residual loop until all ResidualUnits are handled.
+    ResBuilder.CreateCondBr(
+        ResBuilder.CreateICmpULT(ResNewIndex, ResidualUnits), ResLoopBB,
+        PostLoopBB);
+  } else {
+    // There is no need for a residual loop after the main loop. We do however
+    // need to patch up the control flow by creating the terminators for the
+    // preloop block and the main loop.
+
+    // Enter the MainLoop unless no main loop iteration is required.
+    if (MustTakeMainLoop) {
+      PreLoopBuilder.CreateBr(MainLoopBB);
+    } else {
+      ConstantInt *Zero = ConstantInt::get(ILenType, 0U);
+      PreLoopBuilder.CreateCondBr(PreLoopBuilder.CreateICmpNE(LoopUnits, Zero),
+                                  MainLoopBB, PostLoopBB);
+    }
+    PreLoopBB->getTerminator()->eraseFromParent();
+    // Stay in the MainLoop until we have handled all the LoopUnits.
+    LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, LoopUnits),
+                             MainLoopBB, PostLoopBB);
+  }
+  return LEI;
+}
+
 void llvm::createMemCpyLoopKnownSize(
     Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr,
     ConstantInt *CopyLen, Align SrcAlign, Align DstAlign, bool SrcIsVolatile,
@@ -31,7 +243,6 @@ void llvm::createMemCpyLoopKnownSize(
     return;
 
   BasicBlock *PreLoopBB = InsertBefore->getParent();
-  BasicBlock *PostLoopBB = nullptr;
   Function *ParentFunc = PreLoopBB->getParent();
   LLVMContext &Ctx = PreLoopBB->getContext();
   const DataLayout &DL = ParentFunc->getDataLayout();
@@ -56,37 +267,32 @@ void llvm::createMemCpyLoopKnownSize(
 
   uint64_t LoopEndCount = alignDown(CopyLen->getZExtValue(), LoopOpSize);
 
+  // Skip the loop expansion entirely if the loop would never be taken.
   if (LoopEndCount != 0) {
-    // Split
-    PostLoopBB = PreLoopBB->splitBasicBlock(InsertBefore, "memcpy-split");
-    BasicBlock *LoopBB =
-        BasicBlock::Create(Ctx, "load-store-loop", ParentFunc, PostLoopBB);
-    PreLoopBB->getTerminator()->setSuccessor(0, LoopBB);
-
-    IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
+    LoopExpansionInfo LEI = insertLoopExpansion(InsertBefore, CopyLen,
+                                                LoopOpSize, 0, "static-memcpy");
 
+    // Fill MainLoopBB
+    IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
     Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
     Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
 
-    IRBuilder<> LoopBuilder(LoopBB);
-    PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index");
-    LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0U), PreLoopBB);
-    // Loop Body
-
     // If we used LoopOpType as GEP element type, we would iterate over the
     // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
     // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use
     // byte offsets computed from the TypeStoreSize.
-    Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LoopIndex);
-    LoadInst *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP,
-                                                   PartSrcAlign, SrcIsVolatile);
+    Value *SrcGEP =
+        MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LEI.MainLoopIndex);
+    LoadInst *Load = MainLoopBuilder.CreateAlignedLoad(
+        LoopOpType, SrcGEP, PartSrcAlign, SrcIsVolatile);
     if (!CanOverlap) {
       // Set alias scope for loads.
       Load->setMetadata(LLVMContext::MD_alias_scope,
                         MDNode::get(Ctx, NewScope));
     }
-    Value *DstGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LoopIndex);
-    StoreInst *Store = LoopBuilder.CreateAlignedStore(
+    Value *DstGEP =
+        MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex);
+    StoreInst *Store = MainLoopBuilder.CreateAlignedStore(
         Load, DstGEP, PartDstAlign, DstIsVolatile);
     if (!CanOverlap) {
       // Indicate that stores don't overlap loads.
@@ -96,96 +302,63 @@ void llvm::createMemCpyLoopKnownSize(
       Load->setAtomic(AtomicOrdering::Unordered);
       Store->setAtomic(AtomicOrdering::Unordered);
     }
-    Value *NewIndex = LoopBuilder.CreateAdd(
-        LoopIndex, ConstantInt::get(TypeOfCopyLen, LoopOpSize));
-    LoopIndex->addIncoming(NewIndex, LoopBB);
-
-    // Create the loop branch condition.
-    Constant *LoopEndCI = ConstantInt::get(TypeOfCopyLen, LoopEndCount);
-    LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, LoopEndCI),
-                             LoopBB, PostLoopBB);
+    assert(!LEI.ResidualLoopIP && !LEI.ResidualLoopIndex &&
+           "No residual loop was requested");
   }
 
+  // Copy the remaining bytes with straight-line code.
   uint64_t BytesCopied = LoopEndCount;
   uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied;
-  if (RemainingBytes) {
-    BasicBlock::iterator InsertIt = PostLoopBB ? PostLoopBB->getFirstNonPHIIt()
-                                               : InsertBefore->getIterator();
-    IRBuilder<> RBuilder(InsertIt->getParent(), InsertIt);
+  if (RemainingBytes == 0)
+    return;
 
-    SmallVector<Type *, 5> RemainingOps;
-    TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
-                                          SrcAS, DstAS, SrcAlign, DstAlign,
-                                          AtomicElementSize);
+  IRBuilder<> RBuilder(InsertBefore);
+  SmallVector<Type *, 5> RemainingOps;
+  TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
+                                        SrcAS, DstAS, SrcAlign, DstAlign,
+                                        AtomicElementSize);
 
-    for (auto *OpTy : RemainingOps) {
-      Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied));
-      Align PartDstAlign(commonAlignment(DstAlign, BytesCopied));
-
-      unsigned OperandSize = DL.getTypeStoreSize(OpTy);
-      assert(
-          (!AtomicElementSize || OperandSize % *AtomicElementSize == 0) &&
-          "Atomic memcpy lowering is not supported for selected operand size");
-
-      Value *SrcGEP = RBuilder.CreateInBoundsGEP(
-          Int8Type, SrcAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
-      LoadInst *Load =
-          RBuilder.CreateAlignedLoad(OpTy, SrcGEP, PartSrcAlign, SrcIsVolatile);
-      if (!CanOverlap) {
-        // Set alias scope for loads.
-        Load->setMetadata(LLVMContext::MD_alias_scope,
-                          MDNode::get(Ctx, NewScope));
-      }
-      Value *DstGEP = RBuilder.CreateInBoundsGEP(
-          Int8Type, DstAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
-      StoreInst *Store = RBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign,
-                                                     DstIsVolatile);
-      if (!CanOverlap) {
-        // Indicate that stores don't overlap loads.
-        Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
-      }
-      if (AtomicElementSize) {
-        Load->setAtomic(AtomicOrdering::Unordered);
-        Store->setAtomic(AtomicOrdering::Unordered);
-      }
-      BytesCopied += OperandSize;
+  for (auto *OpTy : RemainingOps) {
+    Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied));
+    Align PartDstAlign(commonAlignment(DstAlign, BytesCopied));
+
+    unsigned OperandSize = DL.getTypeStoreSize(OpTy);
+    assert((!AtomicElementSize || OperandSize % *AtomicElementSize == 0) &&
+           "Atomic memcpy lowering is not supported for selected operand size");
+
+    Value *SrcGEP = RBuilder.CreateInBoundsGEP(
+        Int8Type, SrcAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
+    LoadInst *Load =
+        RBuilder.CreateAlignedLoad(OpTy, SrcGEP, PartSrcAlign, SrcIsVolatile);
+    if (!CanOverlap) {
+      // Set alias scope for loads.
+      Load->setMetadata(LLVMContext::MD_alias_scope,
+                        MDNode::get(Ctx, NewScope));
+    }
+    Value *DstGEP = RBuilder.CreateInBoundsGEP(
+        Int8Type, DstAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
+    StoreInst *Store =
+        RBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
+    if (!CanOverlap) {
+      // Indicate that stores don't overlap loads.
+      Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
     }
+    if (AtomicElementSize) {
+      Load->setAtomic(AtomicOrdering::Unordered);
+      Store->setAtomic(AtomicOrdering::Unordered);
+    }
+    BytesCopied += OperandSize;
   }
   assert(BytesCopied == CopyLen->getZExtValue() &&
          "Bytes copied should match size in the call!");
 }
 
-// \returns \p Len urem \p OpSize, checking for optimization opportunities.
-static Value *getRuntimeLoopRemainder(const DataLayout &DL, IRBuilderBase &B,
-                                      Value *Len, Value *OpSize,
-                                      unsigned OpSizeVal) {
-  // For powers of 2, we can and by (OpSizeVal - 1) instead of using urem.
-  if (isPowerOf2_32(OpSizeVal))
-    return B.CreateAnd(Len, OpSizeVal - 1);
-  return B.CreateURem(Len, OpSize);
-}
-
-// \returns (\p Len udiv \p OpSize) mul \p OpSize, checking for optimization
-// opportunities.
-// If RTLoopRemainder is provided, it must be the result of
-// getRuntimeLoopRemainder() with the same arguments.
-static Value *getRuntimeLoopBytes(const DataLayout &DL, IRBuilderBase &B,
-                                  Value *Len, Value *OpSize, unsigned OpSizeVal,
-                                  Value *RTLoopRemainder = nullptr) {
-  if (!RTLoopRemainder)
-    RTLoopRemainder = getRuntimeLoopRemainder(DL, B, Len, OpSize, OpSizeVal);
-  return B.CreateSub(Len, RTLoopRemainder);
-}
-
 void llvm::createMemCpyLoopUnknownSize(
     Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen,
     Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile,
     bool CanOverlap, const TargetTransformInfo &TTI,
     std::optional<uint32_t> AtomicElementSize) {
   BasicBlock *PreLoopBB = InsertBefore->getParent();
-  BasicBlock *PostLoopBB =
-      PreLoopBB->splitBasicBlock(InsertBefore, "post-loop-memcpy-expansion");
-
   Function *ParentFunc = PreLoopBB->getParent();
   const DataLayout &DL = ParentFunc->getDataLayout();
   LLVMContext &Ctx = PreLoopBB->getContext();
@@ -205,50 +378,39 @@ void llvm::createMemCpyLoopUnknownSize(
   assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) &&
          "Atomic memcpy lowering is not supported for selected operand size");
 
-  IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
-
-  // Calculate the loop trip count, and remaining bytes to copy after the loop.
-  Type *CopyLenType = CopyLen->getType();
-  IntegerType *ILengthType = dyn_cast<IntegerType>(CopyLenType);
-  assert(ILengthType &&
-         "expected size argument to memcpy to be an integer type!");
   Type *Int8Type = Type::getInt8Ty(Ctx);
-  bool LoopOpIsInt8 = LoopOpType == Int8Type;
-  ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
 
-  Value *RuntimeLoopBytes = CopyLen;
-  Value *RuntimeResidualBytes = nullptr;
-  if (!LoopOpIsInt8) {
-    RuntimeResidualBytes = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen,
-                                                   CILoopOpSize, LoopOpSize);
-    RuntimeLoopBytes = getRuntimeLoopBytes(DL, PLBuilder, CopyLen, CILoopOpSize,
-                                           LoopOpSize, RuntimeResidualBytes);
-  }
+  Type *ResidualLoopOpType = AtomicElementSize
+                                 ? Type::getIntNTy(Ctx, *AtomicElementSize * 8)
+                                 : Int8Type;
+  unsigned ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
+  assert(ResidualLoopOpSize == (AtomicElementSize ? *AtomicElementSize : 1) &&
+         "Store size is expected to match type size");
 
-  BasicBlock *LoopBB =
-      BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB);
-  IRBuilder<> LoopBuilder(LoopBB);
+  LoopExpansionInfo LEI = insertLoopExpansion(
+      InsertBefore, CopyLen, LoopOpSize, ResidualLoopOpSize, "dynamic-memcpy");
 
+  // Fill MainLoopBB
+  IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
   Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
   Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
 
-  PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index");
-  LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB);
-
   // If we used LoopOpType as GEP element type, we would iterate over the
   // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
   // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use byte
   // offsets computed from the TypeStoreSize.
-  Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LoopIndex);
-  LoadInst *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP,
-                                                 PartSrcAlign, SrcIsVolatile);
+  Value *SrcGEP =
+      MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LEI.MainLoopIndex);
+  LoadInst *Load = MainLoopBuilder.CreateAlignedLoad(
+      LoopOpType, SrcGEP, PartSrcAlign, SrcIsVolatile);
   if (!CanOverlap) {
     // Set alias scope for loads.
     Load->setMetadata(LLVMContext::MD_alias_scope, MDNode::get(Ctx, NewScope));
   }
-  Value *DstGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LoopIndex);
-  StoreInst *Store =
-      LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
+  Value *DstGEP =
+      MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex);
+  StoreInst *Store = MainLoopBuilder.CreateAlignedStore(
+      Load, DstGEP, PartDstAlign, DstIsVolatile);
   if (!CanOverlap) {
     // Indicate that stores don't overlap loads.
     Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
@@ -257,95 +419,35 @@ void llvm::createMemCpyLoopUnknownSize(
     Load->setAtomic(AtomicOrdering::Unordered);
     Store->setAtomic(AtomicOrdering::Unordered);
   }
-  Value *NewIndex = LoopBuilder.CreateAdd(
-      LoopIndex, ConstantInt::get(CopyLenType, LoopOpSize));
-  LoopIndex->addIncoming(NewIndex, LoopBB);
-
-  bool RequiresResidual =
-      !LoopOpIsInt8 && !(AtomicElementSize && LoopOpSize == AtomicElementSize);
-  if (RequiresResidual) {
-    Type *ResLoopOpType = AtomicElementSize
-                              ? Type::getIntNTy(Ctx, *AtomicElementSize * 8)
-                              : Int8Type;
-    unsigned ResLoopOpSize = DL.getTypeStoreSize(ResLoopOpType);
-    assert((ResLoopOpSize == AtomicElementSize ? *AtomicElementSize : 1) &&
-           "Store size is expected to match type size");
-
-    Align ResSrcAlign(commonAlignment(PartSrcAlign, ResLoopOpSize));
-    Align ResDstAlign(commonAlignment(PartDstAlign, ResLoopOpSize));
-
-    // Loop body for the residual copy.
-    BasicBlock *ResLoopBB = BasicBlock::Create(
-        Ctx, "loop-memcpy-residual", PreLoopBB->getParent(), PostLoopBB);
-    // Residual loop header.
-    BasicBlock *ResHeaderBB = BasicBlock::Create(
-        Ctx, "loop-memcpy-residual-header", PreLoopBB->getParent(), nullptr);
-
-    // Need to update the pre-loop basic block to branch to the correct place.
-    // branch to the main loop if the count is non-zero, branch to the residual
-    // loop if the copy size is smaller then 1 iteration of the main loop but
-    // non-zero and finally branch to after the residual loop if the memcpy
-    //  size is zero.
-    ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
-    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopBytes, Zero),
-                           LoopBB, ResHeaderBB);
-    PreLoopBB->getTerminator()->eraseFromParent();
 
-    LoopBuilder.CreateCondBr(
-        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopBytes), LoopBB,
-        ResHeaderBB);
-
-    // Determine if we need to branch to the residual loop or bypass it.
-    IRBuilder<> RHBuilder(ResHeaderBB);
-    RHBuilder.CreateCondBr(RHBuilder.CreateICmpNE(RuntimeResidualBytes, Zero),
-                           ResLoopBB, PostLoopBB);
+  // Fill ResidualLoopBB.
+  if (!LEI.ResidualLoopIP)
+    return;
 
-    // Copy the residual with single byte load/store loop.
-    IRBuilder<> ResBuilder(ResLoopBB);
-    PHINode *ResidualIndex =
-        ResBuilder.CreatePHI(CopyLenType, 2, "residual-loop-index");
-    ResidualIndex->addIncoming(Zero, ResHeaderBB);
+  Align ResSrcAlign(commonAlignment(PartSrcAlign, ResidualLoopOpSize));
+  Align ResDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize));
 
-    Value *FullOffset = ResBuilder.CreateAdd(RuntimeLoopBytes, ResidualIndex);
-    Value *SrcGEP = ResBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, FullOffset);
-    LoadInst *Load = ResBuilder.CreateAlignedLoad(ResLoopOpType, SrcGEP,
-                                                  ResSrcAlign, SrcIsVolatile);
-    if (!CanOverlap) {
-      // Set alias scope for loads.
-      Load->setMetadata(LLVMContext::MD_alias_scope,
-                        MDNode::get(Ctx, NewScope));
-    }
-    Value *DstGEP = ResBuilder.CreateInBoundsGEP(Int8Type, DstAddr, FullOffset);
-    StoreInst *Store =
-        ResBuilder.CreateAlignedStore(Load, DstGEP, ResDstAlign, DstIsVolatile);
-    if (!CanOverlap) {
-      // Indicate that stores don't overlap loads.
-      Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
-    }
-    if (AtomicElementSize) {
-      Load->setAtomic(AtomicOrdering::Unordered);
-      Store->setAtomic(AtomicOrdering::Unordered);
-    }
-    Value *ResNewIndex = ResBuilder.CreateAdd(
-        ResidualIndex, ConstantInt::get(CopyLenType, ResLoopOpSize));
-    ResidualIndex->addIncoming(ResNewIndex, ResLoopBB);
-
-    // Create the loop branch condition.
-    ResBuilder.CreateCondBr(
-        ResBuilder.CreateICmpULT(ResNewIndex, RuntimeResidualBytes), ResLoopBB,
-        PostLoopBB);
-  } else {
-    // In this case the loop operand type was a byte, and there is no need for a
-    // residual loop to copy the remaining memory after the main loop.
-    // We do however need to patch up the control flow by creating the
-    // terminators for the preloop block and the memcpy loop.
-    ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
-    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopBytes, Zero),
-                           LoopBB, PostLoopBB);
-    PreLoopBB->getTerminator()->eraseFromParent();
-    LoopBuilder.CreateCondBr(
-        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopBytes), LoopBB,
-        PostLoopBB);
+  IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP);
+  Value *ResSrcGEP = ResLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr,
+                                                      LEI.ResidualLoopIndex);
+  LoadInst *ResLoad = ResLoopBuilder.CreateAlignedLoad(
+      ResidualLoopOpType, ResSrcGEP, ResSrcAlign, SrcIsVolatile);
+  if (!CanOverlap) {
+    // Set alias scope for loads.
+    ResLoad->setMetadata(LLVMContext::MD_alias_scope,
+                         MDNode::get(Ctx, NewScope));
+  }
+  Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
+                                                      LEI.ResidualLoopIndex);
+  StoreInst *ResStore = ResLoopBuilder.CreateAlignedStore(
+      ResLoad, ResDstGEP, ResDstAlign, DstIsVolatile);
+  if (!CanOverlap) {
+    // Indicate that stores don't overlap loads.
+    ResStore->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
+  }
+  if (AtomicElementSize) {
+    ResLoad->setAtomic(AtomicOrdering::Unordered);
+    ResStore->setAtomic(AtomicOrdering::Unordered);
   }
 }
 
@@ -439,9 +541,9 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
   Value *RuntimeLoopRemainder = nullptr;
   Value *SkipResidualCondition = nullptr;
   if (RequiresResidual) {
-    RuntimeLoopRemainder = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen,
-                                                   CILoopOpSize, LoopOpSize);
-    RuntimeLoopBytes = getRuntimeLoopBytes(DL, PLBuilder, CopyLen, CILoopOpSize,
+    RuntimeLoopRemainder =
+        getRuntimeLoopRemainder(PLBuilder, CopyLen, CILoopOpSize, LoopOpSize);
+    RuntimeLoopBytes = getRuntimeLoopUnits(PLBuilder, CopyLen, CILoopOpSize,
                                            LoopOpSize, RuntimeLoopRemainder);
     SkipResidualCondition =
         PLBuilder.CreateICmpEQ(RuntimeLoopRemainder, Zero, "skip_residual");
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
index e0016b0a5a64d..993fb7eeb3aa9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
@@ -12,7 +12,7 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
 ; LOOP-NEXT:    s_mov_b32 s3, 0xf000
 ; LOOP-NEXT:    v_mov_b32_e32 v5, s1
 ; LOOP-NEXT:    v_mov_b32_e32 v4, s0
-; LOOP-NEXT:  .LBB0_1: ; %load-store-loop
+; LOOP-NEXT:  .LBB0_1: ; %static-memcpy-expansion-main-body
 ; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; LOOP-NEXT:    v_add_i32_e32 v6, vcc, v2, v4
 ; LOOP-NEXT:    v_addc_u32_e32 v7, vcc, v3, v5, vcc
@@ -177,7 +177,7 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
 ; LOOP-NEXT:    buffer_store_byte v30, v[6:7], s[0:3], 0 addr64 offset:30
 ; LOOP-NEXT:    buffer_store_byte v13, v[6:7], s[0:3], 0 addr64 offset:31
 ; LOOP-NEXT:    s_cbranch_vccnz .LBB0_1
-; LOOP-NEXT:  ; %bb.2: ; %memcpy-split
+; LOOP-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; LOOP-NEXT:    s_mov_b32 s2, 0
 ; LOOP-NEXT:    s_mov_b32 s3, 0xf000
 ; LOOP-NEXT:    s_mov_b64 s[0:1], 0
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index 931a62298812f..1e79c4f63cd42 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -255,7 +255,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; SDAG-GFX942-NEXT:    s_mov_b32 s17, s10
 ; SDAG-GFX942-NEXT:    s_mov_b32 s2, s9
 ; SDAG-GFX942-NEXT:    s_or_b64 s[12:13], s[2:3], s[16:17]
-; SDAG-GFX942-NEXT:  .LBB0_1: ; %load-store-loop
+; SDAG-GFX942-NEXT:  .LBB0_1: ; %static-memcpy-expansion-main-body
 ; SDAG-GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SDAG-GFX942-NEXT:    s_add_i32 s1, s0, s16
 ; SDAG-GFX942-NEXT:    v_mov_b32_e32 v0, s1
@@ -312,7 +312,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 a[0:3], v0, s[12:15], 0 offen offset:240
 ; SDAG-GFX942-NEXT:    s_cbranch_scc1 .LBB0_1
-; SDAG-GFX942-NEXT:  ; %bb.2: ; %memcpy-split
+; SDAG-GFX942-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; SDAG-GFX942-NEXT:    s_endpgm
 ;
 ; SDAG-GFX1100-LABEL: memcpy_known:
@@ -341,7 +341,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s3, s16
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-GFX1100-NEXT:    s_or_b64 s[12:13], s[2:3], s[16:17]
-; SDAG-GFX1100-NEXT:  .LBB0_1: ; %load-store-loop
+; SDAG-GFX1100-NEXT:  .LBB0_1: ; %static-memcpy-expansion-main-body
 ; SDAG-GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SDAG-GFX1100-NEXT:    s_add_i32 s1, s0, s16
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -400,7 +400,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-GFX1100-NEXT:    buffer_store_b128 v[60:63], v64, s[12:15], 0 offen offset:240
 ; SDAG-GFX1100-NEXT:    s_cbranch_scc1 .LBB0_1
-; SDAG-GFX1100-NEXT:  ; %bb.2: ; %memcpy-split
+; SDAG-GFX1100-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; SDAG-GFX1100-NEXT:    s_endpgm
 ;
 ; GISEL-GFX942-LABEL: memcpy_known:
@@ -419,7 +419,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX942-NEXT:    s_mov_b32 s5, s14
 ; GISEL-GFX942-NEXT:    s_mov_b32 s6, s15
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v1, s16
-; GISEL-GFX942-NEXT:  .LBB0_1: ; %load-store-loop
+; GISEL-GFX942-NEXT:  .LBB0_1: ; %static-memcpy-expansion-main-body
 ; GISEL-GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GISEL-GFX942-NEXT:    v_add_u32_e32 v62, s0, v1
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
@@ -477,7 +477,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
 ; GISEL-GFX942-NEXT:    s_cbranch_vccnz .LBB0_1
-; GISEL-GFX942-NEXT:  ; %bb.2: ; %memcpy-split
+; GISEL-GFX942-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; GISEL-GFX942-NEXT:    s_endpgm
 ;
 ; GISEL-GFX1100-LABEL: memcpy_known:
@@ -497,7 +497,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s9
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s13, s10
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s14, s11
-; GISEL-GFX1100-NEXT:  .LBB0_1: ; %load-store-loop
+; GISEL-GFX1100-NEXT:  .LBB0_1: ; %static-memcpy-expansion-main-body
 ; GISEL-GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GISEL-GFX1100-NEXT:    v_add_nc_u32_e32 v61, s0, v0
 ; GISEL-GFX1100-NEXT:    v_add_nc_u32_e32 v65, s8, v0
@@ -553,7 +553,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX1100-NEXT:    buffer_store_b128 v[61:64], v65, s[12:15], 0 offen offset:240
 ; GISEL-GFX1100-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x2000, v0
 ; GISEL-GFX1100-NEXT:    s_cbranch_vccnz .LBB0_1
-; GISEL-GFX1100-NEXT:  ; %bb.2: ; %memcpy-split
+; GISEL-GFX1100-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; GISEL-GFX1100-NEXT:    s_endpgm
   call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false)
   ret void
@@ -787,7 +787,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; SDAG-GFX942-NEXT:    s_mov_b32 s17, s10
 ; SDAG-GFX942-NEXT:    s_mov_b32 s2, s9
 ; SDAG-GFX942-NEXT:    s_or_b64 s[12:13], s[2:3], s[16:17]
-; SDAG-GFX942-NEXT:  .LBB1_1: ; %load-store-loop
+; SDAG-GFX942-NEXT:  .LBB1_1: ; %static-memcpy-expansion-main-body
 ; SDAG-GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SDAG-GFX942-NEXT:    s_add_i32 s1, s0, s16
 ; SDAG-GFX942-NEXT:    v_mov_b32_e32 v0, s1
@@ -844,7 +844,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 a[0:3], v0, s[12:15], 0 offen offset:240
 ; SDAG-GFX942-NEXT:    s_cbranch_scc1 .LBB1_1
-; SDAG-GFX942-NEXT:  ; %bb.2: ; %memcpy-split
+; SDAG-GFX942-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; SDAG-GFX942-NEXT:    s_endpgm
 ;
 ; SDAG-GFX1100-LABEL: memcpy_known_medium:
@@ -873,7 +873,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s3, s16
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-GFX1100-NEXT:    s_or_b64 s[12:13], s[2:3], s[16:17]
-; SDAG-GFX1100-NEXT:  .LBB1_1: ; %load-store-loop
+; SDAG-GFX1100-NEXT:  .LBB1_1: ; %static-memcpy-expansion-main-body
 ; SDAG-GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SDAG-GFX1100-NEXT:    s_add_i32 s1, s0, s16
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -932,7 +932,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-GFX1100-NEXT:    buffer_store_b128 v[60:63], v64, s[12:15], 0 offen offset:240
 ; SDAG-GFX1100-NEXT:    s_cbranch_scc1 .LBB1_1
-; SDAG-GFX1100-NEXT:  ; %bb.2: ; %memcpy-split
+; SDAG-GFX1100-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; SDAG-GFX1100-NEXT:    s_endpgm
 ;
 ; GISEL-GFX942-LABEL: memcpy_known_medium:
@@ -951,7 +951,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX942-NEXT:    s_mov_b32 s5, s14
 ; GISEL-GFX942-NEXT:    s_mov_b32 s6, s15
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v1, s16
-; GISEL-GFX942-NEXT:  .LBB1_1: ; %load-store-loop
+; GISEL-GFX942-NEXT:  .LBB1_1: ; %static-memcpy-expansion-main-body
 ; GISEL-GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GISEL-GFX942-NEXT:    v_add_u32_e32 v62, s0, v1
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
@@ -1009,7 +1009,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
 ; GISEL-GFX942-NEXT:    s_cbranch_vccnz .LBB1_1
-; GISEL-GFX942-NEXT:  ; %bb.2: ; %memcpy-split
+; GISEL-GFX942-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; GISEL-GFX942-NEXT:    s_endpgm
 ;
 ; GISEL-GFX1100-LABEL: memcpy_known_medium:
@@ -1029,7 +1029,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s9
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s13, s10
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s14, s11
-; GISEL-GFX1100-NEXT:  .LBB1_1: ; %load-store-loop
+; GISEL-GFX1100-NEXT:  .LBB1_1: ; %static-memcpy-expansion-main-body
 ; GISEL-GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GISEL-GFX1100-NEXT:    v_add_nc_u32_e32 v61, s0, v0
 ; GISEL-GFX1100-NEXT:    v_add_nc_u32_e32 v65, s8, v0
@@ -1085,7 +1085,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX1100-NEXT:    buffer_store_b128 v[61:64], v65, s[12:15], 0 offen offset:240
 ; GISEL-GFX1100-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x100, v0
 ; GISEL-GFX1100-NEXT:    s_cbranch_vccnz .LBB1_1
-; GISEL-GFX1100-NEXT:  ; %bb.2: ; %memcpy-split
+; GISEL-GFX1100-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; GISEL-GFX1100-NEXT:    s_endpgm
   call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 256, i1 false)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
index ad0b4fd8d902e..83d6f4f5882b4 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
@@ -19,9 +19,9 @@ define void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %d
 ; CHECK-NEXT:    [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1
 ; CHECK-NEXT:    [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0
 ; CHECK-NEXT:    [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1
-; CHECK-NEXT:    br label %[[LOAD_STORE_LOOP:.*]]
-; CHECK:       [[LOAD_STORE_LOOP]]:
-; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ]
+; CHECK-NEXT:    br label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]]
+; CHECK:       [[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]]
 ; CHECK-NEXT:    [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0)
 ; CHECK-NEXT:    [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -135,8 +135,8 @@ define void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %d
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
-; CHECK:       [[MEMCPY_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_LOOP_EXPANSION:.*]]
+; CHECK:       [[STATIC_MEMCPY_POST_LOOP_EXPANSION]]:
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false)
@@ -211,9 +211,9 @@ define void @memcpy_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(7) inre
 ; CHECK-NEXT:    [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1
 ; CHECK-NEXT:    [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0
 ; CHECK-NEXT:    [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1
-; CHECK-NEXT:    br label %[[LOAD_STORE_LOOP:.*]]
-; CHECK:       [[LOAD_STORE_LOOP]]:
-; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ]
+; CHECK-NEXT:    br label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]]
+; CHECK:       [[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ]
 ; CHECK-NEXT:    [[LOOP_INDEX_C:%.*]] = trunc i64 [[LOOP_INDEX]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX_C]]
 ; CHECK-NEXT:    [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0)
@@ -329,8 +329,8 @@ define void @memcpy_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(7) inre
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 8192
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
-; CHECK:       [[MEMCPY_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_LOOP_EXPANSION:.*]]
+; CHECK:       [[STATIC_MEMCPY_POST_LOOP_EXPANSION]]:
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.memcpy.p7.p7.i64(ptr addrspace(7) %dst, ptr addrspace(7) %src, i64 8192, i1 false)
@@ -366,18 +366,21 @@ define void @memcpy_unknown(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[LENGTH]], 15
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[LENGTH]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[LOOP_MEMCPY_EXPANSION:.*]], label %[[LOOP_MEMCPY_RESIDUAL_HEADER:.*]]
-; CHECK:       [[LOOP_MEMCPY_EXPANSION]]:
-; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP7:%.*]], %[[LOOP_MEMCPY_EXPANSION]] ]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_COND:.*]]
+; CHECK:       [[DYNAMIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP7:%.*]], %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP4]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]]
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP5]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP6]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP7]] = add i32 [[LOOP_INDEX]], 16
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[LOOP_MEMCPY_EXPANSION]], label %[[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; CHECK:       [[LOOP_MEMCPY_RESIDUAL:.*]]:
-; CHECK-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, %[[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP13:%.*]], %[[LOOP_MEMCPY_RESIDUAL]] ]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_COND]]
+; CHECK:       [[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_COND]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_BODY:.*]], label %[[DYNAMIC_MEMCPY_POST_LOOP_EXPANSION:.*]]
+; CHECK:       [[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_BODY]]:
+; CHECK-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_COND]] ], [ [[TMP13:%.*]], %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP2]], [[RESIDUAL_LOOP_INDEX]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[SRC_OFF]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP10]], i32 0, i32 0)
@@ -385,12 +388,9 @@ define void @memcpy_unknown(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[TMP11]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP12]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP13]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp ult i32 [[TMP13]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[LOOP_MEMCPY_RESIDUAL]], label %[[POST_LOOP_MEMCPY_EXPANSION:.*]]
-; CHECK:       [[POST_LOOP_MEMCPY_EXPANSION]]:
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_BODY]], label %[[DYNAMIC_MEMCPY_POST_LOOP_EXPANSION]]
+; CHECK:       [[DYNAMIC_MEMCPY_POST_LOOP_EXPANSION]]:
 ; CHECK-NEXT:    ret void
-; CHECK:       [[LOOP_MEMCPY_RESIDUAL_HEADER]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[TMP15]], label %[[LOOP_MEMCPY_RESIDUAL]], label %[[POST_LOOP_MEMCPY_EXPANSION]]
 ;
   call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 %length, i1 false)
   ret void
@@ -401,9 +401,9 @@ define void @memcpy_known_p1_to_p7(ptr addrspace(1) inreg %src, ptr addrspace(7)
 ; CHECK-SAME: ptr addrspace(1) inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0
 ; CHECK-NEXT:    [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1
-; CHECK-NEXT:    br label %[[LOAD_STORE_LOOP:.*]]
-; CHECK:       [[LOAD_STORE_LOOP]]:
-; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ]
+; CHECK-NEXT:    br label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]]
+; CHECK:       [[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i32 [[LOOP_INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]]
@@ -456,8 +456,8 @@ define void @memcpy_known_p1_to_p7(ptr addrspace(1) inreg %src, ptr addrspace(7)
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
-; CHECK:       [[MEMCPY_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_LOOP_EXPANSION:.*]]
+; CHECK:       [[STATIC_MEMCPY_POST_LOOP_EXPANSION]]:
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.memcpy.p7.p1.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i32 8192, i1 false)
@@ -469,9 +469,9 @@ define void @memcpy_known_p7_to_p1(ptr addrspace(7) inreg %src, ptr addrspace(1)
 ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(1) inreg [[DST:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0
 ; CHECK-NEXT:    [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1
-; CHECK-NEXT:    br label %[[LOAD_STORE_LOOP:.*]]
-; CHECK:       [[LOAD_STORE_LOOP]]:
-; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ]
+; CHECK-NEXT:    br label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]]
+; CHECK:       [[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]]
 ; CHECK-NEXT:    [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0)
 ; CHECK-NEXT:    [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -540,8 +540,8 @@ define void @memcpy_known_p7_to_p1(ptr addrspace(7) inreg %src, ptr addrspace(1)
 ; CHECK-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 16
 ; CHECK-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
-; CHECK:       [[MEMCPY_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_LOOP_EXPANSION:.*]]
+; CHECK:       [[STATIC_MEMCPY_POST_LOOP_EXPANSION]]:
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.memcpy.p1.p7.i32(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false)
@@ -582,9 +582,9 @@ define void @memcpy_known_p7_to_p3_long(ptr addrspace(7) inreg %src, ptr addrspa
 ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(3) inreg [[DST:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0
 ; CHECK-NEXT:    [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1
-; CHECK-NEXT:    br label %[[LOAD_STORE_LOOP:.*]]
-; CHECK:       [[LOAD_STORE_LOOP]]:
-; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ]
+; CHECK-NEXT:    br label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]]
+; CHECK:       [[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]]
 ; CHECK-NEXT:    [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0)
 ; CHECK-NEXT:    [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -653,8 +653,8 @@ define void @memcpy_known_p7_to_p3_long(ptr addrspace(7) inreg %src, ptr addrspa
 ; CHECK-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 16
 ; CHECK-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
-; CHECK:       [[MEMCPY_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_LOOP_EXPANSION:.*]]
+; CHECK:       [[STATIC_MEMCPY_POST_LOOP_EXPANSION]]:
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.memcpy.p3.p7.i32(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false)
@@ -676,9 +676,9 @@ define void @memcpy.inline_known(ptr addrspace(7) inreg %src, ptr addrspace(7) i
 ; CHECK-NEXT:    [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1
 ; CHECK-NEXT:    [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0
 ; CHECK-NEXT:    [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1
-; CHECK-NEXT:    br label %[[LOAD_STORE_LOOP:.*]]
-; CHECK:       [[LOAD_STORE_LOOP]]:
-; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ]
+; CHECK-NEXT:    br label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]]
+; CHECK:       [[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]]
 ; CHECK-NEXT:    [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0)
 ; CHECK-NEXT:    [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -792,8 +792,8 @@ define void @memcpy.inline_known(ptr addrspace(7) inreg %src, ptr addrspace(7) i
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
-; CHECK:       [[MEMCPY_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_LOOP_EXPANSION:.*]]
+; CHECK:       [[STATIC_MEMCPY_POST_LOOP_EXPANSION]]:
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.memcpy.inline.p7.p7.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false)
@@ -868,9 +868,9 @@ define void @memcpy.inline_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(
 ; CHECK-NEXT:    [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1
 ; CHECK-NEXT:    [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0
 ; CHECK-NEXT:    [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1
-; CHECK-NEXT:    br label %[[LOAD_STORE_LOOP:.*]]
-; CHECK:       [[LOAD_STORE_LOOP]]:
-; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ]
+; CHECK-NEXT:    br label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]]
+; CHECK:       [[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ]
 ; CHECK-NEXT:    [[LOOP_INDEX_C:%.*]] = trunc i64 [[LOOP_INDEX]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX_C]]
 ; CHECK-NEXT:    [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0)
@@ -986,8 +986,8 @@ define void @memcpy.inline_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 8192
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
-; CHECK:       [[MEMCPY_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_LOOP_EXPANSION:.*]]
+; CHECK:       [[STATIC_MEMCPY_POST_LOOP_EXPANSION]]:
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.memcpy.inline.p7.p7.i64(ptr addrspace(7) %dst, ptr addrspace(7) %src, i64 8192, i1 false)
@@ -1023,18 +1023,21 @@ define void @memcpy.inline_unknown(ptr addrspace(7) inreg %src, ptr addrspace(7)
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[LENGTH]], 15
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[LENGTH]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[LOOP_MEMCPY_EXPANSION:.*]], label %[[LOOP_MEMCPY_RESIDUAL_HEADER:.*]]
-; CHECK:       [[LOOP_MEMCPY_EXPANSION]]:
-; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP7:%.*]], %[[LOOP_MEMCPY_EXPANSION]] ]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_COND:.*]]
+; CHECK:       [[DYNAMIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP7:%.*]], %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP4]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]]
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP5]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP6]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP7]] = add i32 [[LOOP_INDEX]], 16
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[LOOP_MEMCPY_EXPANSION]], label %[[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; CHECK:       [[LOOP_MEMCPY_RESIDUAL:.*]]:
-; CHECK-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, %[[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP13:%.*]], %[[LOOP_MEMCPY_RESIDUAL]] ]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_COND]]
+; CHECK:       [[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_COND]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_BODY:.*]], label %[[DYNAMIC_MEMCPY_POST_LOOP_EXPANSION:.*]]
+; CHECK:       [[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_BODY]]:
+; CHECK-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_COND]] ], [ [[TMP13:%.*]], %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP2]], [[RESIDUAL_LOOP_INDEX]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[SRC_OFF]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP10]], i32 0, i32 0)
@@ -1042,12 +1045,9 @@ define void @memcpy.inline_unknown(ptr addrspace(7) inreg %src, ptr addrspace(7)
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[TMP11]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP12]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP13]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp ult i32 [[TMP13]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[LOOP_MEMCPY_RESIDUAL]], label %[[POST_LOOP_MEMCPY_EXPANSION:.*]]
-; CHECK:       [[POST_LOOP_MEMCPY_EXPANSION]]:
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[DYNAMIC_MEMCPY_LOOP_EXPANSION_RESIDUAL_BODY]], label %[[DYNAMIC_MEMCPY_POST_LOOP_EXPANSION]]
+; CHECK:       [[DYNAMIC_MEMCPY_POST_LOOP_EXPANSION]]:
 ; CHECK-NEXT:    ret void
-; CHECK:       [[LOOP_MEMCPY_RESIDUAL_HEADER]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[TMP15]], label %[[LOOP_MEMCPY_RESIDUAL]], label %[[POST_LOOP_MEMCPY_EXPANSION]]
 ;
   call void @llvm.memcpy.inline.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 %length, i1 false)
   ret void
@@ -1058,9 +1058,9 @@ define void @memcpy.inline_known_p1_to_p7(ptr addrspace(1) inreg %src, ptr addrs
 ; CHECK-SAME: ptr addrspace(1) inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0
 ; CHECK-NEXT:    [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1
-; CHECK-NEXT:    br label %[[LOAD_STORE_LOOP:.*]]
-; CHECK:       [[LOAD_STORE_LOOP]]:
-; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ]
+; CHECK-NEXT:    br label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]]
+; CHECK:       [[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i32 [[LOOP_INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]]
@@ -1113,8 +1113,8 @@ define void @memcpy.inline_known_p1_to_p7(ptr addrspace(1) inreg %src, ptr addrs
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
-; CHECK:       [[MEMCPY_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_LOOP_EXPANSION:.*]]
+; CHECK:       [[STATIC_MEMCPY_POST_LOOP_EXPANSION]]:
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.memcpy.inline.p7.p1.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i32 8192, i1 false)
@@ -1126,9 +1126,9 @@ define void @memcpy.inline_known_p7_to_p1(ptr addrspace(7) inreg %src, ptr addrs
 ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(1) inreg [[DST:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0
 ; CHECK-NEXT:    [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1
-; CHECK-NEXT:    br label %[[LOAD_STORE_LOOP:.*]]
-; CHECK:       [[LOAD_STORE_LOOP]]:
-; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ]
+; CHECK-NEXT:    br label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]]
+; CHECK:       [[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]]
 ; CHECK-NEXT:    [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0)
 ; CHECK-NEXT:    [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -1197,8 +1197,8 @@ define void @memcpy.inline_known_p7_to_p1(ptr addrspace(7) inreg %src, ptr addrs
 ; CHECK-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 16
 ; CHECK-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
-; CHECK:       [[MEMCPY_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_LOOP_EXPANSION:.*]]
+; CHECK:       [[STATIC_MEMCPY_POST_LOOP_EXPANSION]]:
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.memcpy.inline.p1.p7.i32(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false)
@@ -1239,9 +1239,9 @@ define void @memcpy.inline_known_p7_to_p3_long(ptr addrspace(7) inreg %src, ptr
 ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(3) inreg [[DST:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0
 ; CHECK-NEXT:    [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1
-; CHECK-NEXT:    br label %[[LOAD_STORE_LOOP:.*]]
-; CHECK:       [[LOAD_STORE_LOOP]]:
-; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ]
+; CHECK-NEXT:    br label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY:.*]]
+; CHECK:       [[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]]
 ; CHECK-NEXT:    [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0)
 ; CHECK-NEXT:    [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -1310,8 +1310,8 @@ define void @memcpy.inline_known_p7_to_p3_long(ptr addrspace(7) inreg %src, ptr
 ; CHECK-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 16
 ; CHECK-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
-; CHECK:       [[MEMCPY_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STATIC_MEMCPY_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_LOOP_EXPANSION:.*]]
+; CHECK:       [[STATIC_MEMCPY_POST_LOOP_EXPANSION]]:
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.memcpy.inline.p3.p7.i32(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index 5a9f53ec0077d..20a34dc997bbc 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -43,7 +43,7 @@ define amdgpu_kernel void @max_size_small_static_memcpy_caller0(ptr addrspace(1)
 ;
 ; ALL-LABEL: @max_size_small_static_memcpy_caller0(
 ; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; ALL:       load-store-loop:
+; ALL:       static-memcpy-expansion-main-body:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1
@@ -52,7 +52,7 @@ define amdgpu_kernel void @max_size_small_static_memcpy_caller0(ptr addrspace(1)
 ; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; ALL:       memcpy-split:
+; ALL:       static-memcpy-post-expansion:
 ; ALL-NEXT:    ret void
 ;
   call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1024, i1 false)
@@ -63,7 +63,7 @@ define amdgpu_kernel void @max_size_small_static_memcpy_caller0(ptr addrspace(1)
 define amdgpu_kernel void @min_size_large_static_memcpy_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
 ; OPT-LABEL: @min_size_large_static_memcpy_caller0(
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; OPT:       load-store-loop:
+; OPT:       static-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1
@@ -72,7 +72,7 @@ define amdgpu_kernel void @min_size_large_static_memcpy_caller0(ptr addrspace(1)
 ; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; OPT:       memcpy-split:
+; OPT:       static-memcpy-post-expansion:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 1
 ; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
@@ -201,7 +201,7 @@ define amdgpu_kernel void @variable_memcpy_caller0(ptr addrspace(1) %dst, ptr ad
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
-; OPT:       loop-memcpy-expansion:
+; OPT:       dynamic-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
@@ -210,8 +210,11 @@ define amdgpu_kernel void @variable_memcpy_caller0(ptr addrspace(1) %dst, ptr ad
 ; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; OPT:       loop-memcpy-residual:
-; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; OPT:       dynamic-memcpy-expansion-residual-cond:
+; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
+; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; OPT:       dynamic-memcpy-expansion-residual-body:
+; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ]
 ; OPT-NEXT:    [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
 ; OPT-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
@@ -219,12 +222,9 @@ define amdgpu_kernel void @variable_memcpy_caller0(ptr addrspace(1) %dst, ptr ad
 ; OPT-NEXT:    store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
 ; OPT-NEXT:    [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
 ; OPT-NEXT:    [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
-; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
-; OPT:       post-loop-memcpy-expansion:
+; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+; OPT:       dynamic-memcpy-post-expansion:
 ; OPT-NEXT:    ret void
-; OPT:       loop-memcpy-residual-header:
-; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
-; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
   call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n, i1 false)
   ret void
@@ -236,7 +236,7 @@ define amdgpu_kernel void @variable_memcpy_caller1(ptr addrspace(1) %dst, ptr ad
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
-; OPT:       loop-memcpy-expansion:
+; OPT:       dynamic-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
@@ -245,8 +245,11 @@ define amdgpu_kernel void @variable_memcpy_caller1(ptr addrspace(1) %dst, ptr ad
 ; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; OPT:       loop-memcpy-residual:
-; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; OPT:       dynamic-memcpy-expansion-residual-cond:
+; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
+; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; OPT:       dynamic-memcpy-expansion-residual-body:
+; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ]
 ; OPT-NEXT:    [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
 ; OPT-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
@@ -254,12 +257,9 @@ define amdgpu_kernel void @variable_memcpy_caller1(ptr addrspace(1) %dst, ptr ad
 ; OPT-NEXT:    store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
 ; OPT-NEXT:    [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
 ; OPT-NEXT:    [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
-; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
-; OPT:       post-loop-memcpy-expansion:
+; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+; OPT:       dynamic-memcpy-post-expansion:
 ; OPT-NEXT:    ret void
-; OPT:       loop-memcpy-residual-header:
-; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
-; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
   call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n, i1 false)
   ret void
@@ -271,7 +271,7 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0,
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION2:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER5:%.*]]
-; OPT:       loop-memcpy-expansion2:
+; OPT:       dynamic-memcpy-expansion-main-body2:
 ; OPT-NEXT:    [[LOOP_INDEX3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION2]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX3]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
@@ -280,8 +280,11 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0,
 ; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX3]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION2]], label [[LOOP_MEMCPY_RESIDUAL_HEADER5]]
-; OPT:       loop-memcpy-residual4:
-; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX6:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER5]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL4:%.*]] ]
+; OPT:       dynamic-memcpy-expansion-residual-cond5:
+; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
+; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL4:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION1:%.*]]
+; OPT:       dynamic-memcpy-expansion-residual-body4:
+; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX6:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER5]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL4]] ]
 ; OPT-NEXT:    [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX6]]
 ; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
 ; OPT-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
@@ -289,13 +292,13 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0,
 ; OPT-NEXT:    store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
 ; OPT-NEXT:    [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX6]], 1
 ; OPT-NEXT:    [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
-; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1:%.*]]
-; OPT:       post-loop-memcpy-expansion1:
+; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1]]
+; OPT:       dynamic-memcpy-post-expansion1:
 ; OPT-NEXT:    [[TMP17:%.*]] = and i64 [[M:%.*]], 15
 ; OPT-NEXT:    [[TMP18:%.*]] = sub i64 [[M]], [[TMP17]]
 ; OPT-NEXT:    [[TMP19:%.*]] = icmp ne i64 [[TMP18]], 0
 ; OPT-NEXT:    br i1 [[TMP19]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
-; OPT:       loop-memcpy-expansion:
+; OPT:       dynamic-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[POST_LOOP_MEMCPY_EXPANSION1]] ], [ [[TMP23:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; OPT-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP20]], align 1
@@ -304,8 +307,11 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0,
 ; OPT-NEXT:    [[TMP23]] = add i64 [[LOOP_INDEX]], 16
 ; OPT-NEXT:    [[TMP24:%.*]] = icmp ult i64 [[TMP23]], [[TMP18]]
 ; OPT-NEXT:    br i1 [[TMP24]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; OPT:       loop-memcpy-residual:
-; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP29:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; OPT:       dynamic-memcpy-expansion-residual-cond:
+; OPT-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP17]], 0
+; OPT-NEXT:    br i1 [[TMP31]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; OPT:       dynamic-memcpy-expansion-residual-body:
+; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP29:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ]
 ; OPT-NEXT:    [[TMP25:%.*]] = add i64 [[TMP18]], [[RESIDUAL_LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP25]]
 ; OPT-NEXT:    [[TMP27:%.*]] = load i8, ptr addrspace(1) [[TMP26]], align 1
@@ -313,15 +319,9 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0,
 ; OPT-NEXT:    store i8 [[TMP27]], ptr addrspace(1) [[TMP28]], align 1
 ; OPT-NEXT:    [[TMP29]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
 ; OPT-NEXT:    [[TMP30:%.*]] = icmp ult i64 [[TMP29]], [[TMP17]]
-; OPT-NEXT:    br i1 [[TMP30]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
-; OPT:       post-loop-memcpy-expansion:
+; OPT-NEXT:    br i1 [[TMP30]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+; OPT:       dynamic-memcpy-post-expansion:
 ; OPT-NEXT:    ret void
-; OPT:       loop-memcpy-residual-header:
-; OPT-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP17]], 0
-; OPT-NEXT:    br i1 [[TMP31]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
-; OPT:       loop-memcpy-residual-header5:
-; OPT-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[TMP2]], 0
-; OPT-NEXT:    br i1 [[TMP32]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1]]
 ;
   call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst0, ptr addrspace(1) %src, i64 %n, i1 false)
   call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 %m, i1 false)
@@ -334,7 +334,7 @@ define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
-; OPT:       loop-memcpy-expansion:
+; OPT:       dynamic-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 1
@@ -343,8 +343,11 @@ define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(
 ; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; OPT:       loop-memcpy-residual:
-; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; OPT:       dynamic-memcpy-expansion-residual-cond:
+; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
+; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; OPT:       dynamic-memcpy-expansion-residual-body:
+; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ]
 ; OPT-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
 ; OPT-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
@@ -352,12 +355,9 @@ define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(
 ; OPT-NEXT:    store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
 ; OPT-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; OPT-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
-; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
-; OPT:       post-loop-memcpy-expansion:
+; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+; OPT:       dynamic-memcpy-post-expansion:
 ; OPT-NEXT:    ret void
-; OPT:       loop-memcpy-residual-header:
-; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
-; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
   call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n, i1 false)
   ret void
@@ -370,7 +370,7 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac
 ; MAX1024-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
 ; MAX1024-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
-; MAX1024:       loop-memcpy-expansion:
+; MAX1024:       dynamic-memcpy-expansion-main-body:
 ; MAX1024-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; MAX1024-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; MAX1024-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
@@ -379,8 +379,11 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac
 ; MAX1024-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 16
 ; MAX1024-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
 ; MAX1024-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; MAX1024:       loop-memcpy-residual:
-; MAX1024-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; MAX1024:       dynamic-memcpy-expansion-residual-cond:
+; MAX1024-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
+; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; MAX1024:       dynamic-memcpy-expansion-residual-body:
+; MAX1024-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ]
 ; MAX1024-NEXT:    [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; MAX1024-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
 ; MAX1024-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
@@ -388,20 +391,17 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac
 ; MAX1024-NEXT:    store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
 ; MAX1024-NEXT:    [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
 ; MAX1024-NEXT:    [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
-; MAX1024-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
-; MAX1024:       post-loop-memcpy-expansion:
+; MAX1024-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+; MAX1024:       dynamic-memcpy-post-expansion:
 ; MAX1024-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST1:%.*]], ptr addrspace(1) [[SRC]], i64 102, i1 false)
 ; MAX1024-NEXT:    ret void
-; MAX1024:       loop-memcpy-residual-header:
-; MAX1024-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
-; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
 ; ALL-LABEL: @memcpy_multi_use_one_function_keep_small(
 ; ALL-NEXT:    [[TMP2:%.*]] = and i64 [[N:%.*]], 15
 ; ALL-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
-; ALL:       loop-memcpy-expansion:
+; ALL:       dynamic-memcpy-expansion-main-body:
 ; ALL-NEXT:    [[LOOP_INDEX1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX1]]
 ; ALL-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
@@ -410,8 +410,11 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac
 ; ALL-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX1]], 16
 ; ALL-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
 ; ALL-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; ALL:       loop-memcpy-residual:
-; ALL-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; ALL:       dynamic-memcpy-expansion-residual-cond:
+; ALL-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP2]], 0
+; ALL-NEXT:    br i1 [[TMP27]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; ALL:       dynamic-memcpy-expansion-residual-body:
+; ALL-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ]
 ; ALL-NEXT:    [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
 ; ALL-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
@@ -419,8 +422,8 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac
 ; ALL-NEXT:    store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
 ; ALL-NEXT:    [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
-; ALL-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
-; ALL:       post-loop-memcpy-expansion:
+; ALL-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+; ALL:       dynamic-memcpy-post-expansion:
 ; ALL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 0
 ; ALL-NEXT:    [[TMP17:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP16]], align 1
 ; ALL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1:%.*]], i64 0
@@ -454,9 +457,6 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac
 ; ALL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 100
 ; ALL-NEXT:    store i16 [[TMP25]], ptr addrspace(1) [[TMP26]], align 1
 ; ALL-NEXT:    ret void
-; ALL:       loop-memcpy-residual-header:
-; ALL-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP2]], 0
-; ALL-NEXT:    br i1 [[TMP27]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
   call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst0, ptr addrspace(1) %src, i64 %n, i1 false)
   call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 102, i1 false)
@@ -466,7 +466,7 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac
 define amdgpu_kernel void @memcpy_global_align4_global_align4_1028(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
 ; OPT-LABEL: @memcpy_global_align4_global_align4_1028(
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; OPT:       load-store-loop:
+; OPT:       static-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
@@ -475,7 +475,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1028(ptr addrspace
 ; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; OPT:       memcpy-split:
+; OPT:       static-memcpy-post-expansion:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
 ; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
@@ -489,7 +489,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1028(ptr addrspace
 define amdgpu_kernel void @memcpy_global_align4_global_align4_1025(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
 ; OPT-LABEL: @memcpy_global_align4_global_align4_1025(
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; OPT:       load-store-loop:
+; OPT:       static-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
@@ -498,7 +498,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1025(ptr addrspace
 ; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; OPT:       memcpy-split:
+; OPT:       static-memcpy-post-expansion:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 4
 ; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
@@ -512,7 +512,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1025(ptr addrspace
 define amdgpu_kernel void @memcpy_global_align4_global_align4_1026(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
 ; OPT-LABEL: @memcpy_global_align4_global_align4_1026(
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; OPT:       load-store-loop:
+; OPT:       static-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
@@ -521,7 +521,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1026(ptr addrspace
 ; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; OPT:       memcpy-split:
+; OPT:       static-memcpy-post-expansion:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(1) [[TMP6]], align 4
 ; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
@@ -535,7 +535,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1026(ptr addrspace
 define amdgpu_kernel void @memcpy_global_align4_global_align4_1032(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
 ; OPT-LABEL: @memcpy_global_align4_global_align4_1032(
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; OPT:       load-store-loop:
+; OPT:       static-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
@@ -544,7 +544,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1032(ptr addrspace
 ; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; OPT:       memcpy-split:
+; OPT:       static-memcpy-post-expansion:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4
 ; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
@@ -558,7 +558,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1032(ptr addrspace
 define amdgpu_kernel void @memcpy_global_align4_global_align4_1034(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
 ; OPT-LABEL: @memcpy_global_align4_global_align4_1034(
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; OPT:       load-store-loop:
+; OPT:       static-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
@@ -567,7 +567,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1034(ptr addrspace
 ; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; OPT:       memcpy-split:
+; OPT:       static-memcpy-post-expansion:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4
 ; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
@@ -585,7 +585,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1034(ptr addrspace
 define amdgpu_kernel void @memcpy_global_align4_global_align4_1035(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
 ; OPT-LABEL: @memcpy_global_align4_global_align4_1035(
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; OPT:       load-store-loop:
+; OPT:       static-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
@@ -594,7 +594,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1035(ptr addrspace
 ; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; OPT:       memcpy-split:
+; OPT:       static-memcpy-post-expansion:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4
 ; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
@@ -616,7 +616,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1035(ptr addrspace
 define amdgpu_kernel void @memcpy_global_align4_global_align4_1036(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
 ; OPT-LABEL: @memcpy_global_align4_global_align4_1036(
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; OPT:       load-store-loop:
+; OPT:       static-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
@@ -625,7 +625,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1036(ptr addrspace
 ; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; OPT:       memcpy-split:
+; OPT:       static-memcpy-post-expansion:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4
 ; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
@@ -643,7 +643,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1036(ptr addrspace
 define amdgpu_kernel void @memcpy_global_align4_global_align4_1039(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
 ; OPT-LABEL: @memcpy_global_align4_global_align4_1039(
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; OPT:       load-store-loop:
+; OPT:       static-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
@@ -652,7 +652,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1039(ptr addrspace
 ; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; OPT:       memcpy-split:
+; OPT:       static-memcpy-post-expansion:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4
 ; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
@@ -678,7 +678,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1039(ptr addrspace
 define amdgpu_kernel void @memcpy_global_align2_global_align2_1039(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
 ; OPT-LABEL: @memcpy_global_align2_global_align2_1039(
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; OPT:       load-store-loop:
+; OPT:       static-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 2
@@ -687,7 +687,7 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_1039(ptr addrspace
 ; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; OPT:       memcpy-split:
+; OPT:       static-memcpy-post-expansion:
 ; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP16:%.*]] = load i64, ptr addrspace(1) [[TMP15]], align 2
 ; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
@@ -713,7 +713,7 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_1039(ptr addrspace
 define amdgpu_kernel void @memcpy_global_align4_global_align4_1027(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
 ; OPT-LABEL: @memcpy_global_align4_global_align4_1027(
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; OPT:       load-store-loop:
+; OPT:       static-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
@@ -722,7 +722,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1027(ptr addrspace
 ; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; OPT:       memcpy-split:
+; OPT:       static-memcpy-post-expansion:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(1) [[TMP6]], align 4
 ; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
@@ -740,7 +740,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1027(ptr addrspace
 define amdgpu_kernel void @memcpy_global_align2_global_align4_1027(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
 ; OPT-LABEL: @memcpy_global_align2_global_align4_1027(
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; OPT:       load-store-loop:
+; OPT:       static-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
@@ -749,7 +749,7 @@ define amdgpu_kernel void @memcpy_global_align2_global_align4_1027(ptr addrspace
 ; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; OPT:       memcpy-split:
+; OPT:       static-memcpy-post-expansion:
 ; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 4
 ; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
@@ -767,7 +767,7 @@ define amdgpu_kernel void @memcpy_global_align2_global_align4_1027(ptr addrspace
 define amdgpu_kernel void @memcpy_global_align4_global_align2_1027(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
 ; OPT-LABEL: @memcpy_global_align4_global_align2_1027(
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; OPT:       load-store-loop:
+; OPT:       static-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 2
@@ -776,7 +776,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align2_1027(ptr addrspace
 ; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; OPT:       memcpy-split:
+; OPT:       static-memcpy-post-expansion:
 ; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 2
 ; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
@@ -794,7 +794,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align2_1027(ptr addrspace
 define amdgpu_kernel void @memcpy_private_align4_private_align4_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 {
 ; OPT-LABEL: @memcpy_private_align4_private_align4_1027(
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; OPT:       load-store-loop:
+; OPT:       static-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4
@@ -803,7 +803,7 @@ define amdgpu_kernel void @memcpy_private_align4_private_align4_1027(ptr addrspa
 ; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; OPT:       memcpy-split:
+; OPT:       static-memcpy-post-expansion:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(5) [[TMP6]], align 4
 ; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
@@ -821,7 +821,7 @@ define amdgpu_kernel void @memcpy_private_align4_private_align4_1027(ptr addrspa
 define amdgpu_kernel void @memcpy_private_align2_private_align4_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 {
 ; OPT-LABEL: @memcpy_private_align2_private_align4_1027(
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; OPT:       load-store-loop:
+; OPT:       static-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4
@@ -830,7 +830,7 @@ define amdgpu_kernel void @memcpy_private_align2_private_align4_1027(ptr addrspa
 ; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; OPT:       memcpy-split:
+; OPT:       static-memcpy-post-expansion:
 ; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
 ; OPT-NEXT:    [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 4
 ; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
@@ -848,7 +848,7 @@ define amdgpu_kernel void @memcpy_private_align2_private_align4_1027(ptr addrspa
 define amdgpu_kernel void @memcpy_private_align1_private_align4_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 {
 ; OPT-LABEL: @memcpy_private_align1_private_align4_1027(
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; OPT:       load-store-loop:
+; OPT:       static-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4
@@ -857,7 +857,7 @@ define amdgpu_kernel void @memcpy_private_align1_private_align4_1027(ptr addrspa
 ; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; OPT:       memcpy-split:
+; OPT:       static-memcpy-post-expansion:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(5) [[TMP6]], align 4
 ; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
@@ -875,7 +875,7 @@ define amdgpu_kernel void @memcpy_private_align1_private_align4_1027(ptr addrspa
 define amdgpu_kernel void @memcpy_private_align4_private_align2_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 {
 ; OPT-LABEL: @memcpy_private_align4_private_align2_1027(
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; OPT:       load-store-loop:
+; OPT:       static-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 2
@@ -884,7 +884,7 @@ define amdgpu_kernel void @memcpy_private_align4_private_align2_1027(ptr addrspa
 ; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; OPT:       memcpy-split:
+; OPT:       static-memcpy-post-expansion:
 ; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
 ; OPT-NEXT:    [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 2
 ; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
@@ -902,7 +902,7 @@ define amdgpu_kernel void @memcpy_private_align4_private_align2_1027(ptr addrspa
 define amdgpu_kernel void @memcpy_private_align4_private_align1_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 {
 ; OPT-LABEL: @memcpy_private_align4_private_align1_1027(
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; OPT:       load-store-loop:
+; OPT:       static-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 1
@@ -911,7 +911,7 @@ define amdgpu_kernel void @memcpy_private_align4_private_align1_1027(ptr addrspa
 ; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; OPT:       memcpy-split:
+; OPT:       static-memcpy-post-expansion:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(5) [[TMP6]], align 1
 ; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
@@ -929,7 +929,7 @@ define amdgpu_kernel void @memcpy_private_align4_private_align1_1027(ptr addrspa
 define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 {
 ; OPT-LABEL: @memcpy_private_align2_private_align2_1027(
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; OPT:       load-store-loop:
+; OPT:       static-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 2
@@ -938,7 +938,7 @@ define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(ptr addrspa
 ; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; OPT:       memcpy-split:
+; OPT:       static-memcpy-post-expansion:
 ; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
 ; OPT-NEXT:    [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 2
 ; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
@@ -959,7 +959,7 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
-; OPT:       loop-memcpy-expansion:
+; OPT:       dynamic-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 4
@@ -968,8 +968,11 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs
 ; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; OPT:       loop-memcpy-residual:
-; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; OPT:       dynamic-memcpy-expansion-residual-cond:
+; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
+; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; OPT:       dynamic-memcpy-expansion-residual-body:
+; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ]
 ; OPT-NEXT:    [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
 ; OPT-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
@@ -977,12 +980,9 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs
 ; OPT-NEXT:    store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
 ; OPT-NEXT:    [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
 ; OPT-NEXT:    [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
-; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
-; OPT:       post-loop-memcpy-expansion:
+; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+; OPT:       dynamic-memcpy-post-expansion:
 ; OPT-NEXT:    ret void
-; OPT:       loop-memcpy-residual-header:
-; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
-; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
   call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 %n, i1 false)
   ret void
@@ -994,7 +994,7 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrs
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
-; OPT:       loop-memcpy-expansion:
+; OPT:       dynamic-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 2
@@ -1003,8 +1003,11 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrs
 ; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; OPT:       loop-memcpy-residual:
-; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; OPT:       dynamic-memcpy-expansion-residual-cond:
+; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
+; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; OPT:       dynamic-memcpy-expansion-residual-body:
+; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ]
 ; OPT-NEXT:    [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
 ; OPT-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
@@ -1012,12 +1015,9 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrs
 ; OPT-NEXT:    store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
 ; OPT-NEXT:    [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
 ; OPT-NEXT:    [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
-; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
-; OPT:       post-loop-memcpy-expansion:
+; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+; OPT:       dynamic-memcpy-post-expansion:
 ; OPT-NEXT:    ret void
-; OPT:       loop-memcpy-residual-header:
-; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
-; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
   call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 2 %dst, ptr addrspace(1) align 2 %src, i64 %n, i1 false)
   ret void
@@ -1029,7 +1029,7 @@ define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrs
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
-; OPT:       loop-memcpy-expansion:
+; OPT:       dynamic-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
@@ -1038,8 +1038,11 @@ define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrs
 ; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; OPT:       loop-memcpy-residual:
-; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; OPT:       dynamic-memcpy-expansion-residual-cond:
+; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
+; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; OPT:       dynamic-memcpy-expansion-residual-body:
+; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ]
 ; OPT-NEXT:    [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
 ; OPT-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
@@ -1047,12 +1050,9 @@ define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrs
 ; OPT-NEXT:    store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
 ; OPT-NEXT:    [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
 ; OPT-NEXT:    [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
-; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
-; OPT:       post-loop-memcpy-expansion:
+; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+; OPT:       dynamic-memcpy-post-expansion:
 ; OPT-NEXT:    ret void
-; OPT:       loop-memcpy-residual-header:
-; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
-; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
   call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 %src, i64 %n, i1 false)
   ret void
@@ -1064,7 +1064,7 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
-; OPT:       loop-memcpy-expansion:
+; OPT:       dynamic-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 4
@@ -1073,8 +1073,11 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa
 ; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; OPT:       loop-memcpy-residual:
-; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; OPT:       dynamic-memcpy-expansion-residual-cond:
+; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
+; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; OPT:       dynamic-memcpy-expansion-residual-body:
+; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ]
 ; OPT-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
 ; OPT-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
@@ -1082,12 +1085,9 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa
 ; OPT-NEXT:    store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1
 ; OPT-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; OPT-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
-; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
-; OPT:       post-loop-memcpy-expansion:
+; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+; OPT:       dynamic-memcpy-post-expansion:
 ; OPT-NEXT:    ret void
-; OPT:       loop-memcpy-residual-header:
-; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
-; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
   call void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) align 4 %dst, ptr addrspace(3) align 4 %src, i32 %n, i1 false)
   ret void
@@ -1099,7 +1099,7 @@ define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspa
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
-; OPT:       loop-memcpy-expansion:
+; OPT:       dynamic-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 2
@@ -1108,8 +1108,11 @@ define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspa
 ; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; OPT:       loop-memcpy-residual:
-; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; OPT:       dynamic-memcpy-expansion-residual-cond:
+; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
+; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; OPT:       dynamic-memcpy-expansion-residual-body:
+; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ]
 ; OPT-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
 ; OPT-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
@@ -1117,12 +1120,9 @@ define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspa
 ; OPT-NEXT:    store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1
 ; OPT-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; OPT-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
-; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
-; OPT:       post-loop-memcpy-expansion:
+; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+; OPT:       dynamic-memcpy-post-expansion:
 ; OPT-NEXT:    ret void
-; OPT:       loop-memcpy-residual-header:
-; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
-; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
   call void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) align 2 %dst, ptr addrspace(3) align 2 %src, i32 %n, i1 false)
   ret void
@@ -1134,7 +1134,7 @@ define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspa
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
-; OPT:       loop-memcpy-expansion:
+; OPT:       dynamic-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 1
@@ -1143,8 +1143,11 @@ define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspa
 ; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; OPT:       loop-memcpy-residual:
-; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; OPT:       dynamic-memcpy-expansion-residual-cond:
+; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
+; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; OPT:       dynamic-memcpy-expansion-residual-body:
+; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ]
 ; OPT-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
 ; OPT-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
@@ -1152,12 +1155,9 @@ define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspa
 ; OPT-NEXT:    store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1
 ; OPT-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; OPT-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
-; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
-; OPT:       post-loop-memcpy-expansion:
+; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+; OPT:       dynamic-memcpy-post-expansion:
 ; OPT-NEXT:    ret void
-; OPT:       loop-memcpy-residual-header:
-; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
-; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
   call void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align 1 %src, i32 %n, i1 false)
   ret void
@@ -1169,7 +1169,7 @@ define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrsp
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
-; OPT:       loop-memcpy-expansion:
+; OPT:       dynamic-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 4
@@ -1178,8 +1178,11 @@ define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrsp
 ; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; OPT:       loop-memcpy-residual:
-; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; OPT:       dynamic-memcpy-expansion-residual-cond:
+; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
+; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; OPT:       dynamic-memcpy-expansion-residual-body:
+; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ]
 ; OPT-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i32 [[TMP10]]
 ; OPT-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
@@ -1187,12 +1190,9 @@ define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrsp
 ; OPT-NEXT:    store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1
 ; OPT-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; OPT-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
-; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
-; OPT:       post-loop-memcpy-expansion:
+; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+; OPT:       dynamic-memcpy-post-expansion:
 ; OPT-NEXT:    ret void
-; OPT:       loop-memcpy-residual-header:
-; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
-; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
   call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 %dst, ptr addrspace(1) align 4 %src, i32 %n, i1 false)
   ret void
@@ -1204,7 +1204,7 @@ define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrsp
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
-; OPT:       loop-memcpy-expansion:
+; OPT:       dynamic-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 4
@@ -1213,8 +1213,11 @@ define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrsp
 ; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; OPT:       loop-memcpy-residual:
-; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; OPT:       dynamic-memcpy-expansion-residual-cond:
+; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
+; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; OPT:       dynamic-memcpy-expansion-residual-body:
+; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ]
 ; OPT-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
 ; OPT-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
@@ -1222,12 +1225,9 @@ define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrsp
 ; OPT-NEXT:    store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
 ; OPT-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; OPT-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
-; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
-; OPT:       post-loop-memcpy-expansion:
+; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+; OPT:       dynamic-memcpy-post-expansion:
 ; OPT-NEXT:    ret void
-; OPT:       loop-memcpy-residual-header:
-; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
-; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
   call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %dst, ptr addrspace(3) align 4 %src, i32 %n, i1 false)
   ret void
@@ -1496,7 +1496,7 @@ define amdgpu_kernel void @memmove_private_align1_global_align1(ptr addrspace(5)
 ;
 ; ALL-LABEL: @memmove_private_align1_global_align1(
 ; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; ALL:       load-store-loop:
+; ALL:       static-memcpy-expansion-main-body:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1, !alias.scope [[META0:![0-9]+]]
@@ -1505,7 +1505,7 @@ define amdgpu_kernel void @memmove_private_align1_global_align1(ptr addrspace(5)
 ; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 256
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; ALL:       memcpy-split:
+; ALL:       static-memcpy-post-expansion:
 ; ALL-NEXT:    ret void
 ;
   call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) %dst, ptr addrspace(1) %src, i64 256, i1 false)
@@ -1519,7 +1519,7 @@ define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1)
 ;
 ; ALL-LABEL: @memmove_global_align1_private_align1(
 ; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; ALL:       load-store-loop:
+; ALL:       static-memcpy-expansion-main-body:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]]
@@ -1528,7 +1528,7 @@ define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1)
 ; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 256
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; ALL:       memcpy-split:
+; ALL:       static-memcpy-post-expansion:
 ; ALL-NEXT:    ret void
 ;
   call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) %dst, ptr addrspace(5) %src, i64 256, i1 false)
@@ -1722,7 +1722,7 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3)
 ;
 ; ALL-LABEL: @memmove_local_align1_private_align1(
 ; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; ALL:       load-store-loop:
+; ALL:       static-memcpy-expansion-main-body:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META6:![0-9]+]]
@@ -1731,7 +1731,7 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3)
 ; ALL-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 256
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; ALL:       memcpy-split:
+; ALL:       static-memcpy-post-expansion:
 ; ALL-NEXT:    ret void
 ;
   call void @llvm.memmove.p3.p5.i32(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 256, i1 false)
@@ -1744,7 +1744,7 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr
 ; MAX1024-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; MAX1024-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
-; MAX1024:       loop-memcpy-expansion:
+; MAX1024:       dynamic-memcpy-expansion-main-body:
 ; MAX1024-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; MAX1024-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; MAX1024-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META0:![0-9]+]]
@@ -1753,8 +1753,11 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr
 ; MAX1024-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 16
 ; MAX1024-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; MAX1024-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; MAX1024:       loop-memcpy-residual:
-; MAX1024-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; MAX1024:       dynamic-memcpy-expansion-residual-cond:
+; MAX1024-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
+; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; MAX1024:       dynamic-memcpy-expansion-residual-body:
+; MAX1024-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ]
 ; MAX1024-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; MAX1024-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[TMP10]]
 ; MAX1024-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope [[META0]]
@@ -1762,19 +1765,16 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr
 ; MAX1024-NEXT:    store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias [[META0]]
 ; MAX1024-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; MAX1024-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
-; MAX1024-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
-; MAX1024:       post-loop-memcpy-expansion:
+; MAX1024-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+; MAX1024:       dynamic-memcpy-post-expansion:
 ; MAX1024-NEXT:    ret void
-; MAX1024:       loop-memcpy-residual-header:
-; MAX1024-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
-; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
 ; ALL-LABEL: @memmove_local_align1_private_align1_unknown_size(
 ; ALL-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
 ; ALL-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
-; ALL:       loop-memcpy-expansion:
+; ALL:       dynamic-memcpy-expansion-main-body:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META9:![0-9]+]]
@@ -1783,8 +1783,11 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr
 ; ALL-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 16
 ; ALL-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; ALL-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; ALL:       loop-memcpy-residual:
-; ALL-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; ALL:       dynamic-memcpy-expansion-residual-cond:
+; ALL-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
+; ALL-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; ALL:       dynamic-memcpy-expansion-residual-body:
+; ALL-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ]
 ; ALL-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[TMP10]]
 ; ALL-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope [[META9]]
@@ -1792,12 +1795,9 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr
 ; ALL-NEXT:    store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias [[META9]]
 ; ALL-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
-; ALL-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
-; ALL:       post-loop-memcpy-expansion:
+; ALL-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+; ALL:       dynamic-memcpy-post-expansion:
 ; ALL-NEXT:    ret void
-; ALL:       loop-memcpy-residual-header:
-; ALL-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
-; ALL-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
   call void @llvm.memmove.p3.p5.i32(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 %size, i1 false)
   ret void
@@ -1810,7 +1810,7 @@ define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5)
 ;
 ; ALL-LABEL: @memmove_private_align1_local_align1(
 ; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; ALL:       load-store-loop:
+; ALL:       static-memcpy-expansion-main-body:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope [[META12:![0-9]+]]
@@ -1819,7 +1819,7 @@ define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5)
 ; ALL-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 256
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; ALL:       memcpy-split:
+; ALL:       static-memcpy-post-expansion:
 ; ALL-NEXT:    ret void
 ;
   call void @llvm.memmove.p5.p3.i32(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 256, i1 false)
@@ -1832,7 +1832,7 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
 ; MAX1024-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; MAX1024-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
-; MAX1024:       loop-memcpy-expansion:
+; MAX1024:       dynamic-memcpy-expansion-main-body:
 ; MAX1024-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; MAX1024-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; MAX1024-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META3:![0-9]+]]
@@ -1841,8 +1841,11 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
 ; MAX1024-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 16
 ; MAX1024-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; MAX1024-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; MAX1024:       loop-memcpy-residual:
-; MAX1024-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; MAX1024:       dynamic-memcpy-expansion-residual-cond:
+; MAX1024-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
+; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; MAX1024:       dynamic-memcpy-expansion-residual-body:
+; MAX1024-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ]
 ; MAX1024-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; MAX1024-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
 ; MAX1024-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1, !alias.scope [[META3]]
@@ -1850,19 +1853,16 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
 ; MAX1024-NEXT:    store i8 [[TMP12]], ptr addrspace(5) [[TMP13]], align 1, !noalias [[META3]]
 ; MAX1024-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; MAX1024-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
-; MAX1024-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
-; MAX1024:       post-loop-memcpy-expansion:
+; MAX1024-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+; MAX1024:       dynamic-memcpy-post-expansion:
 ; MAX1024-NEXT:    ret void
-; MAX1024:       loop-memcpy-residual-header:
-; MAX1024-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
-; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
 ; ALL-LABEL: @memmove_private_align1_local_align1_unknown_size(
 ; ALL-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
 ; ALL-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
-; ALL:       loop-memcpy-expansion:
+; ALL:       dynamic-memcpy-expansion-main-body:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META15:![0-9]+]]
@@ -1871,8 +1871,11 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
 ; ALL-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 16
 ; ALL-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; ALL-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; ALL:       loop-memcpy-residual:
-; ALL-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; ALL:       dynamic-memcpy-expansion-residual-cond:
+; ALL-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
+; ALL-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; ALL:       dynamic-memcpy-expansion-residual-body:
+; ALL-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ]
 ; ALL-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
 ; ALL-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1, !alias.scope [[META15]]
@@ -1880,12 +1883,9 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
 ; ALL-NEXT:    store i8 [[TMP12]], ptr addrspace(5) [[TMP13]], align 1, !noalias [[META15]]
 ; ALL-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
-; ALL-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
-; ALL:       post-loop-memcpy-expansion:
+; ALL-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+; ALL:       dynamic-memcpy-post-expansion:
 ; ALL-NEXT:    ret void
-; ALL:       loop-memcpy-residual-header:
-; ALL-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
-; ALL-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
   call void @llvm.memmove.p5.p3.i32(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 %size, i1 false)
   ret void
@@ -2367,7 +2367,7 @@ define void @test_umin(i64 %0, i64 %idxprom, ptr %x, ptr %y) {
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[SPEC_SELECT]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
-; OPT:       loop-memcpy-expansion:
+; OPT:       dynamic-memcpy-expansion-main-body:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[TMP5]], align 1
@@ -2376,8 +2376,11 @@ define void @test_umin(i64 %0, i64 %idxprom, ptr %x, ptr %y) {
 ; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
-; OPT:       loop-memcpy-residual:
-; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; OPT:       dynamic-memcpy-expansion-residual-cond:
+; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
+; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL:%.*]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; OPT:       dynamic-memcpy-expansion-residual-body:
+; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL]] ]
 ; OPT-NEXT:    [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP10]]
 ; OPT-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP11]], align 1
@@ -2385,12 +2388,9 @@ define void @test_umin(i64 %0, i64 %idxprom, ptr %x, ptr %y) {
 ; OPT-NEXT:    store i8 [[TMP12]], ptr [[TMP13]], align 1
 ; OPT-NEXT:    [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
 ; OPT-NEXT:    [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
-; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
-; OPT:       post-loop-memcpy-expansion:
+; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+; OPT:       dynamic-memcpy-post-expansion:
 ; OPT-NEXT:    ret void
-; OPT:       loop-memcpy-residual-header:
-; OPT-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
-; OPT-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
 entry:
   %arrayidx = getelementptr [32 x [8 x i64]], ptr %y, i64 0, i64 %idxprom
@@ -2439,7 +2439,7 @@ define amdgpu_kernel void @memcpy_volatile(ptr addrspace(1) %dst, ptr addrspace(
 ;
 ; ALL-LABEL: @memcpy_volatile(
 ; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; ALL:       load-store-loop:
+; ALL:       static-memcpy-expansion-main-body:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP2:%.*]] = load volatile <64 x i32>, ptr addrspace(1) [[TMP1]], align 1
@@ -2448,7 +2448,7 @@ define amdgpu_kernel void @memcpy_volatile(ptr addrspace(1) %dst, ptr addrspace(
 ; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 512
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; ALL:       memcpy-split:
+; ALL:       static-memcpy-post-expansion:
 ; ALL-NEXT:    ret void
 ;
   call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 512, i1 true)
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
index 43752c22b1f3e..faf70f55876f7 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
@@ -12,7 +12,7 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) {
 ; CHECK-NEXT:    v_add_co_u32_e32 v8, vcc, s16, v4
 ; CHECK-NEXT:    v_addc_co_u32_e32 v9, vcc, v6, v5, vcc
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:  .LBB0_1: ; %loop-memcpy-expansion
+; CHECK-NEXT:  .LBB0_1: ; %dynamic-memcpy-expansion-main-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_mov_b32_e32 v7, s5
 ; CHECK-NEXT:    v_mov_b32_e32 v6, s4
@@ -20,28 +20,28 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) {
 ; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, s4, v8
 ; CHECK-NEXT:    s_add_u32 s4, s4, 16
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
-; CHECK-NEXT:    v_cmp_ge_u64_e64 s[6:7], s[4:5], 32
+; CHECK-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[4:5], 32
 ; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, v9, v7, vcc
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[6:7], v[10:13]
-; CHECK-NEXT:    s_cbranch_vccz .LBB0_1
-; CHECK-NEXT:  ; %bb.2: ; %loop-memcpy-residual-header
+; CHECK-NEXT:    s_cbranch_vccnz .LBB0_1
+; CHECK-NEXT:  ; %bb.2: ; %dynamic-memcpy-expansion-residual-cond
 ; CHECK-NEXT:    s_branch .LBB0_4
 ; CHECK-NEXT:  ; %bb.3:
 ; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; CHECK-NEXT:    s_branch .LBB0_5
-; CHECK-NEXT:  .LBB0_4: ; %loop-memcpy-residual-header.post-loop-memcpy-expansion_crit_edge
+; CHECK-NEXT:  .LBB0_4: ; %dynamic-memcpy-expansion-residual-cond.dynamic-memcpy-post-expansion_crit_edge
 ; CHECK-NEXT:    v_lshlrev_b64 v[6:7], 6, v[2:3]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_8
-; CHECK-NEXT:  .LBB0_5: ; %loop-memcpy-residual.preheader
+; CHECK-NEXT:  .LBB0_5: ; %dynamic-memcpy-expansion-residual-body.preheader
 ; CHECK-NEXT:    s_add_u32 s4, s16, 32
 ; CHECK-NEXT:    s_addc_u32 s5, s17, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s5
 ; CHECK-NEXT:    v_add_co_u32_e32 v2, vcc, s4, v4
 ; CHECK-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:  ; %bb.6: ; %loop-memcpy-residual
+; CHECK-NEXT:  ; %bb.6: ; %dynamic-memcpy-expansion-residual-body
 ; CHECK-NEXT:    s_add_u32 s6, 32, s4
 ; CHECK-NEXT:    s_addc_u32 s7, 0, s5
 ; CHECK-NEXT:    v_mov_b32_e32 v6, s6
@@ -57,7 +57,7 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) {
 ; CHECK-NEXT:  ; %bb.7:
 ; CHECK-NEXT:    v_mov_b32_e32 v7, v5
 ; CHECK-NEXT:    v_mov_b32_e32 v6, v4
-; CHECK-NEXT:  .LBB0_8: ; %post-loop-memcpy-expansion
+; CHECK-NEXT:  .LBB0_8: ; %dynamic-memcpy-post-expansion
 ; CHECK-NEXT:    v_and_b32_e32 v2, 15, v0
 ; CHECK-NEXT:    v_and_b32_e32 v0, -16, v0
 ; CHECK-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v0
@@ -76,18 +76,18 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) {
 ; CHECK-NEXT:  .LBB0_10: ; %Flow16
 ; CHECK-NEXT:    ; in Loop: Header=BB0_11 Depth=1
 ; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; CHECK-NEXT:    s_cbranch_vccz .LBB0_19
+; CHECK-NEXT:    s_cbranch_vccz .LBB0_18
 ; CHECK-NEXT:  .LBB0_11: ; %while.cond
 ; CHECK-NEXT:    ; =>This Loop Header: Depth=1
 ; CHECK-NEXT:    ; Child Loop BB0_13 Depth 2
 ; CHECK-NEXT:    ; Child Loop BB0_17 Depth 2
 ; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_14
-; CHECK-NEXT:  ; %bb.12: ; %loop-memcpy-expansion2.preheader
+; CHECK-NEXT:  ; %bb.12: ; %dynamic-memcpy-expansion-main-body2.preheader
 ; CHECK-NEXT:    ; in Loop: Header=BB0_11 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[10:11], 0
 ; CHECK-NEXT:    s_mov_b64 s[12:13], 0
-; CHECK-NEXT:  .LBB0_13: ; %loop-memcpy-expansion2
+; CHECK-NEXT:  .LBB0_13: ; %dynamic-memcpy-expansion-main-body2
 ; CHECK-NEXT:    ; Parent Loop BB0_11 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v6, s10
@@ -108,37 +108,33 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) {
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], -1
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_10
-; CHECK-NEXT:  ; %bb.15: ; %loop-memcpy-residual-header5
+; CHECK-NEXT:  ; %bb.15: ; %dynamic-memcpy-expansion-residual-cond5
 ; CHECK-NEXT:    ; in Loop: Header=BB0_11 Depth=1
-; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
-; CHECK-NEXT:    s_xor_b64 s[10:11], exec, s[8:9]
+; CHECK-NEXT:    s_and_saveexec_b64 s[10:11], s[6:7]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_9
-; CHECK-NEXT:  ; %bb.16: ; %loop-memcpy-residual4.preheader
+; CHECK-NEXT:  ; %bb.16: ; %dynamic-memcpy-expansion-residual-body4.preheader
 ; CHECK-NEXT:    ; in Loop: Header=BB0_11 Depth=1
-; CHECK-NEXT:    s_mov_b64 s[14:15], 0
 ; CHECK-NEXT:    s_mov_b64 s[12:13], 0
-; CHECK-NEXT:  .LBB0_17: ; %loop-memcpy-residual4
+; CHECK-NEXT:    s_mov_b64 s[14:15], 0
+; CHECK-NEXT:  .LBB0_17: ; %dynamic-memcpy-expansion-residual-body4
 ; CHECK-NEXT:    ; Parent Loop BB0_11 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    v_mov_b32_e32 v10, s15
-; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, s14, v0
+; CHECK-NEXT:    v_mov_b32_e32 v10, s13
+; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, s12, v0
 ; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, v1, v10, vcc
 ; CHECK-NEXT:    flat_load_ubyte v11, v[6:7]
-; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, s14, v4
-; CHECK-NEXT:    s_add_u32 s14, s14, 1
-; CHECK-NEXT:    s_addc_u32 s15, s15, 0
-; CHECK-NEXT:    v_cmp_ge_u64_e64 s[8:9], s[14:15], v[2:3]
+; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, s12, v4
+; CHECK-NEXT:    s_add_u32 s12, s12, 1
+; CHECK-NEXT:    s_addc_u32 s13, s13, 0
+; CHECK-NEXT:    v_cmp_ge_u64_e64 s[8:9], s[12:13], v[2:3]
 ; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, v5, v10, vcc
-; CHECK-NEXT:    s_or_b64 s[12:13], s[8:9], s[12:13]
+; CHECK-NEXT:    s_or_b64 s[14:15], s[8:9], s[14:15]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[6:7], v11
-; CHECK-NEXT:    s_andn2_b64 exec, exec, s[12:13]
+; CHECK-NEXT:    s_andn2_b64 exec, exec, s[14:15]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_17
-; CHECK-NEXT:  ; %bb.18: ; %Flow
-; CHECK-NEXT:    ; in Loop: Header=BB0_11 Depth=1
-; CHECK-NEXT:    s_or_b64 exec, exec, s[12:13]
 ; CHECK-NEXT:    s_branch .LBB0_9
-; CHECK-NEXT:  .LBB0_19: ; %DummyReturnBlock
+; CHECK-NEXT:  .LBB0_18: ; %DummyReturnBlock
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
index cb68a987c243b..4f2816538b1ff 100644
--- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -14,7 +14,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:  .LBB0_1: ; %load-store-loop
+; CHECK-NEXT:  .LBB0_1: ; %static-memcpy-expansion-main-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_co_u32 v24, vcc_lo, v2, s4
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo
@@ -83,7 +83,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[102:103], v[96:99]
 ; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_1
-; CHECK-NEXT:  ; %bb.2: ; %memcpy-split
+; CHECK-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -108,7 +108,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
 ; ALIGNED-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v72, off, s[0:3], s32 ; 4-byte Folded Spill
-; ALIGNED-NEXT:  .LBB0_1: ; %load-store-loop
+; ALIGNED-NEXT:  .LBB0_1: ; %static-memcpy-expansion-main-body
 ; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; ALIGNED-NEXT:    v_add_co_u32 v4, vcc_lo, v2, s4
 ; ALIGNED-NEXT:    v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo
@@ -757,7 +757,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
 ; ALIGNED-NEXT:    flat_store_byte v[20:21], v11 offset:48
 ; ALIGNED-NEXT:    flat_store_byte v[20:21], v4 offset:46
 ; ALIGNED-NEXT:    s_cbranch_vccnz .LBB0_1
-; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
+; ALIGNED-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; ALIGNED-NEXT:    s_clause 0x10 ; 68-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v72, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4
@@ -784,7 +784,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
 ; UNROLL3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; UNROLL3-NEXT:    s_mov_b64 s[4:5], 0
 ; UNROLL3-NEXT:    .p2align 6
-; UNROLL3-NEXT:  .LBB0_1: ; %load-store-loop
+; UNROLL3-NEXT:  .LBB0_1: ; %static-memcpy-expansion-main-body
 ; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; UNROLL3-NEXT:    v_add_co_u32 v12, vcc_lo, v2, s4
 ; UNROLL3-NEXT:    v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo
@@ -805,7 +805,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
 ; UNROLL3-NEXT:    v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5]
 ; UNROLL3-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
 ; UNROLL3-NEXT:    s_cbranch_vccnz .LBB0_1
-; UNROLL3-NEXT:  ; %bb.2: ; %memcpy-split
+; UNROLL3-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; UNROLL3-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:2016
 ; UNROLL3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; UNROLL3-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:2016
@@ -824,7 +824,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:  .LBB1_1: ; %load-store-loop
+; CHECK-NEXT:  .LBB1_1: ; %static-memcpy-expansion-main-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_co_u32 v96, vcc_lo, v2, s4
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo
@@ -884,7 +884,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
 ; CHECK-NEXT:    v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
 ; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB1_1
-; CHECK-NEXT:  ; %bb.2: ; %memcpy-split
+; CHECK-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; ALIGNED-LABEL: memcpy_p1_p1_sz2048:
@@ -899,7 +899,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
 ; ALIGNED-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
-; ALIGNED-NEXT:  .LBB1_1: ; %load-store-loop
+; ALIGNED-NEXT:  .LBB1_1: ; %static-memcpy-expansion-main-body
 ; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; ALIGNED-NEXT:    v_add_co_u32 v24, vcc_lo, v2, s4
 ; ALIGNED-NEXT:    v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo
@@ -1520,7 +1520,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
 ; ALIGNED-NEXT:    global_store_byte v[16:17], v11, off offset:3
 ; ALIGNED-NEXT:    global_store_byte v[16:17], v4, off offset:1
 ; ALIGNED-NEXT:    s_cbranch_vccnz .LBB1_1
-; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
+; ALIGNED-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; ALIGNED-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4
@@ -1538,7 +1538,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
 ; UNROLL3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; UNROLL3-NEXT:    s_mov_b64 s[4:5], 0
 ; UNROLL3-NEXT:    .p2align 6
-; UNROLL3-NEXT:  .LBB1_1: ; %load-store-loop
+; UNROLL3-NEXT:  .LBB1_1: ; %static-memcpy-expansion-main-body
 ; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; UNROLL3-NEXT:    v_add_co_u32 v12, vcc_lo, v2, s4
 ; UNROLL3-NEXT:    v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo
@@ -1559,7 +1559,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
 ; UNROLL3-NEXT:    v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5]
 ; UNROLL3-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
 ; UNROLL3-NEXT:    s_cbranch_vccnz .LBB1_1
-; UNROLL3-NEXT:  ; %bb.2: ; %memcpy-split
+; UNROLL3-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; UNROLL3-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:2016
 ; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
 ; UNROLL3-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:2016
@@ -1577,7 +1577,7 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:  .LBB2_1: ; %load-store-loop
+; CHECK-NEXT:  .LBB2_1: ; %static-memcpy-expansion-main-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_co_u32 v96, vcc_lo, v2, s4
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo
@@ -1639,7 +1639,7 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[96:99]
 ; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB2_1
-; CHECK-NEXT:  ; %bb.2: ; %memcpy-split
+; CHECK-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1647,7 +1647,7 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
 ; ALIGNED:       ; %bb.0: ; %entry
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0
-; ALIGNED-NEXT:  .LBB2_1: ; %load-store-loop
+; ALIGNED-NEXT:  .LBB2_1: ; %static-memcpy-expansion-main-body
 ; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; ALIGNED-NEXT:    v_add_co_u32 v8, vcc_lo, v2, s4
 ; ALIGNED-NEXT:    v_add_co_ci_u32_e64 v9, null, s5, v3, vcc_lo
@@ -2141,7 +2141,7 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
 ; ALIGNED-NEXT:    flat_store_byte v[84:85], v65 offset:1
 ; ALIGNED-NEXT:    flat_store_byte v[84:85], v4
 ; ALIGNED-NEXT:    s_cbranch_vccnz .LBB2_1
-; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
+; ALIGNED-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; ALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2150,7 +2150,7 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
 ; UNROLL3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; UNROLL3-NEXT:    s_mov_b64 s[4:5], 0
 ; UNROLL3-NEXT:    .p2align 6
-; UNROLL3-NEXT:  .LBB2_1: ; %load-store-loop
+; UNROLL3-NEXT:  .LBB2_1: ; %static-memcpy-expansion-main-body
 ; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; UNROLL3-NEXT:    v_add_co_u32 v12, vcc_lo, v2, s4
 ; UNROLL3-NEXT:    v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo
@@ -2171,7 +2171,7 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
 ; UNROLL3-NEXT:    v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5]
 ; UNROLL3-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
 ; UNROLL3-NEXT:    s_cbranch_vccnz .LBB2_1
-; UNROLL3-NEXT:  ; %bb.2: ; %memcpy-split
+; UNROLL3-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; UNROLL3-NEXT:    s_clause 0x1
 ; UNROLL3-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:2016
 ; UNROLL3-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:2032
@@ -2191,7 +2191,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5)
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:  .LBB3_1: ; %load-store-loop
+; CHECK-NEXT:  .LBB3_1: ; %static-memcpy-expansion-main-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    s_clause 0x3e
 ; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:252
@@ -2392,7 +2392,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5)
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, 0x100, v0
 ; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB3_1
-; CHECK-NEXT:  ; %bb.2: ; %memcpy-split
+; CHECK-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; ALIGNED-LABEL: memcpy_p5_p5_sz2048:
@@ -2447,7 +2447,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill
-; ALIGNED-NEXT:  .LBB3_1: ; %load-store-loop
+; ALIGNED-NEXT:  .LBB3_1: ; %static-memcpy-expansion-main-body
 ; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; ALIGNED-NEXT:    s_clause 0x34
 ; ALIGNED-NEXT:    buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:255
@@ -3495,7 +3495,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen
 ; ALIGNED-NEXT:    v_add_nc_u32_e32 v0, 0x100, v0
 ; ALIGNED-NEXT:    s_cbranch_vccnz .LBB3_1
-; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
+; ALIGNED-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; ALIGNED-NEXT:    s_clause 0x2f ; 192-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v127, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v126, off, s[0:3], s32 offset:4
@@ -3554,7 +3554,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5)
 ; UNROLL3-NEXT:    v_mov_b32_e32 v2, v1
 ; UNROLL3-NEXT:    v_mov_b32_e32 v3, v0
 ; UNROLL3-NEXT:    s_mov_b64 s[4:5], 0
-; UNROLL3-NEXT:  .LBB3_1: ; %load-store-loop
+; UNROLL3-NEXT:  .LBB3_1: ; %static-memcpy-expansion-main-body
 ; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; UNROLL3-NEXT:    s_clause 0xb
 ; UNROLL3-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:44
@@ -3600,7 +3600,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5)
 ; UNROLL3-NEXT:    v_add_nc_u32_e32 v3, 48, v3
 ; UNROLL3-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
 ; UNROLL3-NEXT:    s_cbranch_vccnz .LBB3_1
-; UNROLL3-NEXT:  ; %bb.2: ; %memcpy-split
+; UNROLL3-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; UNROLL3-NEXT:    s_clause 0x3
 ; UNROLL3-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:2028
 ; UNROLL3-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:2024
@@ -3638,7 +3638,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:  .LBB4_1: ; %load-store-loop
+; CHECK-NEXT:  .LBB4_1: ; %static-memcpy-expansion-main-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    s_clause 0x3e
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:32
@@ -3741,7 +3741,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[84:87]
 ; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB4_1
-; CHECK-NEXT:  ; %bb.2: ; %memcpy-split
+; CHECK-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3799,7 +3799,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill
-; ALIGNED-NEXT:  .LBB4_1: ; %load-store-loop
+; ALIGNED-NEXT:  .LBB4_1: ; %static-memcpy-expansion-main-body
 ; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; ALIGNED-NEXT:    s_clause 0x3e
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20
@@ -5282,7 +5282,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v0
 ; ALIGNED-NEXT:    s_cbranch_vccnz .LBB4_1
-; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
+; ALIGNED-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; ALIGNED-NEXT:    s_clause 0x2f ; 192-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v127, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v126, off, s[0:3], s32 offset:4
@@ -5342,7 +5342,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; UNROLL3-NEXT:    s_mov_b64 s[4:5], 0
 ; UNROLL3-NEXT:    s_inst_prefetch 0x1
 ; UNROLL3-NEXT:    .p2align 6
-; UNROLL3-NEXT:  .LBB4_1: ; %load-store-loop
+; UNROLL3-NEXT:  .LBB4_1: ; %static-memcpy-expansion-main-body
 ; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; UNROLL3-NEXT:    s_clause 0xb
 ; UNROLL3-NEXT:    buffer_load_dword v4, v3, s[0:3], 0 offen
@@ -5370,7 +5370,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[12:15] offset:32
 ; UNROLL3-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
 ; UNROLL3-NEXT:    s_cbranch_vccnz .LBB4_1
-; UNROLL3-NEXT:  ; %bb.2: ; %memcpy-split
+; UNROLL3-NEXT:  ; %bb.2: ; %static-memcpy-post-expansion
 ; UNROLL3-NEXT:    s_inst_prefetch 0x2
 ; UNROLL3-NEXT:    s_clause 0x3
 ; UNROLL3-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:2016
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
index 953511db10b29..d95965caa81ab 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
@@ -1051,10 +1051,10 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[7:8]
 ; CHECK-NEXT:    s_cbranch_execz .LBB7_3
-; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
+; CHECK-NEXT:  ; %bb.1: ; %dynamic-memcpy-expansion-main-body.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v9, v2
 ; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:  .LBB7_2: ; %loop-memcpy-expansion
+; CHECK-NEXT:  .LBB7_2: ; %dynamic-memcpy-expansion-main-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ds_read_b128 v[10:13], v9
 ; CHECK-NEXT:    v_add_co_u32 v14, vcc_lo, v0, s4
@@ -1073,15 +1073,14 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[5:6]
-; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
-; CHECK-NEXT:    s_cbranch_execz .LBB7_7
-; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
+; CHECK-NEXT:    s_cbranch_execz .LBB7_6
+; CHECK-NEXT:  ; %bb.4: ; %dynamic-memcpy-expansion-residual-body.preheader
 ; CHECK-NEXT:    v_and_b32_e32 v3, -16, v3
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; CHECK-NEXT:    v_add_nc_u32_e32 v2, v2, v3
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo
-; CHECK-NEXT:  .LBB7_5: ; %loop-memcpy-residual
+; CHECK-NEXT:  .LBB7_5: ; %dynamic-memcpy-expansion-residual-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ds_read_u8 v7, v2
 ; CHECK-NEXT:    v_add_co_u32 v3, vcc_lo, v0, s4
@@ -1095,9 +1094,7 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    global_store_byte v[3:4], v7, off
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB7_5
-; CHECK-NEXT:  ; %bb.6: ; %Flow
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT:  .LBB7_7: ; %Flow7
+; CHECK-NEXT:  .LBB7_6: ; %Flow7
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1263,11 +1260,11 @@ define void @memmove_p1_p5(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[7:8]
 ; CHECK-NEXT:    s_cbranch_execz .LBB9_3
-; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
+; CHECK-NEXT:  ; %bb.1: ; %dynamic-memcpy-expansion-main-body.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v9, v2
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    .p2align 6
-; CHECK-NEXT:  .LBB9_2: ; %loop-memcpy-expansion
+; CHECK-NEXT:  .LBB9_2: ; %dynamic-memcpy-expansion-main-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    s_clause 0x3
 ; CHECK-NEXT:    buffer_load_dword v10, v9, s[0:3], 0 offen
@@ -1290,15 +1287,14 @@ define void @memmove_p1_p5(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[5:6]
-; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
-; CHECK-NEXT:    s_cbranch_execz .LBB9_7
-; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
+; CHECK-NEXT:    s_cbranch_execz .LBB9_6
+; CHECK-NEXT:  ; %bb.4: ; %dynamic-memcpy-expansion-residual-body.preheader
 ; CHECK-NEXT:    v_and_b32_e32 v3, -16, v3
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; CHECK-NEXT:    v_add_nc_u32_e32 v2, v2, v3
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo
-; CHECK-NEXT:  .LBB9_5: ; %loop-memcpy-residual
+; CHECK-NEXT:  .LBB9_5: ; %dynamic-memcpy-expansion-residual-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    v_add_co_u32 v3, vcc_lo, v0, s4
@@ -1312,9 +1308,7 @@ define void @memmove_p1_p5(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align
 ; CHECK-NEXT:    global_store_byte v[3:4], v7, off
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB9_5
-; CHECK-NEXT:  ; %bb.6: ; %Flow
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT:  .LBB9_7: ; %Flow7
+; CHECK-NEXT:  .LBB9_6: ; %Flow7
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1479,10 +1473,10 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[7:8]
 ; CHECK-NEXT:    s_cbranch_execz .LBB11_3
-; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
+; CHECK-NEXT:  ; %bb.1: ; %dynamic-memcpy-expansion-main-body.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v9, v0
 ; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:  .LBB11_2: ; %loop-memcpy-expansion
+; CHECK-NEXT:  .LBB11_2: ; %dynamic-memcpy-expansion-main-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_co_u32 v10, vcc_lo, v1, s4
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, null, s5, v2, vcc_lo
@@ -1501,15 +1495,14 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[5:6]
-; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
-; CHECK-NEXT:    s_cbranch_execz .LBB11_7
-; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
+; CHECK-NEXT:    s_cbranch_execz .LBB11_6
+; CHECK-NEXT:  ; %bb.4: ; %dynamic-memcpy-expansion-residual-body.preheader
 ; CHECK-NEXT:    v_and_b32_e32 v3, -16, v3
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v3
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo
-; CHECK-NEXT:  .LBB11_5: ; %loop-memcpy-residual
+; CHECK-NEXT:  .LBB11_5: ; %dynamic-memcpy-expansion-residual-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_co_u32 v3, vcc_lo, v1, s4
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v4, null, s5, v2, vcc_lo
@@ -1523,9 +1516,7 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB11_5
-; CHECK-NEXT:  ; %bb.6: ; %Flow
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT:  .LBB11_7: ; %Flow7
+; CHECK-NEXT:  .LBB11_6: ; %Flow7
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1673,10 +1664,10 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[7:8]
 ; CHECK-NEXT:    s_cbranch_execz .LBB13_3
-; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
+; CHECK-NEXT:  ; %bb.1: ; %dynamic-memcpy-expansion-main-body.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v9, v0
 ; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:  .LBB13_2: ; %loop-memcpy-expansion
+; CHECK-NEXT:  .LBB13_2: ; %dynamic-memcpy-expansion-main-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_co_u32 v10, vcc_lo, v1, s4
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, null, s5, v2, vcc_lo
@@ -1695,15 +1686,14 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[5:6]
-; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
-; CHECK-NEXT:    s_cbranch_execz .LBB13_7
-; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
+; CHECK-NEXT:    s_cbranch_execz .LBB13_6
+; CHECK-NEXT:  ; %bb.4: ; %dynamic-memcpy-expansion-residual-body.preheader
 ; CHECK-NEXT:    v_and_b32_e32 v3, -16, v3
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v3
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo
-; CHECK-NEXT:  .LBB13_5: ; %loop-memcpy-residual
+; CHECK-NEXT:  .LBB13_5: ; %dynamic-memcpy-expansion-residual-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_co_u32 v3, vcc_lo, v1, s4
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v4, null, s5, v2, vcc_lo
@@ -1717,9 +1707,7 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB13_5
-; CHECK-NEXT:  ; %bb.6: ; %Flow
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT:  .LBB13_7: ; %Flow7
+; CHECK-NEXT:  .LBB13_6: ; %Flow7
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1740,12 +1728,12 @@ define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align
 ; CHECK-NEXT:    v_and_b32_e32 v5, 15, v4
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[2:3]
 ; CHECK-NEXT:    s_cbranch_execz .LBB14_3
-; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
+; CHECK-NEXT:  ; %bb.1: ; %dynamic-memcpy-expansion-main-body.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v7, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v8, v0
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    .p2align 6
-; CHECK-NEXT:  .LBB14_2: ; %loop-memcpy-expansion
+; CHECK-NEXT:  .LBB14_2: ; %dynamic-memcpy-expansion-main-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    s_clause 0x3
 ; CHECK-NEXT:    buffer_load_dword v9, v7, s[0:3], 0 offen
@@ -1767,14 +1755,13 @@ define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[5:6]
-; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
-; CHECK-NEXT:    s_cbranch_execz .LBB14_7
-; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
+; CHECK-NEXT:    s_cbranch_execz .LBB14_6
+; CHECK-NEXT:  ; %bb.4: ; %dynamic-memcpy-expansion-residual-body.preheader
 ; CHECK-NEXT:    v_and_b32_e32 v2, -16, v4
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; CHECK-NEXT:  .LBB14_5: ; %loop-memcpy-residual
+; CHECK-NEXT:  .LBB14_5: ; %dynamic-memcpy-expansion-residual-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
 ; CHECK-NEXT:    s_add_u32 s4, s4, 1
@@ -1787,9 +1774,7 @@ define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB14_5
-; CHECK-NEXT:  ; %bb.6: ; %Flow
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT:  .LBB14_7: ; %Flow12
+; CHECK-NEXT:  .LBB14_6: ; %Flow12
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1959,11 +1944,11 @@ define void @memmove_p5_p1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[7:8]
 ; CHECK-NEXT:    s_cbranch_execz .LBB16_3
-; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
+; CHECK-NEXT:  ; %bb.1: ; %dynamic-memcpy-expansion-main-body.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v9, v0
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    .p2align 6
-; CHECK-NEXT:  .LBB16_2: ; %loop-memcpy-expansion
+; CHECK-NEXT:  .LBB16_2: ; %dynamic-memcpy-expansion-main-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_co_u32 v10, vcc_lo, v1, s4
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, null, s5, v2, vcc_lo
@@ -1985,15 +1970,14 @@ define void @memmove_p5_p1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[5:6]
-; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
-; CHECK-NEXT:    s_cbranch_execz .LBB16_7
-; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
+; CHECK-NEXT:    s_cbranch_execz .LBB16_6
+; CHECK-NEXT:  ; %bb.4: ; %dynamic-memcpy-expansion-residual-body.preheader
 ; CHECK-NEXT:    v_and_b32_e32 v3, -16, v3
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v3
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo
-; CHECK-NEXT:  .LBB16_5: ; %loop-memcpy-residual
+; CHECK-NEXT:  .LBB16_5: ; %dynamic-memcpy-expansion-residual-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_co_u32 v3, vcc_lo, v1, s4
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v4, null, s5, v2, vcc_lo
@@ -2007,9 +1991,7 @@ define void @memmove_p5_p1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB16_5
-; CHECK-NEXT:  ; %bb.6: ; %Flow
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT:  .LBB16_7: ; %Flow7
+; CHECK-NEXT:  .LBB16_6: ; %Flow7
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2029,12 +2011,12 @@ define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    v_and_b32_e32 v5, 15, v4
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[2:3]
 ; CHECK-NEXT:    s_cbranch_execz .LBB17_3
-; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
+; CHECK-NEXT:  ; %bb.1: ; %dynamic-memcpy-expansion-main-body.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v7, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v8, v0
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    .p2align 6
-; CHECK-NEXT:  .LBB17_2: ; %loop-memcpy-expansion
+; CHECK-NEXT:  .LBB17_2: ; %dynamic-memcpy-expansion-main-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ds_read_b128 v[9:12], v7
 ; CHECK-NEXT:    s_add_u32 s4, s4, 16
@@ -2055,14 +2037,13 @@ define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[5:6]
-; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
-; CHECK-NEXT:    s_cbranch_execz .LBB17_7
-; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
+; CHECK-NEXT:    s_cbranch_execz .LBB17_6
+; CHECK-NEXT:  ; %bb.4: ; %dynamic-memcpy-expansion-residual-body.preheader
 ; CHECK-NEXT:    v_and_b32_e32 v2, -16, v4
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; CHECK-NEXT:  .LBB17_5: ; %loop-memcpy-residual
+; CHECK-NEXT:  .LBB17_5: ; %dynamic-memcpy-expansion-residual-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ds_read_u8 v2, v1
 ; CHECK-NEXT:    s_add_u32 s4, s4, 1
@@ -2075,9 +2056,7 @@ define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB17_5
-; CHECK-NEXT:  ; %bb.6: ; %Flow
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT:  .LBB17_7: ; %Flow12
+; CHECK-NEXT:  .LBB17_6: ; %Flow12
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2097,11 +2076,11 @@ define void @memmove_p5_p4(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[7:8]
 ; CHECK-NEXT:    s_cbranch_execz .LBB18_3
-; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
+; CHECK-NEXT:  ; %bb.1: ; %dynamic-memcpy-expansion-main-body.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v9, v0
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    .p2align 6
-; CHECK-NEXT:  .LBB18_2: ; %loop-memcpy-expansion
+; CHECK-NEXT:  .LBB18_2: ; %dynamic-memcpy-expansion-main-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_co_u32 v10, vcc_lo, v1, s4
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, null, s5, v2, vcc_lo
@@ -2123,15 +2102,14 @@ define void @memmove_p5_p4(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[5:6]
-; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
-; CHECK-NEXT:    s_cbranch_execz .LBB18_7
-; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
+; CHECK-NEXT:    s_cbranch_execz .LBB18_6
+; CHECK-NEXT:  ; %bb.4: ; %dynamic-memcpy-expansion-residual-body.preheader
 ; CHECK-NEXT:    v_and_b32_e32 v3, -16, v3
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v3
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo
-; CHECK-NEXT:  .LBB18_5: ; %loop-memcpy-residual
+; CHECK-NEXT:  .LBB18_5: ; %dynamic-memcpy-expansion-residual-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_co_u32 v3, vcc_lo, v1, s4
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v4, null, s5, v2, vcc_lo
@@ -2145,9 +2123,7 @@ define void @memmove_p5_p4(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB18_5
-; CHECK-NEXT:  ; %bb.6: ; %Flow
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT:  .LBB18_7: ; %Flow7
+; CHECK-NEXT:  .LBB18_6: ; %Flow7
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
diff --git a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
index 297b2b984cdae..ad78e0fe7438b 100644
--- a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
@@ -20,19 +20,19 @@ entry:
 ; IR-LABEL:   @memcpy_caller
 ; IR:         entry:
 ; IR:         [[Cond:%[0-9]+]] = icmp ne i64 %n, 0
-; IR:         br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+; IR:         br i1 [[Cond]], label %dynamic-memcpy-expansion-main-body, label %dynamic-memcpy-post-expansion
 
-; IR:         loop-memcpy-expansion:
-; IR:         %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
+; IR:         dynamic-memcpy-expansion-main-body:
+; IR:         %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %dynamic-memcpy-expansion-main-body ]
 ; IR:         [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, ptr %src, i64 %loop-index
 ; IR:         [[Load:%[0-9]+]] = load i8, ptr [[SrcGep]]
 ; IR:         [[DstGep:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64 %loop-index
 ; IR:         store i8 [[Load]], ptr [[DstGep]]
 ; IR:         [[IndexInc]] = add i64 %loop-index, 1
 ; IR:         [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n
-; IR:         br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+; IR:         br i1 [[Cond2]], label %dynamic-memcpy-expansion-main-body, label %dynamic-memcpy-post-expansion
 
-; IR-LABEL:   post-loop-memcpy-expansion:
+; IR-LABEL:   dynamic-memcpy-post-expansion:
 ; IR:         ret ptr %dst
 
 ; PTX-LABEL:  .visible .func (.param .b64 func_retval0) memcpy_caller
@@ -53,19 +53,19 @@ entry:
 ; IR-LABEL:   @memcpy_volatile_caller
 ; IR:         entry:
 ; IR:         [[Cond:%[0-9]+]] = icmp ne i64 %n, 0
-; IR:         br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+; IR:         br i1 [[Cond]], label %dynamic-memcpy-expansion-main-body, label %dynamic-memcpy-post-expansion
 
-; IR:         loop-memcpy-expansion:
-; IR:         %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
+; IR:         dynamic-memcpy-expansion-main-body:
+; IR:         %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %dynamic-memcpy-expansion-main-body ]
 ; IR:         [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, ptr %src, i64 %loop-index
 ; IR:         [[Load:%[0-9]+]] = load volatile i8, ptr [[SrcGep]]
 ; IR:         [[DstGep:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64 %loop-index
 ; IR:         store volatile i8 [[Load]], ptr [[DstGep]]
 ; IR:         [[IndexInc]] = add i64 %loop-index, 1
 ; IR:         [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n
-; IR:         br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+; IR:         br i1 [[Cond2]], label %dynamic-memcpy-expansion-main-body, label %dynamic-memcpy-post-expansion
 
-; IR-LABEL:   post-loop-memcpy-expansion:
+; IR-LABEL:   dynamic-memcpy-post-expansion:
 ; IR:         ret ptr %dst
 
 
@@ -97,16 +97,16 @@ entry:
 ; Check that calls with compile-time constant size are handled correctly
 ; IR-LABEL:    @memcpy_known_size
 ; IR:          entry:
-; IR:          br label %load-store-loop
-; IR:          load-store-loop:
-; IR:          %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ]
+; IR:          br label %static-memcpy-expansion-main-body
+; IR:          static-memcpy-expansion-main-body:
+; IR:          %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %static-memcpy-expansion-main-body ]
 ; IR:          [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, ptr %src, i64 %loop-index
 ; IR:          [[Load:%[0-9]+]] = load i8, ptr [[SrcGep]]
 ; IR:          [[DstGep:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64 %loop-index
 ; IR:          store i8 [[Load]], ptr [[DstGep]]
 ; IR:          [[IndexInc]] = add i64 %loop-index, 1
 ; IR:          [[Cond:%[0-9]+]] = icmp ult i64 %3, 144
-; IR:          br i1 [[Cond]], label %load-store-loop, label %memcpy-split
+; IR:          br i1 [[Cond]], label %static-memcpy-expansion-main-body, label %static-memcpy-post-expansion
 }
 
 define ptr @memset_caller(ptr %dst, i32 %c, i64 %n) #0 {
diff --git a/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp b/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp
index dd03b4f2ae971..752029e54f394 100644
--- a/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp
+++ b/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp
@@ -120,7 +120,8 @@ TEST_F(MemTransferLowerTest, MemCpyKnownLength) {
         MemCpyInst *MemCpyI = cast<MemCpyInst>(Inst);
         auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(F);
         expandMemCpyAsLoop(MemCpyI, TTI, &SE);
-        auto *CopyLoopBB = getBasicBlockByName(F, "load-store-loop");
+        auto *CopyLoopBB =
+            getBasicBlockByName(F, "static-memcpy-expansion-main-body");
         Instruction *LoadInst =
             getInstructionByOpcode(*CopyLoopBB, Instruction::Load, 1);
         EXPECT_NE(nullptr, LoadInst->getMetadata(LLVMContext::MD_alias_scope));
@@ -203,7 +204,8 @@ TEST_F(MemTransferLowerTest, AtomicMemCpyKnownLength) {
         AnyMemCpyInst *MemCpyI = cast<AnyMemCpyInst>(Inst);
         auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(F);
         expandAtomicMemCpyAsLoop(MemCpyI, TTI, &SE);
-        auto *CopyLoopBB = getBasicBlockByName(F, "load-store-loop");
+        auto *CopyLoopBB =
+            getBasicBlockByName(F, "static-memcpy-expansion-main-body");
         Instruction *LoadInst =
             getInstructionByOpcode(*CopyLoopBB, Instruction::Load, 1);
         EXPECT_TRUE(LoadInst->isAtomic());
@@ -248,7 +250,8 @@ TEST_F(MemTransferLowerTest, AtomicMemCpyUnKnownLength) {
         auto *MemCpyI = cast<AnyMemCpyInst>(Inst);
         auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(F);
         expandAtomicMemCpyAsLoop(MemCpyI, TTI, &SE);
-        auto *CopyLoopBB = getBasicBlockByName(F, "loop-memcpy-expansion");
+        auto *CopyLoopBB =
+            getBasicBlockByName(F, "dynamic-memcpy-expansion-main-body");
         Instruction *LoadInst =
             getInstructionByOpcode(*CopyLoopBB, Instruction::Load, 1);
         EXPECT_TRUE(LoadInst->isAtomic());

From f17abc280c708c16f622be2de2ab7d0710cc8bc1 Mon Sep 17 00:00:00 2001
From: Gil Rapaport <gil.rapaport@mobileye.com>
Date: Wed, 3 Dec 2025 12:41:29 +0200
Subject: [PATCH 24/36] [mlir][emitc] Add address-of and dereference ops
 (#72569)

EmitC currently models C's `&` and `*` operators via its `apply` op,
which has several drawbacks:

- Its pre-lvalue semantics combines dereferencing with memory access.

- Representing multiple opcodes (selected by an attribute) in a single
op complicates the code by adding a second, attribute-based selection
layer on top of MLIR's standard `isa<>` mechanism.

This patch adds two distinct, lvalue-based ops to model these C operators.
EmitC passes were converted to use the new ops instead of `apply`, which
is now deprecated.
---
 mlir/include/mlir/Dialect/EmitC/IR/EmitC.td   | 58 ++++++++++++++++++-
 .../MemRefToEmitC/MemRefToEmitC.cpp           | 14 ++---
 mlir/lib/Dialect/EmitC/IR/EmitC.cpp           | 29 ++++++++++
 mlir/lib/Target/Cpp/TranslateToCpp.cpp        | 29 ++++++++--
 .../memref-to-emitc-alloc-copy.mlir           | 12 ++--
 .../MemRefToEmitC/memref-to-emitc-copy.mlir   |  6 +-
 .../MemRefToEmitC/memref-to-emitc.mlir        |  2 +-
 mlir/test/Dialect/EmitC/invalid_ops.mlir      | 16 +++++
 mlir/test/Dialect/EmitC/ops.mlir              | 10 ++++
 mlir/test/Target/Cpp/common-cpp.mlir          | 19 ++++++
 mlir/test/Target/Cpp/expressions.mlir         | 30 ++++++++--
 11 files changed, 199 insertions(+), 26 deletions(-)

diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
index 4c1db58da45f0..c1820904f2665 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
@@ -116,6 +116,37 @@ def EmitC_FileOp
   let skipDefaultBuilders = 1;
 }
 
+def EmitC_AddressOfOp : EmitC_Op<"address_of", [
+    CExpressionInterface,
+    TypesMatchWith<"input and result reference the same type", "reference", "result",
+                   "emitc::PointerType::get(::llvm::cast<emitc::LValueType>($_self).getValueType())">
+]> {
+  let summary = "Address operation";
+  let description = [{
+    This operation models the C & (address of) operator for a single operand,
+    which must be an emitc.lvalue, and returns an emitc pointer to its location.
+
+    Example:
+
+    ```mlir
+    // Custom form of applying the & operator.
+    %0 = emitc.address_of %arg0 : (!emitc.lvalue<i32>) -> !emitc.ptr<i32>
+    ```
+  }];
+  let arguments = (ins EmitC_LValueType:$reference);
+  let results = (outs EmitC_PointerType:$result);
+  let assemblyFormat = [{
+    $reference `:` qualified(type($reference)) attr-dict
+  }];
+  let hasVerifier = 1;
+
+  let extraClassDeclaration = [{
+    bool hasSideEffects() {
+      return false;
+    }
+  }];
+}
+
 def EmitC_AddOp : EmitC_BinaryOp<"add", []> {
   let summary = "Addition operation";
   let description = [{
@@ -140,7 +171,7 @@ def EmitC_AddOp : EmitC_BinaryOp<"add", []> {
 }
 
 def EmitC_ApplyOp : EmitC_Op<"apply", [CExpressionInterface]> {
-  let summary = "Apply operation";
+  let summary = "Deprecated (use address_of/dereference)";
   let description = [{
     With the `emitc.apply` operation the operators & (address of) and * (contents of)
     can be applied to a single operand.
@@ -439,6 +470,31 @@ def EmitC_ConstantOp
   }];
 }
 
+def EmitC_DereferenceOp : EmitC_Op<"dereference", [
+    TypesMatchWith<"input and result reference the same type", "pointer", "result",
+                   "emitc::LValueType::get(::llvm::cast<emitc::PointerType>($_self).getPointee())">
+]> {
+  let summary = "Dereference operation";
+  let description = [{
+    This operation models the C * (dereference) operator, which must be of
+    !emitc.ptr<> type, returning an !emitc.lvalue<> the value pointed to by the
+    pointer.
+
+    Example:
+
+    ```mlir
+    // Custom form of the dereference operator.
+    %0 = emitc.dereference %arg0 : (!emitc.ptr<i32>) -> !emitc.lvalue<i32>
+    ```
+  }];
+  let arguments = (ins EmitC_PointerType:$pointer);
+  let results = (outs EmitC_LValueType:$result);
+  let assemblyFormat = [{
+    $pointer `:` qualified(type($pointer)) attr-dict
+  }];
+  let hasVerifier = 1;
+}
+
 def EmitC_DivOp : EmitC_BinaryOp<"div", []> {
   let summary = "Division operation";
   let description = [{
diff --git a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp
index 11f866c103639..0a382d812f362 100644
--- a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp
+++ b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp
@@ -122,7 +122,7 @@ static Value calculateMemrefTotalSizeBytes(Location loc, MemRefType memrefType,
   return totalSizeBytes.getResult();
 }
 
-static emitc::ApplyOp
+static emitc::AddressOfOp
 createPointerFromEmitcArray(Location loc, OpBuilder &builder,
                             TypedValue<emitc::ArrayType> arrayValue) {
 
@@ -133,9 +133,9 @@ createPointerFromEmitcArray(Location loc, OpBuilder &builder,
   llvm::SmallVector<mlir::Value> indices(arrayType.getRank(), zeroIndex);
   emitc::SubscriptOp subPtr =
       emitc::SubscriptOp::create(builder, loc, arrayValue, ValueRange(indices));
-  emitc::ApplyOp ptr = emitc::ApplyOp::create(
+  emitc::AddressOfOp ptr = emitc::AddressOfOp::create(
       builder, loc, emitc::PointerType::get(arrayType.getElementType()),
-      builder.getStringAttr("&"), subPtr);
+      subPtr);
 
   return ptr;
 }
@@ -225,12 +225,12 @@ struct ConvertCopy final : public OpConversionPattern<memref::CopyOp> {
 
     auto srcArrayValue =
         cast<TypedValue<emitc::ArrayType>>(operands.getSource());
-    emitc::ApplyOp srcPtr =
+    emitc::AddressOfOp srcPtr =
         createPointerFromEmitcArray(loc, rewriter, srcArrayValue);
 
     auto targetArrayValue =
         cast<TypedValue<emitc::ArrayType>>(operands.getTarget());
-    emitc::ApplyOp targetPtr =
+    emitc::AddressOfOp targetPtr =
         createPointerFromEmitcArray(loc, rewriter, targetArrayValue);
 
     emitc::CallOpaqueOp memCpyCall = emitc::CallOpaqueOp::create(
@@ -319,8 +319,8 @@ struct ConvertGetGlobal final
       emitc::GetGlobalOp globalLValue = emitc::GetGlobalOp::create(
           rewriter, op.getLoc(), lvalueType, operands.getNameAttr());
       emitc::PointerType pointerType = emitc::PointerType::get(resultTy);
-      rewriter.replaceOpWithNewOp<emitc::ApplyOp>(
-          op, pointerType, rewriter.getStringAttr("&"), globalLValue);
+      rewriter.replaceOpWithNewOp<emitc::AddressOfOp>(op, pointerType,
+                                                      globalLValue);
       return success();
     }
     rewriter.replaceOpWithNewOp<emitc::GetGlobalOp>(op, resultTy,
diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index d478220221f7a..b0566dd10f490 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -225,6 +225,21 @@ FailureOr<SmallVector<ReplacementItem>> parseFormatString(
   return items;
 }
 
+//===----------------------------------------------------------------------===//
+// AddressOfOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult AddressOfOp::verify() {
+  emitc::LValueType referenceType = getReference().getType();
+  emitc::PointerType resultType = getResult().getType();
+
+  if (referenceType.getValueType() != resultType.getPointee())
+    return emitOpError("requires result to be a pointer to the type "
+                       "referenced by operand");
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // AddOp
 //===----------------------------------------------------------------------===//
@@ -379,6 +394,20 @@ LogicalResult emitc::ConstantOp::verify() {
 
 OpFoldResult emitc::ConstantOp::fold(FoldAdaptor adaptor) { return getValue(); }
 
+//===----------------------------------------------------------------------===//
+// DereferenceOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult DereferenceOp::verify() {
+  emitc::PointerType pointerType = getPointer().getType();
+
+  if (pointerType.getPointee() != getResult().getType().getValueType())
+    return emitOpError("requires result to be an lvalue of the type "
+                       "pointed to by operand");
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // ExpressionOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Target/Cpp/TranslateToCpp.cpp b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
index bcce0a5145221..a37ea5e87a316 100644
--- a/mlir/lib/Target/Cpp/TranslateToCpp.cpp
+++ b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
@@ -70,6 +70,7 @@ static inline LogicalResult interleaveCommaWithError(const Container &c,
 /// imply higher precedence.
 static FailureOr<int> getOperatorPrecedence(Operation *operation) {
   return llvm::TypeSwitch<Operation *, FailureOr<int>>(operation)
+      .Case<emitc::AddressOfOp>([&](auto op) { return 15; })
       .Case<emitc::AddOp>([&](auto op) { return 12; })
       .Case<emitc::ApplyOp>([&](auto op) { return 15; })
       .Case<emitc::BitwiseAndOp>([&](auto op) { return 7; })
@@ -396,6 +397,15 @@ static bool shouldBeInlined(ExpressionOp expressionOp) {
   return false;
 }
 
+static LogicalResult printOperation(CppEmitter &emitter,
+                                    emitc::DereferenceOp dereferenceOp) {
+  std::string out;
+  llvm::raw_string_ostream ss(out);
+  ss << "*" << emitter.getOrCreateName(dereferenceOp.getPointer());
+  emitter.cacheDeferredOpResult(dereferenceOp.getResult(), out);
+  return success();
+}
+
 static LogicalResult printOperation(CppEmitter &emitter,
                                     emitc::GetFieldOp getFieldOp) {
   emitter.cacheDeferredOpResult(getFieldOp.getResult(),
@@ -479,6 +489,17 @@ static LogicalResult printConstantOp(CppEmitter &emitter, Operation *operation,
   return emitter.emitAttribute(operation->getLoc(), value);
 }
 
+static LogicalResult printOperation(CppEmitter &emitter,
+                                    emitc::AddressOfOp addressOfOp) {
+  raw_ostream &os = emitter.ostream();
+  Operation &op = *addressOfOp.getOperation();
+
+  if (failed(emitter.emitAssignPrefix(op)))
+    return failure();
+  os << "&";
+  return emitter.emitOperand(addressOfOp.getReference());
+}
+
 static LogicalResult printOperation(CppEmitter &emitter,
                                     emitc::ConstantOp constantOp) {
   Operation *operation = constantOp.getOperation();
@@ -1768,14 +1789,14 @@ LogicalResult CppEmitter::emitOperation(Operation &op, bool trailingSemicolon) {
           .Case<cf::BranchOp, cf::CondBranchOp>(
               [&](auto op) { return printOperation(*this, op); })
           // EmitC ops.
-          .Case<emitc::AddOp, emitc::ApplyOp, emitc::AssignOp,
-                emitc::BitwiseAndOp, emitc::BitwiseLeftShiftOp,
+          .Case<emitc::AddressOfOp, emitc::AddOp, emitc::ApplyOp,
+                emitc::AssignOp, emitc::BitwiseAndOp, emitc::BitwiseLeftShiftOp,
                 emitc::BitwiseNotOp, emitc::BitwiseOrOp,
                 emitc::BitwiseRightShiftOp, emitc::BitwiseXorOp, emitc::CallOp,
                 emitc::CallOpaqueOp, emitc::CastOp, emitc::ClassOp,
                 emitc::CmpOp, emitc::ConditionalOp, emitc::ConstantOp,
-                emitc::DeclareFuncOp, emitc::DivOp, emitc::DoOp,
-                emitc::ExpressionOp, emitc::FieldOp, emitc::FileOp,
+                emitc::DeclareFuncOp, emitc::DereferenceOp, emitc::DivOp,
+                emitc::DoOp, emitc::ExpressionOp, emitc::FieldOp, emitc::FileOp,
                 emitc::ForOp, emitc::FuncOp, emitc::GetFieldOp,
                 emitc::GetGlobalOp, emitc::GlobalOp, emitc::IfOp,
                 emitc::IncludeOp, emitc::LiteralOp, emitc::LoadOp,
diff --git a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-alloc-copy.mlir b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-alloc-copy.mlir
index d1b751dc590f6..19e1c7ae4263e 100644
--- a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-alloc-copy.mlir
+++ b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-alloc-copy.mlir
@@ -26,14 +26,14 @@ func.func @alloc_copy(%arg0: memref<999xi32>) {
 // CHECK:           %[[UNREALIZED_CONVERSION_CAST_1:.*]] = builtin.unrealized_conversion_cast %[[CAST_0]] : !emitc.ptr<i32> to !emitc.array<999xi32>
 // CHECK:           %[[VAL_1:.*]] = "emitc.constant"() <{value = 0 : index}> : () -> index
 // CHECK:           %[[SUBSCRIPT_0:.*]] = emitc.subscript %[[UNREALIZED_CONVERSION_CAST_0]]{{\[}}%[[VAL_1]]] : (!emitc.array<999xi32>, index) -> !emitc.lvalue<i32>
-// CHECK:           %[[APPLY_0:.*]] = emitc.apply "&"(%[[SUBSCRIPT_0]]) : (!emitc.lvalue<i32>) -> !emitc.ptr<i32>
+// CHECK:           %[[ADDRESS_OF_0:.*]] = emitc.address_of %[[SUBSCRIPT_0]] : !emitc.lvalue<i32>
 // CHECK:           %[[VAL_2:.*]] = "emitc.constant"() <{value = 0 : index}> : () -> index
 // CHECK:           %[[SUBSCRIPT_1:.*]] = emitc.subscript %[[UNREALIZED_CONVERSION_CAST_1]]{{\[}}%[[VAL_2]]] : (!emitc.array<999xi32>, index) -> !emitc.lvalue<i32>
-// CHECK:           %[[APPLY_1:.*]] = emitc.apply "&"(%[[SUBSCRIPT_1]]) : (!emitc.lvalue<i32>) -> !emitc.ptr<i32>
+// CHECK:           %[[ADDRESS_OF_1:.*]] = emitc.address_of %[[SUBSCRIPT_1]] : !emitc.lvalue<i32>
 // CHECK:           %[[CALL_OPAQUE_2:.*]] = emitc.call_opaque "sizeof"() {args = [i32]} : () -> !emitc.size_t
 // CHECK:           %[[VAL_3:.*]] = "emitc.constant"() <{value = 999 : index}> : () -> index
 // CHECK:           %[[MUL_1:.*]] = emitc.mul %[[CALL_OPAQUE_2]], %[[VAL_3]] : (!emitc.size_t, index) -> !emitc.size_t
-// CHECK:           emitc.call_opaque "memcpy"(%[[APPLY_1]], %[[APPLY_0]], %[[MUL_1]]) : (!emitc.ptr<i32>, !emitc.ptr<i32>, !emitc.size_t) -> ()
+// CHECK:           emitc.call_opaque "memcpy"(%[[ADDRESS_OF_1]], %[[ADDRESS_OF_0]], %[[MUL_1]]) : (!emitc.ptr<i32>, !emitc.ptr<i32>, !emitc.size_t) -> ()
 // CHECK:           %[[CALL_OPAQUE_3:.*]] = emitc.call_opaque "sizeof"() {args = [i32]} : () -> !emitc.size_t
 // CHECK:           %[[VAL_4:.*]] = "emitc.constant"() <{value = 999 : index}> : () -> index
 // CHECK:           %[[MUL_2:.*]] = emitc.mul %[[CALL_OPAQUE_3]], %[[VAL_4]] : (!emitc.size_t, index) -> !emitc.size_t
@@ -42,13 +42,13 @@ func.func @alloc_copy(%arg0: memref<999xi32>) {
 // CHECK:           %[[UNREALIZED_CONVERSION_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[CAST_1]] : !emitc.ptr<i32> to !emitc.array<999xi32>
 // CHECK:           %[[VAL_5:.*]] = "emitc.constant"() <{value = 0 : index}> : () -> index
 // CHECK:           %[[SUBSCRIPT_2:.*]] = emitc.subscript %[[UNREALIZED_CONVERSION_CAST_0]]{{\[}}%[[VAL_5]]] : (!emitc.array<999xi32>, index) -> !emitc.lvalue<i32>
-// CHECK:           %[[APPLY_2:.*]] = emitc.apply "&"(%[[SUBSCRIPT_2]]) : (!emitc.lvalue<i32>) -> !emitc.ptr<i32>
+// CHECK:           %[[ADDRESS_OF_2:.*]] = emitc.address_of %[[SUBSCRIPT_2]] : !emitc.lvalue<i32>
 // CHECK:           %[[VAL_6:.*]] = "emitc.constant"() <{value = 0 : index}> : () -> index
 // CHECK:           %[[SUBSCRIPT_3:.*]] = emitc.subscript %[[UNREALIZED_CONVERSION_CAST_2]]{{\[}}%[[VAL_6]]] : (!emitc.array<999xi32>, index) -> !emitc.lvalue<i32>
-// CHECK:           %[[APPLY_3:.*]] = emitc.apply "&"(%[[SUBSCRIPT_3]]) : (!emitc.lvalue<i32>) -> !emitc.ptr<i32>
+// CHECK:           %[[ADDRESS_OF_3:.*]] = emitc.address_of %[[SUBSCRIPT_3]] : !emitc.lvalue<i32>
 // CHECK:           %[[CALL_OPAQUE_5:.*]] = emitc.call_opaque "sizeof"() {args = [i32]} : () -> !emitc.size_t
 // CHECK:           %[[VAL_7:.*]] = "emitc.constant"() <{value = 999 : index}> : () -> index
 // CHECK:           %[[MUL_3:.*]] = emitc.mul %[[CALL_OPAQUE_5]], %[[VAL_7]] : (!emitc.size_t, index) -> !emitc.size_t
-// CHECK:           emitc.call_opaque "memcpy"(%[[APPLY_3]], %[[APPLY_2]], %[[MUL_3]]) : (!emitc.ptr<i32>, !emitc.ptr<i32>, !emitc.size_t) -> ()
+// CHECK:           emitc.call_opaque "memcpy"(%[[ADDRESS_OF_3]], %[[ADDRESS_OF_2]], %[[MUL_3]]) : (!emitc.ptr<i32>, !emitc.ptr<i32>, !emitc.size_t) -> ()
 // CHECK:           return
 // CHECK:         }
diff --git a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-copy.mlir b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-copy.mlir
index 829f267743d93..3de2d25f2b0d4 100644
--- a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-copy.mlir
+++ b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-copy.mlir
@@ -17,14 +17,14 @@ func.func @copying(%arg0 : memref<9x4x5x7xf32>, %arg1 : memref<9x4x5x7xf32>) {
 // CHECK:           %[[UNREALIZED_CONVERSION_CAST_1:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<9x4x5x7xf32> to !emitc.array<9x4x5x7xf32>
 // CHECK:           %[[VAL_0:.*]] = "emitc.constant"() <{value = 0 : index}> : () -> index
 // CHECK:           %[[SUBSCRIPT_0:.*]] = emitc.subscript %[[UNREALIZED_CONVERSION_CAST_1]]{{\[}}%[[VAL_0]], %[[VAL_0]], %[[VAL_0]], %[[VAL_0]]] : (!emitc.array<9x4x5x7xf32>, index, index, index, index) -> !emitc.lvalue<f32>
-// CHECK:           %[[APPLY_0:.*]] = emitc.apply "&"(%[[SUBSCRIPT_0]]) : (!emitc.lvalue<f32>) -> !emitc.ptr<f32>
+// CHECK:           %[[ADDRESS_OF_0:.*]] = emitc.address_of %[[SUBSCRIPT_0]] : !emitc.lvalue<f32>
 // CHECK:           %[[VAL_1:.*]] = "emitc.constant"() <{value = 0 : index}> : () -> index
 // CHECK:           %[[SUBSCRIPT_1:.*]] = emitc.subscript %[[UNREALIZED_CONVERSION_CAST_0]]{{\[}}%[[VAL_1]], %[[VAL_1]], %[[VAL_1]], %[[VAL_1]]] : (!emitc.array<9x4x5x7xf32>, index, index, index, index) -> !emitc.lvalue<f32>
-// CHECK:           %[[APPLY_1:.*]] = emitc.apply "&"(%[[SUBSCRIPT_1]]) : (!emitc.lvalue<f32>) -> !emitc.ptr<f32>
+// CHECK:           %[[ADDRESS_OF_1:.*]] = emitc.address_of %[[SUBSCRIPT_1]] : !emitc.lvalue<f32>
 // CHECK:           %[[CALL_OPAQUE_0:.*]] = emitc.call_opaque "sizeof"() {args = [f32]} : () -> !emitc.size_t
 // CHECK:           %[[VAL_2:.*]] = "emitc.constant"() <{value = 1260 : index}> : () -> index
 // CHECK:           %[[MUL_0:.*]] = emitc.mul %[[CALL_OPAQUE_0]], %[[VAL_2]] : (!emitc.size_t, index) -> !emitc.size_t
-// CHECK:           emitc.call_opaque "memcpy"(%[[APPLY_1]], %[[APPLY_0]], %[[MUL_0]]) : (!emitc.ptr<f32>, !emitc.ptr<f32>, !emitc.size_t) -> ()
+// CHECK:           emitc.call_opaque "memcpy"(%[[ADDRESS_OF_1]], %[[ADDRESS_OF_0]], %[[MUL_0]]) : (!emitc.ptr<f32>, !emitc.ptr<f32>, !emitc.size_t) -> ()
 // CHECK:           return
 // CHECK:         }
 
diff --git a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir
index 2b4eda37903d4..c7b043b8e2370 100644
--- a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir
+++ b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir
@@ -53,7 +53,7 @@ module @globals {
     // CHECK-NEXT: emitc.get_global @public_global : !emitc.array<3x7xf32>
     %0 = memref.get_global @public_global : memref<3x7xf32>
     // CHECK-NEXT: emitc.get_global @__constant_xi32 : !emitc.lvalue<i32>
-    // CHECK-NEXT: emitc.apply "&"(%1) : (!emitc.lvalue<i32>) -> !emitc.ptr<i32>
+    // CHECK-NEXT: emitc.address_of %1 : !emitc.lvalue<i32>
     %1 = memref.get_global @__constant_xi32 : memref<i32>
     return
   }
diff --git a/mlir/test/Dialect/EmitC/invalid_ops.mlir b/mlir/test/Dialect/EmitC/invalid_ops.mlir
index f285196d466ce..d1601bed29ca9 100644
--- a/mlir/test/Dialect/EmitC/invalid_ops.mlir
+++ b/mlir/test/Dialect/EmitC/invalid_ops.mlir
@@ -914,3 +914,19 @@ func.func @test_for_unmatch_type(%arg0: index) {
   ) : (index, index, index) -> ()
   return
 }
+
+// -----
+
+func.func @address_of(%arg0: !emitc.lvalue<i32>) {
+  // expected-error @+1 {{failed to verify that input and result reference the same type}}
+  %1 = "emitc.address_of"(%arg0) : (!emitc.lvalue<i32>) -> !emitc.ptr<i8>
+  return
+}
+
+// -----
+
+func.func @dereference(%arg0: !emitc.ptr<i32>) {
+  // expected-error @+1 {{failed to verify that input and result reference the same type}}
+  %1 = "emitc.dereference"(%arg0) : (!emitc.ptr<i32>) -> !emitc.lvalue<i8>
+  return
+}
diff --git a/mlir/test/Dialect/EmitC/ops.mlir b/mlir/test/Dialect/EmitC/ops.mlir
index 1259748dfce84..b2c8b843ec14b 100644
--- a/mlir/test/Dialect/EmitC/ops.mlir
+++ b/mlir/test/Dialect/EmitC/ops.mlir
@@ -355,3 +355,13 @@ func.func @do(%arg0 : !emitc.ptr<i32>) {
 
   return
 }
+
+func.func @address_of(%arg0: !emitc.lvalue<i32>) {
+  %1 = emitc.address_of %arg0 : !emitc.lvalue<i32>
+  return
+}
+
+func.func @dereference(%arg0: !emitc.ptr<i32>) {
+  %1 = emitc.dereference %arg0 : !emitc.ptr<i32>
+  return
+}
diff --git a/mlir/test/Target/Cpp/common-cpp.mlir b/mlir/test/Target/Cpp/common-cpp.mlir
index 294e6af65bf14..abf85c8e9a359 100644
--- a/mlir/test/Target/Cpp/common-cpp.mlir
+++ b/mlir/test/Target/Cpp/common-cpp.mlir
@@ -105,6 +105,25 @@ func.func @apply() -> !emitc.ptr<i32> {
   return %1 : !emitc.ptr<i32>
 }
 
+
+// CHECK-LABEL: void address_of() {
+func.func @address_of() {
+  // CHECK-NEXT: int32_t [[V1:[^ ]*]];
+  %0 = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+  // CHECK-NEXT: int32_t* [[V2:[^ ]*]] = &[[V1]];
+  %1 = emitc.address_of %0 : !emitc.lvalue<i32>
+  return
+}
+
+// CHECK-LABEL: void dereference
+// CHECK-SAME:                   (int32_t* [[ARG0:[^ ]*]]) {
+func.func @dereference(%arg0: !emitc.ptr<i32>) {
+  // CHECK: int32_t [[V1:[^ ]*]] = *[[ARG0]];
+  %2 = emitc.dereference %arg0 : !emitc.ptr<i32>
+  emitc.load %2 : !emitc.lvalue<i32>
+  return
+}
+
 // CHECK: void array_type(int32_t v1[3], float v2[10][20])
 func.func @array_type(%arg0: !emitc.array<3xi32>, %arg1: !emitc.array<10x20xf32>) {
   return
diff --git a/mlir/test/Target/Cpp/expressions.mlir b/mlir/test/Target/Cpp/expressions.mlir
index 9f1c816ddabbc..2de94d0b11fc8 100644
--- a/mlir/test/Target/Cpp/expressions.mlir
+++ b/mlir/test/Target/Cpp/expressions.mlir
@@ -314,14 +314,14 @@ func.func @different_expressions(%arg0: i32, %arg1: i32, %arg2: i32, %arg3: i32)
   return %v_load : i32
 }
 
-// CPP-DEFAULT:      int32_t expression_with_dereference(int32_t [[VAL_1:v[0-9]+]], int32_t* [[VAL_2]]) {
+// CPP-DEFAULT:      int32_t expression_with_dereference_apply(int32_t [[VAL_1:v[0-9]+]], int32_t* [[VAL_2]]) {
 // CPP-DEFAULT-NEXT:   return *([[VAL_2]] - [[VAL_1]]);
 // CPP-DEFAULT-NEXT: }
 
-// CPP-DECLTOP:      int32_t expression_with_dereference(int32_t [[VAL_1:v[0-9]+]], int32_t* [[VAL_2]]) {
+// CPP-DECLTOP:      int32_t expression_with_dereference_apply(int32_t [[VAL_1:v[0-9]+]], int32_t* [[VAL_2]]) {
 // CPP-DECLTOP-NEXT:   return *([[VAL_2]] - [[VAL_1]]);
 // CPP-DECLTOP-NEXT: }
-emitc.func @expression_with_dereference(%arg1: i32, %arg2: !emitc.ptr<i32>) -> i32 {
+emitc.func @expression_with_dereference_apply(%arg1: i32, %arg2: !emitc.ptr<i32>) -> i32 {
   %c = emitc.expression %arg1, %arg2 : (i32, !emitc.ptr<i32>) -> i32 {
     %e = emitc.sub %arg2, %arg1 : (!emitc.ptr<i32>, i32) -> !emitc.ptr<i32>
     %d = emitc.apply "*"(%e) : (!emitc.ptr<i32>) -> i32
@@ -330,6 +330,28 @@ emitc.func @expression_with_dereference(%arg1: i32, %arg2: !emitc.ptr<i32>) -> i
   return %c : i32
 }
 
+// CPP-DEFAULT:      bool expression_with_address_taken_apply(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t* [[VAL_3]]) {
+// CPP-DEFAULT-NEXT:   int32_t [[VAL_4:v[0-9]+]] = 42;
+// CPP-DEFAULT-NEXT:   return &[[VAL_4]] - [[VAL_2]] < [[VAL_3]];
+// CPP-DEFAULT-NEXT: }
+
+// CPP-DECLTOP:      bool expression_with_address_taken_apply(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t* [[VAL_3]]) {
+// CPP-DECLTOP-NEXT:   int32_t [[VAL_4:v[0-9]+]];
+// CPP-DECLTOP-NEXT:   [[VAL_4]] = 42;
+// CPP-DECLTOP-NEXT:   return &[[VAL_4]] - [[VAL_2]] < [[VAL_3]];
+// CPP-DECLTOP-NEXT: }
+
+func.func @expression_with_address_taken_apply(%arg0: i32, %arg1: i32, %arg2: !emitc.ptr<i32>) -> i1 {
+  %a = "emitc.variable"(){value = 42 : i32} : () -> !emitc.lvalue<i32>
+  %c = emitc.expression %arg1, %arg2, %a : (i32, !emitc.ptr<i32>, !emitc.lvalue<i32>) -> i1 {
+    %d = emitc.apply "&"(%a) : (!emitc.lvalue<i32>) -> !emitc.ptr<i32>
+    %e = emitc.sub %d, %arg1 : (!emitc.ptr<i32>, i32) -> !emitc.ptr<i32>
+    %f = emitc.cmp lt, %e, %arg2 : (!emitc.ptr<i32>, !emitc.ptr<i32>) -> i1
+    emitc.yield %f : i1
+  }
+  return %c : i1
+}
+
 // CPP-DEFAULT:      bool expression_with_address_taken(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t* [[VAL_3]]) {
 // CPP-DEFAULT-NEXT:   int32_t [[VAL_4:v[0-9]+]] = 42;
 // CPP-DEFAULT-NEXT:   return &[[VAL_4]] - [[VAL_2]] < [[VAL_3]];
@@ -344,7 +366,7 @@ emitc.func @expression_with_dereference(%arg1: i32, %arg2: !emitc.ptr<i32>) -> i
 func.func @expression_with_address_taken(%arg0: i32, %arg1: i32, %arg2: !emitc.ptr<i32>) -> i1 {
   %a = "emitc.variable"(){value = 42 : i32} : () -> !emitc.lvalue<i32>
   %c = emitc.expression %arg1, %arg2, %a : (i32, !emitc.ptr<i32>, !emitc.lvalue<i32>) -> i1 {
-    %d = emitc.apply "&"(%a) : (!emitc.lvalue<i32>) -> !emitc.ptr<i32>
+    %d = emitc.address_of %a : !emitc.lvalue<i32>
     %e = emitc.sub %d, %arg1 : (!emitc.ptr<i32>, i32) -> !emitc.ptr<i32>
     %f = emitc.cmp lt, %e, %arg2 : (!emitc.ptr<i32>, !emitc.ptr<i32>) -> i1
     emitc.yield %f : i1

From 49774448d69b55f5c46aef2147b45537fd61276a Mon Sep 17 00:00:00 2001
From: mitchell <mitchell.xu2@gmail.com>
Date: Wed, 3 Dec 2025 18:41:45 +0800
Subject: [PATCH 25/36] [clang-tidy] Fix false positive in
 `readability-redundant-typename` (#170034)

Closes #169166

---------

Co-authored-by: Victor Chernyakin <chernyakin.victor.j@outlook.com>
---
 .../readability/RedundantTypenameCheck.cpp    |  6 +++--
 .../readability/redundant-typename.cpp        | 24 +++++++++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/readability/RedundantTypenameCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantTypenameCheck.cpp
index a4edd2b46b86b..5f2519ce9d5c3 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantTypenameCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantTypenameCheck.cpp
@@ -47,6 +47,9 @@ void RedundantTypenameCheck::check(const MatchFinder::MatchResult &Result) {
   const SourceLocation ElaboratedKeywordLoc = [&] {
     if (const auto *NonDependentTypeLoc =
             Result.Nodes.getNodeAs<TypeLoc>("nonDependentTypeLoc")) {
+      if (NonDependentTypeLoc->getType()->isDependentType())
+        return SourceLocation();
+
       if (const auto TL = NonDependentTypeLoc->getAs<TypedefTypeLoc>())
         return TL.getElaboratedKeywordLoc();
 
@@ -59,8 +62,7 @@ void RedundantTypenameCheck::check(const MatchFinder::MatchResult &Result) {
 
       if (const auto TL =
               NonDependentTypeLoc->getAs<TemplateSpecializationTypeLoc>())
-        if (!TL.getType()->isDependentType())
-          return TL.getElaboratedKeywordLoc();
+        return TL.getElaboratedKeywordLoc();
     } else {
       TypeLoc InnermostTypeLoc =
           *Result.Nodes.getNodeAs<TypeLoc>("dependentTypeLoc");
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-typename.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-typename.cpp
index 2efafd1a9a649..e8fcd9bcd5731 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-typename.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-typename.cpp
@@ -267,3 +267,27 @@ WHOLE_TYPE_IN_MACRO Macro2;
 
 #define WHOLE_DECLARATION_IN_MACRO typename NotDependent::R Macro3
 WHOLE_DECLARATION_IN_MACRO;
+
+template <typename T> struct Wrapper {};
+template <typename T>
+struct ClassWrapper {
+    using R = T;
+    Wrapper<R> f();
+};
+
+template <typename T>
+Wrapper<typename ClassWrapper<T>::R> ClassWrapper<T>::f() {
+    return {};
+}
+
+template <typename T> struct StructWrapper {};
+template <typename T>
+class ClassWithNestedStruct {
+  struct Nested {};
+  StructWrapper<Nested> f();
+};
+
+template <typename T>
+StructWrapper<typename ClassWithNestedStruct<T>::Nested> ClassWithNestedStruct<T>::f() {
+  return {};
+}

From 4e4763a8a4659dc252429a003c613f762d5a1083 Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Wed, 3 Dec 2025 10:43:48 +0000
Subject: [PATCH 26/36] [lldb] Handle backwards branches in
 UnwindAssemblyInstEmulation (#169633)

This allows the unwinder to handle code with mid-function epilogues
where the subsequent code is reachable through a backwards branch.

Two changes are required to accomplish this:

1. Do not enqueue the subsequent instruction if the current instruction
   is a barrier(*).
2. When processing an instruction, stop ignoring branches with negative
   offsets.

(*) As per the definition in LLVM's MC layer, a barrier is any
instruction that "stops control flow from executing the instruction
immediately following it". See `MCInstrDesc::isBarrier` in MCInstrDesc.h

Part of a sequence of PRs:
[lldb][NFCI] Rewrite UnwindAssemblyInstEmulation in terms of a CFG visit
#169630
[lldb][NFC] Rename forward_branch_offset to branch_offset in
UnwindAssemblyInstEmulation #169631
[lldb] Add DisassemblerLLVMC::IsBarrier API #169632
[lldb] Handle backwards branches in UnwindAssemblyInstEmulation #169633

commit-id:fd266c13
---
 .../UnwindAssemblyInstEmulation.cpp           | 12 ++++++----
 .../UnwindAssemblyInstEmulation.h             |  2 +-
 .../ARM64/TestArm64InstEmulation.cpp          | 23 +++++++++++++++----
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
index 3913684e4ac62..19ae1cf392efa 100644
--- a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
+++ b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
@@ -227,6 +227,10 @@ bool UnwindAssemblyInstEmulation::GetNonCallSiteUnwindPlanFromAssembly(
       }
     }
 
+    // If inst is a barrier, do not propagate state to the next instruction.
+    if (inst.IsBarrier())
+      continue;
+
     // Were there any changes to the CFI while evaluating this instruction?
     if (m_curr_row_modified) {
       // Save the modified row if we don't already have a CFI row in the
@@ -530,19 +534,19 @@ bool UnwindAssemblyInstEmulation::WriteRegister(
   case EmulateInstruction::eContextAbsoluteBranchRegister:
   case EmulateInstruction::eContextRelativeBranchImmediate: {
     if (context.GetInfoType() == EmulateInstruction::eInfoTypeISAAndImmediate &&
-        context.info.ISAAndImmediate.unsigned_data32 > 0) {
+        context.info.ISAAndImmediate.unsigned_data32 != 0) {
       m_branch_offset = context.info.ISAAndImmediate.unsigned_data32;
     } else if (context.GetInfoType() ==
                    EmulateInstruction::eInfoTypeISAAndImmediateSigned &&
-               context.info.ISAAndImmediateSigned.signed_data32 > 0) {
+               context.info.ISAAndImmediateSigned.signed_data32 != 0) {
       m_branch_offset = context.info.ISAAndImmediateSigned.signed_data32;
     } else if (context.GetInfoType() ==
                    EmulateInstruction::eInfoTypeImmediate &&
-               context.info.unsigned_immediate > 0) {
+               context.info.unsigned_immediate != 0) {
       m_branch_offset = context.info.unsigned_immediate;
     } else if (context.GetInfoType() ==
                    EmulateInstruction::eInfoTypeImmediateSigned &&
-               context.info.signed_immediate > 0) {
+               context.info.signed_immediate != 0) {
       m_branch_offset = context.info.signed_immediate;
     }
   } break;
diff --git a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.h b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.h
index 1c80199235b4b..43daf1c9f9fd6 100644
--- a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.h
+++ b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.h
@@ -152,7 +152,7 @@ class UnwindAssemblyInstEmulation : public lldb_private::UnwindAssembly {
   bool m_curr_row_modified;
   // The instruction is branching forward with the given offset. 0 value means
   // no branching.
-  uint32_t m_branch_offset = 0;
+  int64_t m_branch_offset = 0;
 };
 
 #endif // LLDB_SOURCE_PLUGINS_UNWINDASSEMBLY_INSTEMULATION_UNWINDASSEMBLYINSTEMULATION_H
diff --git a/lldb/unittests/UnwindAssembly/ARM64/TestArm64InstEmulation.cpp b/lldb/unittests/UnwindAssembly/ARM64/TestArm64InstEmulation.cpp
index 6c74860971674..b72abc188b4c2 100644
--- a/lldb/unittests/UnwindAssembly/ARM64/TestArm64InstEmulation.cpp
+++ b/lldb/unittests/UnwindAssembly/ARM64/TestArm64InstEmulation.cpp
@@ -987,7 +987,7 @@ TEST_F(TestArm64InstEmulation, TestMidFunctionEpilogueAndBackwardsJump) {
       0xfd, 0x7b, 0x42, 0xa9, // <+20>: ldp    x29, x30, [sp, #0x20]
       0xff, 0xc3, 0x00, 0x91, // <+24>: add    sp, sp, #0x30
       0xc0, 0x03, 0x5f, 0xd6, // <+28>: ret
-      // AFTER_EPILOGUE:  LLDB computes the next 5 unwind states incorrectly.
+      // AFTER_EPILOGUE
       0x37, 0x00, 0x80, 0xd2, // <+32>: mov    x23, #0x1
       0xf6, 0x5f, 0x41, 0xa9, // <+36>: ldp    x22, x23, [sp, #0x10]
       0xfd, 0x7b, 0x42, 0xa9, // <+40>: ldp    x29, x30, [sp, #0x20]
@@ -1054,12 +1054,19 @@ TEST_F(TestArm64InstEmulation, TestMidFunctionEpilogueAndBackwardsJump) {
   EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
   EXPECT_EQ(row->GetCFAValue().GetOffset(), 0);
 
-  // FIXME: Row for offset +32 incorrectly inherits the state of the `ret`
-  // instruction, but +32 _never_ executes after the `ret`.
+  // Row for offset +32 should not inherit the state of the `ret` instruction
+  // in +28. Instead, it should inherit the state of the branch in +64.
+  // Check for register x22, which is available in row +64.
   // <+28>: ret
   // <+32>: mov    x23, #0x1
   row = unwind_plan.GetRowForFunctionOffset(32);
-  // FIXME: EXPECT_NE(28, row->GetOffset());
+  EXPECT_EQ(32, row->GetOffset());
+  {
+    UnwindPlan::Row::AbstractRegisterLocation loc;
+    EXPECT_TRUE(row->GetRegisterInfo(gpr_x22_arm64, loc));
+    EXPECT_TRUE(loc.IsAtCFAPlusOffset());
+    EXPECT_EQ(loc.GetOffset(), -32);
+  }
 
   // Check that the state of this branch
   // <+16>: b.ne   ; <+52> DO_SOMETHING_AND_GOTO_AFTER_EPILOGUE
@@ -1070,4 +1077,12 @@ TEST_F(TestArm64InstEmulation, TestMidFunctionEpilogueAndBackwardsJump) {
   EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset());
   EXPECT_EQ(row->GetCFAValue().GetRegisterNumber(), gpr_fp_arm64);
   EXPECT_EQ(row->GetCFAValue().GetOffset(), 16);
+
+  row = unwind_plan.GetRowForFunctionOffset(64);
+  {
+    UnwindPlan::Row::AbstractRegisterLocation loc;
+    EXPECT_TRUE(row->GetRegisterInfo(gpr_x22_arm64, loc));
+    EXPECT_TRUE(loc.IsAtCFAPlusOffset());
+    EXPECT_EQ(loc.GetOffset(), -32);
+  }
 }

From bfde296d081098605bdad0e4487c4bad9ca19c95 Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail@bennani.ma>
Date: Wed, 3 Dec 2025 02:46:01 -0800
Subject: [PATCH 27/36] [lldb/docs] Add ScriptingFrameProvider documentation to
 the website

This patch adds the documentation for ScriptedFrameProviders to the
lldb website.

Signed-off-by: Med Ismail Bennani <ismail@bennani.ma>
---
 lldb/docs/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/docs/CMakeLists.txt b/lldb/docs/CMakeLists.txt
index 092bdc1c30f5c..bbecf606f1f8f 100644
--- a/lldb/docs/CMakeLists.txt
+++ b/lldb/docs/CMakeLists.txt
@@ -28,6 +28,7 @@ if (LLDB_ENABLE_PYTHON AND SPHINX_FOUND)
     add_custom_target(lldb-python-doc-package
       COMMAND "${CMAKE_COMMAND}" -E copy "${lldb_bindings_dir}/lldb.py" "${CMAKE_CURRENT_BINARY_DIR}/lldb/__init__.py"
       COMMAND "${CMAKE_COMMAND}" -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/lldb/plugins"
+      COMMAND "${CMAKE_COMMAND}" -E copy "${LLDB_SOURCE_DIR}/examples/python/templates/scripted_frame_provider.py" "${CMAKE_CURRENT_BINARY_DIR}/lldb/plugins/"
       COMMAND "${CMAKE_COMMAND}" -E copy "${LLDB_SOURCE_DIR}/examples/python/templates/scripted_process.py" "${CMAKE_CURRENT_BINARY_DIR}/lldb/plugins/"
       COMMAND "${CMAKE_COMMAND}" -E copy "${LLDB_SOURCE_DIR}/examples/python/templates/scripted_platform.py" "${CMAKE_CURRENT_BINARY_DIR}/lldb/plugins/"
       COMMAND "${CMAKE_COMMAND}" -E copy "${LLDB_SOURCE_DIR}/examples/python/templates/operating_system.py" "${CMAKE_CURRENT_BINARY_DIR}/lldb/plugins/"

From c0f0936f5a47270d47486f6d5860b5f8e30e0e32 Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Wed, 3 Dec 2025 10:51:27 +0000
Subject: [PATCH 28/36] [lldb] Fix ThreadPlanStepOut::DoPlanExplainsStop
 inspection of BreakpointSite (#169799)

Suppose two threads are performing the exact same step out plan. They
will both have an internal breakpoint set at their parent frame. Now
supposed both of those breakpoints are in the same address (i.e. the
same BreakpointSite).

At the end of `ThreadPlanStepOut::DoPlanExplainsStop`, we see this:

```
// If there was only one owner, then we're done.  But if we also hit
// some user breakpoint on our way out, we should mark ourselves as
// done, but also not claim to explain the stop, since it is more
// important to report the user breakpoint than the step out
// completion.

if (site_sp->GetNumberOfConstituents() == 1)
  return true;
```

In other words, the plan looks at the name number of constituents of the
site to decide whether it explains the stop, the logic being that a
_user_ might have put a breakpoint there. However, the implementation is
not correct; in particular, it will fail in the situation described
above. We should only care about non-internal breakpoints that would
stop for the current thread.

It is tricky to test this, as it depends on the timing of threads, but I
was able to consistently reproduce the issue with a swift program using
concurrency.

rdar://165481473
---
 lldb/include/lldb/Breakpoint/BreakpointSite.h |  4 ++++
 lldb/source/Breakpoint/BreakpointSite.cpp     | 16 ++++++++++++++++
 lldb/source/Target/ThreadPlanStepOut.cpp      | 11 ++++-------
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/lldb/include/lldb/Breakpoint/BreakpointSite.h b/lldb/include/lldb/Breakpoint/BreakpointSite.h
index a935b2441c02a..e189ed77e261b 100644
--- a/lldb/include/lldb/Breakpoint/BreakpointSite.h
+++ b/lldb/include/lldb/Breakpoint/BreakpointSite.h
@@ -156,6 +156,10 @@ class BreakpointSite : public std::enable_shared_from_this<BreakpointSite>,
   ///     would be valid for this thread, false otherwise.
   bool ValidForThisThread(Thread &thread);
 
+  /// Returns true if at least one constituent is both public and valid for
+  /// `thread`.
+  bool ContainsUserBreakpointForThread(Thread &thread);
+
   /// Print a description of this breakpoint site to the stream \a s.
   /// GetDescription tells you about the breakpoint site's constituents. Use
   /// BreakpointSite::Dump(Stream *) to get information about the breakpoint
diff --git a/lldb/source/Breakpoint/BreakpointSite.cpp b/lldb/source/Breakpoint/BreakpointSite.cpp
index fd7666be6b1bf..8639379afe1df 100644
--- a/lldb/source/Breakpoint/BreakpointSite.cpp
+++ b/lldb/source/Breakpoint/BreakpointSite.cpp
@@ -168,6 +168,22 @@ bool BreakpointSite::ValidForThisThread(Thread &thread) {
   return m_constituents.ValidForThisThread(thread);
 }
 
+bool BreakpointSite::ContainsUserBreakpointForThread(Thread &thread) {
+  if (ThreadSP backed_thread = thread.GetBackedThread())
+    return ContainsUserBreakpointForThread(*backed_thread);
+
+  std::lock_guard<std::recursive_mutex> guard(m_constituents_mutex);
+  for (const BreakpointLocationSP &bp_loc :
+       m_constituents.BreakpointLocations()) {
+    const Breakpoint &bp = bp_loc->GetBreakpoint();
+    if (bp.IsInternal())
+      continue;
+    if (bp_loc->ValidForThisThread(thread))
+      return true;
+  }
+  return false;
+}
+
 void BreakpointSite::BumpHitCounts() {
   std::lock_guard<std::recursive_mutex> guard(m_constituents_mutex);
   for (BreakpointLocationSP loc_sp : m_constituents.BreakpointLocations()) {
diff --git a/lldb/source/Target/ThreadPlanStepOut.cpp b/lldb/source/Target/ThreadPlanStepOut.cpp
index d49a01bdbcef7..0307b38d7d94b 100644
--- a/lldb/source/Target/ThreadPlanStepOut.cpp
+++ b/lldb/source/Target/ThreadPlanStepOut.cpp
@@ -356,13 +356,10 @@ bool ThreadPlanStepOut::DoPlanExplainsStop(Event *event_ptr) {
           }
         }
 
-        // If there was only one owner, then we're done.  But if we also hit
-        // some user breakpoint on our way out, we should mark ourselves as
-        // done, but also not claim to explain the stop, since it is more
-        // important to report the user breakpoint than the step out
-        // completion.
-
-        if (site_sp->GetNumberOfConstituents() == 1)
+        // If the thread also hit a user breakpoint on its way out, the plan is
+        // done but should not claim to explain the stop. It is more important
+        // to report the user breakpoint than the step out completion.
+        if (!site_sp->ContainsUserBreakpointForThread(GetThread()))
           return true;
       }
       return false;

From aeb36a92523427b63466555d92b35bd3aa26ee40 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Wed, 3 Dec 2025 22:03:23 +1100
Subject: [PATCH 29/36] [ORC] Port CallableTraitsHelper from the new ORC
 runtime. (#170441)

The code for this commit was taken with minimal modification to fit LLVM
style from llvm-project/orc-rt/include/CallableTraitsHelper.h and
llvm-project/orc-rt/unittests/CallableTraitsHelperTest.cpp (originally
commited in 40fce325011)

CallableTraitsHelper identifies the return type and argument types of a
callable type and passes those to an implementation class template to
operate on. E.g. the CallableArgInfoImpl class exposes these types as
typedefs.

Porting CallableTraitsHelper from the new ORC runtime will allow us to
simplify existing and upcoming "callable-traits" classes in ORC.
---
 .../Orc/CallableTraitsHelper.h                | 74 +++++++++++++++++++
 .../ExecutionEngine/Orc/CMakeLists.txt        |  1 +
 .../Orc/CallableTraitsHelperTest.cpp          | 70 ++++++++++++++++++
 3 files changed, 145 insertions(+)
 create mode 100644 llvm/include/llvm/ExecutionEngine/Orc/CallableTraitsHelper.h
 create mode 100644 llvm/unittests/ExecutionEngine/Orc/CallableTraitsHelperTest.cpp

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/CallableTraitsHelper.h b/llvm/include/llvm/ExecutionEngine/Orc/CallableTraitsHelper.h
new file mode 100644
index 0000000000000..11bafa9745693
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/CallableTraitsHelper.h
@@ -0,0 +1,74 @@
+//===- CallableTraitsHelper.h - Callable arg/ret type extractor -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// CallableTraitsHelper API.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_CALLABLETRAITSHELPER_H
+#define LLVM_EXECUTIONENGINE_ORC_CALLABLETRAITSHELPER_H
+
+#include <tuple>
+#include <type_traits>
+
+namespace llvm::orc {
+
+/// CallableTraitsHelper takes an implementation class template Impl and some
+/// callable type C and passes the return and argument types of C to the Impl
+/// class template.
+///
+/// This can be used to simplify the implementation of classes that need to
+/// operate on callable types.
+template <template <typename...> typename ImplT, typename C>
+struct CallableTraitsHelper
+    : public CallableTraitsHelper<
+          ImplT,
+          decltype(&std::remove_cv_t<std::remove_reference_t<C>>::operator())> {
+};
+
+template <template <typename...> typename ImplT, typename RetT,
+          typename... ArgTs>
+struct CallableTraitsHelper<ImplT, RetT(ArgTs...)>
+    : public ImplT<RetT, ArgTs...> {};
+
+template <template <typename...> typename ImplT, typename RetT,
+          typename... ArgTs>
+struct CallableTraitsHelper<ImplT, RetT (*)(ArgTs...)>
+    : public CallableTraitsHelper<ImplT, RetT(ArgTs...)> {};
+
+template <template <typename...> typename ImplT, typename RetT,
+          typename... ArgTs>
+struct CallableTraitsHelper<ImplT, RetT (&)(ArgTs...)>
+    : public CallableTraitsHelper<ImplT, RetT(ArgTs...)> {};
+
+template <template <typename...> typename ImplT, typename ClassT, typename RetT,
+          typename... ArgTs>
+struct CallableTraitsHelper<ImplT, RetT (ClassT::*)(ArgTs...)>
+    : public CallableTraitsHelper<ImplT, RetT(ArgTs...)> {};
+
+template <template <typename...> typename ImplT, typename ClassT, typename RetT,
+          typename... ArgTs>
+struct CallableTraitsHelper<ImplT, RetT (ClassT::*)(ArgTs...) const>
+    : public CallableTraitsHelper<ImplT, RetT(ArgTs...)> {};
+
+namespace detail {
+template <typename RetT, typename... ArgTs> struct CallableArgInfoImpl {
+  using ReturnType = RetT;
+  using ArgsTupleType = std::tuple<ArgTs...>;
+};
+} // namespace detail
+
+/// CallableArgInfo provides typedefs for the return type and argument types
+/// (as a tuple) of the given callable type.
+template <typename Callable>
+struct CallableArgInfo
+    : public CallableTraitsHelper<detail::CallableArgInfoImpl, Callable> {};
+
+} // namespace llvm::orc
+
+#endif // LLVM_EXECUTIONENGINE_ORC_CALLABLETRAITSHELPER_H
diff --git a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
index 7b563d7bcc68c..e5b5633c5a096 100644
--- a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
@@ -18,6 +18,7 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_llvm_unittest(OrcJITTests
+  CallableTraitsHelperTest.cpp
   CoreAPIsTest.cpp
   ExecutorAddressTest.cpp
   ExecutionSessionWrapperFunctionCallsTest.cpp
diff --git a/llvm/unittests/ExecutionEngine/Orc/CallableTraitsHelperTest.cpp b/llvm/unittests/ExecutionEngine/Orc/CallableTraitsHelperTest.cpp
new file mode 100644
index 0000000000000..bfb3d8eefbff3
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/CallableTraitsHelperTest.cpp
@@ -0,0 +1,70 @@
+//===- CallableTraitsHelperTest.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Tests for llvm::orc::CallableTraitsHelper APIs.
+//
+// NOTE: All tests in this file are testing compile-time functionality, so the
+//       tests at runtime all end up being noops. That's fine -- those are
+//       cheap.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/CallableTraitsHelper.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::orc;
+
+static void freeVoidVoid() {}
+
+TEST(CallableTraitsHelperTest, FreeVoidVoid) {
+  (void)freeVoidVoid;
+  typedef CallableArgInfo<decltype(freeVoidVoid)> CAI;
+  static_assert(std::is_void_v<CAI::ReturnType>);
+  static_assert(std::is_same_v<CAI::ArgsTupleType, std::tuple<>>);
+}
+
+static int freeBinaryOp(int, float) { return 0; }
+
+TEST(CallableTraitsHelperTest, FreeBinaryOp) {
+  (void)freeBinaryOp;
+  typedef CallableArgInfo<decltype(freeBinaryOp)> CAI;
+  static_assert(std::is_same_v<CAI::ReturnType, int>);
+  static_assert(std::is_same_v<CAI::ArgsTupleType, std::tuple<int, float>>);
+}
+
+TEST(CallableTraitsHelperTest, VoidVoidObj) {
+  auto VoidVoid = []() {};
+  typedef CallableArgInfo<decltype(VoidVoid)> CAI;
+  static_assert(std::is_void_v<CAI::ReturnType>);
+  static_assert(std::is_same_v<CAI::ArgsTupleType, std::tuple<>>);
+}
+
+TEST(CallableTraitsHelperTest, BinaryOpObj) {
+  auto BinaryOp = [](int X, float Y) -> int { return X + Y; };
+  typedef CallableArgInfo<decltype(BinaryOp)> CAI;
+  static_assert(std::is_same_v<CAI::ReturnType, int>);
+  static_assert(std::is_same_v<CAI::ArgsTupleType, std::tuple<int, float>>);
+}
+
+TEST(CallableTraitsHelperTest, PreservesLValueRef) {
+  auto RefOp = [](int &) {};
+  typedef CallableArgInfo<decltype(RefOp)> CAI;
+  static_assert(std::is_same_v<CAI::ArgsTupleType, std::tuple<int &>>);
+}
+
+TEST(CallableTraitsHelperTest, PreservesLValueRefConstness) {
+  auto RefOp = [](const int &) {};
+  typedef CallableArgInfo<decltype(RefOp)> CAI;
+  static_assert(std::is_same_v<CAI::ArgsTupleType, std::tuple<const int &>>);
+}
+
+TEST(CallableTraitsHelperTest, PreservesRValueRef) {
+  auto RefOp = [](int &&) {};
+  typedef CallableArgInfo<decltype(RefOp)> CAI;
+  static_assert(std::is_same_v<CAI::ArgsTupleType, std::tuple<int &&>>);
+}

From 6822e3c91b5df96ea980c94655a5d547c5f510b8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 3 Dec 2025 11:07:26 +0000
Subject: [PATCH 30/36] [VectorCombine][X86] Add tests showing failure to push
 a shuffle through a fma with multiple constants (#170458)

Despite 2 of the 3 arguments of the fma intrinsics calls being constant
(free shuffle), foldShuffleOfIntrinsics fails to fold the shuffle
through
---
 .../VectorCombine/X86/shuffle-of-fma-const.ll | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 llvm/test/Transforms/VectorCombine/X86/shuffle-of-fma-const.ll

diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-fma-const.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-fma-const.ll
new file mode 100644
index 0000000000000..4710bdee11472
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-fma-const.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX
+
+define <4 x float> @shuffle_fma_const_chain(<4 x float> %a0) {
+; CHECK-LABEL: define <4 x float> @shuffle_fma_const_chain(
+; CHECK-SAME: <4 x float> [[A0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[F:%.*]] = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <4 x float> [[F]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %f = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
+  %res = shufflevector <4 x float> %f, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x float> %res
+}
+
+define <8 x float> @concat_fma_const_chain(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: define <8 x float> @concat_fma_const_chain(
+; SSE-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0]], <4 x float> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> splat (float 0x3F8DE8D040000000), <8 x float> splat (float 0xBFB3715EE0000000))
+; SSE-NEXT:    ret <8 x float> [[RES]]
+;
+; AVX-LABEL: define <8 x float> @concat_fma_const_chain(
+; AVX-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[L:%.*]] = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
+; AVX-NEXT:    [[H:%.*]] = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> [[A1]], <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
+; AVX-NEXT:    [[RES:%.*]] = shufflevector <4 x float> [[L]], <4 x float> [[H]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    ret <8 x float> [[RES]]
+;
+  %l = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
+  %h = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> %a1, <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
+  %res = shufflevector <4 x float> %l, <4 x float> %h, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+define <8 x float> @interleave_fma_const_chain(<4 x float> %a0, <4 x float> %a1) {
+; CHECK-LABEL: define <8 x float> @interleave_fma_const_chain(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[L:%.*]] = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
+; CHECK-NEXT:    [[H:%.*]] = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> [[A1]], <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <4 x float> [[L]], <4 x float> [[H]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[RES]]
+;
+  %l = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
+  %h = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> %a1, <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
+  %res = shufflevector <4 x float> %l, <4 x float> %h, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x float> %res
+}

From 1ca763b76423a17a2101a4579b5d74bade4f0ce4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Wed, 3 Dec 2025 13:09:14 +0200
Subject: [PATCH 31/36] [llvm-readobj] [ARMWinEH] Fix printing of packed unwind
 with H=1, RegI=RegF=0, CR!=1 (#170294)

In these cases, there are no other GPRs or float registers that would
have been backed up before the register homing area, that would have
allocated space on the stack for the saved registers.

Normally, the register homing part of the prologue consists of 4 nop
unwind codes. However, if we haven't allocated stack space for those
arguments yet, there's no space to store them in. The previous printout,
printing "stp x0, x1, [sp, #-N]!" wouldn't work when interpreted as a
nop unwind code.

Based on "dumpbin -unwindinfo", and from empirical inspection with
RtlVirtualUnwind, it turns out that the homing of argument registers is
done outside of the prologue. In these cases, "dumpbin -unwindinfo"
prints an annotation "(argument registers homed post-prolog)".

Adjust the printout accordingly. In these cases, the later stack
allocation (either "stp x29, x30, [sp, #-LocSZ]! or "sub sp, sp,
#LocSZ") is adjusted to include the space the homed registers (i.e. be
the full size from FrameSize).
---
 .../llvm-readobj/COFF/arm64-packed-unwind.s   | 43 ++++++++++++++++---
 llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp   | 17 ++++----
 2 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/llvm/test/tools/llvm-readobj/COFF/arm64-packed-unwind.s b/llvm/test/tools/llvm-readobj/COFF/arm64-packed-unwind.s
index 72e79b77a01ad..489d385468b70 100644
--- a/llvm/test/tools/llvm-readobj/COFF/arm64-packed-unwind.s
+++ b/llvm/test/tools/llvm-readobj/COFF/arm64-packed-unwind.s
@@ -105,11 +105,7 @@
 // CHECK-NEXT:     CR: 0
 // CHECK-NEXT:     FrameSize: 112
 // CHECK-NEXT:     Prologue [
-// CHECK-NEXT:       sub sp, sp, #48
-// CHECK-NEXT:       stp x6, x7, [sp, #48]
-// CHECK-NEXT:       stp x4, x5, [sp, #32]
-// CHECK-NEXT:       stp x2, x3, [sp, #16]
-// CHECK-NEXT:       stp x0, x1, [sp, #-64]!
+// CHECK-NEXT:       sub sp, sp, #112
 // CHECK-NEXT:       end
 // CHECK-NEXT:     ]
 // CHECK-NEXT:   }
@@ -276,6 +272,37 @@
 // CHECK-NEXT:       end
 // CHECK-NEXT:     ]
 // CHECK-NEXT:   }
+// CHECK-NEXT:   RuntimeFunction {
+// CHECK-NEXT:     Function: func17
+// CHECK-NEXT:     Fragment: No
+// CHECK-NEXT:     FunctionLength: 44
+// CHECK-NEXT:     RegF: 0
+// CHECK-NEXT:     RegI: 0
+// CHECK-NEXT:     HomedParameters: Yes
+// CHECK-NEXT:     CR: 3
+// CHECK-NEXT:     FrameSize: 96
+// CHECK-NEXT:     Prologue [
+// CHECK-NEXT:       mov x29, sp
+// CHECK-NEXT:       stp x29, lr, [sp, #-96]!
+// CHECK-NEXT:       end
+// CHECK-NEXT:     ]
+// CHECK-NEXT:   }
+// CHECK-NEXT:   RuntimeFunction {
+// CHECK-NEXT:     Function: func18
+// CHECK-NEXT:     Fragment: No
+// CHECK-NEXT:     FunctionLength: 44
+// CHECK-NEXT:     RegF: 0
+// CHECK-NEXT:     RegI: 0
+// CHECK-NEXT:     HomedParameters: Yes
+// CHECK-NEXT:     CR: 3
+// CHECK-NEXT:     FrameSize: 528
+// CHECK-NEXT:     Prologue [
+// CHECK-NEXT:       mov x29, sp
+// CHECK-NEXT:       stp x29, lr, [sp, #0]
+// CHECK-NEXT:       sub sp, sp, #528
+// CHECK-NEXT:       end
+// CHECK-NEXT:     ]
+// CHECK-NEXT:   }
 // CHECK-NEXT: ]
 
         .text
@@ -296,6 +323,8 @@ func13:
 func14:
 func15:
 func16:
+func17:
+func18:
         ret
 
         .section .pdata,"dr"
@@ -331,3 +360,7 @@ func16:
         .long 0x11820019 // FunctionLength=6  RegF=0 RegI=2 H=0 CR=0 FrameSize=34
         .long func16@IMGREL
         .long 0x03b00039 // FunctionLength=14 RegF=0 RegI=0 H=1 CR=1 FrameSize=7
+        .long func17@IMGREL
+        .long 0x0370002d // FunctionLength=11 RegF=0 RegI=0 H=1 CR=3 FrameSize=6
+        .long func18@IMGREL
+        .long 0x10f0002d // FunctionLength=11 RegF=0 RegI=0 H=1 CR=3 FrameSize=6
diff --git a/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp b/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp
index 3b2ea5b551117..32e3d059f44d3 100644
--- a/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp
+++ b/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp
@@ -1404,6 +1404,12 @@ bool Decoder::dumpPackedARM64Entry(const object::COFFObjectFile &COFF,
     FpSZ += 8;
   int SavSZ = (IntSZ + FpSZ + 8 * 8 * RF.H() + 0xf) & ~0xf;
   int LocSZ = (RF.FrameSize() << 4) - SavSZ;
+  bool Homing = RF.H();
+
+  if (RF.H() && RF.RegI() == 0 && RF.RegF() == 0 && RF.CR() != 1) {
+    LocSZ += SavSZ;
+    Homing = false;
+  }
 
   if (RF.CR() == 2 || RF.CR() == 3) {
     SW.startLine() << "mov x29, sp\n";
@@ -1419,18 +1425,11 @@ bool Decoder::dumpPackedARM64Entry(const object::COFFObjectFile &COFF,
   } else if ((RF.CR() != 3 && RF.CR() != 2 && LocSZ > 0) || LocSZ > 512) {
     SW.startLine() << format("sub sp, sp, #%d\n", LocSZ);
   }
-  if (RF.H()) {
+  if (Homing) {
     SW.startLine() << format("stp x6, x7, [sp, #%d]\n", SavSZ - 16);
     SW.startLine() << format("stp x4, x5, [sp, #%d]\n", SavSZ - 32);
     SW.startLine() << format("stp x2, x3, [sp, #%d]\n", SavSZ - 48);
-    if (RF.RegI() > 0 || RF.RegF() > 0 || RF.CR() == 1) {
-      SW.startLine() << format("stp x0, x1, [sp, #%d]\n", SavSZ - 64);
-    } else {
-      // This case isn't documented; if neither RegI nor RegF nor CR=1
-      // have decremented the stack pointer by SavSZ, we need to do it here
-      // (as the final stack adjustment of LocSZ excludes SavSZ).
-      SW.startLine() << format("stp x0, x1, [sp, #-%d]!\n", SavSZ);
-    }
+    SW.startLine() << format("stp x0, x1, [sp, #%d]\n", SavSZ - 64);
   }
   int FloatRegs = RF.RegF() > 0 ? RF.RegF() + 1 : 0;
   for (int I = (FloatRegs + 1) / 2 - 1; I >= 0; I--) {

From 4286a474b476e300079efa127d084593e833b1d6 Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail@bennani.ma>
Date: Wed, 3 Dec 2025 03:10:22 -0800
Subject: [PATCH 32/36] Revert "[lldb/docs] Add ScriptingFrameProvider
 documentation to the website"

This reverts commit bfde296d081098605bdad0e4487c4bad9ca19c95.
---
 lldb/docs/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lldb/docs/CMakeLists.txt b/lldb/docs/CMakeLists.txt
index bbecf606f1f8f..092bdc1c30f5c 100644
--- a/lldb/docs/CMakeLists.txt
+++ b/lldb/docs/CMakeLists.txt
@@ -28,7 +28,6 @@ if (LLDB_ENABLE_PYTHON AND SPHINX_FOUND)
     add_custom_target(lldb-python-doc-package
       COMMAND "${CMAKE_COMMAND}" -E copy "${lldb_bindings_dir}/lldb.py" "${CMAKE_CURRENT_BINARY_DIR}/lldb/__init__.py"
       COMMAND "${CMAKE_COMMAND}" -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/lldb/plugins"
-      COMMAND "${CMAKE_COMMAND}" -E copy "${LLDB_SOURCE_DIR}/examples/python/templates/scripted_frame_provider.py" "${CMAKE_CURRENT_BINARY_DIR}/lldb/plugins/"
       COMMAND "${CMAKE_COMMAND}" -E copy "${LLDB_SOURCE_DIR}/examples/python/templates/scripted_process.py" "${CMAKE_CURRENT_BINARY_DIR}/lldb/plugins/"
       COMMAND "${CMAKE_COMMAND}" -E copy "${LLDB_SOURCE_DIR}/examples/python/templates/scripted_platform.py" "${CMAKE_CURRENT_BINARY_DIR}/lldb/plugins/"
       COMMAND "${CMAKE_COMMAND}" -E copy "${LLDB_SOURCE_DIR}/examples/python/templates/operating_system.py" "${CMAKE_CURRENT_BINARY_DIR}/lldb/plugins/"

From 0dcbc870ed9baa54dc7c46e483d40a26dff28f96 Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail@bennani.ma>
Date: Wed, 3 Dec 2025 03:10:49 -0800
Subject: [PATCH 33/36] [lldb/docs] Add ScriptingFrameProvider documentation to
 the website

This patch adds the documentation for ScriptedFrameProviders to the
lldb website.

Signed-off-by: Med Ismail Bennani <ismail@bennani.ma>
---
 lldb/docs/CMakeLists.txt        | 1 +
 lldb/docs/python_extensions.rst | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/lldb/docs/CMakeLists.txt b/lldb/docs/CMakeLists.txt
index 092bdc1c30f5c..bbecf606f1f8f 100644
--- a/lldb/docs/CMakeLists.txt
+++ b/lldb/docs/CMakeLists.txt
@@ -28,6 +28,7 @@ if (LLDB_ENABLE_PYTHON AND SPHINX_FOUND)
     add_custom_target(lldb-python-doc-package
       COMMAND "${CMAKE_COMMAND}" -E copy "${lldb_bindings_dir}/lldb.py" "${CMAKE_CURRENT_BINARY_DIR}/lldb/__init__.py"
       COMMAND "${CMAKE_COMMAND}" -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/lldb/plugins"
+      COMMAND "${CMAKE_COMMAND}" -E copy "${LLDB_SOURCE_DIR}/examples/python/templates/scripted_frame_provider.py" "${CMAKE_CURRENT_BINARY_DIR}/lldb/plugins/"
       COMMAND "${CMAKE_COMMAND}" -E copy "${LLDB_SOURCE_DIR}/examples/python/templates/scripted_process.py" "${CMAKE_CURRENT_BINARY_DIR}/lldb/plugins/"
       COMMAND "${CMAKE_COMMAND}" -E copy "${LLDB_SOURCE_DIR}/examples/python/templates/scripted_platform.py" "${CMAKE_CURRENT_BINARY_DIR}/lldb/plugins/"
       COMMAND "${CMAKE_COMMAND}" -E copy "${LLDB_SOURCE_DIR}/examples/python/templates/operating_system.py" "${CMAKE_CURRENT_BINARY_DIR}/lldb/plugins/"
diff --git a/lldb/docs/python_extensions.rst b/lldb/docs/python_extensions.rst
index 7e5f1ba6879db..8420187efcdcc 100644
--- a/lldb/docs/python_extensions.rst
+++ b/lldb/docs/python_extensions.rst
@@ -14,6 +14,14 @@ Operating System Thread Plugins
     :skip: ScriptedThread
     :no-inheritance-diagram:
 
+Scripted Frame Provider Plugins
+-------------------------------
+
+.. automodapi:: lldb.plugins.scripted_frame_provider
+    :no-heading:
+    :skip: ABCMeta
+    :no-inheritance-diagram:
+
 Scripted Process Plugins
 -------------------------------
 

From dd9a516e0eb3b3a55890adbdc2221e70a3bf7719 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 3 Dec 2025 11:14:16 +0000
Subject: [PATCH 34/36] [gn build] Port aeb36a925234

---
 .../gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
index 111e4c997de92..c5f038c20e140 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
@@ -15,6 +15,7 @@ unittest("OrcJITTests") {
     "//llvm/lib/Testing/Support",
   ]
   sources = [
+    "CallableTraitsHelperTest.cpp",
     "CoreAPIsTest.cpp",
     "EPCGenericJITLinkMemoryManagerTest.cpp",
     "EPCGenericMemoryAccessTest.cpp",

From 4497c53298a6121dae51da490b3c228beb053e89 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Wed, 3 Dec 2025 12:21:36 +0100
Subject: [PATCH 35/36] [clang][bytecode] Accept current PC argument in
 Function::dump() (#170449)

This is useful since we can highlight the opcode that OpPC points to.
---
 clang/lib/AST/ByteCode/Disasm.cpp | 21 ++++++++++++++++++---
 clang/lib/AST/ByteCode/Function.h |  4 ++--
 clang/lib/AST/ByteCode/Source.h   |  1 +
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp
index 638028f84ff24..30ed41cd37ca3 100644
--- a/clang/lib/AST/ByteCode/Disasm.cpp
+++ b/clang/lib/AST/ByteCode/Disasm.cpp
@@ -138,9 +138,16 @@ static size_t getNumDisplayWidth(size_t N) {
   return L;
 }
 
-LLVM_DUMP_METHOD void Function::dump() const { dump(llvm::errs()); }
+LLVM_DUMP_METHOD void Function::dump(CodePtr PC) const {
+  dump(llvm::errs(), PC);
+}
 
-LLVM_DUMP_METHOD void Function::dump(llvm::raw_ostream &OS) const {
+LLVM_DUMP_METHOD void Function::dump(llvm::raw_ostream &OS,
+                                     CodePtr OpPC) const {
+  if (OpPC) {
+    assert(OpPC >= getCodeBegin());
+    assert(OpPC <= getCodeEnd());
+  }
   {
     ColorScope SC(OS, true, {llvm::raw_ostream::BRIGHT_GREEN, true});
     OS << getName() << " " << (const void *)this << "\n";
@@ -154,6 +161,7 @@ LLVM_DUMP_METHOD void Function::dump(llvm::raw_ostream &OS) const {
     size_t Addr;
     std::string Op;
     bool IsJump;
+    bool CurrentOp = false;
     llvm::SmallVector<std::string> Args;
   };
 
@@ -171,6 +179,7 @@ LLVM_DUMP_METHOD void Function::dump(llvm::raw_ostream &OS) const {
     auto Op = PC.read<Opcode>();
     Text.Addr = Addr;
     Text.IsJump = isJumpOpcode(Op);
+    Text.CurrentOp = (PC == OpPC);
     switch (Op) {
 #define GET_DISASM
 #include "Opcodes.inc"
@@ -198,9 +207,15 @@ LLVM_DUMP_METHOD void Function::dump(llvm::raw_ostream &OS) const {
   Text.reserve(Code.size());
   size_t LongestLine = 0;
   // Print code to a string, one at a time.
-  for (auto C : Code) {
+  for (const auto &C : Code) {
     std::string Line;
     llvm::raw_string_ostream LS(Line);
+    if (OpPC) {
+      if (C.CurrentOp)
+        LS << " * ";
+      else
+        LS << "   ";
+    }
     LS << C.Addr;
     LS.indent(LongestAddr - getNumDisplayWidth(C.Addr) + 4);
     LS << C.Op;
diff --git a/clang/lib/AST/ByteCode/Function.h b/clang/lib/AST/ByteCode/Function.h
index 8c309c921afa9..80283afb6e987 100644
--- a/clang/lib/AST/ByteCode/Function.h
+++ b/clang/lib/AST/ByteCode/Function.h
@@ -312,8 +312,8 @@ class Function final {
 
 public:
   /// Dumps the disassembled bytecode to \c llvm::errs().
-  void dump() const;
-  void dump(llvm::raw_ostream &OS) const;
+  void dump(CodePtr PC = {}) const;
+  void dump(llvm::raw_ostream &OS, CodePtr PC = {}) const;
 };
 
 } // namespace interp
diff --git a/clang/lib/AST/ByteCode/Source.h b/clang/lib/AST/ByteCode/Source.h
index f355d14db5e30..56ca197e66473 100644
--- a/clang/lib/AST/ByteCode/Source.h
+++ b/clang/lib/AST/ByteCode/Source.h
@@ -51,6 +51,7 @@ class CodePtr final {
   explicit operator bool() const { return Ptr; }
   bool operator<=(const CodePtr &RHS) const { return Ptr <= RHS.Ptr; }
   bool operator>=(const CodePtr &RHS) const { return Ptr >= RHS.Ptr; }
+  bool operator==(const CodePtr RHS) const { return Ptr == RHS.Ptr; }
 
   /// Reads data and advances the pointer.
   template <typename T> std::enable_if_t<!std::is_pointer<T>::value, T> read() {

From d68f5432532bb2bb641258b9f9236f0eba53c4fd Mon Sep 17 00:00:00 2001
From: Baranov Victor <bar.victor.2002@gmail.com>
Date: Wed, 3 Dec 2025 14:35:09 +0300
Subject: [PATCH 36/36] [clang-tidy] Remove 'clang-analyzer-*' checks from
 default checks. (#157306)

Closes https://github.com/llvm/llvm-project/issues/146482.
---
 clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp | 3 +--
 clang-tools-extra/docs/ReleaseNotes.rst             | 8 ++++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
index bc6bd164e24f8..6a1f61dd6b9e1 100644
--- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
+++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
@@ -104,8 +104,7 @@ Configuration files:
 )");
 
 const char DefaultChecks[] = // Enable these checks by default:
-    "clang-diagnostic-*,"    //   * compiler diagnostics
-    "clang-analyzer-*";      //   * Static Analyzer checks
+    "clang-diagnostic-*";    //   * compiler diagnostics
 
 static cl::opt<std::string> Checks("checks", desc(R"(
 Comma-separated list of globs with optional '-'
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 8d63fa85c0a8f..9533d56c219f7 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -58,6 +58,10 @@ Potentially Breaking Changes
   :program:`clang-tidy-20`. Users should use the check-specific options of the
   same name instead.
 
+- Removed `clang-analyzer-*` checks from default checks in :program:`clang-tidy`.
+  From now on, users should specify explicitly that they want CSA checks to run
+  in :program:`clang-tidy` via `clang-analyzer-*`.
+
 - Renamed a few :program:`clang-tidy` check options, as they
   were misspelled:
 
@@ -184,6 +188,10 @@ Improvements to clang-tidy
   scripts by adding the `-hide-progress` option to suppress progress and
   informational messages.
 
+- Removed `clang-analyzer-*` check from default checks in :program:`clang-tidy`.
+  From now on, users should specify explicitly that they want CSA checks to run
+  in :program:`clang-tidy`.
+
 - Deprecated the :program:`clang-tidy` ``zircon`` module. All checks have been
   moved to the ``fuchsia`` module instead. The ``zircon`` module will be removed
   in the 24th release.