diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index 7c6e01d669b74..0dbf6f59d5e88 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -811,6 +811,15 @@
 
   Specify file name of the runtime instrumentation library
 
+- `--runtime-lib-init-hook=<value>`
+
+  Primary target for hooking runtime library initialization, used in
+  fallback order of availability in input binary (entry_point -> init
+   -> init_array) (default: entry_point)
+  - `entry_point`: use ELF Header Entry Point
+  - `init`: use ELF DT_INIT entry
+  - `init_array`: use ELF .init_array entry
+
 - `--sctc-mode=<value>`
 
   Mode for simplify conditional tail calls
diff --git a/bolt/docs/PacRetDesign.md b/bolt/docs/PacRetDesign.md
index f3fe5fbd522cb..2e3cb7b91e0ce 100644
--- a/bolt/docs/PacRetDesign.md
+++ b/bolt/docs/PacRetDesign.md
@@ -200,15 +200,22 @@ This pass runs after optimizations. It performns the _inverse_ of MarkRAState pa
 Some BOLT passes can add new Instructions. In InsertNegateRAStatePass, we have
 to know what RA state these have.
 
-The current solution has the `inferUnknownStates` function to cover these, using
-a fairly simple strategy: unknown states inherit the last known state.
+> [!important]
+> As issue #160989 explains, unwind info is missing from stubs.
+> For this same reason, we cannot generate correct pac-specific unwind info: the
+> signedness of the _incorrect_ return address is meaningless.
 
-This will be updated to a more robust solution.
+Assignment of RAStates to newly generated instructions is done in `inferUnknownStates`.
+We have two different cases to cover:
 
-> [!important]
-> As issue #160989 describes, unwind info is incorrect in stubs with multiple callers.
-> For this same reason, we cannot generate correct pac-specific unwind info: the signess
-> of the _incorrect_ return address is meaningless.
+1. If a BasicBlock has some instructions with known RA state, and some without, we
+   can copy the RAState of known instructions to the unknown ones. As the control
+   flow only changes between BasicBlocks, instructions in the same BasicBlock have
+   the same return address. (The exception is noreturn calls, but these would only
+   cause problems, if the newly inserted instruction is right after the call.)
+
+2. If a BasicBlock has no instructions with known RAState, we have to copy the
+   RAState of the previous BasicBlock in layout order.
 
 ### Optimizations requiring special attention
 
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 2af1d330b7545..8a90febcea3cc 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -807,6 +807,15 @@ class BinaryContext {
   /// the execution of the binary is completed.
   std::optional<uint64_t> FiniFunctionAddress;
 
+  /// DT_INIT.
+  std::optional<uint64_t> InitAddress;
+
+  /// DT_INIT_ARRAY. Only used when DT_INIT is not set.
+  std::optional<uint64_t> InitArrayAddress;
+
+  /// DT_INIT_ARRAYSZ. Only used when DT_INIT is not set.
+  std::optional<uint64_t> InitArraySize;
+
   /// DT_FINI.
   std::optional<uint64_t> FiniAddress;
 
diff --git a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
index 836948bf5e9c0..3f003af96162d 100644
--- a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
+++ b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/InsertNegateRAStatePass.cpp ----------------------------===//
+//===- bolt/Passes/InsertNegateRAStatePass.h ------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -30,9 +30,30 @@ class InsertNegateRAState : public BinaryFunctionPass {
 private:
   /// Because states are tracked as MCAnnotations on individual instructions,
   /// newly inserted instructions do not have a state associated with them.
-  /// New states are "inherited" from the last known state.
+  /// Uses fillUnknownStateInBB and fillUnknownStubs.
   void inferUnknownStates(BinaryFunction &BF);
 
+  /// Simple case: copy RAStates to unknown insts from previous inst.
+  /// If the first inst has unknown state, copy set it to the first known state.
+  /// Accounts for signing and authenticating insts.
+  void fillUnknownStateInBB(BinaryContext &BC, BinaryBasicBlock &BB);
+
+  /// Fill in RAState in BasicBlocks consisting entirely of new instructions.
+  /// As of #160989, we have to copy the RAState from the previous BB in the
+  /// layout, because CFIs are already incorrect here.
+  void fillUnknownStubs(BinaryFunction &BF);
+
+  /// Returns the first known RAState from \p BB, or std::nullopt if all are
+  /// unknown.
+  std::optional<bool> getFirstKnownRAState(BinaryContext &BC,
+                                           BinaryBasicBlock &BB);
+
+  /// \p Return true if all instructions have unknown RAState.
+  bool isUnknownBlock(BinaryContext &BC, BinaryBasicBlock &BB);
+
+  /// Set all instructions in \p BB to \p State.
+  void markUnknownBlock(BinaryContext &BC, BinaryBasicBlock &BB, bool State);
+
   /// Support for function splitting:
   /// if two consecutive BBs with Signed state are going to end up in different
   /// functions (so are held by different FunctionFragments), we have to add a
diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h
index 35abf6b4d4ddd..5950b3c1630e1 100644
--- a/bolt/include/bolt/Rewrite/RewriteInstance.h
+++ b/bolt/include/bolt/Rewrite/RewriteInstance.h
@@ -93,14 +93,23 @@ class RewriteInstance {
   /// section allocations if found.
   void discoverBOLTReserved();
 
+  /// Check whether we should use DT_INIT or DT_INIT_ARRAY for instrumentation.
+  /// DT_INIT is preferred; DT_INIT_ARRAY is only used when no DT_INIT entry was
+  /// found.
+  Error discoverRtInitAddress();
+
   /// Check whether we should use DT_FINI or DT_FINI_ARRAY for instrumentation.
   /// DT_FINI is preferred; DT_FINI_ARRAY is only used when no DT_FINI entry was
   /// found.
   Error discoverRtFiniAddress();
 
+  /// If DT_INIT_ARRAY is used for instrumentation, update the relocation of its
+  /// first entry to point to the instrumentation library's init address.
+  Error updateRtInitReloc();
+
   /// If DT_FINI_ARRAY is used for instrumentation, update the relocation of its
   /// first entry to point to the instrumentation library's fini address.
-  void updateRtFiniReloc();
+  Error updateRtFiniReloc();
 
   /// Create and initialize metadata rewriters for this instance.
   void initializeMetadataManager();
diff --git a/bolt/lib/Passes/Inliner.cpp b/bolt/lib/Passes/Inliner.cpp
index 5a7d02a34b4d8..0740fcef9102b 100644
--- a/bolt/lib/Passes/Inliner.cpp
+++ b/bolt/lib/Passes/Inliner.cpp
@@ -491,6 +491,32 @@ bool Inliner::inlineCallsInFunction(BinaryFunction &Function) {
         }
       }
 
+      // AArch64 BTI:
+      // If the callee has an indirect tailcall (BR), we would transform it to
+      // an indirect call (BLR) in InlineCall. Because of this, we would have to
+      // update the BTI at the target of the tailcall. However, these targets
+      // are not known. Instead, we skip inlining blocks with indirect
+      // tailcalls.
+      auto HasIndirectTailCall = [&](const BinaryFunction &BF) -> bool {
+        for (const auto &BB : BF) {
+          for (const auto &II : BB) {
+            if (BC.MIB->isIndirectBranch(II) && BC.MIB->isTailCall(II)) {
+              return true;
+            }
+          }
+        }
+        return false;
+      };
+
+      if (BC.isAArch64() && BC.usesBTI() &&
+          HasIndirectTailCall(*TargetFunction)) {
+        ++InstIt;
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Skipping inlining block with tailcall"
+                          << " in " << Function << " : " << BB->getName()
+                          << " to keep BTIs consistent.\n");
+        continue;
+      }
+
       LLVM_DEBUG(dbgs() << "BOLT-DEBUG: inlining call to " << *TargetFunction
                         << " in " << Function << " : " << BB->getName()
                         << ". Count: " << BB->getKnownExecutionCount()
diff --git a/bolt/lib/Passes/InsertNegateRAStatePass.cpp b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
index 775b7795e77c5..ed4de8a56f89f 100644
--- a/bolt/lib/Passes/InsertNegateRAStatePass.cpp
+++ b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
@@ -52,8 +52,8 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
         MCInst &Inst = *It;
         if (BC.MIB->isCFI(Inst))
           continue;
-        auto RAState = BC.MIB->getRAState(Inst);
-        if (!RAState) {
+        std::optional<bool> RAState = BC.MIB->getRAState(Inst);
+        if (!RAState.has_value()) {
           BC.errs() << "BOLT-ERROR: unknown RAState after inferUnknownStates "
                     << " in function " << BF.getPrintName() << "\n";
           PassFailed = true;
@@ -74,6 +74,20 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
   }
 }
 
+void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) {
+  BinaryContext &BC = BF.getBinaryContext();
+
+  // Fill in missing RAStates in simple cases (inside BBs).
+  for (BinaryBasicBlock &BB : BF) {
+    fillUnknownStateInBB(BC, BB);
+  }
+  // BasicBlocks which are made entirely of "new instructions" (instructions
+  // without RAState annotation) are stubs, and do not have correct unwind info.
+  // We should iterate in layout order and fill them based on previous known
+  // RAState.
+  fillUnknownStubs(BF);
+}
+
 void InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF,
                                                      FunctionFragment &FF) {
   BinaryContext &BC = BF.getBinaryContext();
@@ -92,8 +106,8 @@ void InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF,
   // If a function is already split in the input, the first FF can also start
   // with Signed state. This covers that scenario as well.
   auto II = (*FirstNonEmpty)->getFirstNonPseudo();
-  auto RAState = BC.MIB->getRAState(*II);
-  if (!RAState) {
+  std::optional<bool> RAState = BC.MIB->getRAState(*II);
+  if (!RAState.has_value()) {
     BC.errs() << "BOLT-ERROR: unknown RAState after inferUnknownStates "
               << " in function " << BF.getPrintName() << "\n";
     PassFailed = true;
@@ -104,32 +118,119 @@ void InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF,
                          MCCFIInstruction::createNegateRAState(nullptr));
 }
 
-void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) {
+std::optional<bool>
+InsertNegateRAState::getFirstKnownRAState(BinaryContext &BC,
+                                          BinaryBasicBlock &BB) {
+  for (const MCInst &Inst : BB) {
+    if (BC.MIB->isCFI(Inst))
+      continue;
+    std::optional<bool> RAState = BC.MIB->getRAState(Inst);
+    if (RAState.has_value())
+      return RAState;
+  }
+  return std::nullopt;
+}
+
+bool InsertNegateRAState::isUnknownBlock(BinaryContext &BC,
+                                         BinaryBasicBlock &BB) {
+  std::optional<bool> FirstRAState = getFirstKnownRAState(BC, BB);
+  return !FirstRAState.has_value();
+}
+
+void InsertNegateRAState::fillUnknownStateInBB(BinaryContext &BC,
+                                               BinaryBasicBlock &BB) {
+
+  auto First = BB.getFirstNonPseudo();
+  if (First == BB.end())
+    return;
+  // If the first instruction has unknown RAState, we should copy the first
+  // known RAState.
+  std::optional<bool> RAState = BC.MIB->getRAState(*First);
+  if (!RAState.has_value()) {
+    std::optional<bool> FirstRAState = getFirstKnownRAState(BC, BB);
+    if (!FirstRAState.has_value())
+      // We fill unknown BBs later.
+      return;
+
+    BC.MIB->setRAState(*First, *FirstRAState);
+  }
+
+  // At this point we know the RAState of the first instruction,
+  // so we can propagate the RAStates to all subsequent unknown instructions.
+  MCInst Prev = *First;
+  for (auto It = First + 1; It != BB.end(); ++It) {
+    MCInst &Inst = *It;
+    if (BC.MIB->isCFI(Inst))
+      continue;
+
+    // No need to check for nullopt: we only entered this loop after the first
+    // instruction had its RAState set, and RAState is always set for the
+    // previous instruction in the previous iteration of the loop.
+    std::optional<bool> PrevRAState = BC.MIB->getRAState(Prev);
+
+    std::optional<bool> RAState = BC.MIB->getRAState(Inst);
+    if (!RAState.has_value()) {
+      if (BC.MIB->isPSignOnLR(Prev))
+        PrevRAState = true;
+      else if (BC.MIB->isPAuthOnLR(Prev))
+        PrevRAState = false;
+      BC.MIB->setRAState(Inst, *PrevRAState);
+    }
+    Prev = Inst;
+  }
+}
+
+void InsertNegateRAState::markUnknownBlock(BinaryContext &BC,
+                                           BinaryBasicBlock &BB, bool State) {
+  // If we call this when an Instruction has either kRASigned or kRAUnsigned
+  // annotation, setRASigned or setRAUnsigned would fail.
+  assert(isUnknownBlock(BC, BB) &&
+         "markUnknownBlock should only be called on unknown blocks");
+  for (MCInst &Inst : BB) {
+    if (BC.MIB->isCFI(Inst))
+      continue;
+    BC.MIB->setRAState(Inst, State);
+  }
+}
+
+void InsertNegateRAState::fillUnknownStubs(BinaryFunction &BF) {
   BinaryContext &BC = BF.getBinaryContext();
   bool FirstIter = true;
   MCInst PrevInst;
-  for (BinaryBasicBlock &BB : BF) {
-    for (MCInst &Inst : BB) {
-      if (BC.MIB->isCFI(Inst))
-        continue;
+  for (FunctionFragment &FF : BF.getLayout().fragments()) {
+    for (BinaryBasicBlock *BB : FF) {
+      if (FirstIter) {
+        FirstIter = false;
+        if (isUnknownBlock(BC, *BB))
+          // If the first BasicBlock is unknown, the function's entry RAState
+          // should be used.
+          markUnknownBlock(BC, *BB, BF.getInitialRAState());
+      } else if (isUnknownBlock(BC, *BB)) {
+        // As explained in issue #160989, the unwind info is incorrect for
+        // stubs. Indicating the correct RAState without the rest of the unwind
+        // info being correct is not useful. Instead, we copy the RAState from
+        // the previous instruction.
+        std::optional<bool> PrevRAState = BC.MIB->getRAState(PrevInst);
+        if (!PrevRAState.has_value()) {
+          // No non-cfi instruction encountered in the function yet.
+          // This means the RAState is the same as at the function entry.
+          markUnknownBlock(BC, *BB, BF.getInitialRAState());
+          continue;
+        }
 
-      auto RAState = BC.MIB->getRAState(Inst);
-      if (!FirstIter && !RAState) {
         if (BC.MIB->isPSignOnLR(PrevInst))
-          RAState = true;
+          PrevRAState = true;
         else if (BC.MIB->isPAuthOnLR(PrevInst))
-          RAState = false;
-        else {
-          auto PrevRAState = BC.MIB->getRAState(PrevInst);
-          RAState = PrevRAState ? *PrevRAState : false;
-        }
-        BC.MIB->setRAState(Inst, *RAState);
-      } else {
-        FirstIter = false;
-        if (!RAState)
-          BC.MIB->setRAState(Inst, BF.getInitialRAState());
+          PrevRAState = false;
+        markUnknownBlock(BC, *BB, *PrevRAState);
       }
-      PrevInst = Inst;
+      // This function iterates on BasicBlocks, so the PrevInst has to be
+      // updated to the last instruction of the current BasicBlock. If the
+      // BasicBlock is empty, or only has PseudoInstructions, PrevInst will not
+      // be updated.
+      auto Last = BB->getLastNonPseudo();
+      if (Last != BB->rend())
+        PrevInst = *Last;
     }
   }
 }
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 8a5bbe28e5f66..1c6244b2d2bf8 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -294,6 +294,28 @@ cl::bits<GadgetScannerKind> GadgetScannersToRun(
         clEnumValN(GS_ALL, "all", "All implemented scanners")),
     cl::ZeroOrMore, cl::CommaSeparated, cl::cat(BinaryAnalysisCategory));
 
+// Primary targets for hooking runtime library initialization hooking
+// with fallback to next item in case if current item is not available
+// in the input binary.
+enum RuntimeLibInitHookTarget : char {
+  RLIH_ENTRY_POINT = 0, /// Use ELF Header Entry Point
+  RLIH_INIT = 1,        /// Use ELF DT_INIT entry
+  RLIH_INIT_ARRAY = 2,  /// Use ELF .init_array entry
+};
+
+cl::opt<RuntimeLibInitHookTarget> RuntimeLibInitHook(
+    "runtime-lib-init-hook",
+    cl::desc("Primary target for hooking runtime library initialization, used "
+             "in fallback order of availabiliy in input binary (entry_point -> "
+             "init -> init_array) (default: entry_point)"),
+    cl::Hidden, cl::init(RLIH_ENTRY_POINT),
+    cl::values(clEnumValN(RLIH_ENTRY_POINT, "entry_point",
+                          "use ELF Header Entry Point"),
+               clEnumValN(RLIH_INIT, "init", "use ELF DT_INIT entry"),
+               clEnumValN(RLIH_INIT_ARRAY, "init_array",
+                          "use ELF .init_array entry")),
+    cl::ZeroOrMore, cl::cat(BoltOptCategory));
+
 } // namespace opts
 
 // FIXME: implement a better way to mark sections for replacement.
@@ -741,9 +763,12 @@ Error RewriteInstance::run() {
   adjustCommandLineOptions();
   discoverFileObjects();
 
-  if (opts::Instrument && !BC->IsStaticExecutable)
+  if (opts::Instrument && !BC->IsStaticExecutable) {
+    if (Error E = discoverRtInitAddress())
+      return E;
     if (Error E = discoverRtFiniAddress())
       return E;
+  }
 
   preprocessProfileData();
 
@@ -785,8 +810,12 @@ Error RewriteInstance::run() {
 
   updateMetadata();
 
-  if (opts::Instrument && !BC->IsStaticExecutable)
-    updateRtFiniReloc();
+  if (opts::Instrument && !BC->IsStaticExecutable) {
+    if (Error E = updateRtInitReloc())
+      return E;
+    if (Error E = updateRtFiniReloc())
+      return E;
+  }
 
   if (opts::OutputFilename == "/dev/null") {
     BC->outs() << "BOLT-INFO: skipping writing final binary to disk\n";
@@ -1411,6 +1440,65 @@ void RewriteInstance::discoverBOLTReserved() {
   NextAvailableAddress = BC->BOLTReserved.start();
 }
 
+Error RewriteInstance::discoverRtInitAddress() {
+  if (BC->HasInterpHeader && opts::RuntimeLibInitHook == opts::RLIH_ENTRY_POINT)
+    return Error::success();
+
+  // Use DT_INIT if it's available.
+  if (BC->InitAddress && opts::RuntimeLibInitHook <= opts::RLIH_INIT) {
+    BC->StartFunctionAddress = BC->InitAddress;
+    return Error::success();
+  }
+
+  if (!BC->InitArrayAddress || !BC->InitArraySize) {
+    return createStringError(std::errc::not_supported,
+                             "Instrumentation of shared library needs either "
+                             "DT_INIT or DT_INIT_ARRAY");
+  }
+
+  if (*BC->InitArraySize < BC->AsmInfo->getCodePointerSize()) {
+    return createStringError(std::errc::not_supported,
+                             "Need at least 1 DT_INIT_ARRAY slot");
+  }
+
+  ErrorOr<BinarySection &> InitArraySection =
+      BC->getSectionForAddress(*BC->InitArrayAddress);
+  if (auto EC = InitArraySection.getError())
+    return errorCodeToError(EC);
+
+  if (InitArraySection->getAddress() != *BC->InitArrayAddress) {
+    return createStringError(std::errc::not_supported,
+                             "Inconsistent address of .init_array section");
+  }
+
+  if (const Relocation *Reloc = InitArraySection->getDynamicRelocationAt(0)) {
+    if (Reloc->isRelative()) {
+      BC->StartFunctionAddress = Reloc->Addend;
+    } else {
+      MCSymbol *Sym = Reloc->Symbol;
+      if (!Sym)
+        return createStringError(
+            std::errc::not_supported,
+            "Failed to locate symbol for 0 entry of .init_array");
+      const BinaryFunction *BF = BC->getFunctionForSymbol(Sym);
+      if (!BF)
+        return createStringError(
+            std::errc::not_supported,
+            "Failed to locate binary function for 0 entry of .init_array");
+      BC->StartFunctionAddress = BF->getAddress() + Reloc->Addend;
+    }
+    return Error::success();
+  }
+
+  if (const Relocation *Reloc = InitArraySection->getRelocationAt(0)) {
+    BC->StartFunctionAddress = Reloc->Value;
+    return Error::success();
+  }
+
+  return createStringError(std::errc::not_supported,
+                           "No relocation for first DT_INIT_ARRAY slot");
+}
+
 Error RewriteInstance::discoverRtFiniAddress() {
   // Use DT_FINI if it's available.
   if (BC->FiniAddress) {
@@ -1434,6 +1522,11 @@ Error RewriteInstance::discoverRtFiniAddress() {
   if (auto EC = FiniArraySection.getError())
     return errorCodeToError(EC);
 
+  if (FiniArraySection->getAddress() != *BC->FiniArrayAddress) {
+    return createStringError(std::errc::not_supported,
+                             "Inconsistent address of .fini_array section");
+  }
+
   if (const Relocation *Reloc = FiniArraySection->getDynamicRelocationAt(0)) {
     BC->FiniFunctionAddress = Reloc->Addend;
     return Error::success();
@@ -1448,26 +1541,95 @@ Error RewriteInstance::discoverRtFiniAddress() {
                            "No relocation for first DT_FINI_ARRAY slot");
 }
 
-void RewriteInstance::updateRtFiniReloc() {
+Error RewriteInstance::updateRtInitReloc() {
+  if (BC->HasInterpHeader && opts::RuntimeLibInitHook == opts::RLIH_ENTRY_POINT)
+    return Error::success();
+
+  // Updating DT_INIT is handled by patchELFDynamic.
+  if (BC->InitAddress && opts::RuntimeLibInitHook <= opts::RLIH_INIT)
+    return Error::success();
+
+  const RuntimeLibrary *RT = BC->getRuntimeLibrary();
+  if (!RT || !RT->getRuntimeStartAddress())
+    return Error::success();
+
+  if (!BC->InitArrayAddress)
+    return Error::success();
+
+  if (!BC->InitArrayAddress || !BC->InitArraySize)
+    return createStringError(std::errc::not_supported,
+                             "inconsistent .init_array state");
+
+  ErrorOr<BinarySection &> InitArraySection =
+      BC->getSectionForAddress(*BC->InitArrayAddress);
+  if (!InitArraySection)
+    return createStringError(std::errc::not_supported, ".init_array removed");
+
+  if (std::optional<Relocation> Reloc =
+          InitArraySection->takeDynamicRelocationAt(0)) {
+    if (Reloc->isRelative()) {
+      if (Reloc->Addend != BC->StartFunctionAddress)
+        return createStringError(std::errc::not_supported,
+                                 "inconsistent .init_array dynamic relocation");
+      Reloc->Addend = RT->getRuntimeStartAddress();
+      InitArraySection->addDynamicRelocation(*Reloc);
+    } else {
+      MCSymbol *Sym = Reloc->Symbol;
+      if (!Sym)
+        return createStringError(
+            std::errc::not_supported,
+            "Failed to locate symbol for 0 entry of .init_array");
+      const BinaryFunction *BF = BC->getFunctionForSymbol(Sym);
+      if (!BF)
+        return createStringError(
+            std::errc::not_supported,
+            "Failed to locate binary function for 0 entry of .init_array");
+      if (BF->getAddress() + Reloc->Addend != BC->StartFunctionAddress)
+        return createStringError(std::errc::not_supported,
+                                 "inconsistent .init_array dynamic relocation");
+      InitArraySection->addDynamicRelocation(Relocation{
+          /*Offset*/ 0, /*Symbol*/ nullptr, /*Type*/ Relocation::getAbs64(),
+          /*Addend*/ RT->getRuntimeStartAddress(), /*Value*/ 0});
+    }
+  }
+  // Update the static relocation by adding a pending relocation which will get
+  // patched when flushPendingRelocations is called in rewriteFile. Note that
+  // flushPendingRelocations will calculate the value to patch as
+  // "Symbol + Addend". Since we don't have a symbol, just set the addend to the
+  // desired value.
+  InitArraySection->addPendingRelocation(Relocation{
+      /*Offset*/ 0, /*Symbol*/ nullptr, /*Type*/ Relocation::getAbs64(),
+      /*Addend*/ RT->getRuntimeStartAddress(), /*Value*/ 0});
+  BC->outs()
+      << "BOLT-INFO: runtime library initialization was hooked via .init_array "
+         "entry, set to 0x"
+      << Twine::utohexstr(RT->getRuntimeStartAddress()) << "\n";
+  return Error::success();
+}
+
+Error RewriteInstance::updateRtFiniReloc() {
   // Updating DT_FINI is handled by patchELFDynamic.
   if (BC->FiniAddress)
-    return;
+    return Error::success();
 
   const RuntimeLibrary *RT = BC->getRuntimeLibrary();
   if (!RT || !RT->getRuntimeFiniAddress())
-    return;
+    return Error::success();
 
-  assert(BC->FiniArrayAddress && BC->FiniArraySize &&
-         "inconsistent .fini_array state");
+  if (!BC->FiniArrayAddress || !BC->FiniArraySize)
+    return createStringError(std::errc::not_supported,
+                             "inconsistent .fini_array state");
 
   ErrorOr<BinarySection &> FiniArraySection =
       BC->getSectionForAddress(*BC->FiniArrayAddress);
-  assert(FiniArraySection && ".fini_array removed");
+  if (!FiniArraySection)
+    return createStringError(std::errc::not_supported, ".fini_array removed");
 
   if (std::optional<Relocation> Reloc =
           FiniArraySection->takeDynamicRelocationAt(0)) {
-    assert(Reloc->Addend == BC->FiniFunctionAddress &&
-           "inconsistent .fini_array dynamic relocation");
+    if (Reloc->Addend != BC->FiniFunctionAddress)
+      return createStringError(std::errc::not_supported,
+                               "inconsistent .fini_array dynamic relocation");
     Reloc->Addend = RT->getRuntimeFiniAddress();
     FiniArraySection->addDynamicRelocation(*Reloc);
   }
@@ -1480,6 +1642,10 @@ void RewriteInstance::updateRtFiniReloc() {
   FiniArraySection->addPendingRelocation(Relocation{
       /*Offset*/ 0, /*Symbol*/ nullptr, /*Type*/ Relocation::getAbs64(),
       /*Addend*/ RT->getRuntimeFiniAddress(), /*Value*/ 0});
+  BC->outs() << "BOLT-INFO: runtime library finalization was hooked via "
+                ".fini_array entry, set to 0x"
+             << Twine::utohexstr(RT->getRuntimeFiniAddress()) << "\n";
+  return Error::success();
 }
 
 void RewriteInstance::registerFragments() {
@@ -2178,6 +2344,14 @@ void RewriteInstance::adjustCommandLineOptions() {
     exit(1);
   }
 
+  if (opts::Instrument && opts::RuntimeLibInitHook == opts::RLIH_ENTRY_POINT &&
+      !BC->HasInterpHeader) {
+    BC->errs()
+        << "BOLT-WARNING: adjusted runtime-lib-init-hook to 'init' due to "
+           "absence of INTERP header\n";
+    opts::RuntimeLibInitHook = opts::RLIH_INIT;
+  }
+
   if (opts::HotText && opts::HotTextMoveSections.getNumOccurrences() == 0) {
     opts::HotTextMoveSections.addValue(".stub");
     opts::HotTextMoveSections.addValue(".mover");
@@ -4849,9 +5023,14 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile<ELFT> *File) {
   ELFEhdrTy NewEhdr = Obj.getHeader();
 
   if (BC->HasRelocations) {
-    if (RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary())
+    RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary();
+    if (RtLibrary && opts::RuntimeLibInitHook == opts::RLIH_ENTRY_POINT) {
       NewEhdr.e_entry = RtLibrary->getRuntimeStartAddress();
-    else
+      BC->outs()
+          << "BOLT-INFO: runtime library initialization was hooked via ELF "
+             "Header Entry Point, set to 0x"
+          << Twine::utohexstr(NewEhdr.e_entry) << "\n";
+    } else
       NewEhdr.e_entry = getNewFunctionAddress(NewEhdr.e_entry);
     assert((NewEhdr.e_entry || !Obj.getHeader().e_entry) &&
            "cannot find new address for entry point");
@@ -5692,14 +5871,23 @@ void RewriteInstance::patchELFDynamic(ELFObjectFile<ELFT> *File) {
       }
       RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary();
       if (RtLibrary && Dyn.getTag() == ELF::DT_FINI) {
-        if (uint64_t Addr = RtLibrary->getRuntimeFiniAddress())
+        if (uint64_t Addr = RtLibrary->getRuntimeFiniAddress()) {
           NewDE.d_un.d_ptr = Addr;
+          BC->outs()
+              << "BOLT-INFO: runtime library finalization was hooked via "
+                 "DT_FINI, set to 0x"
+              << Twine::utohexstr(Addr) << "\n";
+        }
       }
-      if (RtLibrary && Dyn.getTag() == ELF::DT_INIT && !BC->HasInterpHeader) {
+      if (RtLibrary && Dyn.getTag() == ELF::DT_INIT &&
+          (!BC->HasInterpHeader ||
+           opts::RuntimeLibInitHook == opts::RLIH_INIT)) {
         if (auto Addr = RtLibrary->getRuntimeStartAddress()) {
-          LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Set DT_INIT to 0x"
-                            << Twine::utohexstr(Addr) << '\n');
           NewDE.d_un.d_ptr = Addr;
+          BC->outs()
+              << "BOLT-INFO: runtime library initialization was hooked via "
+                 "DT_INIT, set to 0x"
+              << Twine::utohexstr(Addr) << "\n";
         }
       }
       break;
@@ -5767,10 +5955,13 @@ Error RewriteInstance::readELFDynamic(ELFObjectFile<ELFT> *File) {
   for (const Elf_Dyn &Dyn : DynamicEntries) {
     switch (Dyn.d_tag) {
     case ELF::DT_INIT:
-      if (!BC->HasInterpHeader) {
-        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Set start function address\n");
-        BC->StartFunctionAddress = Dyn.getPtr();
-      }
+      BC->InitAddress = Dyn.getPtr();
+      break;
+    case ELF::DT_INIT_ARRAY:
+      BC->InitArrayAddress = Dyn.getPtr();
+      break;
+    case ELF::DT_INIT_ARRAYSZ:
+      BC->InitArraySize = Dyn.getPtr();
       break;
     case ELF::DT_FINI:
       BC->FiniAddress = Dyn.getPtr();
diff --git a/bolt/test/AArch64/hook-fini.s b/bolt/test/AArch64/hook-fini.s
index 4f321d463ef32..3bb95f9317b1b 100644
--- a/bolt/test/AArch64/hook-fini.s
+++ b/bolt/test/AArch64/hook-fini.s
@@ -15,13 +15,13 @@
 # RUN: %clang %cflags -pie %s -Wl,-q -o %t.exe
 # RUN: llvm-readelf -d %t.exe | FileCheck --check-prefix=DYN-FINI %s
 # RUN: llvm-readelf -r %t.exe | FileCheck --check-prefix=RELOC-PIE %s
-# RUN: llvm-bolt %t.exe -o %t --instrument
+# RUN: llvm-bolt %t.exe -o %t --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-FINI %s
 # RUN: llvm-readelf -drs %t | FileCheck --check-prefix=CHECK-FINI %s
 
 # RUN: %clang %cflags -pie %s -Wl,-q,-fini=0 -o %t-no-fini.exe
 # RUN: llvm-readelf -d %t-no-fini.exe | FileCheck --check-prefix=DYN-NO-FINI %s
 # RUN: llvm-readelf -r %t-no-fini.exe | FileCheck --check-prefix=RELOC-PIE %s
-# RUN: llvm-bolt %t-no-fini.exe -o %t-no-fini --instrument
+# RUN: llvm-bolt %t-no-fini.exe -o %t-no-fini --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-FINI-ARRAY %s
 # RUN: llvm-readelf -drs %t-no-fini | FileCheck --check-prefix=CHECK-NO-FINI %s
 # RUN: llvm-readelf -ds -x .fini_array %t-no-fini | FileCheck --check-prefix=CHECK-NO-FINI-RELOC %s
 
@@ -29,7 +29,7 @@
 # RUN: %clang %cflags %p/../Inputs/stub.c -fPIC -shared -o %t-stub.so
 # RUN: %clang %cflags %s -no-pie -Wl,-q,-fini=0 %t-stub.so -o %t-no-pie-no-fini.exe
 # RUN: llvm-readelf -r %t-no-pie-no-fini.exe | FileCheck --check-prefix=RELOC-NO-PIE %s
-# RUN: llvm-bolt %t-no-pie-no-fini.exe -o %t-no-pie-no-fini --instrument
+# RUN: llvm-bolt %t-no-pie-no-fini.exe -o %t-no-pie-no-fini --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-FINI-ARRAY %s
 # RUN: llvm-readelf -ds -x .fini_array %t-no-pie-no-fini | FileCheck --check-prefix=CHECK-NO-PIE-NO-FINI %s
 
 ## With fini: dynamic section should contain DT_FINI
@@ -46,6 +46,14 @@
 ## Without PIE: binary should not have relative relocations
 # RELOC-NO-PIE-NOT: R_AARCH64_RELATIVE
 
+## Check BOLT output output finalization hook (DT_FINI)
+# CHECK-BOLT-RT-FINI: runtime library finalization was hooked via DT_FINI
+# CHECK-BOLT-RT-FINI-NOT: runtime library finalization was hooked via .fini_array entry
+
+## Check BOLT output output finalization hook (.fini_array entry)
+# CHECK-BOLT-RT-FINI-ARRAY-NOT: runtime library finalization was hooked via DT_FINI
+# CHECK-BOLT-RT-FINI-ARRAY: runtime library finalization was hooked via .fini_array entry
+
 ## Check that DT_FINI is set to __bolt_runtime_fini
 # CHECK-FINI:     Dynamic section at offset {{.*}} contains {{.*}} entries:
 # CHECK-FINI-DAG: (FINI) 0x[[FINI:[[:xdigit:]]+]]
diff --git a/bolt/test/AArch64/hook-init.s b/bolt/test/AArch64/hook-init.s
new file mode 100644
index 0000000000000..a48328b630fa0
--- /dev/null
+++ b/bolt/test/AArch64/hook-init.s
@@ -0,0 +1,221 @@
+## Test the different ways of hooking the init function for instrumentation (via
+## entry point, DT_INIT and via DT_INIT_ARRAY). We test the latter for both PIE
+## and non-PIE binaries because of the different ways of handling relocations
+## (static or dynamic), executable and shared library.
+## All tests perform the following steps:
+## - Compile and link for the case to be tested
+## - Some sanity-checks on the dynamic section and relocations in the binary to
+##   verify it has the shape we want for testing:
+##   - INTERP in Program Headers
+##   - DT_INIT or DT_INIT_ARRAY in dynamic section
+##   - No relative relocations for non-PIE
+## - Instrument (with extra --runtime-lib-init-hook=init/init_array options
+##   in some cases)
+## - Verify generated binary
+# REQUIRES: system-linux,bolt-runtime,target=aarch64{{.*}}
+
+# RUN: %clang %cflags -pie %s -Wl,-q -o %t.exe
+# RUN: llvm-readelf -d %t.exe | FileCheck --check-prefix=DYN-INIT %s
+# RUN: llvm-readelf -l %t.exe | FileCheck --check-prefix=PH-INTERP %s
+# RUN: llvm-readelf -r %t.exe | FileCheck --check-prefix=RELOC-PIE %s
+# RUN: llvm-bolt %t.exe -o %t --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-EP %s
+# RUN: llvm-readelf -hdrs %t | FileCheck --check-prefix=CHECK-INIT-EP %s
+# RUN: llvm-bolt %t.exe -o %t-no-ep --instrument --runtime-lib-init-hook=init | FileCheck --check-prefix=CHECK-BOLT-RT-INIT %s
+# RUN: llvm-readelf -hdrs %t-no-ep | FileCheck --check-prefix=CHECK-INIT-NO-EP %s
+# RUN: llvm-bolt %t.exe -o %t-no-ep --instrument --runtime-lib-init-hook=init_array | FileCheck --check-prefix=CHECK-BOLT-RT-INIT-ARRAY %s
+# RUN: llvm-readelf -hdrs %t-no-ep | FileCheck --check-prefix=CHECK-INIT-ARRAY-NO-EP %s
+
+# RUN: %clang -shared %cflags -pie %s -Wl,-q -o %t-shared.exe
+# RUN: llvm-readelf -d %t-shared.exe | FileCheck --check-prefix=DYN-INIT %s
+# RUN: llvm-readelf -l %t-shared.exe | FileCheck --check-prefix=PH-INTERP-SHARED %s
+# RUN: llvm-readelf -r %t-shared.exe | FileCheck --check-prefix=RELOC-SHARED-PIE %s
+# RUN: llvm-bolt %t-shared.exe -o %t-shared --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-INIT %s
+# RUN: llvm-readelf -hdrs %t-shared | FileCheck --check-prefix=CHECK-SHARED-INIT %s
+
+# RUN: %clang %cflags -pie %s -Wl,-q,-init=0 -o %t-no-init.exe
+# RUN: llvm-readelf -d %t-no-init.exe | FileCheck --check-prefix=DYN-NO-INIT %s
+# RUN: llvm-readelf -l %t-no-init.exe | FileCheck --check-prefix=PH-INTERP %s
+# RUN: llvm-readelf -r %t-no-init.exe | FileCheck --check-prefix=RELOC-PIE %s
+# RUN: llvm-bolt %t-no-init.exe -o %t-no-init --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-EP %s
+# RUN: llvm-readelf -hdrs %t-no-init | FileCheck --check-prefix=CHECK-NO-INIT-EP %s
+# RUN: llvm-bolt %t-no-init.exe -o %t-no-init-no-ep --instrument --runtime-lib-init-hook=init | FileCheck --check-prefix=CHECK-BOLT-RT-INIT-ARRAY %s
+# RUN: llvm-readelf -hdrs %t-no-init-no-ep | FileCheck --check-prefix=CHECK-NO-INIT-NO-EP %s
+
+# RUN: %clang -shared %cflags -pie %s -Wl,-q,-init=0 -o %t-shared-no-init.exe
+# RUN: llvm-readelf -d %t-shared-no-init.exe | FileCheck --check-prefix=DYN-NO-INIT %s
+# RUN: llvm-readelf -l %t-shared-no-init.exe | FileCheck --check-prefix=PH-INTERP-SHARED %s
+# RUN: llvm-readelf -r %t-shared-no-init.exe | FileCheck --check-prefix=RELOC-SHARED-PIE %s
+# RUN: llvm-bolt %t-shared-no-init.exe -o %t-shared-no-init --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-INIT-ARRAY %s
+# RUN: llvm-readelf -drs %t-shared-no-init | FileCheck --check-prefix=CHECK-SHARED-NO-INIT %s
+
+## Create a dummy shared library to link against to force creation of the dynamic section.
+# RUN: %clang %cflags %p/../Inputs/stub.c -fPIC -shared -o %t-stub.so
+# RUN: %clang %cflags %s -no-pie -Wl,-q,-init=0 %t-stub.so -o %t-no-pie-no-init.exe
+# RUN: llvm-readelf -r %t-no-pie-no-init.exe | FileCheck --check-prefix=RELOC-NO-PIE %s
+# RUN: llvm-bolt %t-no-pie-no-init.exe -o %t-no-pie-no-init --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-EP %s
+# RUN: llvm-readelf -hds %t-no-pie-no-init | FileCheck --check-prefix=CHECK-NO-PIE-NO-INIT-EP %s
+
+## With init: dynamic section should contain DT_INIT
+# DYN-INIT: (INIT)
+
+## Without init: dynamic section should only contain DT_INIT_ARRAY
+# DYN-NO-INIT-NOT: (INIT)
+# DYN-NO-INIT:     (INIT_ARRAY)
+# DYN-NO-INIT:     (INIT_ARRAYSZ)
+
+## With interp program header (executable)
+# PH-INTERP: Program Headers:
+# PH-INTERP: INTERP
+
+## Without interp program header (shared library)
+# PH-INTERP-SHARED:     Program Headers:
+# PH-INTERP-SHARED-NOT: INTERP
+
+## With PIE: binary should have relative relocations
+# RELOC-PIE: R_AARCH64_RELATIVE
+
+## With PIE: binary should have relative relocations
+# RELOC-SHARED-PIE: R_AARCH64_ABS64
+
+## Without PIE: binary should not have relative relocations
+# RELOC-NO-PIE-NOT: R_AARCH64_RELATIVE
+
+## Check BOLT output output initialization hook (ELF Header Entry Point)
+# CHECK-BOLT-RT-EP: runtime library initialization was hooked via ELF Header Entry Point
+# CHECK-BOLT-RT-EP-NOT: runtime library initialization was hooked via DT_INIT
+# CHECK-BOLT-RT-EP-NOT: runtime library initialization was hooked via .init_array entry
+
+## Check BOLT output output initialization hook (DT_INIT)
+# CHECK-BOLT-RT-INIT-NOT: runtime library initialization was hooked via ELF Header Entry Point
+# CHECK-BOLT-RT-INIT: runtime library initialization was hooked via DT_INIT
+# CHECK-BOLT-RT-INIT-NOT: runtime library initialization was hooked via .init_array entry
+
+## Check BOLT output output initialization hook (.init_array entry)
+# CHECK-BOLT-RT-INIT-ARRAY-NOT: runtime library initialization was hooked via ELF Header Entry Point
+# CHECK-BOLT-RT-INIT-ARRAY-NOT: runtime library initialization was hooked via DT_INIT
+# CHECK-BOLT-RT-INIT-ARRAY: runtime library initialization was hooked via .init_array entry
+
+## Check that entry point address is set to __bolt_runtime_start for PIE executable with DT_INIT
+# CHECK-INIT-EP:               ELF Header:
+# CHECK-INIT-EP:               Entry point address: 0x[[#%x,EP_ADDR:]]
+## Check that the dynamic relocation at .init and .init_array were not patched
+# CHECK-INIT-EP:               Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-INIT-EP-NOT:           (INIT) 0x[[#%x, EP_ADDR]]
+# CHECK-INIT-EP-NOT:           (INIT_ARRAY) 0x[[#%x, EP_ADDR]]
+## Check that the new entry point address points to __bolt_runtime_start
+# CHECK-INIT-EP:               Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-INIT-EP:               {{0+}}[[#%x, EP_ADDR]] {{.*}} __bolt_runtime_start
+
+## Check that DT_INIT address is set to __bolt_runtime_start for PIE executable with DT_INIT
+# CHECK-INIT-NO-EP:            ELF Header:
+# CHECK-INIT-NO-EP:            Entry point address: 0x[[#%x,EP_ADDR:]]
+## Read Dynamic section DT_INIT and DT_INIT_ARRAY entries
+# CHECK-INIT-NO-EP:            Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-INIT-NO-EP-DAG:        (INIT) 0x[[#%x,INIT:]]
+# CHECK-INIT-NO-EP-DAG:        (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]]
+## Check if ELF entry point address points to _start symbol and new DT_INIT entry points to __bolt_runtime_start
+# CHECK-INIT-NO-EP:            Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-INIT-NO-EP-DAG:        {{0+}}[[#%x, EP_ADDR]] {{.*}} _start
+# CHECK-INIT-NO-EP-DAG:        {{0+}}[[#%x, INIT]] {{.*}} __bolt_runtime_start
+
+## Check that 1st entry of DT_INIT_ARRAY is set to __bolt_runtime_start and DT_INIT was not changed
+# CHECK-INIT-ARRAY-NO-EP:      ELF Header:
+# CHECK-INIT-ARRAY-NO-EP:      Entry point address: 0x[[#%x,EP_ADDR:]]
+## Read Dynamic section DT_INIT and DT_INIT_ARRAY entries
+# CHECK-INIT-ARRAY-NO-EP:      Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-INIT-ARRAY-NO-EP-DAG:  (INIT) 0x[[#%x,INIT:]]
+# CHECK-INIT-ARRAY-NO-EP-DAG:  (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]]
+## Read the dynamic relocation from 1st entry of .init_array
+# CHECK-INIT-ARRAY-NO-EP:      Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries
+# CHECK-INIT-ARRAY-NO-EP:      {{0+}}[[#%x,INIT_ARRAY]] {{.*}} R_AARCH64_RELATIVE [[#%x,INIT_ADDR:]]
+# CHECK-INIT-ARRAY-NO-EP-NOT:  {{0+}}[[#%x,INIT_ARRAY]] {{.*}} R_AARCH64_RELATIVE [[#%x,INIT]]
+## Check that 1st entry of .init_array points to __bolt_runtime_start
+# CHECK-INIT-ARRAY-NO-EP:      Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-INIT-ARRAY-NO-EP-DAG:  {{0+}}[[#%x, EP_ADDR]] {{.*}} _start
+# CHECK-INIT-ARRAY-NO-EP-DAG:  {{[0-9]]*}}: {{0+}}[[#%x, INIT_ADDR]] {{.*}} __bolt_runtime_start
+
+## Check that entry point address is set to __bolt_runtime_start for PIE executable without DT_INIT
+# CHECK-NO-INIT-EP:            ELF Header:
+# CHECK-NO-INIT-EP:            Entry point address: 0x[[#%x,EP_ADDR:]]
+## Check that the dynamic relocation at .init and .init_array were not patched
+# CHECK-NO-INIT-EP:            Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-NO-INIT-EP-NOT:        (INIT) 0x[[#%x, EP_ADDR]]
+# CHECK-NO-INIT-EP-NOT:        (INIT_ARRAY) 0x[[#%x, EP_ADDR]]
+## Check that the new entry point address points to __bolt_runtime_start
+# CHECK-NO-INIT-EP:            Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-NO-INIT-EP:            {{0+}}[[#%x, EP_ADDR]] {{.*}} __bolt_runtime_start
+
+## Check that DT_INIT is set to __bolt_runtime_start for shared library with DT_INIT
+# CHECK-SHARED-INIT:           Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-SHARED-INIT-DAG:       (INIT) 0x[[#%x, INIT:]]
+# CHECK-SHARED-INIT-DAG:       (INIT_ARRAY) 0x[[#%x, INIT_ARRAY:]]
+## Check that the dynamic relocation at .init_array was not patched
+# CHECK-SHARED-INIT:           Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries
+# CHECK-SHARED-INIT-NOT:       {{0+}}[[#%x, INIT_ARRAY]] {{.*}} R_AARCH64_ABS64 {{0+}}[[#%x, INIT]]
+## Check that dynamic section DT_INIT points to __bolt_runtime_start
+# CHECK-SHARED-INIT:           Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-SHARED-INIT:           {{0+}}[[#%x, INIT]] {{.*}} __bolt_runtime_start
+
+## Check that entry point address is set to __bolt_runtime_start for PIE executable without DT_INIT
+# CHECK-NO-INIT-NO-EP:         ELF Header:
+# CHECK-NO-INIT-NO-EP:         Entry point address: 0x[[#%x,EP_ADDR:]]
+# CHECK-NO-INIT-NO-EP:         Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-NO-INIT-NO-EP-NOT:     (INIT)
+# CHECK-NO-INIT-NO-EP:         (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]]
+## Read the dynamic relocation from 1st entry of .init_array
+# CHECK-NO-INIT-NO-EP:         Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries
+# CHECK-NO-INIT-NO-EP:         {{0+}}[[#%x,INIT_ARRAY]] {{.*}} R_AARCH64_RELATIVE [[#%x,INIT_ADDR:]]
+## Check that 1st entry of .init_array points to __bolt_runtime_start
+# CHECK-NO-INIT-NO-EP:         Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-NO-INIT-NO-EP-DAG:     {{0+}}[[#%x, EP_ADDR]] {{.*}} _start
+# CHECK-NO-INIT-NO-EP-DAG:     {{[0-9]]*}}: {{0+}}[[#%x, INIT_ADDR]] {{.*}} __bolt_runtime_start
+
+## Check that entry point address is set to __bolt_runtime_start for shared library without DT_INIT
+# CHECK-SHARED-NO-INIT:        Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-SHARED-NO-INIT-NOT:    (INIT)
+# CHECK-SHARED-NO-INIT:        (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]]
+## Read the dynamic relocation from 1st entry of .init_array
+# CHECK-SHARED-NO-INIT:        Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries
+# CHECK-SHARED-NO-INIT:        {{0+}}[[#%x, INIT_ARRAY]] {{.*}} R_AARCH64_ABS64 [[#%x,INIT_ADDR:]]
+## Check that 1st entry of .init_array points to __bolt_runtime_start
+# CHECK-SHARED-NO-INIT:        Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-SHARED-NO-INIT:        {{[0-9]]*}}: {{0+}}[[#%x, INIT_ADDR]] {{.*}} __bolt_runtime_start
+
+## Check that entry point address is set to __bolt_runtime_start for non-PIE executable with DT_INIT
+# CHECK-NO-PIE-NO-INIT-EP:     ELF Header:
+# CHECK-NO-PIE-NO-INIT-EP:     Entry point address: 0x[[#%x,EP_ADDR:]]
+## Check that the dynamic relocation at .init and .init_array were not patched
+# CHECK-NO-PIE-NO-INIT-EP:     Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-NO-PIE-NO-INIT-EP-NOT: (INIT) 0x[[#%x, EP_ADDR]]
+# CHECK-NO-PIE-NO-INIT-EP-NOT: (INIT_ARRAY) 0x[[#%x, EP_ADDR]]
+## Check that the new entry point address points to __bolt_runtime_start
+# CHECK-NO-PIE-NO-INIT-EP:     Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-NO-PIE-NO-INIT-EP:     {{0+}}[[#%x, EP_ADDR]] {{.*}} __bolt_runtime_start
+
+  .globl _start
+  .type _start, %function
+_start:
+  # Dummy relocation to force relocation mode.
+  .reloc 0, R_AARCH64_NONE
+  ret
+.size _start, .-_start
+
+  .globl _init
+  .type _init, %function
+_init:
+  ret
+  .size _init, .-_init
+
+  .globl _fini
+  .type _fini, %function
+_fini:
+  ret
+  .size _fini, .-_fini
+
+  .section .init_array,"aw"
+  .align 3
+  .dword _init
+
+  .section .fini_array,"aw"
+  .align 3
+  .dword _fini
diff --git a/bolt/test/AArch64/inline-bti-dbg.s b/bolt/test/AArch64/inline-bti-dbg.s
new file mode 100644
index 0000000000000..a0db4589d39ac
--- /dev/null
+++ b/bolt/test/AArch64/inline-bti-dbg.s
@@ -0,0 +1,40 @@
+# This test checks that for AArch64 binaries with BTI, we do not inline blocks with indirect tailcalls.
+# Same as inline-bti.s, but checks the debug output, and therefore requires assertions.
+
+# REQUIRES: system-linux, assertions
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags -O0 %t.o -o %t.exe -Wl,-q -Wl,-z,force-bti
+# RUN: llvm-bolt --inline-all %t.exe -o %t.bolt --debug 2>&1 | FileCheck %s
+
+# For BTI, we should not inline foo.
+# CHECK: BOLT-DEBUG: Skipping inlining block with tailcall in _Z3barP1A : .LBB01 to keep BTIs consistent.
+# CHECK-NOT: BOLT-INFO: inlined {{[0-9]+}} calls at {{[0-9]+}} call sites in {{[0-9]+}} iteration(s). Change in binary size: {{[0-9]+}} bytes.
+
+	.text
+	.globl	_Z3fooP1A
+	.type	_Z3fooP1A,@function
+_Z3fooP1A:
+	ldr	x8, [x0]
+	ldr	w0, [x8]
+	br x30
+	.size	_Z3fooP1A, .-_Z3fooP1A
+
+	.globl	_Z3barP1A
+	.type	_Z3barP1A,@function
+_Z3barP1A:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	bl	_Z3fooP1A
+	mul	w0, w0, w0
+	ldp	x29, x30, [sp], #16
+	ret
+	.size	_Z3barP1A, .-_Z3barP1A
+
+	.globl	main
+	.p2align	2
+	.type	main,@function
+main:
+	mov	w0, wzr
+	ret
+	.size	main, .-main
diff --git a/bolt/test/AArch64/inline-bti.s b/bolt/test/AArch64/inline-bti.s
new file mode 100644
index 0000000000000..62f6ea6f4b63a
--- /dev/null
+++ b/bolt/test/AArch64/inline-bti.s
@@ -0,0 +1,38 @@
+## This test checks that for AArch64 binaries with BTI, we do not inline blocks with indirect tailcalls.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags -O0 %t.o -o %t.exe -Wl,-q -Wl,-z,force-bti
+# RUN: llvm-bolt --inline-all %t.exe -o %t.bolt  | FileCheck %s
+
+# For BTI, we should not inline foo.
+# CHECK-NOT: BOLT-INFO: inlined {{[0-9]+}} calls at {{[0-9]+}} call sites in {{[0-9]+}} iteration(s). Change in binary size: {{[0-9]+}} bytes.
+
+	.text
+	.globl	_Z3fooP1A
+	.type	_Z3fooP1A,@function
+_Z3fooP1A:
+	ldr	x8, [x0]
+	ldr	w0, [x8]
+	br x30
+	.size	_Z3fooP1A, .-_Z3fooP1A
+
+	.globl	_Z3barP1A
+	.type	_Z3barP1A,@function
+_Z3barP1A:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	bl	_Z3fooP1A
+	mul	w0, w0, w0
+	ldp	x29, x30, [sp], #16
+	ret
+	.size	_Z3barP1A, .-_Z3barP1A
+
+	.globl	main
+	.p2align	2
+	.type	main,@function
+main:
+	mov	w0, wzr
+	ret
+	.size	main, .-main
diff --git a/bolt/test/X86/hook-init.s b/bolt/test/X86/hook-init.s
new file mode 100644
index 0000000000000..3184541f040b9
--- /dev/null
+++ b/bolt/test/X86/hook-init.s
@@ -0,0 +1,221 @@
+## Test the different ways of hooking the init function for instrumentation (via
+## entry point, DT_INIT and via DT_INIT_ARRAY). We test the latter for both PIE
+## and non-PIE binaries because of the different ways of handling relocations
+## (static or dynamic), executable and shared library.
+## All tests perform the following steps:
+## - Compile and link for the case to be tested
+## - Some sanity-checks on the dynamic section and relocations in the binary to
+##   verify it has the shape we want for testing:
+##   - INTERP in Program Headers
+##   - DT_INIT or DT_INIT_ARRAY in dynamic section
+##   - No relative relocations for non-PIE
+## - Instrument (with extra --runtime-lib-init-hook=init/init_array options
+##   in some cases)
+## - Verify generated binary
+# REQUIRES: system-linux,bolt-runtime,target=x86_64-{{.*}}
+
+# RUN: %clang %cflags -pie %s -Wl,-q -o %t.exe
+# RUN: llvm-readelf -d %t.exe | FileCheck --check-prefix=DYN-INIT %s
+# RUN: llvm-readelf -l %t.exe | FileCheck --check-prefix=PH-INTERP %s
+# RUN: llvm-readelf -r %t.exe | FileCheck --check-prefix=RELOC-PIE %s
+# RUN: llvm-bolt %t.exe -o %t --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-EP %s
+# RUN: llvm-readelf -hdrs %t | FileCheck --check-prefix=CHECK-INIT-EP %s
+# RUN: llvm-bolt %t.exe -o %t-no-ep --instrument --runtime-lib-init-hook=init | FileCheck --check-prefix=CHECK-BOLT-RT-INIT %s
+# RUN: llvm-readelf -hdrs %t-no-ep | FileCheck --check-prefix=CHECK-INIT-NO-EP %s
+# RUN: llvm-bolt %t.exe -o %t-no-ep --instrument --runtime-lib-init-hook=init_array | FileCheck --check-prefix=CHECK-BOLT-RT-INIT-ARRAY %s
+# RUN: llvm-readelf -hdrs %t-no-ep | FileCheck --check-prefix=CHECK-INIT-ARRAY-NO-EP %s
+
+# RUN: %clang -shared %cflags -pie %s -Wl,-q -o %t-shared.exe
+# RUN: llvm-readelf -d %t-shared.exe | FileCheck --check-prefix=DYN-INIT %s
+# RUN: llvm-readelf -l %t-shared.exe | FileCheck --check-prefix=PH-INTERP-SHARED %s
+# RUN: llvm-readelf -r %t-shared.exe | FileCheck --check-prefix=RELOC-SHARED-PIE %s
+# RUN: llvm-bolt %t-shared.exe -o %t-shared --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-INIT %s
+# RUN: llvm-readelf -hdrs %t-shared | FileCheck --check-prefix=CHECK-SHARED-INIT %s
+
+# RUN: %clang %cflags -pie %s -Wl,-q,-init=0 -o %t-no-init.exe
+# RUN: llvm-readelf -d %t-no-init.exe | FileCheck --check-prefix=DYN-NO-INIT %s
+# RUN: llvm-readelf -l %t-no-init.exe | FileCheck --check-prefix=PH-INTERP %s
+# RUN: llvm-readelf -r %t-no-init.exe | FileCheck --check-prefix=RELOC-PIE %s
+# RUN: llvm-bolt %t-no-init.exe -o %t-no-init --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-EP %s
+# RUN: llvm-readelf -hdrs %t-no-init | FileCheck --check-prefix=CHECK-NO-INIT-EP %s
+# RUN: llvm-bolt %t-no-init.exe -o %t-no-init-no-ep --instrument --runtime-lib-init-hook=init | FileCheck --check-prefix=CHECK-BOLT-RT-INIT-ARRAY %s
+# RUN: llvm-readelf -hdrs %t-no-init-no-ep | FileCheck --check-prefix=CHECK-NO-INIT-NO-EP %s
+
+# RUN: %clang -shared %cflags -pie %s -Wl,-q,-init=0 -o %t-shared-no-init.exe
+# RUN: llvm-readelf -d %t-shared-no-init.exe | FileCheck --check-prefix=DYN-NO-INIT %s
+# RUN: llvm-readelf -l %t-shared-no-init.exe | FileCheck --check-prefix=PH-INTERP-SHARED %s
+# RUN: llvm-readelf -r %t-shared-no-init.exe | FileCheck --check-prefix=RELOC-SHARED-PIE %s
+# RUN: llvm-bolt %t-shared-no-init.exe -o %t-shared-no-init --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-INIT-ARRAY %s
+# RUN: llvm-readelf -drs %t-shared-no-init | FileCheck --check-prefix=CHECK-SHARED-NO-INIT %s
+
+## Create a dummy shared library to link against to force creation of the dynamic section.
+# RUN: %clang %cflags %p/../Inputs/stub.c -fPIC -shared -o %t-stub.so
+# RUN: %clang %cflags %s -no-pie -Wl,-q,-init=0 %t-stub.so -o %t-no-pie-no-init.exe
+# RUN: llvm-readelf -r %t-no-pie-no-init.exe | FileCheck --check-prefix=RELOC-NO-PIE %s
+# RUN: llvm-bolt %t-no-pie-no-init.exe -o %t-no-pie-no-init --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-EP %s
+# RUN: llvm-readelf -hds %t-no-pie-no-init | FileCheck --check-prefix=CHECK-NO-PIE-NO-INIT-EP %s
+
+## With init: dynamic section should contain DT_INIT
+# DYN-INIT: (INIT)
+
+## Without init: dynamic section should only contain DT_INIT_ARRAY
+# DYN-NO-INIT-NOT: (INIT)
+# DYN-NO-INIT:     (INIT_ARRAY)
+# DYN-NO-INIT:     (INIT_ARRAYSZ)
+
+## With interp program header (executable)
+# PH-INTERP: Program Headers:
+# PH-INTERP: INTERP
+
+## Without interp program header (shared library)
+# PH-INTERP-SHARED:     Program Headers:
+# PH-INTERP-SHARED-NOT: INTERP
+
+## With PIE: binary should have relative relocations
+# RELOC-PIE: R_X86_64_RELATIVE
+
+## With PIE: binary should have relative relocations
+# RELOC-SHARED-PIE: R_X86_64_64
+
+## Without PIE: binary should not have relative relocations
+# RELOC-NO-PIE-NOT: R_X86_64_RELATIVE
+
+## Check BOLT output output initialization hook (ELF Header Entry Point)
+# CHECK-BOLT-RT-EP: runtime library initialization was hooked via ELF Header Entry Point
+# CHECK-BOLT-RT-EP-NOT: runtime library initialization was hooked via DT_INIT
+# CHECK-BOLT-RT-EP-NOT: runtime library initialization was hooked via .init_array entry
+
+## Check BOLT output output initialization hook (DT_INIT)
+# CHECK-BOLT-RT-INIT-NOT: runtime library initialization was hooked via ELF Header Entry Point
+# CHECK-BOLT-RT-INIT: runtime library initialization was hooked via DT_INIT
+# CHECK-BOLT-RT-INIT-NOT: runtime library initialization was hooked via .init_array entry
+
+## Check BOLT output output initialization hook (1st entry of .init_array)
+# CHECK-BOLT-RT-INIT-ARRAY-NOT: runtime library initialization was hooked via ELF Header Entry Point
+# CHECK-BOLT-RT-INIT-ARRAY-NOT: runtime library initialization was hooked via DT_INIT
+# CHECK-BOLT-RT-INIT-ARRAY: runtime library initialization was hooked via .init_array entry
+
+## Check that entry point address is set to __bolt_runtime_start for PIE executable with DT_INIT
+# CHECK-INIT-EP:               ELF Header:
+# CHECK-INIT-EP:               Entry point address: 0x[[#%x,EP_ADDR:]]
+## Check that the dynamic relocation at .init and .init_array were not patched
+# CHECK-INIT-EP:               Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-INIT-EP-NOT:           (INIT) 0x[[#%x, EP_ADDR]]
+# CHECK-INIT-EP-NOT:           (INIT_ARRAY) 0x[[#%x, EP_ADDR]]
+## Check that the new entry point address points to __bolt_runtime_start
+# CHECK-INIT-EP:               Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-INIT-EP:               {{0+}}[[#%x, EP_ADDR]] {{.*}} __bolt_runtime_start
+
+## Check that DT_INIT address is set to __bolt_runtime_start for PIE executable with DT_INIT
+# CHECK-INIT-NO-EP:            ELF Header:
+# CHECK-INIT-NO-EP:            Entry point address: 0x[[#%x,EP_ADDR:]]
+## Read Dynamic section DT_INIT and DT_INIT_ARRAY entries
+# CHECK-INIT-NO-EP:            Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-INIT-NO-EP-DAG:        (INIT) 0x[[#%x,INIT:]]
+# CHECK-INIT-NO-EP-DAG:        (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]]
+## Check if ELF entry point address points to _start symbol and new DT_INIT entry points to __bolt_runtime_start
+# CHECK-INIT-NO-EP:            Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-INIT-NO-EP-DAG:        {{0+}}[[#%x, EP_ADDR]] {{.*}} _start
+# CHECK-INIT-NO-EP-DAG:        {{0+}}[[#%x, INIT]] {{.*}} __bolt_runtime_start
+
+## Check that 1st entry of DT_INIT_ARRAY is set to __bolt_runtime_start and DT_INIT was not changed
+# CHECK-INIT-ARRAY-NO-EP:      ELF Header:
+# CHECK-INIT-ARRAY-NO-EP:      Entry point address: 0x[[#%x,EP_ADDR:]]
+## Read Dynamic section DT_INIT and DT_INIT_ARRAY entries
+# CHECK-INIT-ARRAY-NO-EP:      Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-INIT-ARRAY-NO-EP-DAG:  (INIT) 0x[[#%x,INIT:]]
+# CHECK-INIT-ARRAY-NO-EP-DAG:  (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]]
+## Read the dynamic relocation from 1st entry of .init_array
+# CHECK-INIT-ARRAY-NO-EP:      Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries
+# CHECK-INIT-ARRAY-NO-EP:      {{0+}}[[#%x,INIT_ARRAY]] {{.*}} R_X86_64_RELATIVE [[#%x,INIT_ADDR:]]
+# CHECK-INIT-ARRAY-NO-EP-NOT:  {{0+}}[[#%x,INIT_ARRAY]] {{.*}} R_X86_64_RELATIVE [[#%x,INIT]]
+## Check that 1st entry of .init_array points to __bolt_runtime_start
+# CHECK-INIT-ARRAY-NO-EP:      Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-INIT-ARRAY-NO-EP-DAG:  {{0+}}[[#%x, EP_ADDR]] {{.*}} _start
+# CHECK-INIT-ARRAY-NO-EP-DAG:  {{[0-9]]*}}: {{0+}}[[#%x, INIT_ADDR]] {{.*}} __bolt_runtime_start
+
+## Check that entry point address is set to __bolt_runtime_start for PIE executable without DT_INIT
+# CHECK-NO-INIT-EP:            ELF Header:
+# CHECK-NO-INIT-EP:            Entry point address: 0x[[#%x,EP_ADDR:]]
+## Check that the dynamic relocation at .init and .init_array were not patched
+# CHECK-NO-INIT-EP:            Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-NO-INIT-EP-NOT:        (INIT) 0x[[#%x, EP_ADDR]]
+# CHECK-NO-INIT-EP-NOT:        (INIT_ARRAY) 0x[[#%x, EP_ADDR]]
+## Check that the new entry point address points to __bolt_runtime_start
+# CHECK-NO-INIT-EP:            Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-NO-INIT-EP:            {{0+}}[[#%x, EP_ADDR]] {{.*}} __bolt_runtime_start
+
+## Check that DT_INIT is set to __bolt_runtime_start for shared library with DT_INIT
+# CHECK-SHARED-INIT:           Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-SHARED-INIT-DAG:       (INIT) 0x[[#%x, INIT:]]
+# CHECK-SHARED-INIT-DAG:       (INIT_ARRAY) 0x[[#%x, INIT_ARRAY:]]
+## Check that the dynamic relocation at .init_array was not patched
+# CHECK-SHARED-INIT:           Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries
+# CHECK-SHARED-INIT-NOT:       {{0+}}[[#%x, INIT_ARRAY]] {{.*}} R_X86_64_64 {{0+}}[[#%x, INIT]]
+## Check that dynamic section DT_INIT points to __bolt_runtime_start
+# CHECK-SHARED-INIT:           Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-SHARED-INIT:           {{0+}}[[#%x, INIT]] {{.*}} __bolt_runtime_start
+
+## Check that entry point address is set to __bolt_runtime_start for PIE executable without DT_INIT
+# CHECK-NO-INIT-NO-EP:         ELF Header:
+# CHECK-NO-INIT-NO-EP:         Entry point address: 0x[[#%x,EP_ADDR:]]
+# CHECK-NO-INIT-NO-EP:         Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-NO-INIT-NO-EP-NOT:     (INIT)
+# CHECK-NO-INIT-NO-EP:         (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]]
+## Read the dynamic relocation from 1st entry of .init_array
+# CHECK-NO-INIT-NO-EP:         Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries
+# CHECK-NO-INIT-NO-EP:         {{0+}}[[#%x,INIT_ARRAY]] {{.*}} R_X86_64_RELATIVE [[#%x,INIT_ADDR:]]
+## Check that 1st entry of .init_array points to __bolt_runtime_start
+# CHECK-NO-INIT-NO-EP:         Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-NO-INIT-NO-EP-DAG:     {{0+}}[[#%x, EP_ADDR]] {{.*}} _start
+# CHECK-NO-INIT-NO-EP-DAG:     {{[0-9]]*}}: {{0+}}[[#%x, INIT_ADDR]] {{.*}} __bolt_runtime_start
+
+## Check that entry point address is set to __bolt_runtime_start for shared library without DT_INIT
+# CHECK-SHARED-NO-INIT:        Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-SHARED-NO-INIT-NOT:    (INIT)
+# CHECK-SHARED-NO-INIT:        (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]]
+## Read the dynamic relocation from 1st entry of .init_array
+# CHECK-SHARED-NO-INIT:        Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries
+# CHECK-SHARED-NO-INIT:        {{0+}}[[#%x, INIT_ARRAY]] {{.*}} R_X86_64_64 [[#%x,INIT_ADDR:]]
+## Check that 1st entry of .init_array points to __bolt_runtime_start
+# CHECK-SHARED-NO-INIT:        Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-SHARED-NO-INIT:        {{[0-9]]*}}: {{0+}}[[#%x, INIT_ADDR]] {{.*}} __bolt_runtime_start
+
+## Check that entry point address is set to __bolt_runtime_start for non-PIE executable with DT_INIT
+# CHECK-NO-PIE-NO-INIT-EP:     ELF Header:
+# CHECK-NO-PIE-NO-INIT-EP:     Entry point address: 0x[[#%x,EP_ADDR:]]
+## Check that the dynamic relocation at .init and .init_array were not patched
+# CHECK-NO-PIE-NO-INIT-EP:     Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-NO-PIE-NO-INIT-EP-NOT: (INIT) 0x[[#%x, EP_ADDR]]
+# CHECK-NO-PIE-NO-INIT-EP-NOT: (INIT_ARRAY) 0x[[#%x, EP_ADDR]]
+## Check that the new entry point address points to __bolt_runtime_start
+# CHECK-NO-PIE-NO-INIT-EP:     Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-NO-PIE-NO-INIT-EP:     {{0+}}[[#%x, EP_ADDR]] {{.*}} __bolt_runtime_start
+
+  .globl _start
+  .type _start, %function
+_start:
+  # Dummy relocation to force relocation mode.
+  .reloc 0, R_X86_64_NONE
+  retq
+.size _start, .-_start
+
+  .globl _init
+  .type _init, %function
+_init:
+  retq
+  .size _init, .-_init
+
+  .globl _fini
+  .type _fini, %function
+_fini:
+  retq
+  .size _fini, .-_fini
+
+  .section .init_array,"aw"
+  .align 8
+  .quad _init
+
+  .section .fini_array,"aw"
+  .align 8
+  .quad _fini
diff --git a/bolt/test/X86/internal-call-instrument-so.s b/bolt/test/X86/internal-call-instrument-so.s
index 99e5b29221409..fe23bc61afa32 100644
--- a/bolt/test/X86/internal-call-instrument-so.s
+++ b/bolt/test/X86/internal-call-instrument-so.s
@@ -5,7 +5,7 @@
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
 # Delete our BB symbols so BOLT doesn't mark them as entry points
 # RUN: llvm-strip --strip-unneeded %t.o
-# RUN: ld.lld %t.o -o %t.exe -q -shared -fini=_fini
+# RUN: ld.lld %t.o -o %t.exe -q -shared -fini=_fini -init=_init
 # RUN: llvm-bolt --instrument %t.exe --relocs -o %t.out
 
   .text
@@ -48,6 +48,13 @@ _fini:
   hlt
   .size _fini, .-_fini
 
+  .globl  _init
+  .type _init, %function
+  .p2align  4
+_init:
+  retq
+  .size _init, .-_init
+
   .data
   .globl var
 var:
diff --git a/bolt/test/runtime/X86/instrument-wrong-target.s b/bolt/test/runtime/X86/instrument-wrong-target.s
index 343d93a89ed13..fa40d43f10a0f 100644
--- a/bolt/test/runtime/X86/instrument-wrong-target.s
+++ b/bolt/test/runtime/X86/instrument-wrong-target.s
@@ -19,6 +19,13 @@ _start:
     ret
     .size _start, .-_start
 
+    .globl _init
+    .type _init, %function
+    # Force DT_INIT to be created (needed for instrumentation).
+_init:
+    ret
+    .size _init, .-_init
+
     .globl _fini
     .type _fini, %function
     # Force DT_FINI to be created (needed for instrumentation).
diff --git a/bolt/unittests/CMakeLists.txt b/bolt/unittests/CMakeLists.txt
index 64414b83d39fe..d47ddc46b7388 100644
--- a/bolt/unittests/CMakeLists.txt
+++ b/bolt/unittests/CMakeLists.txt
@@ -7,3 +7,4 @@ endfunction()
 
 add_subdirectory(Core)
 add_subdirectory(Profile)
+add_subdirectory(Passes)
diff --git a/bolt/unittests/Passes/CMakeLists.txt b/bolt/unittests/Passes/CMakeLists.txt
new file mode 100644
index 0000000000000..3dc578adeb357
--- /dev/null
+++ b/bolt/unittests/Passes/CMakeLists.txt
@@ -0,0 +1,30 @@
+set(LLVM_LINK_COMPONENTS
+  DebugInfoDWARF
+  Object
+  MC
+  ${BOLT_TARGETS_TO_BUILD}
+  )
+
+add_bolt_unittest(PassTests
+  InsertNegateRAState.cpp
+
+  DISABLE_LLVM_LINK_LLVM_DYLIB
+  )
+
+target_link_libraries(PassTests
+  PRIVATE
+  LLVMBOLTCore
+  LLVMBOLTRewrite
+  LLVMBOLTPasses
+  LLVMBOLTProfile
+  LLVMBOLTUtils
+  )
+
+foreach (tgt ${BOLT_TARGETS_TO_BUILD})
+  include_directories(
+    ${LLVM_MAIN_SRC_DIR}/lib/Target/${tgt}
+    ${LLVM_BINARY_DIR}/lib/Target/${tgt}
+  )
+  string(TOUPPER "${tgt}" upper)
+  target_compile_definitions(PassTests PRIVATE "${upper}_AVAILABLE")
+endforeach()
diff --git a/bolt/unittests/Passes/InsertNegateRAState.cpp b/bolt/unittests/Passes/InsertNegateRAState.cpp
new file mode 100644
index 0000000000000..2ef78d381e570
--- /dev/null
+++ b/bolt/unittests/Passes/InsertNegateRAState.cpp
@@ -0,0 +1,333 @@
+//===- bolt/unittest/Passes/InsertNegateRAState.cpp -----------------------===//
+//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef AARCH64_AVAILABLE
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#endif // AARCH64_AVAILABLE
+
+#include "bolt/Core/BinaryBasicBlock.h"
+#include "bolt/Core/BinaryFunction.h"
+#include "bolt/Passes/InsertNegateRAStatePass.h"
+#include "bolt/Rewrite/BinaryPassManager.h"
+#include "bolt/Rewrite/RewriteInstance.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/Support/TargetSelect.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::object;
+using namespace llvm::ELF;
+using namespace bolt;
+
+namespace {
+struct PassTester : public testing::TestWithParam<Triple::ArchType> {
+  void SetUp() override {
+    initalizeLLVM();
+    prepareElf();
+    initializeBolt();
+  }
+
+protected:
+  void initalizeLLVM() {
+#define BOLT_TARGET(target)                                                    \
+  LLVMInitialize##target##TargetInfo();                                        \
+  LLVMInitialize##target##TargetMC();                                          \
+  LLVMInitialize##target##AsmParser();                                         \
+  LLVMInitialize##target##Disassembler();                                      \
+  LLVMInitialize##target##Target();                                            \
+  LLVMInitialize##target##AsmPrinter();
+
+#include "bolt/Core/TargetConfig.def"
+  }
+
+#define PREPARE_FUNC(name)                                                     \
+  constexpr uint64_t FunctionAddress = 0x1000;                                 \
+  BinaryFunction *BF = BC->createBinaryFunction(                               \
+      name, *TextSection, FunctionAddress, /*Size=*/0, /*SymbolSize=*/0,       \
+      /*Alignment=*/16);                                                       \
+  /* Make sure the pass runs on the BF.*/                                      \
+  BF->updateState(BinaryFunction::State::CFG);                                 \
+  BF->setContainedNegateRAState();                                             \
+  /* All tests need at least one BB. */                                        \
+  BinaryBasicBlock *BB = BF->addBasicBlock();                                  \
+  BF->addEntryPoint(*BB);                                                      \
+  BB->setCFIState(0);
+
+  void prepareElf() {
+    memcpy(ElfBuf, "\177ELF", 4);
+    ELF64LE::Ehdr *EHdr = reinterpret_cast<typename ELF64LE::Ehdr *>(ElfBuf);
+    EHdr->e_ident[llvm::ELF::EI_CLASS] = llvm::ELF::ELFCLASS64;
+    EHdr->e_ident[llvm::ELF::EI_DATA] = llvm::ELF::ELFDATA2LSB;
+    EHdr->e_machine = GetParam() == Triple::aarch64 ? EM_AARCH64 : EM_X86_64;
+    MemoryBufferRef Source(StringRef(ElfBuf, sizeof(ElfBuf)), "ELF");
+    ObjFile = cantFail(ObjectFile::createObjectFile(Source));
+  }
+  void initializeBolt() {
+    Relocation::Arch = ObjFile->makeTriple().getArch();
+    BC = cantFail(BinaryContext::createBinaryContext(
+        ObjFile->makeTriple(), std::make_shared<orc::SymbolStringPool>(),
+        ObjFile->getFileName(), nullptr, true, DWARFContext::create(*ObjFile),
+        {llvm::outs(), llvm::errs()}));
+    ASSERT_FALSE(!BC);
+    BC->initializeTarget(std::unique_ptr<MCPlusBuilder>(
+        createMCPlusBuilder(GetParam(), BC->MIA.get(), BC->MII.get(),
+                            BC->MRI.get(), BC->STI.get())));
+
+    PassManager = std::make_unique<BinaryFunctionPassManager>(*BC);
+    PassManager->registerPass(std::make_unique<InsertNegateRAState>());
+
+    TextSection = &BC->registerOrUpdateSection(
+        ".text", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_EXECINSTR,
+        /*Data=*/nullptr, /*Size=*/0,
+        /*Alignment=*/16);
+  }
+
+  std::vector<int> findCFIOffsets(BinaryFunction &BF) {
+    std::vector<int> Locations;
+    int Idx = 0;
+    int InstSize = 4; // AArch64
+    for (BinaryBasicBlock &BB : BF) {
+      for (MCInst &Inst : BB) {
+        if (BC->MIB->isCFI(Inst)) {
+          const MCCFIInstruction *CFI = BF.getCFIFor(Inst);
+          if (CFI->getOperation() == MCCFIInstruction::OpNegateRAState)
+            Locations.push_back(Idx * InstSize);
+        }
+        Idx++;
+      }
+    }
+    return Locations;
+  }
+
+  char ElfBuf[sizeof(typename ELF64LE::Ehdr)] = {};
+  std::unique_ptr<ObjectFile> ObjFile;
+  std::unique_ptr<BinaryContext> BC;
+  std::unique_ptr<BinaryFunctionPassManager> PassManager;
+  BinarySection *TextSection;
+};
+} // namespace
+
+TEST_P(PassTester, ExampleTest) {
+  if (GetParam() != Triple::aarch64)
+    GTEST_SKIP();
+
+  ASSERT_NE(TextSection, nullptr);
+
+  PREPARE_FUNC("ExampleFunction");
+
+  MCInst UnsignedInst = MCInstBuilder(AArch64::ADDSXri)
+                            .addReg(AArch64::X0)
+                            .addReg(AArch64::X0)
+                            .addImm(0)
+                            .addImm(0);
+  BC->MIB->setRAState(UnsignedInst, false);
+  BB->addInstruction(UnsignedInst);
+
+  MCInst SignedInst = MCInstBuilder(AArch64::ADDSXri)
+                          .addReg(AArch64::X0)
+                          .addReg(AArch64::X0)
+                          .addImm(1)
+                          .addImm(0);
+  BC->MIB->setRAState(SignedInst, true);
+  BB->addInstruction(SignedInst);
+
+  Error E = PassManager->runPasses();
+  EXPECT_FALSE(E);
+
+  /* Expected layout of BF after the pass:
+
+   .LBB0 (3 instructions, align : 1)
+      Entry Point
+      CFI State : 0
+        00000000:   adds    x0, x0, #0x0
+        00000004:   !CFI    $0      ; OpNegateRAState
+        00000004:   adds    x0, x0, #0x1
+      CFI State: 0
+   */
+  auto CFILoc = findCFIOffsets(*BF);
+  EXPECT_EQ(CFILoc.size(), 1u);
+  EXPECT_EQ(CFILoc[0], 4);
+}
+
+TEST_P(PassTester, fillUnknownStateInBBTest) {
+  /* Check that a if BB starts with unknown RAState, we can fill the unknown
+   states based on following instructions with known RAStates.
+   *
+   * .LBB0 (1 instructions, align : 1)
+        Entry Point
+        CFI State : 0
+          00000000:   adds    x0, x0, #0x0
+        CFI State: 0
+
+     .LBB1 (4 instructions, align : 1)
+        CFI State : 0
+          00000004:   !CFI    $0      ; OpNegateRAState
+          00000004:   adds    x0, x0, #0x1
+          00000008:   adds    x0, x0, #0x2
+          0000000c:   adds    x0, x0, #0x3
+        CFI State: 0
+   */
+  if (GetParam() != Triple::aarch64)
+    GTEST_SKIP();
+
+  ASSERT_NE(TextSection, nullptr);
+
+  PREPARE_FUNC("FuncWithUnknownStateInBB");
+  BinaryBasicBlock *BB2 = BF->addBasicBlock();
+  BB2->setCFIState(0);
+
+  MCInst Unsigned = MCInstBuilder(AArch64::ADDSXri)
+                        .addReg(AArch64::X0)
+                        .addReg(AArch64::X0)
+                        .addImm(0)
+                        .addImm(0);
+  BC->MIB->setRAState(Unsigned, false);
+  BB->addInstruction(Unsigned);
+
+  MCInst Unknown = MCInstBuilder(AArch64::ADDSXri)
+                       .addReg(AArch64::X0)
+                       .addReg(AArch64::X0)
+                       .addImm(1)
+                       .addImm(0);
+  MCInst Unknown1 = MCInstBuilder(AArch64::ADDSXri)
+                        .addReg(AArch64::X0)
+                        .addReg(AArch64::X0)
+                        .addImm(2)
+                        .addImm(0);
+  MCInst Signed = MCInstBuilder(AArch64::ADDSXri)
+                      .addReg(AArch64::X0)
+                      .addReg(AArch64::X0)
+                      .addImm(3)
+                      .addImm(0);
+  BC->MIB->setRAState(Signed, true);
+  BB2->addInstruction(Unknown);
+  BB2->addInstruction(Unknown1);
+  BB2->addInstruction(Signed);
+
+  Error E = PassManager->runPasses();
+  EXPECT_FALSE(E);
+
+  auto CFILoc = findCFIOffsets(*BF);
+  EXPECT_EQ(CFILoc.size(), 1u);
+  EXPECT_EQ(CFILoc[0], 4);
+  // Check that the pass set Unknown and Unknown1 to signed.
+  // begin() is the CFI, begin() + 1 is Unknown, begin() + 2 is Unknown1.
+  std::optional<bool> RAState = BC->MIB->getRAState(*(BB2->begin() + 1));
+  EXPECT_TRUE(RAState.has_value());
+  EXPECT_TRUE(*RAState);
+  std::optional<bool> RAState1 = BC->MIB->getRAState(*(BB2->begin() + 2));
+  EXPECT_TRUE(RAState1.has_value());
+  EXPECT_TRUE(*RAState1);
+}
+
+TEST_P(PassTester, fillUnknownStubs) {
+  /*
+   * Stubs that are not part of the function's CFG should inherit the RAState of
+   the BasicBlock before it.
+   *
+   * LBB1 is not part of the CFG: LBB0 jumps unconditionally to LBB2.
+   * LBB1 would be a stub inserted in LongJmp in real code.
+   * We do not add any NegateRAState CFIs, as other CFIs are not added either.
+   * See issue #160989 for more details.
+   *
+   *  .LBB0 (1 instructions, align : 1)
+       Entry Point
+         00000000:   b       .LBB2
+       Successors: .LBB2
+
+     .LBB1 (1 instructions, align : 1)
+         00000004:   ret
+
+     .LBB2 (1 instructions, align : 1)
+       Predecessors: .LBB0
+          00000008:   ret
+   */
+  if (GetParam() != Triple::aarch64)
+    GTEST_SKIP();
+
+  ASSERT_NE(TextSection, nullptr);
+
+  PREPARE_FUNC("FuncWithStub");
+  BinaryBasicBlock *BB2 = BF->addBasicBlock();
+  BB2->setCFIState(0);
+  BinaryBasicBlock *BB3 = BF->addBasicBlock();
+  BB3->setCFIState(0);
+
+  BB->addSuccessor(BB3);
+
+  // Jumping over BB2, to BB3.
+  MCInst Jump;
+  BC->MIB->createUncondBranch(Jump, BB3->getLabel(), BC->Ctx.get());
+  BB->addInstruction(Jump);
+  BC->MIB->setRAState(Jump, false);
+
+  // BB2, in real code it would be a ShortJmp.
+  // Unknown RAState.
+  MCInst StubInst;
+  BC->MIB->createReturn(StubInst);
+  BB2->addInstruction(StubInst);
+
+  // Can be any instruction.
+  MCInst Ret;
+  BC->MIB->createReturn(Ret);
+  BB3->addInstruction(Ret);
+  BC->MIB->setRAState(Ret, false);
+
+  Error E = PassManager->runPasses();
+  EXPECT_FALSE(E);
+
+  // Check that we did not generate any NegateRAState CFIs.
+  auto CFILoc = findCFIOffsets(*BF);
+  EXPECT_EQ(CFILoc.size(), 0u);
+}
+
+TEST_P(PassTester, fillUnknownStubsEmpty) {
+  /*
+   * This test checks that BOLT can set the RAState of unknown BBs,
+   * even if all previous BBs are empty, hence no PrevInst gets set.
+   *
+   * As this means that the current (empty) BB is the first with non-pseudo
+   * instructions, the function's initialRAState should be used.
+   */
+  if (GetParam() != Triple::aarch64)
+    GTEST_SKIP();
+
+  ASSERT_NE(TextSection, nullptr);
+
+  PREPARE_FUNC("FuncWithStub");
+  BF->setInitialRAState(false);
+  BinaryBasicBlock *BB2 = BF->addBasicBlock();
+  BB2->setCFIState(0);
+
+  // BB is empty.
+  BB->addSuccessor(BB2);
+
+  // BB2, in real code it would be a ShortJmp.
+  // Unknown RAState.
+  MCInst StubInst;
+  BC->MIB->createReturn(StubInst);
+  BB2->addInstruction(StubInst);
+
+  Error E = PassManager->runPasses();
+  EXPECT_FALSE(E);
+
+  // Check that BOLT added an RAState to BB2.
+  std::optional<bool> RAState = BC->MIB->getRAState(*(BB2->begin()));
+  EXPECT_TRUE(RAState.has_value());
+  // BB2 should be set to BF.initialRAState (false).
+  EXPECT_FALSE(*RAState);
+}
+
+#ifdef AARCH64_AVAILABLE
+INSTANTIATE_TEST_SUITE_P(AArch64, PassTester,
+                         ::testing::Values(Triple::aarch64));
+#endif
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst
index fe78ad8056443..38143c94cd3ae 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst
@@ -29,9 +29,9 @@ STL containers for which ``operator[]`` is well-defined for all inputs are
 excluded from this check (e.g.: ``std::map::operator[]``).
 
 This check enforces part of the `SL.con.3
-<https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#slcon3-avoid-bounds-errors>`
+<https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#slcon3-avoid-bounds-errors>`_
 guideline and is part of the `Bounds Safety (Bounds 4)
-<https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#pro-bounds-arrayindex>`
+<https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#pro-bounds-arrayindex>`_
 profile from the C++ Core Guidelines.
 
 Options
diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index e7ca7b0bd0792..ab3f2c48983ca 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -266,7 +266,7 @@ implementation.
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | device                       | has_device_addr clause on target construct                   | :none:`unclaimed`        |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | iterators in map clause or motion clauses                    | :none:`unclaimed`        |                                                                       |
+| device                       | iterators in map clause or motion clauses                    | :none:`done`             | https://github.com/llvm/llvm-project/pull/159112                      |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | device                       | indirect clause on declare target directive                  | :part:`In Progress`      |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index ee2321dd158d4..5394b2558b407 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -4524,6 +4524,11 @@ class RecordDecl : public TagDecl {
     return field_begin() == field_end();
   }
 
+  /// Returns the number of fields (non-static data members) in this record.
+  unsigned getNumFields() const {
+    return std::distance(field_begin(), field_end());
+  }
+
   /// noload_fields - Iterate over the fields stored in this record
   /// that are currently loaded; don't attempt to retrieve anything
   /// from an external source.
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 72c5efde7449b..d9c3cf239451e 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -7582,7 +7582,8 @@ class OMPToClause final : public OMPMappableExprListClause<OMPToClause>,
 
   /// Motion-modifiers for the 'to' clause.
   OpenMPMotionModifierKind MotionModifiers[NumberOfOMPMotionModifiers] = {
-      OMPC_MOTION_MODIFIER_unknown, OMPC_MOTION_MODIFIER_unknown};
+      OMPC_MOTION_MODIFIER_unknown, OMPC_MOTION_MODIFIER_unknown,
+      OMPC_MOTION_MODIFIER_unknown};
 
   /// Location of motion-modifiers for the 'to' clause.
   SourceLocation MotionModifiersLoc[NumberOfOMPMotionModifiers];
@@ -7654,6 +7655,9 @@ class OMPToClause final : public OMPMappableExprListClause<OMPToClause>,
     MotionModifiersLoc[I] = TLoc;
   }
 
+  void setIteratorModifier(Expr *IteratorModifier) {
+    getTrailingObjects<Expr *>()[2 * varlist_size()] = IteratorModifier;
+  }
   /// Set colon location.
   void setColonLoc(SourceLocation Loc) { ColonLoc = Loc; }
 
@@ -7662,7 +7666,7 @@ class OMPToClause final : public OMPMappableExprListClause<OMPToClause>,
   size_t numTrailingObjects(OverloadToken<Expr *>) const {
     // There are varlist_size() of expressions, and varlist_size() of
     // user-defined mappers.
-    return 2 * varlist_size();
+    return 2 * varlist_size() + 1;
   }
   size_t numTrailingObjects(OverloadToken<ValueDecl *>) const {
     return getUniqueDeclarationsNum();
@@ -7688,15 +7692,14 @@ class OMPToClause final : public OMPMappableExprListClause<OMPToClause>,
   /// \param UDMQualifierLoc C++ nested name specifier for the associated
   /// user-defined mapper.
   /// \param MapperId The identifier of associated user-defined mapper.
-  static OMPToClause *Create(const ASTContext &C, const OMPVarListLocTy &Locs,
-                             ArrayRef<Expr *> Vars,
-                             ArrayRef<ValueDecl *> Declarations,
-                             MappableExprComponentListsRef ComponentLists,
-                             ArrayRef<Expr *> UDMapperRefs,
-                             ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
-                             ArrayRef<SourceLocation> MotionModifiersLoc,
-                             NestedNameSpecifierLoc UDMQualifierLoc,
-                             DeclarationNameInfo MapperId);
+  static OMPToClause *
+  Create(const ASTContext &C, const OMPVarListLocTy &Locs,
+         ArrayRef<Expr *> Vars, ArrayRef<ValueDecl *> Declarations,
+         MappableExprComponentListsRef ComponentLists,
+         ArrayRef<Expr *> UDMapperRefs, Expr *IteratorModifier,
+         ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
+         ArrayRef<SourceLocation> MotionModifiersLoc,
+         NestedNameSpecifierLoc UDMQualifierLoc, DeclarationNameInfo MapperId);
 
   /// Creates an empty clause with the place for \a NumVars variables.
   ///
@@ -7717,7 +7720,9 @@ class OMPToClause final : public OMPMappableExprListClause<OMPToClause>,
            "Requested modifier exceeds the total number of modifiers.");
     return MotionModifiers[Cnt];
   }
-
+  Expr *getIteratorModifier() const {
+    return getTrailingObjects<Expr *>()[2 * varlist_size()];
+  }
   /// Fetches the motion-modifier location at 'Cnt' index of array of modifiers'
   /// locations.
   ///
@@ -7782,7 +7787,8 @@ class OMPFromClause final
 
   /// Motion-modifiers for the 'from' clause.
   OpenMPMotionModifierKind MotionModifiers[NumberOfOMPMotionModifiers] = {
-      OMPC_MOTION_MODIFIER_unknown, OMPC_MOTION_MODIFIER_unknown};
+      OMPC_MOTION_MODIFIER_unknown, OMPC_MOTION_MODIFIER_unknown,
+      OMPC_MOTION_MODIFIER_unknown};
 
   /// Location of motion-modifiers for the 'from' clause.
   SourceLocation MotionModifiersLoc[NumberOfOMPMotionModifiers];
@@ -7843,7 +7849,9 @@ class OMPFromClause final
            "Unexpected index to store motion modifier, exceeds array size.");
     MotionModifiers[I] = T;
   }
-
+  void setIteratorModifier(Expr *IteratorModifier) {
+    getTrailingObjects<Expr *>()[2 * varlist_size()] = IteratorModifier;
+  }
   /// Set location for the motion-modifier.
   ///
   /// \param I index for motion-modifier location.
@@ -7862,7 +7870,7 @@ class OMPFromClause final
   size_t numTrailingObjects(OverloadToken<Expr *>) const {
     // There are varlist_size() of expressions, and varlist_size() of
     // user-defined mappers.
-    return 2 * varlist_size();
+    return 2 * varlist_size() + 1;
   }
   size_t numTrailingObjects(OverloadToken<ValueDecl *>) const {
     return getUniqueDeclarationsNum();
@@ -7892,7 +7900,7 @@ class OMPFromClause final
   Create(const ASTContext &C, const OMPVarListLocTy &Locs,
          ArrayRef<Expr *> Vars, ArrayRef<ValueDecl *> Declarations,
          MappableExprComponentListsRef ComponentLists,
-         ArrayRef<Expr *> UDMapperRefs,
+         ArrayRef<Expr *> UDMapperRefs, Expr *IteratorExpr,
          ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
          ArrayRef<SourceLocation> MotionModifiersLoc,
          NestedNameSpecifierLoc UDMQualifierLoc, DeclarationNameInfo MapperId);
@@ -7916,7 +7924,9 @@ class OMPFromClause final
            "Requested modifier exceeds the total number of modifiers.");
     return MotionModifiers[Cnt];
   }
-
+  Expr *getIteratorModifier() const {
+    return getTrailingObjects<Expr *>()[2 * varlist_size()];
+  }
   /// Fetches the motion-modifier location at 'Cnt' index of array of modifiers'
   /// locations.
   ///
diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def
index b98b946cad75a..ceac89d3aba6d 100644
--- a/clang/include/clang/Basic/OpenMPKinds.def
+++ b/clang/include/clang/Basic/OpenMPKinds.def
@@ -207,6 +207,7 @@ OPENMP_MAP_MODIFIER_KIND(ompx_hold)
 
 // Modifiers for 'to' or 'from' clause.
 OPENMP_MOTION_MODIFIER_KIND(mapper)
+OPENMP_MOTION_MODIFIER_KIND(iterator)
 OPENMP_MOTION_MODIFIER_KIND(present)
 
 // Static attributes for 'dist_schedule' clause.
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index 686e51ee92a08..2d05b4423140b 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -1351,7 +1351,7 @@ class SemaOpenMP : public SemaBase {
   OMPClause *
   ActOnOpenMPToClause(ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
                       ArrayRef<SourceLocation> MotionModifiersLoc,
-                      CXXScopeSpec &MapperIdScopeSpec,
+                      Expr *IteratorModifier, CXXScopeSpec &MapperIdScopeSpec,
                       DeclarationNameInfo &MapperId, SourceLocation ColonLoc,
                       ArrayRef<Expr *> VarList, const OMPVarListLocTy &Locs,
                       ArrayRef<Expr *> UnresolvedMappers = {});
@@ -1359,7 +1359,7 @@ class SemaOpenMP : public SemaBase {
   OMPClause *
   ActOnOpenMPFromClause(ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
                         ArrayRef<SourceLocation> MotionModifiersLoc,
-                        CXXScopeSpec &MapperIdScopeSpec,
+                        Expr *IteratorModifier, CXXScopeSpec &MapperIdScopeSpec,
                         DeclarationNameInfo &MapperId, SourceLocation ColonLoc,
                         ArrayRef<Expr *> VarList, const OMPVarListLocTy &Locs,
                         ArrayRef<Expr *> UnresolvedMappers = {});
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index dd0b8e790d444..58e84ef70abb7 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -1705,6 +1705,9 @@ bool Compiler<Emitter>::VisitFixedPointUnaryOperator(const UnaryOperator *E) {
 template <class Emitter>
 bool Compiler<Emitter>::VisitImplicitValueInitExpr(
     const ImplicitValueInitExpr *E) {
+  if (DiscardResult)
+    return true;
+
   QualType QT = E->getType();
 
   if (OptPrimType T = classify(QT))
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 8496b58105c7a..971fce541bb88 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -1921,6 +1921,10 @@ static bool interp__builtin_memcmp(InterpState &S, CodePtr OpPC,
   if (PtrA.isDummy() || PtrB.isDummy())
     return false;
 
+  if (!CheckRange(S, OpPC, PtrA, AK_Read) ||
+      !CheckRange(S, OpPC, PtrB, AK_Read))
+    return false;
+
   // Now, read both pointers to a buffer and compare those.
   BitcastBuffer BufferA(
       Bits(ASTCtx.getTypeSize(ElemTypeA) * PtrA.getNumElems()));
diff --git a/clang/lib/AST/ComparisonCategories.cpp b/clang/lib/AST/ComparisonCategories.cpp
index 0c7a7f4eacbbf..1b9c938e2ace3 100644
--- a/clang/lib/AST/ComparisonCategories.cpp
+++ b/clang/lib/AST/ComparisonCategories.cpp
@@ -49,7 +49,7 @@ bool ComparisonCategoryInfo::ValueInfo::hasValidIntValue() const {
   // Before we attempt to get the value of the first field, ensure that we
   // actually have one (and only one) field.
   const auto *Record = VD->getType()->getAsCXXRecordDecl();
-  if (std::distance(Record->field_begin(), Record->field_end()) != 1 ||
+  if (Record->getNumFields() != 1 ||
       !Record->field_begin()->getType()->isIntegralOrEnumerationType())
     return false;
 
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index b986ee6ca4fa3..e5af4cb049ba9 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -3971,8 +3971,7 @@ static bool constructAggregate(EvalInfo &Info, const FPOptions FPO,
       if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD))
         NumBases = CXXRD->getNumBases();
 
-      *Res = APValue(APValue::UninitStruct(), NumBases,
-                     std::distance(RD->field_begin(), RD->field_end()));
+      *Res = APValue(APValue::UninitStruct(), NumBases, RD->getNumFields());
 
       SmallVector<std::tuple<APValue *, QualType, unsigned>> ReverseList;
       // we need to traverse backwards
@@ -5529,8 +5528,8 @@ static bool handleDefaultInitValue(QualType T, APValue &Result) {
       Result = APValue((const FieldDecl *)nullptr);
       return true;
     }
-    Result = APValue(APValue::UninitStruct(), RD->getNumBases(),
-                     std::distance(RD->field_begin(), RD->field_end()));
+    Result =
+        APValue(APValue::UninitStruct(), RD->getNumBases(), RD->getNumFields());
 
     unsigned Index = 0;
     for (CXXRecordDecl::base_class_const_iterator I = RD->bases_begin(),
@@ -7184,7 +7183,7 @@ static bool HandleConstructorCall(const Expr *E, const LValue &This,
   if (!Result.hasValue()) {
     if (!RD->isUnion())
       Result = APValue(APValue::UninitStruct(), RD->getNumBases(),
-                       std::distance(RD->field_begin(), RD->field_end()));
+                       RD->getNumFields());
     else
       // A union starts with no active member.
       Result = APValue((const FieldDecl*)nullptr);
@@ -8135,8 +8134,7 @@ class BufferToAPValueConverter {
     if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD))
       NumBases = CXXRD->getNumBases();
 
-    APValue ResultVal(APValue::UninitStruct(), NumBases,
-                      std::distance(RD->field_begin(), RD->field_end()));
+    APValue ResultVal(APValue::UninitStruct(), NumBases, RD->getNumFields());
 
     // Visit the base classes.
     if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
@@ -11146,7 +11144,7 @@ static bool HandleClassZeroInitialization(EvalInfo &Info, const Expr *E,
   assert(!RD->isUnion() && "Expected non-union class type");
   const CXXRecordDecl *CD = dyn_cast<CXXRecordDecl>(RD);
   Result = APValue(APValue::UninitStruct(), CD ? CD->getNumBases() : 0,
-                   std::distance(RD->field_begin(), RD->field_end()));
+                   RD->getNumFields());
 
   if (RD->isInvalidDecl()) return false;
   const ASTRecordLayout &Layout = Info.Ctx.getASTRecordLayout(RD);
@@ -11342,7 +11340,7 @@ bool RecordExprEvaluator::VisitCXXParenListOrInitListExpr(
 
   if (!Result.hasValue())
     Result = APValue(APValue::UninitStruct(), CXXRD ? CXXRD->getNumBases() : 0,
-                     std::distance(RD->field_begin(), RD->field_end()));
+                     RD->getNumFields());
   unsigned ElementNo = 0;
   bool Success = true;
 
@@ -11549,8 +11547,7 @@ bool RecordExprEvaluator::VisitLambdaExpr(const LambdaExpr *E) {
   if (ClosureClass->isInvalidDecl())
     return false;
 
-  const size_t NumFields =
-      std::distance(ClosureClass->field_begin(), ClosureClass->field_end());
+  const size_t NumFields = ClosureClass->getNumFields();
 
   assert(NumFields == (size_t)std::distance(E->capture_init_begin(),
                                             E->capture_init_end()) &&
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index 0640fed823771..2183d77de8fa7 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -1321,7 +1321,7 @@ OMPToClause *OMPToClause::Create(
     const ASTContext &C, const OMPVarListLocTy &Locs, ArrayRef<Expr *> Vars,
     ArrayRef<ValueDecl *> Declarations,
     MappableExprComponentListsRef ComponentLists, ArrayRef<Expr *> UDMapperRefs,
-    ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
+    Expr *IteratorModifier, ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
     ArrayRef<SourceLocation> MotionModifiersLoc,
     NestedNameSpecifierLoc UDMQualifierLoc, DeclarationNameInfo MapperId) {
   OMPMappableExprListSizeTy Sizes;
@@ -1343,7 +1343,7 @@ OMPToClause *OMPToClause::Create(
   void *Mem = C.Allocate(
       totalSizeToAlloc<Expr *, ValueDecl *, unsigned,
                        OMPClauseMappableExprCommon::MappableComponent>(
-          2 * Sizes.NumVars, Sizes.NumUniqueDeclarations,
+          2 * Sizes.NumVars + 1, Sizes.NumUniqueDeclarations,
           Sizes.NumUniqueDeclarations + Sizes.NumComponentLists,
           Sizes.NumComponents));
 
@@ -1353,6 +1353,7 @@ OMPToClause *OMPToClause::Create(
   Clause->setVarRefs(Vars);
   Clause->setUDMapperRefs(UDMapperRefs);
   Clause->setClauseInfo(Declarations, ComponentLists);
+  Clause->setIteratorModifier(IteratorModifier);
   return Clause;
 }
 
@@ -1361,17 +1362,19 @@ OMPToClause *OMPToClause::CreateEmpty(const ASTContext &C,
   void *Mem = C.Allocate(
       totalSizeToAlloc<Expr *, ValueDecl *, unsigned,
                        OMPClauseMappableExprCommon::MappableComponent>(
-          2 * Sizes.NumVars, Sizes.NumUniqueDeclarations,
+          2 * Sizes.NumVars + 1, Sizes.NumUniqueDeclarations,
           Sizes.NumUniqueDeclarations + Sizes.NumComponentLists,
           Sizes.NumComponents));
-  return new (Mem) OMPToClause(Sizes);
+  OMPToClause *Clause = new (Mem) OMPToClause(Sizes);
+  Clause->setIteratorModifier(nullptr);
+  return Clause;
 }
 
 OMPFromClause *OMPFromClause::Create(
     const ASTContext &C, const OMPVarListLocTy &Locs, ArrayRef<Expr *> Vars,
     ArrayRef<ValueDecl *> Declarations,
     MappableExprComponentListsRef ComponentLists, ArrayRef<Expr *> UDMapperRefs,
-    ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
+    Expr *IteratorModifier, ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
     ArrayRef<SourceLocation> MotionModifiersLoc,
     NestedNameSpecifierLoc UDMQualifierLoc, DeclarationNameInfo MapperId) {
   OMPMappableExprListSizeTy Sizes;
@@ -1393,7 +1396,7 @@ OMPFromClause *OMPFromClause::Create(
   void *Mem = C.Allocate(
       totalSizeToAlloc<Expr *, ValueDecl *, unsigned,
                        OMPClauseMappableExprCommon::MappableComponent>(
-          2 * Sizes.NumVars, Sizes.NumUniqueDeclarations,
+          2 * Sizes.NumVars + 1, Sizes.NumUniqueDeclarations,
           Sizes.NumUniqueDeclarations + Sizes.NumComponentLists,
           Sizes.NumComponents));
 
@@ -1404,6 +1407,7 @@ OMPFromClause *OMPFromClause::Create(
   Clause->setVarRefs(Vars);
   Clause->setUDMapperRefs(UDMapperRefs);
   Clause->setClauseInfo(Declarations, ComponentLists);
+  Clause->setIteratorModifier(IteratorModifier);
   return Clause;
 }
 
@@ -1413,10 +1417,12 @@ OMPFromClause::CreateEmpty(const ASTContext &C,
   void *Mem = C.Allocate(
       totalSizeToAlloc<Expr *, ValueDecl *, unsigned,
                        OMPClauseMappableExprCommon::MappableComponent>(
-          2 * Sizes.NumVars, Sizes.NumUniqueDeclarations,
+          2 * Sizes.NumVars + 1, Sizes.NumUniqueDeclarations,
           Sizes.NumUniqueDeclarations + Sizes.NumComponentLists,
           Sizes.NumComponents));
-  return new (Mem) OMPFromClause(Sizes);
+  OMPFromClause *Clause = new (Mem) OMPFromClause(Sizes);
+  Clause->setIteratorModifier(nullptr);
+  return Clause;
 }
 
 void OMPUseDevicePtrClause::setPrivateCopies(ArrayRef<Expr *> VL) {
@@ -2694,12 +2700,16 @@ template <typename T> void OMPClausePrinter::VisitOMPMotionClause(T *Node) {
     OS << '(';
     for (unsigned I = 0; I < NumberOfOMPMotionModifiers; ++I) {
       if (Node->getMotionModifier(I) != OMPC_MOTION_MODIFIER_unknown) {
-        OS << getOpenMPSimpleClauseTypeName(Node->getClauseKind(),
-                                            Node->getMotionModifier(I));
-        if (Node->getMotionModifier(I) == OMPC_MOTION_MODIFIER_mapper)
-          PrintMapper(OS, Node, Policy);
-        if (I < ModifierCount - 1)
-          OS << ", ";
+        if (Node->getMotionModifier(I) == OMPC_MOTION_MODIFIER_iterator) {
+          PrintIterator(OS, Node, Policy);
+        } else {
+          OS << getOpenMPSimpleClauseTypeName(Node->getClauseKind(),
+                                              Node->getMotionModifier(I));
+          if (Node->getMotionModifier(I) == OMPC_MOTION_MODIFIER_mapper)
+            PrintMapper(OS, Node, Policy);
+          if (I < ModifierCount - 1)
+            OS << ", ";
+        }
       }
     }
     OS << ':';
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index f5c07fe2e33ff..bbe85986b07fc 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -816,8 +816,7 @@ CGHLSLRuntime::handleStructSemanticLoad(
   const llvm::StructType *ST = cast<StructType>(Type);
   const clang::RecordDecl *RD = Decl->getType()->getAsRecordDecl();
 
-  assert(std::distance(RD->field_begin(), RD->field_end()) ==
-         ST->getNumElements());
+  assert(RD->getNumFields() == ST->getNumElements());
 
   llvm::Value *Aggregate = llvm::PoisonValue::get(Type);
   auto FieldDecl = RD->field_begin();
@@ -849,8 +848,7 @@ CGHLSLRuntime::handleStructSemanticStore(
     RD = Decl->getType()->getAsRecordDecl();
   assert(RD);
 
-  assert(std::distance(RD->field_begin(), RD->field_end()) ==
-         ST->getNumElements());
+  assert(RD->getNumFields() == ST->getNumElements());
 
   auto FieldDecl = RD->field_begin();
   for (unsigned I = 0; I < ST->getNumElements(); ++I) {
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 5ceaaf30b8d24..a735295d6e78c 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -8683,6 +8683,15 @@ class MappableExprsHandler {
       if (llvm::is_contained(C->getMotionModifiers(),
                              OMPC_MOTION_MODIFIER_present))
         Kind = Present;
+      if (llvm::is_contained(C->getMotionModifiers(),
+                             OMPC_MOTION_MODIFIER_iterator)) {
+        if (auto *IteratorExpr = dyn_cast<OMPIteratorExpr>(
+                C->getIteratorModifier()->IgnoreParenImpCasts())) {
+          const auto *VD = cast<VarDecl>(IteratorExpr->getIteratorDecl(0));
+          CGF.EmitVarDecl(*VD);
+        }
+      }
+
       const auto *EI = C->getVarRefs().begin();
       for (const auto L : C->component_lists()) {
         InfoGen(std::get<0>(L), Kind, std::get<1>(L), OMPC_MAP_to, {},
@@ -8699,6 +8708,15 @@ class MappableExprsHandler {
       if (llvm::is_contained(C->getMotionModifiers(),
                              OMPC_MOTION_MODIFIER_present))
         Kind = Present;
+      if (llvm::is_contained(C->getMotionModifiers(),
+                             OMPC_MOTION_MODIFIER_iterator)) {
+        if (auto *IteratorExpr = dyn_cast<OMPIteratorExpr>(
+                C->getIteratorModifier()->IgnoreParenImpCasts())) {
+          const auto *VD = cast<VarDecl>(IteratorExpr->getIteratorDecl(0));
+          CGF.EmitVarDecl(*VD);
+        }
+      }
+
       const auto *EI = C->getVarRefs().begin();
       for (const auto L : C->component_lists()) {
         InfoGen(std::get<0>(L), Kind, std::get<1>(L), OMPC_MAP_from, {},
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 3b69c286634bb..15c3f7594bf44 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -4925,19 +4925,28 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
         break;
       Data.MotionModifiers.push_back(Modifier);
       Data.MotionModifiersLoc.push_back(Tok.getLocation());
-      ConsumeToken();
-      if (Modifier == OMPC_MOTION_MODIFIER_mapper) {
-        IsInvalidMapperModifier = parseMapperModifier(Data);
-        if (IsInvalidMapperModifier)
+      if (PP.getSpelling(Tok) == "iterator" && getLangOpts().OpenMP >= 51) {
+        ExprResult Tail;
+        Tail = ParseOpenMPIteratorsExpr();
+        Tail = Actions.ActOnFinishFullExpr(Tail.get(), T.getOpenLocation(),
+                                           /*DiscardedValue=*/false);
+        if (Tail.isUsable())
+          Data.IteratorExpr = Tail.get();
+      } else {
+        ConsumeToken();
+        if (Modifier == OMPC_MOTION_MODIFIER_mapper) {
+          IsInvalidMapperModifier = parseMapperModifier(Data);
+          if (IsInvalidMapperModifier)
+            break;
+        }
+        // OpenMP < 5.1 doesn't permit a ',' or additional modifiers.
+        if (getLangOpts().OpenMP < 51)
           break;
+        // OpenMP 5.1 accepts an optional ',' even if the next character is ':'.
+        // TODO: Is that intentional?
+        if (Tok.is(tok::comma))
+          ConsumeToken();
       }
-      // OpenMP < 5.1 doesn't permit a ',' or additional modifiers.
-      if (getLangOpts().OpenMP < 51)
-        break;
-      // OpenMP 5.1 accepts an optional ',' even if the next character is ':'.
-      // TODO: Is that intentional?
-      if (Tok.is(tok::comma))
-        ConsumeToken();
     }
     if (!Data.MotionModifiers.empty() && Tok.isNot(tok::colon)) {
       if (!IsInvalidMapperModifier) {
diff --git a/clang/lib/Sema/CodeCompleteConsumer.cpp b/clang/lib/Sema/CodeCompleteConsumer.cpp
index e3fc7c11f4594..50a552272f421 100644
--- a/clang/lib/Sema/CodeCompleteConsumer.cpp
+++ b/clang/lib/Sema/CodeCompleteConsumer.cpp
@@ -539,8 +539,7 @@ unsigned CodeCompleteConsumer::OverloadCandidate::getNumParams() const {
     return Template->getTemplateParameters()->size();
 
   if (Kind == CK_Aggregate) {
-    unsigned Count =
-        std::distance(AggregateType->field_begin(), AggregateType->field_end());
+    unsigned Count = AggregateType->getNumFields();
     if (const auto *CRD = dyn_cast<CXXRecordDecl>(AggregateType))
       Count += CRD->getNumBases();
     return Count;
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 836f5517f13f1..4a88891954acd 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -19155,16 +19155,16 @@ OMPClause *SemaOpenMP::ActOnOpenMPVarListClause(OpenMPClauseKind Kind,
         ExtraModifierLoc, ColonLoc, VarList, Locs);
     break;
   case OMPC_to:
-    Res =
-        ActOnOpenMPToClause(Data.MotionModifiers, Data.MotionModifiersLoc,
-                            Data.ReductionOrMapperIdScopeSpec,
-                            Data.ReductionOrMapperId, ColonLoc, VarList, Locs);
+    Res = ActOnOpenMPToClause(
+        Data.MotionModifiers, Data.MotionModifiersLoc, Data.IteratorExpr,
+        Data.ReductionOrMapperIdScopeSpec, Data.ReductionOrMapperId, ColonLoc,
+        VarList, Locs);
     break;
   case OMPC_from:
-    Res = ActOnOpenMPFromClause(Data.MotionModifiers, Data.MotionModifiersLoc,
-                                Data.ReductionOrMapperIdScopeSpec,
-                                Data.ReductionOrMapperId, ColonLoc, VarList,
-                                Locs);
+    Res = ActOnOpenMPFromClause(
+        Data.MotionModifiers, Data.MotionModifiersLoc, Data.IteratorExpr,
+        Data.ReductionOrMapperIdScopeSpec, Data.ReductionOrMapperId, ColonLoc,
+        VarList, Locs);
     break;
   case OMPC_use_device_ptr:
     Res = ActOnOpenMPUseDevicePtrClause(VarList, Locs);
@@ -24902,11 +24902,12 @@ void SemaOpenMP::ActOnOpenMPDeclareTargetInitializer(Decl *TargetDecl) {
 
 OMPClause *SemaOpenMP::ActOnOpenMPToClause(
     ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
-    ArrayRef<SourceLocation> MotionModifiersLoc,
+    ArrayRef<SourceLocation> MotionModifiersLoc, Expr *IteratorExpr,
     CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId,
     SourceLocation ColonLoc, ArrayRef<Expr *> VarList,
     const OMPVarListLocTy &Locs, ArrayRef<Expr *> UnresolvedMappers) {
   OpenMPMotionModifierKind Modifiers[] = {OMPC_MOTION_MODIFIER_unknown,
+                                          OMPC_MOTION_MODIFIER_unknown,
                                           OMPC_MOTION_MODIFIER_unknown};
   SourceLocation ModifiersLoc[NumberOfOMPMotionModifiers];
 
@@ -24930,20 +24931,25 @@ OMPClause *SemaOpenMP::ActOnOpenMPToClause(
                               MapperIdScopeSpec, MapperId, UnresolvedMappers);
   if (MVLI.ProcessedVarList.empty())
     return nullptr;
-
+  if (IteratorExpr)
+    if (auto *DRE = dyn_cast<DeclRefExpr>(IteratorExpr))
+      if (auto *VD = dyn_cast<VarDecl>(DRE->getDecl()))
+        DSAStack->addIteratorVarDecl(VD);
   return OMPToClause::Create(
       getASTContext(), Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
-      MVLI.VarComponents, MVLI.UDMapperList, Modifiers, ModifiersLoc,
-      MapperIdScopeSpec.getWithLocInContext(getASTContext()), MapperId);
+      MVLI.VarComponents, MVLI.UDMapperList, IteratorExpr, Modifiers,
+      ModifiersLoc, MapperIdScopeSpec.getWithLocInContext(getASTContext()),
+      MapperId);
 }
 
 OMPClause *SemaOpenMP::ActOnOpenMPFromClause(
     ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
-    ArrayRef<SourceLocation> MotionModifiersLoc,
+    ArrayRef<SourceLocation> MotionModifiersLoc, Expr *IteratorExpr,
     CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId,
     SourceLocation ColonLoc, ArrayRef<Expr *> VarList,
     const OMPVarListLocTy &Locs, ArrayRef<Expr *> UnresolvedMappers) {
   OpenMPMotionModifierKind Modifiers[] = {OMPC_MOTION_MODIFIER_unknown,
+                                          OMPC_MOTION_MODIFIER_unknown,
                                           OMPC_MOTION_MODIFIER_unknown};
   SourceLocation ModifiersLoc[NumberOfOMPMotionModifiers];
 
@@ -24967,11 +24973,15 @@ OMPClause *SemaOpenMP::ActOnOpenMPFromClause(
                               MapperIdScopeSpec, MapperId, UnresolvedMappers);
   if (MVLI.ProcessedVarList.empty())
     return nullptr;
-
+  if (IteratorExpr)
+    if (auto *DRE = dyn_cast<DeclRefExpr>(IteratorExpr))
+      if (auto *VD = dyn_cast<VarDecl>(DRE->getDecl()))
+        DSAStack->addIteratorVarDecl(VD);
   return OMPFromClause::Create(
       getASTContext(), Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
-      MVLI.VarComponents, MVLI.UDMapperList, Modifiers, ModifiersLoc,
-      MapperIdScopeSpec.getWithLocInContext(getASTContext()), MapperId);
+      MVLI.VarComponents, MVLI.UDMapperList, IteratorExpr, Modifiers,
+      ModifiersLoc, MapperIdScopeSpec.getWithLocInContext(getASTContext()),
+      MapperId);
 }
 
 OMPClause *
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 0e8b674a006d0..8e5dbeb792348 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -2221,13 +2221,14 @@ class TreeTransform {
   OMPClause *
   RebuildOMPToClause(ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
                      ArrayRef<SourceLocation> MotionModifiersLoc,
-                     CXXScopeSpec &MapperIdScopeSpec,
+                     Expr *IteratorModifier, CXXScopeSpec &MapperIdScopeSpec,
                      DeclarationNameInfo &MapperId, SourceLocation ColonLoc,
                      ArrayRef<Expr *> VarList, const OMPVarListLocTy &Locs,
                      ArrayRef<Expr *> UnresolvedMappers) {
     return getSema().OpenMP().ActOnOpenMPToClause(
-        MotionModifiers, MotionModifiersLoc, MapperIdScopeSpec, MapperId,
-        ColonLoc, VarList, Locs, UnresolvedMappers);
+        MotionModifiers, MotionModifiersLoc, IteratorModifier,
+        MapperIdScopeSpec, MapperId, ColonLoc, VarList, Locs,
+        UnresolvedMappers);
   }
 
   /// Build a new OpenMP 'from' clause.
@@ -2237,13 +2238,14 @@ class TreeTransform {
   OMPClause *
   RebuildOMPFromClause(ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
                        ArrayRef<SourceLocation> MotionModifiersLoc,
-                       CXXScopeSpec &MapperIdScopeSpec,
+                       Expr *IteratorModifier, CXXScopeSpec &MapperIdScopeSpec,
                        DeclarationNameInfo &MapperId, SourceLocation ColonLoc,
                        ArrayRef<Expr *> VarList, const OMPVarListLocTy &Locs,
                        ArrayRef<Expr *> UnresolvedMappers) {
     return getSema().OpenMP().ActOnOpenMPFromClause(
-        MotionModifiers, MotionModifiersLoc, MapperIdScopeSpec, MapperId,
-        ColonLoc, VarList, Locs, UnresolvedMappers);
+        MotionModifiers, MotionModifiersLoc, IteratorModifier,
+        MapperIdScopeSpec, MapperId, ColonLoc, VarList, Locs,
+        UnresolvedMappers);
   }
 
   /// Build a new OpenMP 'use_device_ptr' clause.
@@ -11535,6 +11537,13 @@ template <typename Derived>
 OMPClause *TreeTransform<Derived>::TransformOMPToClause(OMPToClause *C) {
   OMPVarListLocTy Locs(C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc());
   llvm::SmallVector<Expr *, 16> Vars;
+  Expr *IteratorModifier = C->getIteratorModifier();
+  if (IteratorModifier) {
+    ExprResult MapModRes = getDerived().TransformExpr(IteratorModifier);
+    if (MapModRes.isInvalid())
+      return nullptr;
+    IteratorModifier = MapModRes.get();
+  }
   CXXScopeSpec MapperIdScopeSpec;
   DeclarationNameInfo MapperIdInfo;
   llvm::SmallVector<Expr *, 16> UnresolvedMappers;
@@ -11542,14 +11551,22 @@ OMPClause *TreeTransform<Derived>::TransformOMPToClause(OMPToClause *C) {
           *this, C, Vars, MapperIdScopeSpec, MapperIdInfo, UnresolvedMappers))
     return nullptr;
   return getDerived().RebuildOMPToClause(
-      C->getMotionModifiers(), C->getMotionModifiersLoc(), MapperIdScopeSpec,
-      MapperIdInfo, C->getColonLoc(), Vars, Locs, UnresolvedMappers);
+      C->getMotionModifiers(), C->getMotionModifiersLoc(), IteratorModifier,
+      MapperIdScopeSpec, MapperIdInfo, C->getColonLoc(), Vars, Locs,
+      UnresolvedMappers);
 }
 
 template <typename Derived>
 OMPClause *TreeTransform<Derived>::TransformOMPFromClause(OMPFromClause *C) {
   OMPVarListLocTy Locs(C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc());
   llvm::SmallVector<Expr *, 16> Vars;
+  Expr *IteratorModifier = C->getIteratorModifier();
+  if (IteratorModifier) {
+    ExprResult MapModRes = getDerived().TransformExpr(IteratorModifier);
+    if (MapModRes.isInvalid())
+      return nullptr;
+    IteratorModifier = MapModRes.get();
+  }
   CXXScopeSpec MapperIdScopeSpec;
   DeclarationNameInfo MapperIdInfo;
   llvm::SmallVector<Expr *, 16> UnresolvedMappers;
@@ -11557,8 +11574,9 @@ OMPClause *TreeTransform<Derived>::TransformOMPFromClause(OMPFromClause *C) {
           *this, C, Vars, MapperIdScopeSpec, MapperIdInfo, UnresolvedMappers))
     return nullptr;
   return getDerived().RebuildOMPFromClause(
-      C->getMotionModifiers(), C->getMotionModifiersLoc(), MapperIdScopeSpec,
-      MapperIdInfo, C->getColonLoc(), Vars, Locs, UnresolvedMappers);
+      C->getMotionModifiers(), C->getMotionModifiersLoc(), IteratorModifier,
+      MapperIdScopeSpec, MapperIdInfo, C->getColonLoc(), Vars, Locs,
+      UnresolvedMappers);
 }
 
 template <typename Derived>
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index b0c7bae46f09e..47a1dc8f5a478 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -12387,6 +12387,8 @@ void OMPClauseReader::VisitOMPToClause(OMPToClause *C) {
     C->setMotionModifier(
         I, static_cast<OpenMPMotionModifierKind>(Record.readInt()));
     C->setMotionModifierLoc(I, Record.readSourceLocation());
+    if (C->getMotionModifier(I) == OMPC_MOTION_MODIFIER_iterator)
+      C->setIteratorModifier(Record.readExpr());
   }
   C->setMapperQualifierLoc(Record.readNestedNameSpecifierLoc());
   C->setMapperIdInfo(Record.readDeclarationNameInfo());
@@ -12443,6 +12445,8 @@ void OMPClauseReader::VisitOMPFromClause(OMPFromClause *C) {
     C->setMotionModifier(
         I, static_cast<OpenMPMotionModifierKind>(Record.readInt()));
     C->setMotionModifierLoc(I, Record.readSourceLocation());
+    if (C->getMotionModifier(I) == OMPC_MOTION_MODIFIER_iterator)
+      C->setIteratorModifier(Record.readExpr());
   }
   C->setMapperQualifierLoc(Record.readNestedNameSpecifierLoc());
   C->setMapperIdInfo(Record.readDeclarationNameInfo());
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index e8c0d3f2b4ee9..fcee93c0ebbd3 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -8417,6 +8417,8 @@ void OMPClauseWriter::VisitOMPToClause(OMPToClause *C) {
   for (unsigned I = 0; I < NumberOfOMPMotionModifiers; ++I) {
     Record.push_back(C->getMotionModifier(I));
     Record.AddSourceLocation(C->getMotionModifierLoc(I));
+    if (C->getMotionModifier(I) == OMPC_MOTION_MODIFIER_iterator)
+      Record.AddStmt(C->getIteratorModifier());
   }
   Record.AddNestedNameSpecifierLoc(C->getMapperQualifierLoc());
   Record.AddDeclarationNameInfo(C->getMapperIdInfo());
@@ -8447,6 +8449,8 @@ void OMPClauseWriter::VisitOMPFromClause(OMPFromClause *C) {
   for (unsigned I = 0; I < NumberOfOMPMotionModifiers; ++I) {
     Record.push_back(C->getMotionModifier(I));
     Record.AddSourceLocation(C->getMotionModifierLoc(I));
+    if (C->getMotionModifier(I) == OMPC_MOTION_MODIFIER_iterator)
+      Record.AddStmt(C->getIteratorModifier());
   }
   Record.AddNestedNameSpecifierLoc(C->getMapperQualifierLoc());
   Record.AddDeclarationNameInfo(C->getMapperIdInfo());
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index 4a53cb66b2fdd..3076b5239ebbe 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -1545,6 +1545,13 @@ namespace Memcmp {
 
   int unknown;
   void foo(void) { unknown *= __builtin_memcmp(0, 0, 2); }
+
+  constexpr int onepasttheend(char a) {
+    __builtin_memcmp(&a, &a + 1, 1); // both-note {{read of dereferenced one-past-the-end pointer}}
+    return 1;
+  }
+  static_assert(onepasttheend(10)); // both-error {{not an integral constant expression}} \
+                                    // both-note {{in call to}}
 }
 
 namespace Memchr {
diff --git a/clang/test/AST/ByteCode/c.c b/clang/test/AST/ByteCode/c.c
index bffd557ff77a6..0d3d97b5eeab2 100644
--- a/clang/test/AST/ByteCode/c.c
+++ b/clang/test/AST/ByteCode/c.c
@@ -392,3 +392,16 @@ void plainComplex(void) {
   _Complex cd; // all-warning {{_Complex double}}
   cd = *(_Complex *)&(struct { double r, i; }){0.0, 0.0}; // all-warning {{_Complex double}}
 }
+
+/// This test results in an ImplicitValueInitExpr with DiscardResult set.
+struct M{
+  char c;
+};
+typedef struct S64 {
+  struct M m;
+  char a[64];
+} I64;
+
+_Static_assert((((I64){}, 1)), ""); // all-warning {{left operand of comma operator has no effect}} \
+                                    // pedantic-warning {{use of an empty initializer is a C23 extension}} \
+                                    // pedantic-warning {{expression is not an integer constant expression; folding it to a constant is a GNU extension}}
diff --git a/clang/test/OpenMP/cancel_codegen.cpp b/clang/test/OpenMP/cancel_codegen.cpp
index 16e7542a8e826..600aae211087a 100644
--- a/clang/test/OpenMP/cancel_codegen.cpp
+++ b/clang/test/OpenMP/cancel_codegen.cpp
@@ -774,10 +774,8 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM12]])
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_AFTER:%.*]]
 // CHECK3:       omp_section_loop.after:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_AFTERSECTIONS_FINI:%.*]]
-// CHECK3:       omp_section_loop.aftersections.fini:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_PREHEADER13:%.*]]
-// CHECK3:       omp_section_loop.preheader13:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_PREHEADER16:%.*]]
+// CHECK3:       omp_section_loop.preheader16:
 // CHECK3-NEXT:    store i32 0, ptr [[P_LOWERBOUND29]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[P_UPPERBOUND30]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[P_STRIDE31]], align 4
@@ -787,54 +785,52 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[P_UPPERBOUND30]], align 4
 // CHECK3-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], [[TMP9]]
 // CHECK3-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 1
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_HEADER14:%.*]]
-// CHECK3:       omp_section_loop.header14:
-// CHECK3-NEXT:    [[OMP_SECTION_LOOP_IV20:%.*]] = phi i32 [ 0, [[OMP_SECTION_LOOP_PREHEADER13]] ], [ [[OMP_SECTION_LOOP_NEXT22:%.*]], [[OMP_SECTION_LOOP_INC17:%.*]] ]
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_COND15:%.*]]
-// CHECK3:       omp_section_loop.cond15:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_HEADER17:%.*]]
+// CHECK3:       omp_section_loop.header17:
+// CHECK3-NEXT:    [[OMP_SECTION_LOOP_IV20:%.*]] = phi i32 [ 0, [[OMP_SECTION_LOOP_PREHEADER16]] ], [ [[OMP_SECTION_LOOP_NEXT22:%.*]], [[OMP_SECTION_LOOP_INC17:%.*]] ]
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_COND18:%.*]]
+// CHECK3:       omp_section_loop.cond18:
 // CHECK3-NEXT:    [[OMP_SECTION_LOOP_CMP21:%.*]] = icmp ult i32 [[OMP_SECTION_LOOP_IV20]], [[TMP12]]
-// CHECK3-NEXT:    br i1 [[OMP_SECTION_LOOP_CMP21]], label [[OMP_SECTION_LOOP_BODY16:%.*]], label [[OMP_SECTION_LOOP_EXIT18:%.*]]
-// CHECK3:       omp_section_loop.body16:
+// CHECK3-NEXT:    br i1 [[OMP_SECTION_LOOP_CMP21]], label [[OMP_SECTION_LOOP_BODY19:%.*]], label [[OMP_SECTION_LOOP_EXIT21:%.*]]
+// CHECK3:       omp_section_loop.body19:
 // CHECK3-NEXT:    [[TMP13:%.*]] = add i32 [[OMP_SECTION_LOOP_IV20]], [[TMP9]]
 // CHECK3-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], 1
 // CHECK3-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 0
 // CHECK3-NEXT:    switch i32 [[TMP15]], label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER:%.*]] [
-// CHECK3-NEXT:      i32 0, label [[OMP_SECTION_LOOP_BODY_CASE23:%.*]]
-// CHECK3-NEXT:      i32 1, label [[OMP_SECTION_LOOP_BODY_CASE25:%.*]]
+// CHECK3-NEXT:      i32 0, label [[OMP_SECTION_LOOP_BODY_CASE26:%.*]]
+// CHECK3-NEXT:      i32 1, label [[OMP_SECTION_LOOP_BODY_CASE29:%.*]]
 // CHECK3-NEXT:    ]
-// CHECK3:       omp_section_loop.body.case23:
+// CHECK3:       omp_section_loop.body.case26:
 // CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM24:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_cancel(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM24]], i32 3)
 // CHECK3-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 0
-// CHECK3-NEXT:    br i1 [[TMP17]], label [[OMP_SECTION_LOOP_BODY_CASE23_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE23_CNCL:%.*]]
-// CHECK3:       omp_section_loop.body.case23.split:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE23_SECTION_AFTER:%.*]]
-// CHECK3:       omp_section_loop.body.case23.section.after:
+// CHECK3-NEXT:    br i1 [[TMP17]], label [[OMP_SECTION_LOOP_BODY_CASE26_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE26_CNCL:%.*]]
+// CHECK3:       omp_section_loop.body.case26.split:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE26_SECTION_AFTER:%.*]]
+// CHECK3:       omp_section_loop.body.case26.section.after:
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER]]
-// CHECK3:       omp_section_loop.body.case25:
+// CHECK3:       omp_section_loop.body.case29:
 // CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM27:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_cancel(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM27]], i32 3)
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP18]], 0
-// CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_SECTION_LOOP_BODY_CASE25_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE25_CNCL:%.*]]
-// CHECK3:       omp_section_loop.body.case25.split:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE25_SECTION_AFTER26:%.*]]
-// CHECK3:       omp_section_loop.body.case25.section.after26:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE25_SECTION_AFTER:%.*]]
-// CHECK3:       omp_section_loop.body.case25.section.after:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER]]
-// CHECK3:       omp_section_loop.body16.sections.after:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_INC17]]
-// CHECK3:       omp_section_loop.inc17:
+// CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_SECTION_LOOP_BODY_CASE29_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE29_CNCL:%.*]]
+// CHECK3:       omp_section_loop.body.case29.split:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE25_SECTION_AFTER29:%.*]]
+// CHECK3:       omp_section_loop.body.case29.section.after30:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE29_SECTION_AFTER:%.*]]
+// CHECK3:       omp_section_loop.body.case29.section.after:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY19_SECTIONS_AFTER:.*]]
+// CHECK3:       omp_section_loop.body19.sections.after:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_INC20:.*]]
+// CHECK3:       omp_section_loop.inc20:
 // CHECK3-NEXT:    [[OMP_SECTION_LOOP_NEXT22]] = add nuw i32 [[OMP_SECTION_LOOP_IV20]], 1
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_HEADER14]]
-// CHECK3:       omp_section_loop.exit18:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_HEADER17]]
+// CHECK3:       omp_section_loop.exit21:
 // CHECK3-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM32]])
 // CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM33:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM33]])
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_AFTER19:%.*]]
-// CHECK3:       omp_section_loop.after19:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_AFTER19SECTIONS_FINI:%.*]]
-// CHECK3:       omp_section_loop.after19sections.fini:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_AFTER22:%.*]]
+// CHECK3:       omp_section_loop.after22:
 // CHECK3-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP20]], ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
@@ -891,11 +887,11 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3:       .cancel.exit:
 // CHECK3-NEXT:    br label [[CANCEL_EXIT:%.*]]
 // CHECK3:       omp_section_loop.body.case.cncl:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_EXIT]]
-// CHECK3:       omp_section_loop.body.case23.cncl:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
-// CHECK3:       omp_section_loop.body.case25.cncl:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_EXIT:.*]]
+// CHECK3:       omp_section_loop.body.case26.cncl:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18:.*]]
+// CHECK3:       omp_section_loop.body.case29.cncl:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_EXIT21:.*]]
 // CHECK3:       .cancel.continue:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
@@ -954,8 +950,17 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP2]], 0.000000e+00
 // CHECK3-NEXT:    br i1 [[TOBOOL]], label [[TMP14:%.*]], label [[TMP3:%.*]]
 // CHECK3:       3:
-// CHECK3-NEXT:    br label [[TMP4:%.*]]
-// CHECK3:       4:
+// CHECK3-NEXT:    %[[GTN:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK3-NEXT:    %[[CANCEL_POINT:.*]] = call i32 @__kmpc_cancellationpoint(ptr @1, i32 %[[GTN]], i32 1)
+// CHECK3-NEXT:    %[[COND:.*]] = icmp eq i32 %[[CANCEL_POINT]], 0
+// CHECK3-NEXT:    br i1 %[[COND]], label %[[SPLIT:.*]], label %[[CNCL:.*]]
+// CHECK3:       .cncl:
+// CHECK3-NEXT:    br label %[[FINI:.*]]
+// CHECK3:       .fini:
+// CHECK3-NEXT:    br label %[[EXIT_STUB:omp.par.exit.exitStub]]
+// CHECK3:       .split:
+// CHECK3-NEXT:    br label [[TMP6:%.*]]
+// CHECK3:       6:
 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[LOADGEP_ARGC_ADDR]], align 4
 // CHECK3-NEXT:    [[CONV:%.*]] = trunc i32 [[TMP5]] to i8
 // CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[LOADGEP_ARGV_ADDR]], align 8
@@ -967,8 +972,8 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_cancel_barrier(ptr @[[GLOB3:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM4]])
 // CHECK3-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
 // CHECK3-NEXT:    br i1 [[TMP9]], label [[DOTCONT:%.*]], label [[DOTCNCL5:%.*]]
-// CHECK3:       .cncl5:
-// CHECK3-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+// CHECK3:       .cncl7:
+// CHECK3-NEXT:    br label %[[FINI]]
 // CHECK3:       .cont:
 // CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[LOADGEP_ARGC_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[LOADGEP_ARGV_ADDR]], align 8
@@ -984,18 +989,16 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3:       omp.par.region.parallel.after:
 // CHECK3-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK3:       omp.par.pre_finalize:
-// CHECK3-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB]]
-// CHECK3:       14:
+// CHECK3-NEXT:    br label %[[FINI]]
+// CHECK3:       16:
 // CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_cancel(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM1]], i32 1)
 // CHECK3-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 0
 // CHECK3-NEXT:    br i1 [[TMP16]], label [[DOTSPLIT:%.*]], label [[DOTCNCL:%.*]]
-// CHECK3:       .cncl:
-// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK3-NEXT:    [[TMP17:%.*]] = call i32 @__kmpc_cancel_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
-// CHECK3-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB]]
-// CHECK3:       .split:
-// CHECK3-NEXT:    br label [[TMP4]]
+// CHECK3:       .cncl4:
+// CHECK3-NEXT:    br label %[[FINI]]
+// CHECK3:       .split3:
+// CHECK3-NEXT:    br label {{.+}}
 // CHECK3:       omp.par.exit.exitStub:
 // CHECK3-NEXT:    ret void
 //
@@ -1089,7 +1092,7 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3:       .omp.sections.case.split:
 // CHECK3-NEXT:    br label [[DOTOMP_SECTIONS_EXIT]]
 // CHECK3:       .omp.sections.case.cncl:
-// CHECK3-NEXT:    br label [[CANCEL_CONT:%.*]]
+// CHECK3-NEXT:    br label [[FINI:%.*]]
 // CHECK3:       .omp.sections.exit:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
@@ -1100,7 +1103,7 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB19:[0-9]+]])
 // CHECK3-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB15]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
-// CHECK3-NEXT:    br label [[CANCEL_CONT]]
+// CHECK3-NEXT:    br label [[CANCEL_CONT:.*]]
 // CHECK3:       cancel.cont:
 // CHECK3-NEXT:    ret void
 // CHECK3:       cancel.exit:
@@ -1153,6 +1156,8 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3:       .omp.sections.case.split:
 // CHECK3-NEXT:    br label [[DOTOMP_SECTIONS_EXIT]]
 // CHECK3:       .omp.sections.case.cncl:
+// CHECK3-NEXT:    br label [[DOTFINI:.%*]]
+// CHECK3:       .fini:
 // CHECK3-NEXT:    br label [[CANCEL_CONT:%.*]]
 // CHECK3:       .omp.sections.case2:
 // CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
@@ -1162,9 +1167,11 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3:       .omp.sections.case2.split:
 // CHECK3-NEXT:    br label [[DOTOMP_SECTIONS_CASE2_SECTION_AFTER:%.*]]
 // CHECK3:       .omp.sections.case2.section.after:
-// CHECK3-NEXT:    br label [[DOTOMP_SECTIONS_EXIT]]
+// CHECK3-NEXT:    br label [[OMP_REGION_FINALIZE:.*]]
+// CHECK3:       omp_region.finalize:
+// CHECK3-NEXT:    br label [[OMP_SECTIONS_EXIT:.*]]
 // CHECK3:       .omp.sections.case2.cncl:
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_END]]
+// CHECK3-NEXT:    br label [[FINI:.*]]
 // CHECK3:       .omp.sections.exit:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
diff --git a/clang/test/OpenMP/critical_codegen.cpp b/clang/test/OpenMP/critical_codegen.cpp
index 5c752d354804b..9620613dfdb87 100644
--- a/clang/test/OpenMP/critical_codegen.cpp
+++ b/clang/test/OpenMP/critical_codegen.cpp
@@ -35,6 +35,8 @@ int main() {
 // ALL-NEXT:  			store i8 2, ptr [[A_ADDR]]
 // IRBUILDER-NEXT:		br label %[[AFTER:[^ ,]+]]
 // IRBUILDER:			[[AFTER]]
+// IRBUILDER-NEXT:		br label %[[OMP_REGION_FINALIZE:[^ ,]+]]
+// IRBUILDER:			[[OMP_REGION_FINALIZE]]
 // ALL-NEXT:  			call {{.*}}void @__kmpc_end_critical(ptr [[DEFAULT_LOC]], i32 [[GTID]], ptr [[UNNAMED_LOCK]])
 #pragma omp critical
   a = 2;
diff --git a/clang/test/OpenMP/critical_codegen_attr.cpp b/clang/test/OpenMP/critical_codegen_attr.cpp
index 32482a92e76b8..50b0b04fcfd4a 100644
--- a/clang/test/OpenMP/critical_codegen_attr.cpp
+++ b/clang/test/OpenMP/critical_codegen_attr.cpp
@@ -35,6 +35,8 @@ int main() {
 // ALL-NEXT:  			store i8 2, ptr [[A_ADDR]]
 // IRBUILDER-NEXT:		br label %[[AFTER:[^ ,]+]]
 // IRBUILDER:			[[AFTER]]
+// IRBUILDER-NEXT:		br label %[[OMP_REGION_FINALIZE:[^ ,]+]]
+// IRBUILDER:			[[OMP_REGION_FINALIZE]]
 // ALL-NEXT:  			call {{.*}}void @__kmpc_end_critical(ptr [[DEFAULT_LOC]], i32 [[GTID]], ptr [[UNNAMED_LOCK]])
   [[omp::directive(critical)]]
   a = 2;
diff --git a/clang/test/OpenMP/irbuilder_nested_parallel_for.c b/clang/test/OpenMP/irbuilder_nested_parallel_for.c
index 607d5a60b5cfb..f946c09726853 100644
--- a/clang/test/OpenMP/irbuilder_nested_parallel_for.c
+++ b/clang/test/OpenMP/irbuilder_nested_parallel_for.c
@@ -137,6 +137,8 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK:       omp.par.region.parallel.after:
 // CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK:       omp.par.pre_finalize:
+// CHECK-NEXT:    br label [[DOTFINI:%.*]]
+// CHECK:       .fini:
 // CHECK-NEXT:    br label [[OMP_PAR_EXIT_EXITSTUB:%.*]]
 // CHECK:       omp_loop.body:
 // CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[OMP_LOOP_IV]], [[TMP5]]
@@ -161,7 +163,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]]
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK-NEXT:    store i32 [[TMP3]], ptr [[DOTSTART]], align 4
 // CHECK-NEXT:    store i32 100, ptr [[DOTSTOP]], align 4
@@ -184,7 +186,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    br label [[COND_END]]
 // CHECK:       cond.end:
 // CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ]
-// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -204,7 +206,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]]
 // CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]]
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -238,11 +240,11 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  omp.par.entry:
 // CHECK-NEXT:    [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0
-// CHECK-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META5:![0-9]+]]
+// CHECK-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META4]]
 // CHECK-NEXT:    [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1
-// CHECK-NEXT:    [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META6:![0-9]+]]
+// CHECK-NEXT:    [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META7:![0-9]+]]
 // CHECK-NEXT:    [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2
-// CHECK-NEXT:    [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META6]]
+// CHECK-NEXT:    [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META7]]
 // CHECK-NEXT:    [[STRUCTARG:%.*]] = alloca { ptr, ptr, ptr }, align 8
 // CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TID_ADDR]], align 4
@@ -266,6 +268,8 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK:       omp.par.region.parallel.after:
 // CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK:       omp.par.pre_finalize:
+// CHECK-NEXT:    br label [[DOTFINI16:%.*]]
+// CHECK:       .fini16:
 // CHECK-NEXT:    br label [[OMP_PAR_EXIT_EXITSTUB:%.*]]
 // CHECK:       omp.par.exit.exitStub:
 // CHECK-NEXT:    ret void
@@ -275,11 +279,11 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-SAME: (ptr noalias [[TID_ADDR2:%.*]], ptr noalias [[ZERO_ADDR3:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  omp.par.entry4:
 // CHECK-NEXT:    [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0
-// CHECK-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META5]]
+// CHECK-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META4]]
 // CHECK-NEXT:    [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1
-// CHECK-NEXT:    [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META6]]
+// CHECK-NEXT:    [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META7]]
 // CHECK-NEXT:    [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2
-// CHECK-NEXT:    [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META6]]
+// CHECK-NEXT:    [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META7]]
 // CHECK-NEXT:    [[P_LASTITER:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[P_LOWERBOUND:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[P_UPPERBOUND:%.*]] = alloca i32, align 4
@@ -331,6 +335,8 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK:       omp.par.region5.parallel.after:
 // CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE6:%.*]]
 // CHECK:       omp.par.pre_finalize6:
+// CHECK-NEXT:    br label [[DOTFINI:%.*]]
+// CHECK:       .fini:
 // CHECK-NEXT:    br label [[OMP_PAR_EXIT7_EXITSTUB:%.*]]
 // CHECK:       omp_loop.body:
 // CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[OMP_LOOP_IV]], [[TMP6]]
@@ -362,7 +368,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK-NEXT:    store i32 [[TMP3]], ptr [[DOTSTART]], align 4
 // CHECK-NEXT:    store i32 100, ptr [[DOTSTOP]], align 4
@@ -385,7 +391,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    br label [[COND_END]]
 // CHECK:       cond.end:
 // CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ]
-// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -405,7 +411,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]]
 // CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]]
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -417,14 +423,14 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    [[R_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca double, align 8
-// CHECK-NEXT:    [[I188:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[AGG_CAPTURED189:%.*]] = alloca [[STRUCT_ANON_17:%.*]], align 8
-// CHECK-NEXT:    [[AGG_CAPTURED190:%.*]] = alloca [[STRUCT_ANON_18:%.*]], align 4
-// CHECK-NEXT:    [[DOTCOUNT_ADDR191:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[P_LASTITER206:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[P_LOWERBOUND207:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[P_UPPERBOUND208:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[P_STRIDE209:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[I191:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[AGG_CAPTURED192:%.*]] = alloca [[STRUCT_ANON_17:%.*]], align 8
+// CHECK-NEXT:    [[AGG_CAPTURED193:%.*]] = alloca [[STRUCT_ANON_18:%.*]], align 4
+// CHECK-NEXT:    [[DOTCOUNT_ADDR194:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_LASTITER209:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_LOWERBOUND210:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_UPPERBOUND211:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_STRIDE212:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store ptr [[R]], ptr [[R_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    store double [[B]], ptr [[B_ADDR]], align 8
@@ -440,53 +446,53 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @_Z14parallel_for_2Pfid..omp_par.23, ptr [[STRUCTARG]])
 // CHECK-NEXT:    br label [[OMP_PAR_EXIT:%.*]]
 // CHECK:       omp.par.exit:
-// CHECK-NEXT:    store i32 0, ptr [[I188]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_17]], ptr [[AGG_CAPTURED189]], i32 0, i32 0
-// CHECK-NEXT:    store ptr [[I188]], ptr [[TMP0]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_18]], ptr [[AGG_CAPTURED190]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I188]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[I191]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_17]], ptr [[AGG_CAPTURED192]], i32 0, i32 0
+// CHECK-NEXT:    store ptr [[I191]], ptr [[TMP0]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_18]], ptr [[AGG_CAPTURED193]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I191]], align 4
 // CHECK-NEXT:    store i32 [[TMP2]], ptr [[TMP1]], align 4
-// CHECK-NEXT:    call void @__captured_stmt.19(ptr [[DOTCOUNT_ADDR191]], ptr [[AGG_CAPTURED189]])
-// CHECK-NEXT:    [[DOTCOUNT192:%.*]] = load i32, ptr [[DOTCOUNT_ADDR191]], align 4
-// CHECK-NEXT:    br label [[OMP_LOOP_PREHEADER193:%.*]]
-// CHECK:       omp_loop.preheader193:
-// CHECK-NEXT:    store i32 0, ptr [[P_LOWERBOUND207]], align 4
-// CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[DOTCOUNT192]], 1
-// CHECK-NEXT:    store i32 [[TMP3]], ptr [[P_UPPERBOUND208]], align 4
-// CHECK-NEXT:    store i32 1, ptr [[P_STRIDE209]], align 4
-// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM210:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM210]], i32 34, ptr [[P_LASTITER206]], ptr [[P_LOWERBOUND207]], ptr [[P_UPPERBOUND208]], ptr [[P_STRIDE209]], i32 1, i32 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[P_LOWERBOUND207]], align 4
-// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[P_UPPERBOUND208]], align 4
-// CHECK-NEXT:    [[TRIP_COUNT_MINUS1211:%.*]] = sub i32 [[TMP5]], [[TMP4]]
-// CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TRIP_COUNT_MINUS1211]], 1
-// CHECK-NEXT:    br label [[OMP_LOOP_HEADER194:%.*]]
-// CHECK:       omp_loop.header194:
-// CHECK-NEXT:    [[OMP_LOOP_IV200:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER193]] ], [ [[OMP_LOOP_NEXT202:%.*]], [[OMP_LOOP_INC197:%.*]] ]
-// CHECK-NEXT:    br label [[OMP_LOOP_COND195:%.*]]
-// CHECK:       omp_loop.cond195:
-// CHECK-NEXT:    [[OMP_LOOP_CMP201:%.*]] = icmp ult i32 [[OMP_LOOP_IV200]], [[TMP6]]
-// CHECK-NEXT:    br i1 [[OMP_LOOP_CMP201]], label [[OMP_LOOP_BODY196:%.*]], label [[OMP_LOOP_EXIT198:%.*]]
-// CHECK:       omp_loop.body196:
-// CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[OMP_LOOP_IV200]], [[TMP4]]
-// CHECK-NEXT:    call void @__captured_stmt.20(ptr [[I188]], i32 [[TMP7]], ptr [[AGG_CAPTURED190]])
+// CHECK-NEXT:    call void @__captured_stmt.19(ptr [[DOTCOUNT_ADDR194]], ptr [[AGG_CAPTURED192]])
+// CHECK-NEXT:    [[DOTCOUNT195:%.*]] = load i32, ptr [[DOTCOUNT_ADDR194]], align 4
+// CHECK-NEXT:    br label [[OMP_LOOP_PREHEADER196:%.*]]
+// CHECK:       omp_loop.preheader196:
+// CHECK-NEXT:    store i32 0, ptr [[P_LOWERBOUND210]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[DOTCOUNT195]], 1
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[P_UPPERBOUND211]], align 4
+// CHECK-NEXT:    store i32 1, ptr [[P_STRIDE212]], align 4
+// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM213:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM213]], i32 34, ptr [[P_LASTITER209]], ptr [[P_LOWERBOUND210]], ptr [[P_UPPERBOUND211]], ptr [[P_STRIDE212]], i32 1, i32 0)
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[P_LOWERBOUND210]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[P_UPPERBOUND211]], align 4
+// CHECK-NEXT:    [[TRIP_COUNT_MINUS1214:%.*]] = sub i32 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TRIP_COUNT_MINUS1214]], 1
+// CHECK-NEXT:    br label [[OMP_LOOP_HEADER197:%.*]]
+// CHECK:       omp_loop.header197:
+// CHECK-NEXT:    [[OMP_LOOP_IV203:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER196]] ], [ [[OMP_LOOP_NEXT205:%.*]], [[OMP_LOOP_INC200:%.*]] ]
+// CHECK-NEXT:    br label [[OMP_LOOP_COND198:%.*]]
+// CHECK:       omp_loop.cond198:
+// CHECK-NEXT:    [[OMP_LOOP_CMP204:%.*]] = icmp ult i32 [[OMP_LOOP_IV203]], [[TMP6]]
+// CHECK-NEXT:    br i1 [[OMP_LOOP_CMP204]], label [[OMP_LOOP_BODY199:%.*]], label [[OMP_LOOP_EXIT201:%.*]]
+// CHECK:       omp_loop.body199:
+// CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[OMP_LOOP_IV203]], [[TMP4]]
+// CHECK-NEXT:    call void @__captured_stmt.20(ptr [[I191]], i32 [[TMP7]], ptr [[AGG_CAPTURED193]])
 // CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
-// CHECK-NEXT:    [[CONV203:%.*]] = sitofp i32 [[TMP8]] to double
+// CHECK-NEXT:    [[CONV206:%.*]] = sitofp i32 [[TMP8]] to double
 // CHECK-NEXT:    [[TMP9:%.*]] = load double, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[ADD204:%.*]] = fadd double [[CONV203]], [[TMP9]]
-// CHECK-NEXT:    [[CONV205:%.*]] = fptrunc double [[ADD204]] to float
+// CHECK-NEXT:    [[ADD207:%.*]] = fadd double [[CONV206]], [[TMP9]]
+// CHECK-NEXT:    [[CONV208:%.*]] = fptrunc double [[ADD207]] to float
 // CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[R_ADDR]], align 8
-// CHECK-NEXT:    store float [[CONV205]], ptr [[TMP10]], align 4
-// CHECK-NEXT:    br label [[OMP_LOOP_INC197]]
-// CHECK:       omp_loop.inc197:
-// CHECK-NEXT:    [[OMP_LOOP_NEXT202]] = add nuw i32 [[OMP_LOOP_IV200]], 1
-// CHECK-NEXT:    br label [[OMP_LOOP_HEADER194]]
-// CHECK:       omp_loop.exit198:
-// CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM210]])
-// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM212:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM212]])
-// CHECK-NEXT:    br label [[OMP_LOOP_AFTER199:%.*]]
-// CHECK:       omp_loop.after199:
+// CHECK-NEXT:    store float [[CONV208]], ptr [[TMP10]], align 4
+// CHECK-NEXT:    br label [[OMP_LOOP_INC200]]
+// CHECK:       omp_loop.inc200:
+// CHECK-NEXT:    [[OMP_LOOP_NEXT205]] = add nuw i32 [[OMP_LOOP_IV203]], 1
+// CHECK-NEXT:    br label [[OMP_LOOP_HEADER197]]
+// CHECK:       omp_loop.exit201:
+// CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM213]])
+// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM215:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM215]])
+// CHECK-NEXT:    br label [[OMP_LOOP_AFTER202:%.*]]
+// CHECK:       omp_loop.after202:
 // CHECK-NEXT:    ret void
 //
 //
@@ -494,16 +500,16 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  omp.par.entry:
 // CHECK-NEXT:    [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0
-// CHECK-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META5]]
+// CHECK-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META4]]
 // CHECK-NEXT:    [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1
-// CHECK-NEXT:    [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META6]]
+// CHECK-NEXT:    [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META7]]
 // CHECK-NEXT:    [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2
-// CHECK-NEXT:    [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META6]]
+// CHECK-NEXT:    [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META7]]
 // CHECK-NEXT:    [[STRUCTARG:%.*]] = alloca { ptr, ptr, ptr }, align 8
-// CHECK-NEXT:    [[P_LASTITER181:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[P_LOWERBOUND182:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[P_UPPERBOUND183:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[P_STRIDE184:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_LASTITER183:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_LOWERBOUND184:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_UPPERBOUND185:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_STRIDE186:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[P_LASTITER:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[P_LOWERBOUND:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[P_UPPERBOUND:%.*]] = alloca i32, align 4
@@ -516,10 +522,10 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON_3:%.*]], align 8
 // CHECK-NEXT:    [[AGG_CAPTURED1:%.*]] = alloca [[STRUCT_ANON_4:%.*]], align 4
 // CHECK-NEXT:    [[DOTCOUNT_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[I163:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[AGG_CAPTURED164:%.*]] = alloca [[STRUCT_ANON_15:%.*]], align 8
-// CHECK-NEXT:    [[AGG_CAPTURED165:%.*]] = alloca [[STRUCT_ANON_16:%.*]], align 4
-// CHECK-NEXT:    [[DOTCOUNT_ADDR166:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[I165:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[AGG_CAPTURED166:%.*]] = alloca [[STRUCT_ANON_15:%.*]], align 8
+// CHECK-NEXT:    [[AGG_CAPTURED167:%.*]] = alloca [[STRUCT_ANON_16:%.*]], align 4
+// CHECK-NEXT:    [[DOTCOUNT_ADDR168:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
 // CHECK:       omp.par.region:
 // CHECK-NEXT:    store i32 0, ptr [[I]], align 4
@@ -567,58 +573,60 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @_Z14parallel_for_2Pfid..omp_par.22, ptr [[STRUCTARG]])
 // CHECK-NEXT:    br label [[OMP_PAR_EXIT11:%.*]]
 // CHECK:       omp.par.exit11:
-// CHECK-NEXT:    store i32 0, ptr [[I163]], align 4
-// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_15]], ptr [[AGG_CAPTURED164]], i32 0, i32 0
-// CHECK-NEXT:    store ptr [[I163]], ptr [[TMP9]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_16]], ptr [[AGG_CAPTURED165]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I163]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[I165]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_15]], ptr [[AGG_CAPTURED166]], i32 0, i32 0
+// CHECK-NEXT:    store ptr [[I165]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_16]], ptr [[AGG_CAPTURED167]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I165]], align 4
 // CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
-// CHECK-NEXT:    call void @__captured_stmt.17(ptr [[DOTCOUNT_ADDR166]], ptr [[AGG_CAPTURED164]])
-// CHECK-NEXT:    [[DOTCOUNT167:%.*]] = load i32, ptr [[DOTCOUNT_ADDR166]], align 4
-// CHECK-NEXT:    br label [[OMP_LOOP_PREHEADER168:%.*]]
-// CHECK:       omp_loop.preheader168:
-// CHECK-NEXT:    store i32 0, ptr [[P_LOWERBOUND182]], align 4
-// CHECK-NEXT:    [[TMP12:%.*]] = sub i32 [[DOTCOUNT167]], 1
-// CHECK-NEXT:    store i32 [[TMP12]], ptr [[P_UPPERBOUND183]], align 4
-// CHECK-NEXT:    store i32 1, ptr [[P_STRIDE184]], align 4
-// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM185:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM185]], i32 34, ptr [[P_LASTITER181]], ptr [[P_LOWERBOUND182]], ptr [[P_UPPERBOUND183]], ptr [[P_STRIDE184]], i32 1, i32 0)
-// CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[P_LOWERBOUND182]], align 4
-// CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[P_UPPERBOUND183]], align 4
-// CHECK-NEXT:    [[TRIP_COUNT_MINUS1186:%.*]] = sub i32 [[TMP14]], [[TMP13]]
-// CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TRIP_COUNT_MINUS1186]], 1
-// CHECK-NEXT:    br label [[OMP_LOOP_HEADER169:%.*]]
-// CHECK:       omp_loop.header169:
-// CHECK-NEXT:    [[OMP_LOOP_IV175:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER168]] ], [ [[OMP_LOOP_NEXT177:%.*]], [[OMP_LOOP_INC172:%.*]] ]
-// CHECK-NEXT:    br label [[OMP_LOOP_COND170:%.*]]
-// CHECK:       omp_loop.cond170:
-// CHECK-NEXT:    [[OMP_LOOP_CMP176:%.*]] = icmp ult i32 [[OMP_LOOP_IV175]], [[TMP15]]
-// CHECK-NEXT:    br i1 [[OMP_LOOP_CMP176]], label [[OMP_LOOP_BODY171:%.*]], label [[OMP_LOOP_EXIT173:%.*]]
-// CHECK:       omp_loop.exit173:
-// CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM185]])
+// CHECK-NEXT:    call void @__captured_stmt.17(ptr [[DOTCOUNT_ADDR168]], ptr [[AGG_CAPTURED166]])
+// CHECK-NEXT:    [[DOTCOUNT169:%.*]] = load i32, ptr [[DOTCOUNT_ADDR168]], align 4
+// CHECK-NEXT:    br label [[OMP_LOOP_PREHEADER170:%.*]]
+// CHECK:       omp_loop.preheader170:
+// CHECK-NEXT:    store i32 0, ptr [[P_LOWERBOUND184]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = sub i32 [[DOTCOUNT169]], 1
+// CHECK-NEXT:    store i32 [[TMP12]], ptr [[P_UPPERBOUND185]], align 4
+// CHECK-NEXT:    store i32 1, ptr [[P_STRIDE186]], align 4
 // CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM187:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM187]])
-// CHECK-NEXT:    br label [[OMP_LOOP_AFTER174:%.*]]
-// CHECK:       omp_loop.after174:
+// CHECK-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM187]], i32 34, ptr [[P_LASTITER183]], ptr [[P_LOWERBOUND184]], ptr [[P_UPPERBOUND185]], ptr [[P_STRIDE186]], i32 1, i32 0)
+// CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[P_LOWERBOUND184]], align 4
+// CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[P_UPPERBOUND185]], align 4
+// CHECK-NEXT:    [[TRIP_COUNT_MINUS1188:%.*]] = sub i32 [[TMP14]], [[TMP13]]
+// CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TRIP_COUNT_MINUS1188]], 1
+// CHECK-NEXT:    br label [[OMP_LOOP_HEADER171:%.*]]
+// CHECK:       omp_loop.header171:
+// CHECK-NEXT:    [[OMP_LOOP_IV177:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER170]] ], [ [[OMP_LOOP_NEXT179:%.*]], [[OMP_LOOP_INC174:%.*]] ]
+// CHECK-NEXT:    br label [[OMP_LOOP_COND172:%.*]]
+// CHECK:       omp_loop.cond172:
+// CHECK-NEXT:    [[OMP_LOOP_CMP178:%.*]] = icmp ult i32 [[OMP_LOOP_IV177]], [[TMP15]]
+// CHECK-NEXT:    br i1 [[OMP_LOOP_CMP178]], label [[OMP_LOOP_BODY173:%.*]], label [[OMP_LOOP_EXIT175:%.*]]
+// CHECK:       omp_loop.exit175:
+// CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM187]])
+// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM189:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM189]])
+// CHECK-NEXT:    br label [[OMP_LOOP_AFTER176:%.*]]
+// CHECK:       omp_loop.after176:
 // CHECK-NEXT:    br label [[OMP_PAR_REGION_PARALLEL_AFTER:%.*]]
 // CHECK:       omp.par.region.parallel.after:
 // CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK:       omp.par.pre_finalize:
+// CHECK-NEXT:    br label [[DOTFINI190:%.*]]
+// CHECK:       .fini190:
 // CHECK-NEXT:    br label [[OMP_PAR_EXIT_EXITSTUB:%.*]]
-// CHECK:       omp_loop.body171:
-// CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[OMP_LOOP_IV175]], [[TMP13]]
-// CHECK-NEXT:    call void @__captured_stmt.18(ptr [[I163]], i32 [[TMP16]], ptr [[AGG_CAPTURED165]])
+// CHECK:       omp_loop.body173:
+// CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[OMP_LOOP_IV177]], [[TMP13]]
+// CHECK-NEXT:    call void @__captured_stmt.18(ptr [[I165]], i32 [[TMP16]], ptr [[AGG_CAPTURED167]])
 // CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[LOADGEP_A_ADDR]], align 4
-// CHECK-NEXT:    [[CONV178:%.*]] = sitofp i32 [[TMP17]] to double
+// CHECK-NEXT:    [[CONV180:%.*]] = sitofp i32 [[TMP17]] to double
 // CHECK-NEXT:    [[TMP18:%.*]] = load double, ptr [[LOADGEP_B_ADDR]], align 8
-// CHECK-NEXT:    [[ADD179:%.*]] = fadd double [[CONV178]], [[TMP18]]
-// CHECK-NEXT:    [[CONV180:%.*]] = fptrunc double [[ADD179]] to float
+// CHECK-NEXT:    [[ADD181:%.*]] = fadd double [[CONV180]], [[TMP18]]
+// CHECK-NEXT:    [[CONV182:%.*]] = fptrunc double [[ADD181]] to float
 // CHECK-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[LOADGEP_R_ADDR]], align 8
-// CHECK-NEXT:    store float [[CONV180]], ptr [[TMP19]], align 4
-// CHECK-NEXT:    br label [[OMP_LOOP_INC172]]
-// CHECK:       omp_loop.inc172:
-// CHECK-NEXT:    [[OMP_LOOP_NEXT177]] = add nuw i32 [[OMP_LOOP_IV175]], 1
-// CHECK-NEXT:    br label [[OMP_LOOP_HEADER169]]
+// CHECK-NEXT:    store float [[CONV182]], ptr [[TMP19]], align 4
+// CHECK-NEXT:    br label [[OMP_LOOP_INC174]]
+// CHECK:       omp_loop.inc174:
+// CHECK-NEXT:    [[OMP_LOOP_NEXT179]] = add nuw i32 [[OMP_LOOP_IV177]], 1
+// CHECK-NEXT:    br label [[OMP_LOOP_HEADER171]]
 // CHECK:       omp_loop.body:
 // CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[OMP_LOOP_IV]], [[TMP6]]
 // CHECK-NEXT:    call void @__captured_stmt.6(ptr [[I]], i32 [[TMP20]], ptr [[AGG_CAPTURED1]])
@@ -641,17 +649,17 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-SAME: (ptr noalias [[TID_ADDR6:%.*]], ptr noalias [[ZERO_ADDR7:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  omp.par.entry8:
 // CHECK-NEXT:    [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0
-// CHECK-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META5]]
+// CHECK-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META4]]
 // CHECK-NEXT:    [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1
-// CHECK-NEXT:    [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META6]]
+// CHECK-NEXT:    [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META7]]
 // CHECK-NEXT:    [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2
-// CHECK-NEXT:    [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META6]]
-// CHECK-NEXT:    [[STRUCTARG213:%.*]] = alloca { ptr, ptr, ptr }, align 8
+// CHECK-NEXT:    [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META7]]
+// CHECK-NEXT:    [[STRUCTARG216:%.*]] = alloca { ptr, ptr, ptr }, align 8
 // CHECK-NEXT:    [[STRUCTARG:%.*]] = alloca { ptr, ptr, ptr }, align 8
-// CHECK-NEXT:    [[P_LASTITER156:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[P_LOWERBOUND157:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[P_UPPERBOUND158:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[P_STRIDE159:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_LASTITER157:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_LOWERBOUND158:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_UPPERBOUND159:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P_STRIDE160:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[P_LASTITER95:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[P_LOWERBOUND96:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[P_UPPERBOUND97:%.*]] = alloca i32, align 4
@@ -672,10 +680,10 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    [[AGG_CAPTURED78:%.*]] = alloca [[STRUCT_ANON_9:%.*]], align 8
 // CHECK-NEXT:    [[AGG_CAPTURED79:%.*]] = alloca [[STRUCT_ANON_10:%.*]], align 4
 // CHECK-NEXT:    [[DOTCOUNT_ADDR80:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[I138:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[AGG_CAPTURED139:%.*]] = alloca [[STRUCT_ANON_13:%.*]], align 8
-// CHECK-NEXT:    [[AGG_CAPTURED140:%.*]] = alloca [[STRUCT_ANON_14:%.*]], align 4
-// CHECK-NEXT:    [[DOTCOUNT_ADDR141:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[I139:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[AGG_CAPTURED140:%.*]] = alloca [[STRUCT_ANON_13:%.*]], align 8
+// CHECK-NEXT:    [[AGG_CAPTURED141:%.*]] = alloca [[STRUCT_ANON_14:%.*]], align 4
+// CHECK-NEXT:    [[DOTCOUNT_ADDR142:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    br label [[OMP_PAR_REGION9:%.*]]
 // CHECK:       omp.par.region9:
 // CHECK-NEXT:    store i32 0, ptr [[I16]], align 4
@@ -757,69 +765,71 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    br label [[OMP_LOOP_AFTER88:%.*]]
 // CHECK:       omp_loop.after88:
 // CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM102:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK-NEXT:    br label [[OMP_PARALLEL217:%.*]]
-// CHECK:       omp_parallel217:
-// CHECK-NEXT:    [[GEP_A_ADDR214:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG213]], i32 0, i32 0
-// CHECK-NEXT:    store ptr [[LOADGEP_A_ADDR]], ptr [[GEP_A_ADDR214]], align 8
-// CHECK-NEXT:    [[GEP_B_ADDR215:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG213]], i32 0, i32 1
-// CHECK-NEXT:    store ptr [[LOADGEP_B_ADDR]], ptr [[GEP_B_ADDR215]], align 8
-// CHECK-NEXT:    [[GEP_R_ADDR216:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG213]], i32 0, i32 2
-// CHECK-NEXT:    store ptr [[LOADGEP_R_ADDR]], ptr [[GEP_R_ADDR216]], align 8
-// CHECK-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @_Z14parallel_for_2Pfid..omp_par.21, ptr [[STRUCTARG213]])
+// CHECK-NEXT:    br label [[OMP_PARALLEL220:%.*]]
+// CHECK:       omp_parallel220:
+// CHECK-NEXT:    [[GEP_A_ADDR217:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG216]], i32 0, i32 0
+// CHECK-NEXT:    store ptr [[LOADGEP_A_ADDR]], ptr [[GEP_A_ADDR217]], align 8
+// CHECK-NEXT:    [[GEP_B_ADDR218:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG216]], i32 0, i32 1
+// CHECK-NEXT:    store ptr [[LOADGEP_B_ADDR]], ptr [[GEP_B_ADDR218]], align 8
+// CHECK-NEXT:    [[GEP_R_ADDR219:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG216]], i32 0, i32 2
+// CHECK-NEXT:    store ptr [[LOADGEP_R_ADDR]], ptr [[GEP_R_ADDR219]], align 8
+// CHECK-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @_Z14parallel_for_2Pfid..omp_par.21, ptr [[STRUCTARG216]])
 // CHECK-NEXT:    br label [[OMP_PAR_EXIT108:%.*]]
 // CHECK:       omp.par.exit108:
-// CHECK-NEXT:    store i32 0, ptr [[I138]], align 4
-// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_13]], ptr [[AGG_CAPTURED139]], i32 0, i32 0
-// CHECK-NEXT:    store ptr [[I138]], ptr [[TMP16]], align 8
-// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_14]], ptr [[AGG_CAPTURED140]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I138]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[I139]], align 4
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_13]], ptr [[AGG_CAPTURED140]], i32 0, i32 0
+// CHECK-NEXT:    store ptr [[I139]], ptr [[TMP16]], align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_14]], ptr [[AGG_CAPTURED141]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I139]], align 4
 // CHECK-NEXT:    store i32 [[TMP18]], ptr [[TMP17]], align 4
-// CHECK-NEXT:    call void @__captured_stmt.15(ptr [[DOTCOUNT_ADDR141]], ptr [[AGG_CAPTURED139]])
-// CHECK-NEXT:    [[DOTCOUNT142:%.*]] = load i32, ptr [[DOTCOUNT_ADDR141]], align 4
-// CHECK-NEXT:    br label [[OMP_LOOP_PREHEADER143:%.*]]
-// CHECK:       omp_loop.preheader143:
-// CHECK-NEXT:    store i32 0, ptr [[P_LOWERBOUND157]], align 4
-// CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[DOTCOUNT142]], 1
-// CHECK-NEXT:    store i32 [[TMP19]], ptr [[P_UPPERBOUND158]], align 4
-// CHECK-NEXT:    store i32 1, ptr [[P_STRIDE159]], align 4
-// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM160:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM160]], i32 34, ptr [[P_LASTITER156]], ptr [[P_LOWERBOUND157]], ptr [[P_UPPERBOUND158]], ptr [[P_STRIDE159]], i32 1, i32 0)
-// CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[P_LOWERBOUND157]], align 4
-// CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[P_UPPERBOUND158]], align 4
-// CHECK-NEXT:    [[TRIP_COUNT_MINUS1161:%.*]] = sub i32 [[TMP21]], [[TMP20]]
-// CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TRIP_COUNT_MINUS1161]], 1
-// CHECK-NEXT:    br label [[OMP_LOOP_HEADER144:%.*]]
-// CHECK:       omp_loop.header144:
-// CHECK-NEXT:    [[OMP_LOOP_IV150:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER143]] ], [ [[OMP_LOOP_NEXT152:%.*]], [[OMP_LOOP_INC147:%.*]] ]
-// CHECK-NEXT:    br label [[OMP_LOOP_COND145:%.*]]
-// CHECK:       omp_loop.cond145:
-// CHECK-NEXT:    [[OMP_LOOP_CMP151:%.*]] = icmp ult i32 [[OMP_LOOP_IV150]], [[TMP22]]
-// CHECK-NEXT:    br i1 [[OMP_LOOP_CMP151]], label [[OMP_LOOP_BODY146:%.*]], label [[OMP_LOOP_EXIT148:%.*]]
-// CHECK:       omp_loop.exit148:
-// CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM160]])
-// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM162:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM162]])
-// CHECK-NEXT:    br label [[OMP_LOOP_AFTER149:%.*]]
-// CHECK:       omp_loop.after149:
+// CHECK-NEXT:    call void @__captured_stmt.15(ptr [[DOTCOUNT_ADDR142]], ptr [[AGG_CAPTURED140]])
+// CHECK-NEXT:    [[DOTCOUNT143:%.*]] = load i32, ptr [[DOTCOUNT_ADDR142]], align 4
+// CHECK-NEXT:    br label [[OMP_LOOP_PREHEADER144:%.*]]
+// CHECK:       omp_loop.preheader144:
+// CHECK-NEXT:    store i32 0, ptr [[P_LOWERBOUND158]], align 4
+// CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[DOTCOUNT143]], 1
+// CHECK-NEXT:    store i32 [[TMP19]], ptr [[P_UPPERBOUND159]], align 4
+// CHECK-NEXT:    store i32 1, ptr [[P_STRIDE160]], align 4
+// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM161:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM161]], i32 34, ptr [[P_LASTITER157]], ptr [[P_LOWERBOUND158]], ptr [[P_UPPERBOUND159]], ptr [[P_STRIDE160]], i32 1, i32 0)
+// CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[P_LOWERBOUND158]], align 4
+// CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[P_UPPERBOUND159]], align 4
+// CHECK-NEXT:    [[TRIP_COUNT_MINUS1162:%.*]] = sub i32 [[TMP21]], [[TMP20]]
+// CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TRIP_COUNT_MINUS1162]], 1
+// CHECK-NEXT:    br label [[OMP_LOOP_HEADER145:%.*]]
+// CHECK:       omp_loop.header145:
+// CHECK-NEXT:    [[OMP_LOOP_IV151:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER144]] ], [ [[OMP_LOOP_NEXT153:%.*]], [[OMP_LOOP_INC148:%.*]] ]
+// CHECK-NEXT:    br label [[OMP_LOOP_COND146:%.*]]
+// CHECK:       omp_loop.cond146:
+// CHECK-NEXT:    [[OMP_LOOP_CMP152:%.*]] = icmp ult i32 [[OMP_LOOP_IV151]], [[TMP22]]
+// CHECK-NEXT:    br i1 [[OMP_LOOP_CMP152]], label [[OMP_LOOP_BODY147:%.*]], label [[OMP_LOOP_EXIT149:%.*]]
+// CHECK:       omp_loop.exit149:
+// CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM161]])
+// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM163:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM163]])
+// CHECK-NEXT:    br label [[OMP_LOOP_AFTER150:%.*]]
+// CHECK:       omp_loop.after150:
 // CHECK-NEXT:    br label [[OMP_PAR_REGION9_PARALLEL_AFTER:%.*]]
 // CHECK:       omp.par.region9.parallel.after:
 // CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE10:%.*]]
 // CHECK:       omp.par.pre_finalize10:
+// CHECK-NEXT:    br label [[DOTFINI164:%.*]]
+// CHECK:       .fini164:
 // CHECK-NEXT:    br label [[OMP_PAR_EXIT11_EXITSTUB:%.*]]
-// CHECK:       omp_loop.body146:
-// CHECK-NEXT:    [[TMP23:%.*]] = add i32 [[OMP_LOOP_IV150]], [[TMP20]]
-// CHECK-NEXT:    call void @__captured_stmt.16(ptr [[I138]], i32 [[TMP23]], ptr [[AGG_CAPTURED140]])
+// CHECK:       omp_loop.body147:
+// CHECK-NEXT:    [[TMP23:%.*]] = add i32 [[OMP_LOOP_IV151]], [[TMP20]]
+// CHECK-NEXT:    call void @__captured_stmt.16(ptr [[I139]], i32 [[TMP23]], ptr [[AGG_CAPTURED141]])
 // CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[LOADGEP_A_ADDR]], align 4
-// CHECK-NEXT:    [[CONV153:%.*]] = sitofp i32 [[TMP24]] to double
+// CHECK-NEXT:    [[CONV154:%.*]] = sitofp i32 [[TMP24]] to double
 // CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr [[LOADGEP_B_ADDR]], align 8
-// CHECK-NEXT:    [[ADD154:%.*]] = fadd double [[CONV153]], [[TMP25]]
-// CHECK-NEXT:    [[CONV155:%.*]] = fptrunc double [[ADD154]] to float
+// CHECK-NEXT:    [[ADD155:%.*]] = fadd double [[CONV154]], [[TMP25]]
+// CHECK-NEXT:    [[CONV156:%.*]] = fptrunc double [[ADD155]] to float
 // CHECK-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[LOADGEP_R_ADDR]], align 8
-// CHECK-NEXT:    store float [[CONV155]], ptr [[TMP26]], align 4
-// CHECK-NEXT:    br label [[OMP_LOOP_INC147]]
-// CHECK:       omp_loop.inc147:
-// CHECK-NEXT:    [[OMP_LOOP_NEXT152]] = add nuw i32 [[OMP_LOOP_IV150]], 1
-// CHECK-NEXT:    br label [[OMP_LOOP_HEADER144]]
+// CHECK-NEXT:    store float [[CONV156]], ptr [[TMP26]], align 4
+// CHECK-NEXT:    br label [[OMP_LOOP_INC148]]
+// CHECK:       omp_loop.inc148:
+// CHECK-NEXT:    [[OMP_LOOP_NEXT153]] = add nuw i32 [[OMP_LOOP_IV151]], 1
+// CHECK-NEXT:    br label [[OMP_LOOP_HEADER145]]
 // CHECK:       omp_loop.body85:
 // CHECK-NEXT:    [[TMP27:%.*]] = add i32 [[OMP_LOOP_IV89]], [[TMP13]]
 // CHECK-NEXT:    call void @__captured_stmt.12(ptr [[I77]], i32 [[TMP27]], ptr [[AGG_CAPTURED79]])
@@ -856,11 +866,11 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-SAME: (ptr noalias [[TID_ADDR103:%.*]], ptr noalias [[ZERO_ADDR104:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  omp.par.entry105:
 // CHECK-NEXT:    [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0
-// CHECK-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META5]]
+// CHECK-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META4]]
 // CHECK-NEXT:    [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1
-// CHECK-NEXT:    [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META6]]
+// CHECK-NEXT:    [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META7]]
 // CHECK-NEXT:    [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2
-// CHECK-NEXT:    [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META6]]
+// CHECK-NEXT:    [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META7]]
 // CHECK-NEXT:    [[P_LASTITER131:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[P_LOWERBOUND132:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[P_UPPERBOUND133:%.*]] = alloca i32, align 4
@@ -912,6 +922,8 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK:       omp.par.region106.parallel.after:
 // CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE107:%.*]]
 // CHECK:       omp.par.pre_finalize107:
+// CHECK-NEXT:    br label [[DOTFINI138:%.*]]
+// CHECK:       .fini138:
 // CHECK-NEXT:    br label [[OMP_PAR_EXIT108_EXITSTUB:%.*]]
 // CHECK:       omp_loop.body121:
 // CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[OMP_LOOP_IV125]], [[TMP6]]
@@ -935,11 +947,11 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-SAME: (ptr noalias [[TID_ADDR42:%.*]], ptr noalias [[ZERO_ADDR43:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  omp.par.entry44:
 // CHECK-NEXT:    [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0
-// CHECK-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META5]]
+// CHECK-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META4]]
 // CHECK-NEXT:    [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1
-// CHECK-NEXT:    [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META6]]
+// CHECK-NEXT:    [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META7]]
 // CHECK-NEXT:    [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2
-// CHECK-NEXT:    [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META6]]
+// CHECK-NEXT:    [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META7]]
 // CHECK-NEXT:    [[P_LASTITER70:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[P_LOWERBOUND71:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[P_UPPERBOUND72:%.*]] = alloca i32, align 4
@@ -991,6 +1003,8 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK:       omp.par.region45.parallel.after:
 // CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE46:%.*]]
 // CHECK:       omp.par.pre_finalize46:
+// CHECK-NEXT:    br label [[DOTFINI:%.*]]
+// CHECK:       .fini:
 // CHECK-NEXT:    br label [[OMP_PAR_EXIT47_EXITSTUB:%.*]]
 // CHECK:       omp_loop.body60:
 // CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[OMP_LOOP_IV64]], [[TMP6]]
@@ -1022,7 +1036,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_3:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK-NEXT:    store i32 [[TMP3]], ptr [[DOTSTART]], align 4
 // CHECK-NEXT:    store i32 100, ptr [[DOTSTOP]], align 4
@@ -1045,7 +1059,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    br label [[COND_END]]
 // CHECK:       cond.end:
 // CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ]
-// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1065,7 +1079,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]]
 // CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]]
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1082,7 +1096,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_5:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK-NEXT:    store i32 [[TMP3]], ptr [[DOTSTART]], align 4
 // CHECK-NEXT:    store i32 100, ptr [[DOTSTOP]], align 4
@@ -1105,7 +1119,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    br label [[COND_END]]
 // CHECK:       cond.end:
 // CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ]
-// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1125,7 +1139,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]]
 // CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]]
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1142,7 +1156,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_7:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK-NEXT:    store i32 [[TMP3]], ptr [[DOTSTART]], align 4
 // CHECK-NEXT:    store i32 100, ptr [[DOTSTOP]], align 4
@@ -1165,7 +1179,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    br label [[COND_END]]
 // CHECK:       cond.end:
 // CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ]
-// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1185,7 +1199,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]]
 // CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]]
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1202,7 +1216,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_9:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK-NEXT:    store i32 [[TMP3]], ptr [[DOTSTART]], align 4
 // CHECK-NEXT:    store i32 100, ptr [[DOTSTOP]], align 4
@@ -1225,7 +1239,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    br label [[COND_END]]
 // CHECK:       cond.end:
 // CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ]
-// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1245,7 +1259,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]]
 // CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]]
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1262,7 +1276,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_11:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK-NEXT:    store i32 [[TMP3]], ptr [[DOTSTART]], align 4
 // CHECK-NEXT:    store i32 100, ptr [[DOTSTOP]], align 4
@@ -1285,7 +1299,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    br label [[COND_END]]
 // CHECK:       cond.end:
 // CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ]
-// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1305,7 +1319,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]]
 // CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]]
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1322,7 +1336,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_13:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK-NEXT:    store i32 [[TMP3]], ptr [[DOTSTART]], align 4
 // CHECK-NEXT:    store i32 100, ptr [[DOTSTOP]], align 4
@@ -1345,7 +1359,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    br label [[COND_END]]
 // CHECK:       cond.end:
 // CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ]
-// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1365,7 +1379,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]]
 // CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]]
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1382,7 +1396,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_15:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK-NEXT:    store i32 [[TMP3]], ptr [[DOTSTART]], align 4
 // CHECK-NEXT:    store i32 100, ptr [[DOTSTOP]], align 4
@@ -1405,7 +1419,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    br label [[COND_END]]
 // CHECK:       cond.end:
 // CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ]
-// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1425,7 +1439,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]]
 // CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]]
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1442,7 +1456,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_17:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK-NEXT:    store i32 [[TMP3]], ptr [[DOTSTART]], align 4
 // CHECK-NEXT:    store i32 100, ptr [[DOTSTOP]], align 4
@@ -1465,7 +1479,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    br label [[COND_END]]
 // CHECK:       cond.end:
 // CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ]
-// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1485,7 +1499,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]]
 // CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]]
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -1557,6 +1571,8 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG:       omp.par.region.parallel.after:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK-DEBUG:       omp.par.pre_finalize:
+// CHECK-DEBUG-NEXT:    br label [[DOTFINI:%.*]]
+// CHECK-DEBUG:       .fini:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_EXIT_EXITSTUB:%.*]], !dbg [[DBG30]]
 // CHECK-DEBUG:       omp_loop.body:
 // CHECK-DEBUG-NEXT:    [[TMP8:%.*]] = add i32 [[OMP_LOOP_IV]], [[TMP5]], !dbg [[DBG29]]
@@ -1584,73 +1600,73 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTART]], [[META42:![0-9]+]], !DIExpression(), [[META44:![0-9]+]])
 // CHECK-DEBUG-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG45:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG45]]
+// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG45]], !nonnull [[META12:![0-9]+]], !align [[META47:![0-9]+]]
 // CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG45]]
 // CHECK-DEBUG-NEXT:    store i32 [[TMP3]], ptr [[DOTSTART]], align 4, !dbg [[META44]]
-// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTOP]], [[META47:![0-9]+]], !DIExpression(), [[META48:![0-9]+]])
-// CHECK-DEBUG-NEXT:    store i32 100, ptr [[DOTSTOP]], align 4, !dbg [[META48]]
-// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTEP]], [[META49:![0-9]+]], !DIExpression(), [[META48]])
-// CHECK-DEBUG-NEXT:    store i32 1, ptr [[DOTSTEP]], align 4, !dbg [[META48]]
-// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTSTART]], align 4, !dbg [[META48]]
-// CHECK-DEBUG-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTSTOP]], align 4, !dbg [[META48]]
-// CHECK-DEBUG-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP4]], [[TMP5]], !dbg [[META48]]
-// CHECK-DEBUG-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[META48]]
+// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTOP]], [[META48:![0-9]+]], !DIExpression(), [[META49:![0-9]+]])
+// CHECK-DEBUG-NEXT:    store i32 100, ptr [[DOTSTOP]], align 4, !dbg [[META49]]
+// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTEP]], [[META50:![0-9]+]], !DIExpression(), [[META49]])
+// CHECK-DEBUG-NEXT:    store i32 1, ptr [[DOTSTEP]], align 4, !dbg [[META49]]
+// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTSTART]], align 4, !dbg [[META49]]
+// CHECK-DEBUG-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTSTOP]], align 4, !dbg [[META49]]
+// CHECK-DEBUG-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP4]], [[TMP5]], !dbg [[META49]]
+// CHECK-DEBUG-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[META49]]
 // CHECK-DEBUG:       cond.true:
-// CHECK-DEBUG-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTSTOP]], align 4, !dbg [[META48]]
-// CHECK-DEBUG-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTSTART]], align 4, !dbg [[META48]]
-// CHECK-DEBUG-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP6]], [[TMP7]], !dbg [[META48]]
-// CHECK-DEBUG-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTSTEP]], align 4, !dbg [[META48]]
-// CHECK-DEBUG-NEXT:    [[SUB1:%.*]] = sub i32 [[TMP8]], 1, !dbg [[META48]]
-// CHECK-DEBUG-NEXT:    [[ADD:%.*]] = add i32 [[SUB]], [[SUB1]], !dbg [[META48]]
-// CHECK-DEBUG-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTSTEP]], align 4, !dbg [[META48]]
-// CHECK-DEBUG-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP9]], !dbg [[META48]]
-// CHECK-DEBUG-NEXT:    br label [[COND_END:%.*]], !dbg [[META48]]
+// CHECK-DEBUG-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTSTOP]], align 4, !dbg [[META49]]
+// CHECK-DEBUG-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTSTART]], align 4, !dbg [[META49]]
+// CHECK-DEBUG-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP6]], [[TMP7]], !dbg [[META49]]
+// CHECK-DEBUG-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTSTEP]], align 4, !dbg [[META49]]
+// CHECK-DEBUG-NEXT:    [[SUB1:%.*]] = sub i32 [[TMP8]], 1, !dbg [[META49]]
+// CHECK-DEBUG-NEXT:    [[ADD:%.*]] = add i32 [[SUB]], [[SUB1]], !dbg [[META49]]
+// CHECK-DEBUG-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTSTEP]], align 4, !dbg [[META49]]
+// CHECK-DEBUG-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP9]], !dbg [[META49]]
+// CHECK-DEBUG-NEXT:    br label [[COND_END:%.*]], !dbg [[META49]]
 // CHECK-DEBUG:       cond.false:
-// CHECK-DEBUG-NEXT:    br label [[COND_END]], !dbg [[META48]]
+// CHECK-DEBUG-NEXT:    br label [[COND_END]], !dbg [[META49]]
 // CHECK-DEBUG:       cond.end:
-// CHECK-DEBUG-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META48]]
-// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META48]]
-// CHECK-DEBUG-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META48]]
-// CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG50:![0-9]+]]
+// CHECK-DEBUG-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META49]]
+// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META49]], !nonnull [[META12]], !align [[META47]]
+// CHECK-DEBUG-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META49]]
+// CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG51:![0-9]+]]
 //
 //
 // CHECK-DEBUG-LABEL: define {{[^@]+}}@__captured_stmt.1
-// CHECK-DEBUG-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[LOOPVAR:%.*]], i32 noundef [[LOGICAL:%.*]], ptr noalias noundef [[__CONTEXT:%.*]]) #[[ATTR3]] !dbg [[DBG52:![0-9]+]] {
+// CHECK-DEBUG-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[LOOPVAR:%.*]], i32 noundef [[LOGICAL:%.*]], ptr noalias noundef [[__CONTEXT:%.*]]) #[[ATTR3]] !dbg [[DBG53:![0-9]+]] {
 // CHECK-DEBUG-NEXT:  entry:
 // CHECK-DEBUG-NEXT:    [[LOOPVAR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-DEBUG-NEXT:    [[LOGICAL_ADDR:%.*]] = alloca i32, align 4
 // CHECK-DEBUG-NEXT:    [[__CONTEXT_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-DEBUG-NEXT:    store ptr [[LOOPVAR]], ptr [[LOOPVAR_ADDR]], align 8
-// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[LOOPVAR_ADDR]], [[META60:![0-9]+]], !DIExpression(), [[META61:![0-9]+]])
+// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[LOOPVAR_ADDR]], [[META61:![0-9]+]], !DIExpression(), [[META62:![0-9]+]])
 // CHECK-DEBUG-NEXT:    store i32 [[LOGICAL]], ptr [[LOGICAL_ADDR]], align 4
-// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[LOGICAL_ADDR]], [[META62:![0-9]+]], !DIExpression(), [[META61]])
+// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[LOGICAL_ADDR]], [[META63:![0-9]+]], !DIExpression(), [[META62]])
 // CHECK-DEBUG-NEXT:    store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8
-// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[__CONTEXT_ADDR]], [[META63:![0-9]+]], !DIExpression(), [[META61]])
+// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[__CONTEXT_ADDR]], [[META64:![0-9]+]], !DIExpression(), [[META62]])
 // CHECK-DEBUG-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
-// CHECK-DEBUG-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_0:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG64:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG64]]
-// CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG66:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG66]]
-// CHECK-DEBUG-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG66]]
-// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG66]]
-// CHECK-DEBUG-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META61]]
-// CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG64]]
+// CHECK-DEBUG-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_0:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG65:![0-9]+]]
+// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG65]]
+// CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG67:![0-9]+]]
+// CHECK-DEBUG-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG67]]
+// CHECK-DEBUG-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG67]]
+// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG67]], !nonnull [[META12]], !align [[META47]]
+// CHECK-DEBUG-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META62]]
+// CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG65]]
 //
 //
 // CHECK-DEBUG-LABEL: define {{[^@]+}}@_Z14parallel_for_1Pfid
-// CHECK-DEBUG-SAME: (ptr noundef [[R:%.*]], i32 noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] !dbg [[DBG69:![0-9]+]] {
+// CHECK-DEBUG-SAME: (ptr noundef [[R:%.*]], i32 noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] !dbg [[DBG70:![0-9]+]] {
 // CHECK-DEBUG-NEXT:  entry:
 // CHECK-DEBUG-NEXT:    [[STRUCTARG:%.*]] = alloca { ptr, ptr, ptr }, align 8
 // CHECK-DEBUG-NEXT:    [[R_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-DEBUG-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-DEBUG-NEXT:    [[B_ADDR:%.*]] = alloca double, align 8
 // CHECK-DEBUG-NEXT:    store ptr [[R]], ptr [[R_ADDR]], align 8
-// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[R_ADDR]], [[META75:![0-9]+]], !DIExpression(), [[META76:![0-9]+]])
+// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[R_ADDR]], [[META76:![0-9]+]], !DIExpression(), [[META77:![0-9]+]])
 // CHECK-DEBUG-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META77:![0-9]+]], !DIExpression(), [[META78:![0-9]+]])
+// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META78:![0-9]+]], !DIExpression(), [[META79:![0-9]+]])
 // CHECK-DEBUG-NEXT:    store double [[B]], ptr [[B_ADDR]], align 8
-// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META79:![0-9]+]], !DIExpression(), [[META80:![0-9]+]])
-// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB6:[0-9]+]]), !dbg [[DBG81:![0-9]+]]
+// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[B_ADDR]], [[META80:![0-9]+]], !DIExpression(), [[META81:![0-9]+]])
+// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB6:[0-9]+]]), !dbg [[DBG82:![0-9]+]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_PARALLEL:%.*]]
 // CHECK-DEBUG:       omp_parallel:
 // CHECK-DEBUG-NEXT:    [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG]], i32 0, i32 0
@@ -1659,17 +1675,17 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    store ptr [[B_ADDR]], ptr [[GEP_B_ADDR]], align 8
 // CHECK-DEBUG-NEXT:    [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG]], i32 0, i32 2
 // CHECK-DEBUG-NEXT:    store ptr [[R_ADDR]], ptr [[GEP_R_ADDR]], align 8
-// CHECK-DEBUG-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB6]], i32 1, ptr @_Z14parallel_for_1Pfid..omp_par.4, ptr [[STRUCTARG]]), !dbg [[DBG82:![0-9]+]]
+// CHECK-DEBUG-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB6]], i32 1, ptr @_Z14parallel_for_1Pfid..omp_par.4, ptr [[STRUCTARG]]), !dbg [[DBG83:![0-9]+]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_EXIT:%.*]]
 // CHECK-DEBUG:       omp.par.exit:
-// CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG84:![0-9]+]]
+// CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG85:![0-9]+]]
 //
 //
 // CHECK-DEBUG-LABEL: define {{[^@]+}}@_Z14parallel_for_1Pfid..omp_par.4
-// CHECK-DEBUG-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] !dbg [[DBG85:![0-9]+]] {
+// CHECK-DEBUG-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] !dbg [[DBG86:![0-9]+]] {
 // CHECK-DEBUG-NEXT:  omp.par.entry:
 // CHECK-DEBUG-NEXT:    [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0
-// CHECK-DEBUG-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META86:![0-9]+]]
+// CHECK-DEBUG-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META47]]
 // CHECK-DEBUG-NEXT:    [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1
 // CHECK-DEBUG-NEXT:    [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META87:![0-9]+]]
 // CHECK-DEBUG-NEXT:    [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2
@@ -1700,6 +1716,8 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG:       omp.par.region.parallel.after:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK-DEBUG:       omp.par.pre_finalize:
+// CHECK-DEBUG-NEXT:    br label [[DOTFINI16:%.*]]
+// CHECK-DEBUG:       .fini16:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_EXIT_EXITSTUB:%.*]], !dbg [[DBG100]]
 // CHECK-DEBUG:       omp.par.exit.exitStub:
 // CHECK-DEBUG-NEXT:    ret void
@@ -1709,7 +1727,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-SAME: (ptr noalias [[TID_ADDR2:%.*]], ptr noalias [[ZERO_ADDR3:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] !dbg [[DBG101:![0-9]+]] {
 // CHECK-DEBUG-NEXT:  omp.par.entry4:
 // CHECK-DEBUG-NEXT:    [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0
-// CHECK-DEBUG-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META86]]
+// CHECK-DEBUG-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META47]]
 // CHECK-DEBUG-NEXT:    [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1
 // CHECK-DEBUG-NEXT:    [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META87]]
 // CHECK-DEBUG-NEXT:    [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2
@@ -1769,6 +1787,8 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG:       omp.par.region5.parallel.after:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_PRE_FINALIZE6:%.*]]
 // CHECK-DEBUG:       omp.par.pre_finalize6:
+// CHECK-DEBUG-NEXT:    br label [[DOTFINI:%.*]]
+// CHECK-DEBUG:       .fini:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_EXIT7_EXITSTUB:%.*]], !dbg [[DBG117]]
 // CHECK-DEBUG:       omp_loop.body:
 // CHECK-DEBUG-NEXT:    [[TMP9:%.*]] = add i32 [[OMP_LOOP_IV]], [[TMP6]], !dbg [[DBG116]]
@@ -1803,7 +1823,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTART]], [[META128:![0-9]+]], !DIExpression(), [[META130:![0-9]+]])
 // CHECK-DEBUG-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_1:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG131:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG131]]
+// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG131]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG131]]
 // CHECK-DEBUG-NEXT:    store i32 [[TMP3]], ptr [[DOTSTART]], align 4, !dbg [[META130]]
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTOP]], [[META133:![0-9]+]], !DIExpression(), [[META134:![0-9]+]])
@@ -1828,7 +1848,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    br label [[COND_END]], !dbg [[META134]]
 // CHECK-DEBUG:       cond.end:
 // CHECK-DEBUG-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META134]]
-// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META134]]
+// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META134]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META134]]
 // CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG136:![0-9]+]]
 //
@@ -1851,7 +1871,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG145:![0-9]+]]
 // CHECK-DEBUG-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG145]]
 // CHECK-DEBUG-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG145]]
-// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG145]]
+// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG145]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META140]]
 // CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG143]]
 //
@@ -1863,14 +1883,14 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[R_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-DEBUG-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-DEBUG-NEXT:    [[B_ADDR:%.*]] = alloca double, align 8
-// CHECK-DEBUG-NEXT:    [[I188:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[AGG_CAPTURED189:%.*]] = alloca [[STRUCT_ANON_17:%.*]], align 8
-// CHECK-DEBUG-NEXT:    [[AGG_CAPTURED190:%.*]] = alloca [[STRUCT_ANON_18:%.*]], align 4
-// CHECK-DEBUG-NEXT:    [[DOTCOUNT_ADDR191:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[P_LASTITER206:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[P_LOWERBOUND207:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[P_UPPERBOUND208:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[P_STRIDE209:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[I191:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[AGG_CAPTURED192:%.*]] = alloca [[STRUCT_ANON_17:%.*]], align 8
+// CHECK-DEBUG-NEXT:    [[AGG_CAPTURED193:%.*]] = alloca [[STRUCT_ANON_18:%.*]], align 4
+// CHECK-DEBUG-NEXT:    [[DOTCOUNT_ADDR194:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[P_LASTITER209:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[P_LOWERBOUND210:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[P_UPPERBOUND211:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[P_STRIDE212:%.*]] = alloca i32, align 4
 // CHECK-DEBUG-NEXT:    store ptr [[R]], ptr [[R_ADDR]], align 8
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[R_ADDR]], [[META147:![0-9]+]], !DIExpression(), [[META148:![0-9]+]])
 // CHECK-DEBUG-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
@@ -1889,54 +1909,54 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB13]], i32 1, ptr @_Z14parallel_for_2Pfid..omp_par.23, ptr [[STRUCTARG]]), !dbg [[DBG154:![0-9]+]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_EXIT:%.*]]
 // CHECK-DEBUG:       omp.par.exit:
-// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[I188]], [[META158:![0-9]+]], !DIExpression(), [[META161:![0-9]+]])
-// CHECK-DEBUG-NEXT:    store i32 0, ptr [[I188]], align 4, !dbg [[META161]]
-// CHECK-DEBUG-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_17]], ptr [[AGG_CAPTURED189]], i32 0, i32 0, !dbg [[DBG162:![0-9]+]]
-// CHECK-DEBUG-NEXT:    store ptr [[I188]], ptr [[TMP0]], align 8, !dbg [[DBG162]]
-// CHECK-DEBUG-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_18]], ptr [[AGG_CAPTURED190]], i32 0, i32 0, !dbg [[DBG162]]
-// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I188]], align 4, !dbg [[DBG163:![0-9]+]]
+// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[I191]], [[META158:![0-9]+]], !DIExpression(), [[META161:![0-9]+]])
+// CHECK-DEBUG-NEXT:    store i32 0, ptr [[I191]], align 4, !dbg [[META161]]
+// CHECK-DEBUG-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_17]], ptr [[AGG_CAPTURED192]], i32 0, i32 0, !dbg [[DBG162:![0-9]+]]
+// CHECK-DEBUG-NEXT:    store ptr [[I191]], ptr [[TMP0]], align 8, !dbg [[DBG162]]
+// CHECK-DEBUG-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_18]], ptr [[AGG_CAPTURED193]], i32 0, i32 0, !dbg [[DBG162]]
+// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I191]], align 4, !dbg [[DBG163:![0-9]+]]
 // CHECK-DEBUG-NEXT:    store i32 [[TMP2]], ptr [[TMP1]], align 4, !dbg [[DBG162]]
-// CHECK-DEBUG-NEXT:    call void @__captured_stmt.19(ptr [[DOTCOUNT_ADDR191]], ptr [[AGG_CAPTURED189]]), !dbg [[DBG162]]
-// CHECK-DEBUG-NEXT:    [[DOTCOUNT192:%.*]] = load i32, ptr [[DOTCOUNT_ADDR191]], align 4, !dbg [[DBG162]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_PREHEADER193:%.*]], !dbg [[DBG162]]
-// CHECK-DEBUG:       omp_loop.preheader193:
-// CHECK-DEBUG-NEXT:    store i32 0, ptr [[P_LOWERBOUND207]], align 4, !dbg [[DBG162]]
-// CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = sub i32 [[DOTCOUNT192]], 1, !dbg [[DBG162]]
-// CHECK-DEBUG-NEXT:    store i32 [[TMP3]], ptr [[P_UPPERBOUND208]], align 4, !dbg [[DBG162]]
-// CHECK-DEBUG-NEXT:    store i32 1, ptr [[P_STRIDE209]], align 4, !dbg [[DBG162]]
-// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM210:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB42:[0-9]+]]), !dbg [[DBG162]]
-// CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM210]], i32 34, ptr [[P_LASTITER206]], ptr [[P_LOWERBOUND207]], ptr [[P_UPPERBOUND208]], ptr [[P_STRIDE209]], i32 1, i32 0), !dbg [[DBG162]]
-// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load i32, ptr [[P_LOWERBOUND207]], align 4, !dbg [[DBG162]]
-// CHECK-DEBUG-NEXT:    [[TMP5:%.*]] = load i32, ptr [[P_UPPERBOUND208]], align 4, !dbg [[DBG162]]
-// CHECK-DEBUG-NEXT:    [[TRIP_COUNT_MINUS1211:%.*]] = sub i32 [[TMP5]], [[TMP4]], !dbg [[DBG162]]
-// CHECK-DEBUG-NEXT:    [[TMP6:%.*]] = add i32 [[TRIP_COUNT_MINUS1211]], 1, !dbg [[DBG162]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER194:%.*]], !dbg [[DBG162]]
-// CHECK-DEBUG:       omp_loop.header194:
-// CHECK-DEBUG-NEXT:    [[OMP_LOOP_IV200:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER193]] ], [ [[OMP_LOOP_NEXT202:%.*]], [[OMP_LOOP_INC197:%.*]] ], !dbg [[DBG162]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_COND195:%.*]], !dbg [[DBG162]]
-// CHECK-DEBUG:       omp_loop.cond195:
-// CHECK-DEBUG-NEXT:    [[OMP_LOOP_CMP201:%.*]] = icmp ult i32 [[OMP_LOOP_IV200]], [[TMP6]], !dbg [[DBG162]]
-// CHECK-DEBUG-NEXT:    br i1 [[OMP_LOOP_CMP201]], label [[OMP_LOOP_BODY196:%.*]], label [[OMP_LOOP_EXIT198:%.*]], !dbg [[DBG162]]
-// CHECK-DEBUG:       omp_loop.body196:
-// CHECK-DEBUG-NEXT:    [[TMP7:%.*]] = add i32 [[OMP_LOOP_IV200]], [[TMP4]], !dbg [[DBG164:![0-9]+]]
-// CHECK-DEBUG-NEXT:    call void @__captured_stmt.20(ptr [[I188]], i32 [[TMP7]], ptr [[AGG_CAPTURED190]]), !dbg [[DBG162]]
+// CHECK-DEBUG-NEXT:    call void @__captured_stmt.19(ptr [[DOTCOUNT_ADDR194]], ptr [[AGG_CAPTURED192]]), !dbg [[DBG162]]
+// CHECK-DEBUG-NEXT:    [[DOTCOUNT195:%.*]] = load i32, ptr [[DOTCOUNT_ADDR194]], align 4, !dbg [[DBG162]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_PREHEADER196:%.*]], !dbg [[DBG162]]
+// CHECK-DEBUG:       omp_loop.preheader196:
+// CHECK-DEBUG-NEXT:    store i32 0, ptr [[P_LOWERBOUND210]], align 4, !dbg [[DBG162]]
+// CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = sub i32 [[DOTCOUNT195]], 1, !dbg [[DBG162]]
+// CHECK-DEBUG-NEXT:    store i32 [[TMP3]], ptr [[P_UPPERBOUND211]], align 4, !dbg [[DBG162]]
+// CHECK-DEBUG-NEXT:    store i32 1, ptr [[P_STRIDE212]], align 4, !dbg [[DBG162]]
+// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM213:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB42:[0-9]+]]), !dbg [[DBG162]]
+// CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM213]], i32 34, ptr [[P_LASTITER209]], ptr [[P_LOWERBOUND210]], ptr [[P_UPPERBOUND211]], ptr [[P_STRIDE212]], i32 1, i32 0), !dbg [[DBG162]]
+// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load i32, ptr [[P_LOWERBOUND210]], align 4, !dbg [[DBG162]]
+// CHECK-DEBUG-NEXT:    [[TMP5:%.*]] = load i32, ptr [[P_UPPERBOUND211]], align 4, !dbg [[DBG162]]
+// CHECK-DEBUG-NEXT:    [[TRIP_COUNT_MINUS1214:%.*]] = sub i32 [[TMP5]], [[TMP4]], !dbg [[DBG162]]
+// CHECK-DEBUG-NEXT:    [[TMP6:%.*]] = add i32 [[TRIP_COUNT_MINUS1214]], 1, !dbg [[DBG162]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER197:%.*]], !dbg [[DBG162]]
+// CHECK-DEBUG:       omp_loop.header197:
+// CHECK-DEBUG-NEXT:    [[OMP_LOOP_IV203:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER196]] ], [ [[OMP_LOOP_NEXT205:%.*]], [[OMP_LOOP_INC200:%.*]] ], !dbg [[DBG162]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_COND198:%.*]], !dbg [[DBG162]]
+// CHECK-DEBUG:       omp_loop.cond198:
+// CHECK-DEBUG-NEXT:    [[OMP_LOOP_CMP204:%.*]] = icmp ult i32 [[OMP_LOOP_IV203]], [[TMP6]], !dbg [[DBG162]]
+// CHECK-DEBUG-NEXT:    br i1 [[OMP_LOOP_CMP204]], label [[OMP_LOOP_BODY199:%.*]], label [[OMP_LOOP_EXIT201:%.*]], !dbg [[DBG162]]
+// CHECK-DEBUG:       omp_loop.body199:
+// CHECK-DEBUG-NEXT:    [[TMP7:%.*]] = add i32 [[OMP_LOOP_IV203]], [[TMP4]], !dbg [[DBG164:![0-9]+]]
+// CHECK-DEBUG-NEXT:    call void @__captured_stmt.20(ptr [[I191]], i32 [[TMP7]], ptr [[AGG_CAPTURED193]]), !dbg [[DBG162]]
 // CHECK-DEBUG-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG165:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[CONV203:%.*]] = sitofp i32 [[TMP8]] to double, !dbg [[DBG165]]
+// CHECK-DEBUG-NEXT:    [[CONV206:%.*]] = sitofp i32 [[TMP8]] to double, !dbg [[DBG165]]
 // CHECK-DEBUG-NEXT:    [[TMP9:%.*]] = load double, ptr [[B_ADDR]], align 8, !dbg [[DBG164]]
-// CHECK-DEBUG-NEXT:    [[ADD204:%.*]] = fadd double [[CONV203]], [[TMP9]], !dbg [[DBG166:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[CONV205:%.*]] = fptrunc double [[ADD204]] to float, !dbg [[DBG165]]
+// CHECK-DEBUG-NEXT:    [[ADD207:%.*]] = fadd double [[CONV206]], [[TMP9]], !dbg [[DBG166:![0-9]+]]
+// CHECK-DEBUG-NEXT:    [[CONV208:%.*]] = fptrunc double [[ADD207]] to float, !dbg [[DBG165]]
 // CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[R_ADDR]], align 8, !dbg [[DBG167:![0-9]+]]
-// CHECK-DEBUG-NEXT:    store float [[CONV205]], ptr [[TMP10]], align 4, !dbg [[DBG168:![0-9]+]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_INC197]], !dbg [[DBG162]]
-// CHECK-DEBUG:       omp_loop.inc197:
-// CHECK-DEBUG-NEXT:    [[OMP_LOOP_NEXT202]] = add nuw i32 [[OMP_LOOP_IV200]], 1, !dbg [[DBG162]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER194]], !dbg [[DBG162]]
-// CHECK-DEBUG:       omp_loop.exit198:
-// CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM210]]), !dbg [[DBG162]]
-// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM212:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB42]]), !dbg [[DBG164]]
-// CHECK-DEBUG-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB43:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM212]]), !dbg [[DBG164]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_AFTER199:%.*]], !dbg [[DBG162]]
-// CHECK-DEBUG:       omp_loop.after199:
+// CHECK-DEBUG-NEXT:    store float [[CONV208]], ptr [[TMP10]], align 4, !dbg [[DBG168:![0-9]+]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_INC200]], !dbg [[DBG162]]
+// CHECK-DEBUG:       omp_loop.inc200:
+// CHECK-DEBUG-NEXT:    [[OMP_LOOP_NEXT205]] = add nuw i32 [[OMP_LOOP_IV203]], 1, !dbg [[DBG162]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER197]], !dbg [[DBG162]]
+// CHECK-DEBUG:       omp_loop.exit201:
+// CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM213]]), !dbg [[DBG162]]
+// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM215:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB42]]), !dbg [[DBG164]]
+// CHECK-DEBUG-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB43:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM215]]), !dbg [[DBG164]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_AFTER202:%.*]], !dbg [[DBG162]]
+// CHECK-DEBUG:       omp_loop.after202:
 // CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG169:![0-9]+]]
 //
 //
@@ -1944,16 +1964,16 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-SAME: (ptr noalias [[TID_ADDR:%.*]], ptr noalias [[ZERO_ADDR:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] !dbg [[DBG170:![0-9]+]] {
 // CHECK-DEBUG-NEXT:  omp.par.entry:
 // CHECK-DEBUG-NEXT:    [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0
-// CHECK-DEBUG-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META86]]
+// CHECK-DEBUG-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META47]]
 // CHECK-DEBUG-NEXT:    [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1
 // CHECK-DEBUG-NEXT:    [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META87]]
 // CHECK-DEBUG-NEXT:    [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2
 // CHECK-DEBUG-NEXT:    [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META87]]
 // CHECK-DEBUG-NEXT:    [[STRUCTARG:%.*]] = alloca { ptr, ptr, ptr }, align 8
-// CHECK-DEBUG-NEXT:    [[P_LASTITER181:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[P_LOWERBOUND182:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[P_UPPERBOUND183:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[P_STRIDE184:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[P_LASTITER183:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[P_LOWERBOUND184:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[P_UPPERBOUND185:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[P_STRIDE186:%.*]] = alloca i32, align 4
 // CHECK-DEBUG-NEXT:    [[P_LASTITER:%.*]] = alloca i32, align 4
 // CHECK-DEBUG-NEXT:    [[P_LOWERBOUND:%.*]] = alloca i32, align 4
 // CHECK-DEBUG-NEXT:    [[P_UPPERBOUND:%.*]] = alloca i32, align 4
@@ -1966,10 +1986,10 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON_3:%.*]], align 8
 // CHECK-DEBUG-NEXT:    [[AGG_CAPTURED1:%.*]] = alloca [[STRUCT_ANON_4:%.*]], align 4
 // CHECK-DEBUG-NEXT:    [[DOTCOUNT_ADDR:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[I163:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[AGG_CAPTURED164:%.*]] = alloca [[STRUCT_ANON_15:%.*]], align 8
-// CHECK-DEBUG-NEXT:    [[AGG_CAPTURED165:%.*]] = alloca [[STRUCT_ANON_16:%.*]], align 4
-// CHECK-DEBUG-NEXT:    [[DOTCOUNT_ADDR166:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[I165:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[AGG_CAPTURED166:%.*]] = alloca [[STRUCT_ANON_15:%.*]], align 8
+// CHECK-DEBUG-NEXT:    [[AGG_CAPTURED167:%.*]] = alloca [[STRUCT_ANON_16:%.*]], align 4
+// CHECK-DEBUG-NEXT:    [[DOTCOUNT_ADDR168:%.*]] = alloca i32, align 4
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[LOADGEP_A_ADDR]], [[META171:![0-9]+]], !DIExpression(), [[META172:![0-9]+]])
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[LOADGEP_B_ADDR]], [[META173:![0-9]+]], !DIExpression(), [[META174:![0-9]+]])
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[LOADGEP_R_ADDR]], [[META175:![0-9]+]], !DIExpression(), [[META176:![0-9]+]])
@@ -2021,59 +2041,61 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB18]], i32 1, ptr @_Z14parallel_for_2Pfid..omp_par.22, ptr [[STRUCTARG]]), !dbg [[DBG186:![0-9]+]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_EXIT11:%.*]]
 // CHECK-DEBUG:       omp.par.exit11:
-// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[I163]], [[META190:![0-9]+]], !DIExpression(), [[META193:![0-9]+]])
-// CHECK-DEBUG-NEXT:    store i32 0, ptr [[I163]], align 4, !dbg [[META193]]
-// CHECK-DEBUG-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_15]], ptr [[AGG_CAPTURED164]], i32 0, i32 0, !dbg [[DBG194:![0-9]+]]
-// CHECK-DEBUG-NEXT:    store ptr [[I163]], ptr [[TMP9]], align 8, !dbg [[DBG194]]
-// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_16]], ptr [[AGG_CAPTURED165]], i32 0, i32 0, !dbg [[DBG194]]
-// CHECK-DEBUG-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I163]], align 4, !dbg [[DBG195:![0-9]+]]
+// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[I165]], [[META190:![0-9]+]], !DIExpression(), [[META193:![0-9]+]])
+// CHECK-DEBUG-NEXT:    store i32 0, ptr [[I165]], align 4, !dbg [[META193]]
+// CHECK-DEBUG-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_15]], ptr [[AGG_CAPTURED166]], i32 0, i32 0, !dbg [[DBG194:![0-9]+]]
+// CHECK-DEBUG-NEXT:    store ptr [[I165]], ptr [[TMP9]], align 8, !dbg [[DBG194]]
+// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_16]], ptr [[AGG_CAPTURED167]], i32 0, i32 0, !dbg [[DBG194]]
+// CHECK-DEBUG-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I165]], align 4, !dbg [[DBG195:![0-9]+]]
 // CHECK-DEBUG-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4, !dbg [[DBG194]]
-// CHECK-DEBUG-NEXT:    call void @__captured_stmt.17(ptr [[DOTCOUNT_ADDR166]], ptr [[AGG_CAPTURED164]]), !dbg [[DBG194]]
-// CHECK-DEBUG-NEXT:    [[DOTCOUNT167:%.*]] = load i32, ptr [[DOTCOUNT_ADDR166]], align 4, !dbg [[DBG194]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_PREHEADER168:%.*]], !dbg [[DBG194]]
-// CHECK-DEBUG:       omp_loop.preheader168:
-// CHECK-DEBUG-NEXT:    store i32 0, ptr [[P_LOWERBOUND182]], align 4, !dbg [[DBG194]]
-// CHECK-DEBUG-NEXT:    [[TMP12:%.*]] = sub i32 [[DOTCOUNT167]], 1, !dbg [[DBG194]]
-// CHECK-DEBUG-NEXT:    store i32 [[TMP12]], ptr [[P_UPPERBOUND183]], align 4, !dbg [[DBG194]]
-// CHECK-DEBUG-NEXT:    store i32 1, ptr [[P_STRIDE184]], align 4, !dbg [[DBG194]]
-// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM185:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB39:[0-9]+]]), !dbg [[DBG194]]
-// CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB39]], i32 [[OMP_GLOBAL_THREAD_NUM185]], i32 34, ptr [[P_LASTITER181]], ptr [[P_LOWERBOUND182]], ptr [[P_UPPERBOUND183]], ptr [[P_STRIDE184]], i32 1, i32 0), !dbg [[DBG194]]
-// CHECK-DEBUG-NEXT:    [[TMP13:%.*]] = load i32, ptr [[P_LOWERBOUND182]], align 4, !dbg [[DBG194]]
-// CHECK-DEBUG-NEXT:    [[TMP14:%.*]] = load i32, ptr [[P_UPPERBOUND183]], align 4, !dbg [[DBG194]]
-// CHECK-DEBUG-NEXT:    [[TRIP_COUNT_MINUS1186:%.*]] = sub i32 [[TMP14]], [[TMP13]], !dbg [[DBG194]]
-// CHECK-DEBUG-NEXT:    [[TMP15:%.*]] = add i32 [[TRIP_COUNT_MINUS1186]], 1, !dbg [[DBG194]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER169:%.*]], !dbg [[DBG194]]
-// CHECK-DEBUG:       omp_loop.header169:
-// CHECK-DEBUG-NEXT:    [[OMP_LOOP_IV175:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER168]] ], [ [[OMP_LOOP_NEXT177:%.*]], [[OMP_LOOP_INC172:%.*]] ], !dbg [[DBG194]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_COND170:%.*]], !dbg [[DBG194]]
-// CHECK-DEBUG:       omp_loop.cond170:
-// CHECK-DEBUG-NEXT:    [[OMP_LOOP_CMP176:%.*]] = icmp ult i32 [[OMP_LOOP_IV175]], [[TMP15]], !dbg [[DBG194]]
-// CHECK-DEBUG-NEXT:    br i1 [[OMP_LOOP_CMP176]], label [[OMP_LOOP_BODY171:%.*]], label [[OMP_LOOP_EXIT173:%.*]], !dbg [[DBG194]]
-// CHECK-DEBUG:       omp_loop.exit173:
-// CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB39]], i32 [[OMP_GLOBAL_THREAD_NUM185]]), !dbg [[DBG194]]
-// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM187:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB39]]), !dbg [[DBG196:![0-9]+]]
-// CHECK-DEBUG-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB40:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM187]]), !dbg [[DBG196]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_AFTER174:%.*]], !dbg [[DBG194]]
-// CHECK-DEBUG:       omp_loop.after174:
+// CHECK-DEBUG-NEXT:    call void @__captured_stmt.17(ptr [[DOTCOUNT_ADDR168]], ptr [[AGG_CAPTURED166]]), !dbg [[DBG194]]
+// CHECK-DEBUG-NEXT:    [[DOTCOUNT169:%.*]] = load i32, ptr [[DOTCOUNT_ADDR168]], align 4, !dbg [[DBG194]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_PREHEADER170:%.*]], !dbg [[DBG194]]
+// CHECK-DEBUG:       omp_loop.preheader170:
+// CHECK-DEBUG-NEXT:    store i32 0, ptr [[P_LOWERBOUND184]], align 4, !dbg [[DBG194]]
+// CHECK-DEBUG-NEXT:    [[TMP12:%.*]] = sub i32 [[DOTCOUNT169]], 1, !dbg [[DBG194]]
+// CHECK-DEBUG-NEXT:    store i32 [[TMP12]], ptr [[P_UPPERBOUND185]], align 4, !dbg [[DBG194]]
+// CHECK-DEBUG-NEXT:    store i32 1, ptr [[P_STRIDE186]], align 4, !dbg [[DBG194]]
+// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM187:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB39:[0-9]+]]), !dbg [[DBG194]]
+// CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB39]], i32 [[OMP_GLOBAL_THREAD_NUM187]], i32 34, ptr [[P_LASTITER183]], ptr [[P_LOWERBOUND184]], ptr [[P_UPPERBOUND185]], ptr [[P_STRIDE186]], i32 1, i32 0), !dbg [[DBG194]]
+// CHECK-DEBUG-NEXT:    [[TMP13:%.*]] = load i32, ptr [[P_LOWERBOUND184]], align 4, !dbg [[DBG194]]
+// CHECK-DEBUG-NEXT:    [[TMP14:%.*]] = load i32, ptr [[P_UPPERBOUND185]], align 4, !dbg [[DBG194]]
+// CHECK-DEBUG-NEXT:    [[TRIP_COUNT_MINUS1188:%.*]] = sub i32 [[TMP14]], [[TMP13]], !dbg [[DBG194]]
+// CHECK-DEBUG-NEXT:    [[TMP15:%.*]] = add i32 [[TRIP_COUNT_MINUS1188]], 1, !dbg [[DBG194]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER171:%.*]], !dbg [[DBG194]]
+// CHECK-DEBUG:       omp_loop.header171:
+// CHECK-DEBUG-NEXT:    [[OMP_LOOP_IV177:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER170]] ], [ [[OMP_LOOP_NEXT179:%.*]], [[OMP_LOOP_INC174:%.*]] ], !dbg [[DBG194]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_COND172:%.*]], !dbg [[DBG194]]
+// CHECK-DEBUG:       omp_loop.cond172:
+// CHECK-DEBUG-NEXT:    [[OMP_LOOP_CMP178:%.*]] = icmp ult i32 [[OMP_LOOP_IV177]], [[TMP15]], !dbg [[DBG194]]
+// CHECK-DEBUG-NEXT:    br i1 [[OMP_LOOP_CMP178]], label [[OMP_LOOP_BODY173:%.*]], label [[OMP_LOOP_EXIT175:%.*]], !dbg [[DBG194]]
+// CHECK-DEBUG:       omp_loop.exit175:
+// CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB39]], i32 [[OMP_GLOBAL_THREAD_NUM187]]), !dbg [[DBG194]]
+// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM189:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB39]]), !dbg [[DBG196:![0-9]+]]
+// CHECK-DEBUG-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB40:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM189]]), !dbg [[DBG196]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_AFTER176:%.*]], !dbg [[DBG194]]
+// CHECK-DEBUG:       omp_loop.after176:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_REGION_PARALLEL_AFTER:%.*]], !dbg [[DBG197:![0-9]+]]
 // CHECK-DEBUG:       omp.par.region.parallel.after:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK-DEBUG:       omp.par.pre_finalize:
+// CHECK-DEBUG-NEXT:    br label [[DOTFINI190:%.*]]
+// CHECK-DEBUG:       .fini190:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_EXIT_EXITSTUB:%.*]], !dbg [[DBG197]]
-// CHECK-DEBUG:       omp_loop.body171:
-// CHECK-DEBUG-NEXT:    [[TMP16:%.*]] = add i32 [[OMP_LOOP_IV175]], [[TMP13]], !dbg [[DBG196]]
-// CHECK-DEBUG-NEXT:    call void @__captured_stmt.18(ptr [[I163]], i32 [[TMP16]], ptr [[AGG_CAPTURED165]]), !dbg [[DBG194]]
+// CHECK-DEBUG:       omp_loop.body173:
+// CHECK-DEBUG-NEXT:    [[TMP16:%.*]] = add i32 [[OMP_LOOP_IV177]], [[TMP13]], !dbg [[DBG196]]
+// CHECK-DEBUG-NEXT:    call void @__captured_stmt.18(ptr [[I165]], i32 [[TMP16]], ptr [[AGG_CAPTURED167]]), !dbg [[DBG194]]
 // CHECK-DEBUG-NEXT:    [[TMP17:%.*]] = load i32, ptr [[LOADGEP_A_ADDR]], align 4, !dbg [[DBG198:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[CONV178:%.*]] = sitofp i32 [[TMP17]] to double, !dbg [[DBG198]]
+// CHECK-DEBUG-NEXT:    [[CONV180:%.*]] = sitofp i32 [[TMP17]] to double, !dbg [[DBG198]]
 // CHECK-DEBUG-NEXT:    [[TMP18:%.*]] = load double, ptr [[LOADGEP_B_ADDR]], align 8, !dbg [[DBG196]]
-// CHECK-DEBUG-NEXT:    [[ADD179:%.*]] = fadd double [[CONV178]], [[TMP18]], !dbg [[DBG199:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[CONV180:%.*]] = fptrunc double [[ADD179]] to float, !dbg [[DBG198]]
+// CHECK-DEBUG-NEXT:    [[ADD181:%.*]] = fadd double [[CONV180]], [[TMP18]], !dbg [[DBG199:![0-9]+]]
+// CHECK-DEBUG-NEXT:    [[CONV182:%.*]] = fptrunc double [[ADD181]] to float, !dbg [[DBG198]]
 // CHECK-DEBUG-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[LOADGEP_R_ADDR]], align 8, !dbg [[DBG200:![0-9]+]]
-// CHECK-DEBUG-NEXT:    store float [[CONV180]], ptr [[TMP19]], align 4, !dbg [[DBG201:![0-9]+]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_INC172]], !dbg [[DBG194]]
-// CHECK-DEBUG:       omp_loop.inc172:
-// CHECK-DEBUG-NEXT:    [[OMP_LOOP_NEXT177]] = add nuw i32 [[OMP_LOOP_IV175]], 1, !dbg [[DBG194]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER169]], !dbg [[DBG194]]
+// CHECK-DEBUG-NEXT:    store float [[CONV182]], ptr [[TMP19]], align 4, !dbg [[DBG201:![0-9]+]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_INC174]], !dbg [[DBG194]]
+// CHECK-DEBUG:       omp_loop.inc174:
+// CHECK-DEBUG-NEXT:    [[OMP_LOOP_NEXT179]] = add nuw i32 [[OMP_LOOP_IV177]], 1, !dbg [[DBG194]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER171]], !dbg [[DBG194]]
 // CHECK-DEBUG:       omp_loop.body:
 // CHECK-DEBUG-NEXT:    [[TMP20:%.*]] = add i32 [[OMP_LOOP_IV]], [[TMP6]], !dbg [[DBG184]]
 // CHECK-DEBUG-NEXT:    call void @__captured_stmt.6(ptr [[I]], i32 [[TMP20]], ptr [[AGG_CAPTURED1]]), !dbg [[DBG182]]
@@ -2096,17 +2118,17 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-SAME: (ptr noalias [[TID_ADDR6:%.*]], ptr noalias [[ZERO_ADDR7:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] !dbg [[DBG206:![0-9]+]] {
 // CHECK-DEBUG-NEXT:  omp.par.entry8:
 // CHECK-DEBUG-NEXT:    [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0
-// CHECK-DEBUG-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META86]]
+// CHECK-DEBUG-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META47]]
 // CHECK-DEBUG-NEXT:    [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1
 // CHECK-DEBUG-NEXT:    [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META87]]
 // CHECK-DEBUG-NEXT:    [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2
 // CHECK-DEBUG-NEXT:    [[LOADGEP_R_ADDR:%.*]] = load ptr, ptr [[GEP_R_ADDR]], align 8, !align [[META87]]
-// CHECK-DEBUG-NEXT:    [[STRUCTARG213:%.*]] = alloca { ptr, ptr, ptr }, align 8
+// CHECK-DEBUG-NEXT:    [[STRUCTARG216:%.*]] = alloca { ptr, ptr, ptr }, align 8
 // CHECK-DEBUG-NEXT:    [[STRUCTARG:%.*]] = alloca { ptr, ptr, ptr }, align 8
-// CHECK-DEBUG-NEXT:    [[P_LASTITER156:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[P_LOWERBOUND157:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[P_UPPERBOUND158:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[P_STRIDE159:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[P_LASTITER157:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[P_LOWERBOUND158:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[P_UPPERBOUND159:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[P_STRIDE160:%.*]] = alloca i32, align 4
 // CHECK-DEBUG-NEXT:    [[P_LASTITER95:%.*]] = alloca i32, align 4
 // CHECK-DEBUG-NEXT:    [[P_LOWERBOUND96:%.*]] = alloca i32, align 4
 // CHECK-DEBUG-NEXT:    [[P_UPPERBOUND97:%.*]] = alloca i32, align 4
@@ -2127,10 +2149,10 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[AGG_CAPTURED78:%.*]] = alloca [[STRUCT_ANON_9:%.*]], align 8
 // CHECK-DEBUG-NEXT:    [[AGG_CAPTURED79:%.*]] = alloca [[STRUCT_ANON_10:%.*]], align 4
 // CHECK-DEBUG-NEXT:    [[DOTCOUNT_ADDR80:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[I138:%.*]] = alloca i32, align 4
-// CHECK-DEBUG-NEXT:    [[AGG_CAPTURED139:%.*]] = alloca [[STRUCT_ANON_13:%.*]], align 8
-// CHECK-DEBUG-NEXT:    [[AGG_CAPTURED140:%.*]] = alloca [[STRUCT_ANON_14:%.*]], align 4
-// CHECK-DEBUG-NEXT:    [[DOTCOUNT_ADDR141:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[I139:%.*]] = alloca i32, align 4
+// CHECK-DEBUG-NEXT:    [[AGG_CAPTURED140:%.*]] = alloca [[STRUCT_ANON_13:%.*]], align 8
+// CHECK-DEBUG-NEXT:    [[AGG_CAPTURED141:%.*]] = alloca [[STRUCT_ANON_14:%.*]], align 4
+// CHECK-DEBUG-NEXT:    [[DOTCOUNT_ADDR142:%.*]] = alloca i32, align 4
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[LOADGEP_A_ADDR]], [[META207:![0-9]+]], !DIExpression(), [[META208:![0-9]+]])
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[LOADGEP_B_ADDR]], [[META209:![0-9]+]], !DIExpression(), [[META210:![0-9]+]])
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[LOADGEP_R_ADDR]], [[META211:![0-9]+]], !DIExpression(), [[META212:![0-9]+]])
@@ -2217,70 +2239,72 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_AFTER88:%.*]], !dbg [[DBG231]]
 // CHECK-DEBUG:       omp_loop.after88:
 // CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM102:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB31:[0-9]+]]), !dbg [[DBG234:![0-9]+]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_PARALLEL217:%.*]]
-// CHECK-DEBUG:       omp_parallel217:
-// CHECK-DEBUG-NEXT:    [[GEP_A_ADDR214:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG213]], i32 0, i32 0
-// CHECK-DEBUG-NEXT:    store ptr [[LOADGEP_A_ADDR]], ptr [[GEP_A_ADDR214]], align 8
-// CHECK-DEBUG-NEXT:    [[GEP_B_ADDR215:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG213]], i32 0, i32 1
-// CHECK-DEBUG-NEXT:    store ptr [[LOADGEP_B_ADDR]], ptr [[GEP_B_ADDR215]], align 8
-// CHECK-DEBUG-NEXT:    [[GEP_R_ADDR216:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG213]], i32 0, i32 2
-// CHECK-DEBUG-NEXT:    store ptr [[LOADGEP_R_ADDR]], ptr [[GEP_R_ADDR216]], align 8
-// CHECK-DEBUG-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB31]], i32 1, ptr @_Z14parallel_for_2Pfid..omp_par.21, ptr [[STRUCTARG213]]), !dbg [[DBG235:![0-9]+]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_PARALLEL220:%.*]]
+// CHECK-DEBUG:       omp_parallel220:
+// CHECK-DEBUG-NEXT:    [[GEP_A_ADDR217:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG216]], i32 0, i32 0
+// CHECK-DEBUG-NEXT:    store ptr [[LOADGEP_A_ADDR]], ptr [[GEP_A_ADDR217]], align 8
+// CHECK-DEBUG-NEXT:    [[GEP_B_ADDR218:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG216]], i32 0, i32 1
+// CHECK-DEBUG-NEXT:    store ptr [[LOADGEP_B_ADDR]], ptr [[GEP_B_ADDR218]], align 8
+// CHECK-DEBUG-NEXT:    [[GEP_R_ADDR219:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG216]], i32 0, i32 2
+// CHECK-DEBUG-NEXT:    store ptr [[LOADGEP_R_ADDR]], ptr [[GEP_R_ADDR219]], align 8
+// CHECK-DEBUG-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB31]], i32 1, ptr @_Z14parallel_for_2Pfid..omp_par.21, ptr [[STRUCTARG216]]), !dbg [[DBG235:![0-9]+]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_EXIT108:%.*]]
 // CHECK-DEBUG:       omp.par.exit108:
-// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[I138]], [[META239:![0-9]+]], !DIExpression(), [[META242:![0-9]+]])
-// CHECK-DEBUG-NEXT:    store i32 0, ptr [[I138]], align 4, !dbg [[META242]]
-// CHECK-DEBUG-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_13]], ptr [[AGG_CAPTURED139]], i32 0, i32 0, !dbg [[DBG243:![0-9]+]]
-// CHECK-DEBUG-NEXT:    store ptr [[I138]], ptr [[TMP16]], align 8, !dbg [[DBG243]]
-// CHECK-DEBUG-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_14]], ptr [[AGG_CAPTURED140]], i32 0, i32 0, !dbg [[DBG243]]
-// CHECK-DEBUG-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I138]], align 4, !dbg [[DBG244:![0-9]+]]
+// CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[I139]], [[META239:![0-9]+]], !DIExpression(), [[META242:![0-9]+]])
+// CHECK-DEBUG-NEXT:    store i32 0, ptr [[I139]], align 4, !dbg [[META242]]
+// CHECK-DEBUG-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_13]], ptr [[AGG_CAPTURED140]], i32 0, i32 0, !dbg [[DBG243:![0-9]+]]
+// CHECK-DEBUG-NEXT:    store ptr [[I139]], ptr [[TMP16]], align 8, !dbg [[DBG243]]
+// CHECK-DEBUG-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_14]], ptr [[AGG_CAPTURED141]], i32 0, i32 0, !dbg [[DBG243]]
+// CHECK-DEBUG-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I139]], align 4, !dbg [[DBG244:![0-9]+]]
 // CHECK-DEBUG-NEXT:    store i32 [[TMP18]], ptr [[TMP17]], align 4, !dbg [[DBG243]]
-// CHECK-DEBUG-NEXT:    call void @__captured_stmt.15(ptr [[DOTCOUNT_ADDR141]], ptr [[AGG_CAPTURED139]]), !dbg [[DBG243]]
-// CHECK-DEBUG-NEXT:    [[DOTCOUNT142:%.*]] = load i32, ptr [[DOTCOUNT_ADDR141]], align 4, !dbg [[DBG243]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_PREHEADER143:%.*]], !dbg [[DBG243]]
-// CHECK-DEBUG:       omp_loop.preheader143:
-// CHECK-DEBUG-NEXT:    store i32 0, ptr [[P_LOWERBOUND157]], align 4, !dbg [[DBG243]]
-// CHECK-DEBUG-NEXT:    [[TMP19:%.*]] = sub i32 [[DOTCOUNT142]], 1, !dbg [[DBG243]]
-// CHECK-DEBUG-NEXT:    store i32 [[TMP19]], ptr [[P_UPPERBOUND158]], align 4, !dbg [[DBG243]]
-// CHECK-DEBUG-NEXT:    store i32 1, ptr [[P_STRIDE159]], align 4, !dbg [[DBG243]]
-// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM160:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB36:[0-9]+]]), !dbg [[DBG243]]
-// CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB36]], i32 [[OMP_GLOBAL_THREAD_NUM160]], i32 34, ptr [[P_LASTITER156]], ptr [[P_LOWERBOUND157]], ptr [[P_UPPERBOUND158]], ptr [[P_STRIDE159]], i32 1, i32 0), !dbg [[DBG243]]
-// CHECK-DEBUG-NEXT:    [[TMP20:%.*]] = load i32, ptr [[P_LOWERBOUND157]], align 4, !dbg [[DBG243]]
-// CHECK-DEBUG-NEXT:    [[TMP21:%.*]] = load i32, ptr [[P_UPPERBOUND158]], align 4, !dbg [[DBG243]]
-// CHECK-DEBUG-NEXT:    [[TRIP_COUNT_MINUS1161:%.*]] = sub i32 [[TMP21]], [[TMP20]], !dbg [[DBG243]]
-// CHECK-DEBUG-NEXT:    [[TMP22:%.*]] = add i32 [[TRIP_COUNT_MINUS1161]], 1, !dbg [[DBG243]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER144:%.*]], !dbg [[DBG243]]
-// CHECK-DEBUG:       omp_loop.header144:
-// CHECK-DEBUG-NEXT:    [[OMP_LOOP_IV150:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER143]] ], [ [[OMP_LOOP_NEXT152:%.*]], [[OMP_LOOP_INC147:%.*]] ], !dbg [[DBG243]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_COND145:%.*]], !dbg [[DBG243]]
-// CHECK-DEBUG:       omp_loop.cond145:
-// CHECK-DEBUG-NEXT:    [[OMP_LOOP_CMP151:%.*]] = icmp ult i32 [[OMP_LOOP_IV150]], [[TMP22]], !dbg [[DBG243]]
-// CHECK-DEBUG-NEXT:    br i1 [[OMP_LOOP_CMP151]], label [[OMP_LOOP_BODY146:%.*]], label [[OMP_LOOP_EXIT148:%.*]], !dbg [[DBG243]]
-// CHECK-DEBUG:       omp_loop.exit148:
-// CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB36]], i32 [[OMP_GLOBAL_THREAD_NUM160]]), !dbg [[DBG243]]
-// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM162:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB36]]), !dbg [[DBG245:![0-9]+]]
-// CHECK-DEBUG-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB37:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM162]]), !dbg [[DBG245]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_AFTER149:%.*]], !dbg [[DBG243]]
-// CHECK-DEBUG:       omp_loop.after149:
+// CHECK-DEBUG-NEXT:    call void @__captured_stmt.15(ptr [[DOTCOUNT_ADDR142]], ptr [[AGG_CAPTURED140]]), !dbg [[DBG243]]
+// CHECK-DEBUG-NEXT:    [[DOTCOUNT143:%.*]] = load i32, ptr [[DOTCOUNT_ADDR142]], align 4, !dbg [[DBG243]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_PREHEADER144:%.*]], !dbg [[DBG243]]
+// CHECK-DEBUG:       omp_loop.preheader144:
+// CHECK-DEBUG-NEXT:    store i32 0, ptr [[P_LOWERBOUND158]], align 4, !dbg [[DBG243]]
+// CHECK-DEBUG-NEXT:    [[TMP19:%.*]] = sub i32 [[DOTCOUNT143]], 1, !dbg [[DBG243]]
+// CHECK-DEBUG-NEXT:    store i32 [[TMP19]], ptr [[P_UPPERBOUND159]], align 4, !dbg [[DBG243]]
+// CHECK-DEBUG-NEXT:    store i32 1, ptr [[P_STRIDE160]], align 4, !dbg [[DBG243]]
+// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM161:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB36:[0-9]+]]), !dbg [[DBG243]]
+// CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB36]], i32 [[OMP_GLOBAL_THREAD_NUM161]], i32 34, ptr [[P_LASTITER157]], ptr [[P_LOWERBOUND158]], ptr [[P_UPPERBOUND159]], ptr [[P_STRIDE160]], i32 1, i32 0), !dbg [[DBG243]]
+// CHECK-DEBUG-NEXT:    [[TMP20:%.*]] = load i32, ptr [[P_LOWERBOUND158]], align 4, !dbg [[DBG243]]
+// CHECK-DEBUG-NEXT:    [[TMP21:%.*]] = load i32, ptr [[P_UPPERBOUND159]], align 4, !dbg [[DBG243]]
+// CHECK-DEBUG-NEXT:    [[TRIP_COUNT_MINUS1162:%.*]] = sub i32 [[TMP21]], [[TMP20]], !dbg [[DBG243]]
+// CHECK-DEBUG-NEXT:    [[TMP22:%.*]] = add i32 [[TRIP_COUNT_MINUS1162]], 1, !dbg [[DBG243]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER145:%.*]], !dbg [[DBG243]]
+// CHECK-DEBUG:       omp_loop.header145:
+// CHECK-DEBUG-NEXT:    [[OMP_LOOP_IV151:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER144]] ], [ [[OMP_LOOP_NEXT153:%.*]], [[OMP_LOOP_INC148:%.*]] ], !dbg [[DBG243]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_COND146:%.*]], !dbg [[DBG243]]
+// CHECK-DEBUG:       omp_loop.cond146:
+// CHECK-DEBUG-NEXT:    [[OMP_LOOP_CMP152:%.*]] = icmp ult i32 [[OMP_LOOP_IV151]], [[TMP22]], !dbg [[DBG243]]
+// CHECK-DEBUG-NEXT:    br i1 [[OMP_LOOP_CMP152]], label [[OMP_LOOP_BODY147:%.*]], label [[OMP_LOOP_EXIT149:%.*]], !dbg [[DBG243]]
+// CHECK-DEBUG:       omp_loop.exit149:
+// CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB36]], i32 [[OMP_GLOBAL_THREAD_NUM161]]), !dbg [[DBG243]]
+// CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM163:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB36]]), !dbg [[DBG245:![0-9]+]]
+// CHECK-DEBUG-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB37:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM163]]), !dbg [[DBG245]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_AFTER150:%.*]], !dbg [[DBG243]]
+// CHECK-DEBUG:       omp_loop.after150:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_REGION9_PARALLEL_AFTER:%.*]], !dbg [[DBG246:![0-9]+]]
 // CHECK-DEBUG:       omp.par.region9.parallel.after:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_PRE_FINALIZE10:%.*]]
 // CHECK-DEBUG:       omp.par.pre_finalize10:
+// CHECK-DEBUG-NEXT:    br label [[DOTFINI164:%.*]]
+// CHECK-DEBUG:       .fini164:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_EXIT11_EXITSTUB:%.*]], !dbg [[DBG246]]
-// CHECK-DEBUG:       omp_loop.body146:
-// CHECK-DEBUG-NEXT:    [[TMP23:%.*]] = add i32 [[OMP_LOOP_IV150]], [[TMP20]], !dbg [[DBG245]]
-// CHECK-DEBUG-NEXT:    call void @__captured_stmt.16(ptr [[I138]], i32 [[TMP23]], ptr [[AGG_CAPTURED140]]), !dbg [[DBG243]]
+// CHECK-DEBUG:       omp_loop.body147:
+// CHECK-DEBUG-NEXT:    [[TMP23:%.*]] = add i32 [[OMP_LOOP_IV151]], [[TMP20]], !dbg [[DBG245]]
+// CHECK-DEBUG-NEXT:    call void @__captured_stmt.16(ptr [[I139]], i32 [[TMP23]], ptr [[AGG_CAPTURED141]]), !dbg [[DBG243]]
 // CHECK-DEBUG-NEXT:    [[TMP24:%.*]] = load i32, ptr [[LOADGEP_A_ADDR]], align 4, !dbg [[DBG247:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[CONV153:%.*]] = sitofp i32 [[TMP24]] to double, !dbg [[DBG247]]
+// CHECK-DEBUG-NEXT:    [[CONV154:%.*]] = sitofp i32 [[TMP24]] to double, !dbg [[DBG247]]
 // CHECK-DEBUG-NEXT:    [[TMP25:%.*]] = load double, ptr [[LOADGEP_B_ADDR]], align 8, !dbg [[DBG245]]
-// CHECK-DEBUG-NEXT:    [[ADD154:%.*]] = fadd double [[CONV153]], [[TMP25]], !dbg [[DBG248:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[CONV155:%.*]] = fptrunc double [[ADD154]] to float, !dbg [[DBG247]]
+// CHECK-DEBUG-NEXT:    [[ADD155:%.*]] = fadd double [[CONV154]], [[TMP25]], !dbg [[DBG248:![0-9]+]]
+// CHECK-DEBUG-NEXT:    [[CONV156:%.*]] = fptrunc double [[ADD155]] to float, !dbg [[DBG247]]
 // CHECK-DEBUG-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[LOADGEP_R_ADDR]], align 8, !dbg [[DBG249:![0-9]+]]
-// CHECK-DEBUG-NEXT:    store float [[CONV155]], ptr [[TMP26]], align 4, !dbg [[DBG250:![0-9]+]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_INC147]], !dbg [[DBG243]]
-// CHECK-DEBUG:       omp_loop.inc147:
-// CHECK-DEBUG-NEXT:    [[OMP_LOOP_NEXT152]] = add nuw i32 [[OMP_LOOP_IV150]], 1, !dbg [[DBG243]]
-// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER144]], !dbg [[DBG243]]
+// CHECK-DEBUG-NEXT:    store float [[CONV156]], ptr [[TMP26]], align 4, !dbg [[DBG250:![0-9]+]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_INC148]], !dbg [[DBG243]]
+// CHECK-DEBUG:       omp_loop.inc148:
+// CHECK-DEBUG-NEXT:    [[OMP_LOOP_NEXT153]] = add nuw i32 [[OMP_LOOP_IV151]], 1, !dbg [[DBG243]]
+// CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER145]], !dbg [[DBG243]]
 // CHECK-DEBUG:       omp_loop.body85:
 // CHECK-DEBUG-NEXT:    [[TMP27:%.*]] = add i32 [[OMP_LOOP_IV89]], [[TMP13]], !dbg [[DBG233]]
 // CHECK-DEBUG-NEXT:    call void @__captured_stmt.12(ptr [[I77]], i32 [[TMP27]], ptr [[AGG_CAPTURED79]]), !dbg [[DBG231]]
@@ -2317,7 +2341,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-SAME: (ptr noalias [[TID_ADDR103:%.*]], ptr noalias [[ZERO_ADDR104:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] !dbg [[DBG259:![0-9]+]] {
 // CHECK-DEBUG-NEXT:  omp.par.entry105:
 // CHECK-DEBUG-NEXT:    [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0
-// CHECK-DEBUG-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META86]]
+// CHECK-DEBUG-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META47]]
 // CHECK-DEBUG-NEXT:    [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1
 // CHECK-DEBUG-NEXT:    [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META87]]
 // CHECK-DEBUG-NEXT:    [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2
@@ -2377,6 +2401,8 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG:       omp.par.region106.parallel.after:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_PRE_FINALIZE107:%.*]]
 // CHECK-DEBUG:       omp.par.pre_finalize107:
+// CHECK-DEBUG-NEXT:    br label [[DOTFINI138:%.*]]
+// CHECK-DEBUG:       .fini138:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_EXIT108_EXITSTUB:%.*]], !dbg [[DBG276]]
 // CHECK-DEBUG:       omp_loop.body121:
 // CHECK-DEBUG-NEXT:    [[TMP9:%.*]] = add i32 [[OMP_LOOP_IV125]], [[TMP6]], !dbg [[DBG275]]
@@ -2400,7 +2426,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-SAME: (ptr noalias [[TID_ADDR42:%.*]], ptr noalias [[ZERO_ADDR43:%.*]], ptr [[TMP0:%.*]]) #[[ATTR1]] !dbg [[DBG281:![0-9]+]] {
 // CHECK-DEBUG-NEXT:  omp.par.entry44:
 // CHECK-DEBUG-NEXT:    [[GEP_A_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 0
-// CHECK-DEBUG-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META86]]
+// CHECK-DEBUG-NEXT:    [[LOADGEP_A_ADDR:%.*]] = load ptr, ptr [[GEP_A_ADDR]], align 8, !align [[META47]]
 // CHECK-DEBUG-NEXT:    [[GEP_B_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 1
 // CHECK-DEBUG-NEXT:    [[LOADGEP_B_ADDR:%.*]] = load ptr, ptr [[GEP_B_ADDR]], align 8, !align [[META87]]
 // CHECK-DEBUG-NEXT:    [[GEP_R_ADDR:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[TMP0]], i32 0, i32 2
@@ -2460,6 +2486,8 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG:       omp.par.region45.parallel.after:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_PRE_FINALIZE46:%.*]]
 // CHECK-DEBUG:       omp.par.pre_finalize46:
+// CHECK-DEBUG-NEXT:    br label [[DOTFINI:%.*]]
+// CHECK-DEBUG:       .fini:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_EXIT47_EXITSTUB:%.*]], !dbg [[DBG298]]
 // CHECK-DEBUG:       omp_loop.body60:
 // CHECK-DEBUG-NEXT:    [[TMP9:%.*]] = add i32 [[OMP_LOOP_IV64]], [[TMP6]], !dbg [[DBG297]]
@@ -2494,7 +2522,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTART]], [[META307:![0-9]+]], !DIExpression(), [[META309:![0-9]+]])
 // CHECK-DEBUG-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_3:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG310:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG310]]
+// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG310]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG310]]
 // CHECK-DEBUG-NEXT:    store i32 [[TMP3]], ptr [[DOTSTART]], align 4, !dbg [[META309]]
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTOP]], [[META312:![0-9]+]], !DIExpression(), [[META313:![0-9]+]])
@@ -2519,7 +2547,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    br label [[COND_END]], !dbg [[META313]]
 // CHECK-DEBUG:       cond.end:
 // CHECK-DEBUG-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META313]]
-// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META313]]
+// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META313]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META313]]
 // CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG315:![0-9]+]]
 //
@@ -2542,7 +2570,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG324:![0-9]+]]
 // CHECK-DEBUG-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG324]]
 // CHECK-DEBUG-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG324]]
-// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG324]]
+// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG324]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META319]]
 // CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG322]]
 //
@@ -2562,7 +2590,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTART]], [[META329:![0-9]+]], !DIExpression(), [[META331:![0-9]+]])
 // CHECK-DEBUG-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_5:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG332:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG332]]
+// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG332]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG332]]
 // CHECK-DEBUG-NEXT:    store i32 [[TMP3]], ptr [[DOTSTART]], align 4, !dbg [[META331]]
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTOP]], [[META334:![0-9]+]], !DIExpression(), [[META335:![0-9]+]])
@@ -2587,7 +2615,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    br label [[COND_END]], !dbg [[META335]]
 // CHECK-DEBUG:       cond.end:
 // CHECK-DEBUG-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META335]]
-// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META335]]
+// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META335]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META335]]
 // CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG337:![0-9]+]]
 //
@@ -2610,7 +2638,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG346:![0-9]+]]
 // CHECK-DEBUG-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG346]]
 // CHECK-DEBUG-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG346]]
-// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG346]]
+// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG346]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META341]]
 // CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG344]]
 //
@@ -2630,7 +2658,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTART]], [[META351:![0-9]+]], !DIExpression(), [[META353:![0-9]+]])
 // CHECK-DEBUG-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_7:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG354:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG354]]
+// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG354]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG354]]
 // CHECK-DEBUG-NEXT:    store i32 [[TMP3]], ptr [[DOTSTART]], align 4, !dbg [[META353]]
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTOP]], [[META356:![0-9]+]], !DIExpression(), [[META357:![0-9]+]])
@@ -2655,7 +2683,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    br label [[COND_END]], !dbg [[META357]]
 // CHECK-DEBUG:       cond.end:
 // CHECK-DEBUG-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META357]]
-// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META357]]
+// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META357]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META357]]
 // CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG359:![0-9]+]]
 //
@@ -2678,7 +2706,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG368:![0-9]+]]
 // CHECK-DEBUG-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG368]]
 // CHECK-DEBUG-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG368]]
-// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG368]]
+// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG368]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META363]]
 // CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG366]]
 //
@@ -2698,7 +2726,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTART]], [[META373:![0-9]+]], !DIExpression(), [[META375:![0-9]+]])
 // CHECK-DEBUG-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_9:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG376:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG376]]
+// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG376]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG376]]
 // CHECK-DEBUG-NEXT:    store i32 [[TMP3]], ptr [[DOTSTART]], align 4, !dbg [[META375]]
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTOP]], [[META378:![0-9]+]], !DIExpression(), [[META379:![0-9]+]])
@@ -2723,7 +2751,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    br label [[COND_END]], !dbg [[META379]]
 // CHECK-DEBUG:       cond.end:
 // CHECK-DEBUG-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META379]]
-// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META379]]
+// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META379]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META379]]
 // CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG381:![0-9]+]]
 //
@@ -2746,7 +2774,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG390:![0-9]+]]
 // CHECK-DEBUG-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG390]]
 // CHECK-DEBUG-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG390]]
-// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG390]]
+// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG390]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META385]]
 // CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG388]]
 //
@@ -2766,7 +2794,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTART]], [[META395:![0-9]+]], !DIExpression(), [[META397:![0-9]+]])
 // CHECK-DEBUG-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_11:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG398:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG398]]
+// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG398]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG398]]
 // CHECK-DEBUG-NEXT:    store i32 [[TMP3]], ptr [[DOTSTART]], align 4, !dbg [[META397]]
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTOP]], [[META400:![0-9]+]], !DIExpression(), [[META401:![0-9]+]])
@@ -2791,7 +2819,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    br label [[COND_END]], !dbg [[META401]]
 // CHECK-DEBUG:       cond.end:
 // CHECK-DEBUG-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META401]]
-// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META401]]
+// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META401]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META401]]
 // CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG403:![0-9]+]]
 //
@@ -2814,7 +2842,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG412:![0-9]+]]
 // CHECK-DEBUG-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG412]]
 // CHECK-DEBUG-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG412]]
-// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG412]]
+// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG412]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META407]]
 // CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG410]]
 //
@@ -2834,7 +2862,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTART]], [[META417:![0-9]+]], !DIExpression(), [[META419:![0-9]+]])
 // CHECK-DEBUG-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_13:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG420:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG420]]
+// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG420]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG420]]
 // CHECK-DEBUG-NEXT:    store i32 [[TMP3]], ptr [[DOTSTART]], align 4, !dbg [[META419]]
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTOP]], [[META422:![0-9]+]], !DIExpression(), [[META423:![0-9]+]])
@@ -2859,7 +2887,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    br label [[COND_END]], !dbg [[META423]]
 // CHECK-DEBUG:       cond.end:
 // CHECK-DEBUG-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META423]]
-// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META423]]
+// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META423]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META423]]
 // CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG425:![0-9]+]]
 //
@@ -2882,7 +2910,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG434:![0-9]+]]
 // CHECK-DEBUG-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG434]]
 // CHECK-DEBUG-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG434]]
-// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG434]]
+// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG434]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META429]]
 // CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG432]]
 //
@@ -2902,7 +2930,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTART]], [[META439:![0-9]+]], !DIExpression(), [[META441:![0-9]+]])
 // CHECK-DEBUG-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_15:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG442:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG442]]
+// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG442]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG442]]
 // CHECK-DEBUG-NEXT:    store i32 [[TMP3]], ptr [[DOTSTART]], align 4, !dbg [[META441]]
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTOP]], [[META444:![0-9]+]], !DIExpression(), [[META445:![0-9]+]])
@@ -2927,7 +2955,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    br label [[COND_END]], !dbg [[META445]]
 // CHECK-DEBUG:       cond.end:
 // CHECK-DEBUG-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META445]]
-// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META445]]
+// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META445]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META445]]
 // CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG447:![0-9]+]]
 //
@@ -2950,7 +2978,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG456:![0-9]+]]
 // CHECK-DEBUG-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG456]]
 // CHECK-DEBUG-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG456]]
-// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG456]]
+// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG456]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META451]]
 // CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG454]]
 //
@@ -2970,7 +2998,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTART]], [[META461:![0-9]+]], !DIExpression(), [[META463:![0-9]+]])
 // CHECK-DEBUG-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_17:%.*]], ptr [[TMP0]], i32 0, i32 0, !dbg [[DBG464:![0-9]+]]
-// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG464]]
+// CHECK-DEBUG-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG464]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !dbg [[DBG464]]
 // CHECK-DEBUG-NEXT:    store i32 [[TMP3]], ptr [[DOTSTART]], align 4, !dbg [[META463]]
 // CHECK-DEBUG-NEXT:      #dbg_declare(ptr [[DOTSTOP]], [[META466:![0-9]+]], !DIExpression(), [[META467:![0-9]+]])
@@ -2995,7 +3023,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    br label [[COND_END]], !dbg [[META467]]
 // CHECK-DEBUG:       cond.end:
 // CHECK-DEBUG-NEXT:    [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ], !dbg [[META467]]
-// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META467]]
+// CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DISTANCE_ADDR]], align 8, !dbg [[META467]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    store i32 [[COND]], ptr [[TMP10]], align 4, !dbg [[META467]]
 // CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG469:![0-9]+]]
 //
@@ -3018,7 +3046,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = load i32, ptr [[LOGICAL_ADDR]], align 4, !dbg [[DBG478:![0-9]+]]
 // CHECK-DEBUG-NEXT:    [[MUL:%.*]] = mul i32 1, [[TMP3]], !dbg [[DBG478]]
 // CHECK-DEBUG-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]], !dbg [[DBG478]]
-// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG478]]
+// CHECK-DEBUG-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8, !dbg [[DBG478]], !nonnull [[META12]], !align [[META47]]
 // CHECK-DEBUG-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4, !dbg [[META473]]
 // CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG476]]
 //
diff --git a/clang/test/OpenMP/masked_codegen.cpp b/clang/test/OpenMP/masked_codegen.cpp
index a39de12d69337..bc6f68de9b248 100644
--- a/clang/test/OpenMP/masked_codegen.cpp
+++ b/clang/test/OpenMP/masked_codegen.cpp
@@ -35,6 +35,8 @@ int main() {
 // ALL-NEXT:  			store i8 2, ptr [[A_ADDR]]
 // IRBUILDER-NEXT:		br label %[[AFTER:[^ ,]+]]
 // IRBUILDER:			[[AFTER]]
+// IRBUILDER-NEXT:		br label %[[OMP_REGION_FINALIZE:[^ ,]+]]
+// IRBUILDER:			[[OMP_REGION_FINALIZE]]
 // ALL-NEXT:  			call {{.*}}void @__kmpc_end_masked(ptr [[DEFAULT_LOC]], i32 [[GTID]])
 // ALL-NEXT:  			br label {{%?}}[[EXIT]]
 // ALL:       			[[EXIT]]
diff --git a/clang/test/OpenMP/master_codegen.cpp b/clang/test/OpenMP/master_codegen.cpp
index a7af326caacfe..5a92444d9a927 100644
--- a/clang/test/OpenMP/master_codegen.cpp
+++ b/clang/test/OpenMP/master_codegen.cpp
@@ -35,6 +35,8 @@ int main() {
 // ALL-NEXT:  			store i8 2, ptr [[A_ADDR]]
 // IRBUILDER-NEXT:		br label %[[AFTER:[^ ,]+]]
 // IRBUILDER:			[[AFTER]]
+// IRBUILDER-NEXT:		br label %[[OMP_REGION_FINALIZE:[^ ,]+]]
+// IRBUILDER:			[[OMP_REGION_FINALIZE]]
 // ALL-NEXT:  			call {{.*}}void @__kmpc_end_master(ptr [[DEFAULT_LOC]], i32 [[GTID]])
 // ALL-NEXT:  			br label {{%?}}[[EXIT]]
 // ALL:       			[[EXIT]]
diff --git a/clang/test/OpenMP/nested_loop_codegen.cpp b/clang/test/OpenMP/nested_loop_codegen.cpp
index 9aefc6a739e51..e01fd0da31ee8 100644
--- a/clang/test/OpenMP/nested_loop_codegen.cpp
+++ b/clang/test/OpenMP/nested_loop_codegen.cpp
@@ -904,6 +904,8 @@ int inline_decl() {
 // CHECK4:       omp.par.region.parallel.after:
 // CHECK4-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK4:       omp.par.pre_finalize:
+// CHECK4-NEXT:    br label [[FINI:%.*]]
+// CHECK4:       .fini:
 // CHECK4-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]], !dbg [[DBG27]]
 // CHECK4:       for.body:
 // CHECK4-NEXT:    store i32 0, ptr [[LOADGEP_K]], align 4, !dbg [[DBG28:![0-9]+]]
@@ -1083,6 +1085,8 @@ int inline_decl() {
 // CHECK4:       omp.par.region.parallel.after:
 // CHECK4-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK4:       omp.par.pre_finalize:
+// CHECK4-NEXT:    br label [[FINI:%.*]]
+// CHECK4:       .fini:
 // CHECK4-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]], !dbg [[DBG90]]
 // CHECK4:       for.body:
 // CHECK4-NEXT:      #dbg_declare(ptr [[K]], [[META91:![0-9]+]], !DIExpression(), [[META95:![0-9]+]])
diff --git a/clang/test/OpenMP/ordered_codegen.cpp b/clang/test/OpenMP/ordered_codegen.cpp
index 5cd95f1927e5c..3b29feac7caa2 100644
--- a/clang/test/OpenMP/ordered_codegen.cpp
+++ b/clang/test/OpenMP/ordered_codegen.cpp
@@ -794,6 +794,8 @@ void foo_simd(int low, int up) {
 // CHECK1-IRBUILDER-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK1-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+// CHECK1-IRBUILDER:       omp_region.finalize:
 // CHECK1-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1-IRBUILDER:       omp.body.continue:
@@ -884,6 +886,8 @@ void foo_simd(int low, int up) {
 // CHECK1-IRBUILDER-NEXT:    store float [[MUL7]], ptr [[ARRAYIDX8]], align 4
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK1-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+// CHECK1-IRBUILDER:       omp_region.finalize:
 // CHECK1-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1-IRBUILDER:       omp.body.continue:
@@ -1022,6 +1026,8 @@ void foo_simd(int low, int up) {
 // CHECK1-IRBUILDER-NEXT:    store float [[MUL29]], ptr [[ARRAYIDX31]], align 4
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK1-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+// CHECK1-IRBUILDER:       omp_region.finalize:
 // CHECK1-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM23]])
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1-IRBUILDER:       omp.body.continue:
@@ -1131,6 +1137,8 @@ void foo_simd(int low, int up) {
 // CHECK1-IRBUILDER-NEXT:    store float [[MUL14]], ptr [[ARRAYIDX16]], align 4
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK1-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+// CHECK1-IRBUILDER:       omp_region.finalize:
 // CHECK1-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8]])
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1-IRBUILDER:       omp.body.continue:
@@ -1296,17 +1304,19 @@ void foo_simd(int low, int up) {
 // CHECK1-IRBUILDER-NEXT:    call void @__captured_stmt.1(ptr [[I28]])
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY33_ORDERED_AFTER:%.*]]
 // CHECK1-IRBUILDER:       omp.inner.for.body33.ordered.after:
-// CHECK1-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE38:%.*]]
-// CHECK1-IRBUILDER:       omp.body.continue38:
-// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC39:%.*]]
-// CHECK1-IRBUILDER:       omp.inner.for.inc39:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_REGION_FINALIZE38:%.*]]
+// CHECK1-IRBUILDER:       omp_region.finalize38:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE39:%.*]]
+// CHECK1-IRBUILDER:       omp.body.continue39:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC40:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.inc40:
 // CHECK1-IRBUILDER-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4
 // CHECK1-IRBUILDER-NEXT:    [[ADD40:%.*]] = add i32 [[TMP32]], 1
 // CHECK1-IRBUILDER-NEXT:    store i32 [[ADD40]], ptr [[DOTOMP_IV16]], align 4
 // CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM41:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB12]])
 // CHECK1-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM41]])
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP5:![0-9]+]]
-// CHECK1-IRBUILDER:       omp.inner.for.end42:
+// CHECK1-IRBUILDER:       omp.inner.for.end43:
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1-IRBUILDER:       omp.dispatch.inc:
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
@@ -2034,6 +2044,8 @@ void foo_simd(int low, int up) {
 // CHECK3-IRBUILDER-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK3-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+// CHECK3-IRBUILDER:       omp_region.finalize:
 // CHECK3-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3-IRBUILDER:       omp.body.continue:
@@ -2124,6 +2136,8 @@ void foo_simd(int low, int up) {
 // CHECK3-IRBUILDER-NEXT:    store float [[MUL7]], ptr [[ARRAYIDX8]], align 4
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK3-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+// CHECK3-IRBUILDER:       omp_region.finalize:
 // CHECK3-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3-IRBUILDER:       omp.body.continue:
@@ -2262,6 +2276,8 @@ void foo_simd(int low, int up) {
 // CHECK3-IRBUILDER-NEXT:    store float [[MUL29]], ptr [[ARRAYIDX31]], align 4
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK3-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+// CHECK3-IRBUILDER:       omp_region.finalize:
 // CHECK3-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM23]])
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3-IRBUILDER:       omp.body.continue:
@@ -2371,6 +2387,8 @@ void foo_simd(int low, int up) {
 // CHECK3-IRBUILDER-NEXT:    store float [[MUL14]], ptr [[ARRAYIDX16]], align 4
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK3-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+// CHECK3-IRBUILDER:       omp_region.finalize:
 // CHECK3-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8]])
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3-IRBUILDER:       omp.body.continue:
@@ -2536,17 +2554,19 @@ void foo_simd(int low, int up) {
 // CHECK3-IRBUILDER-NEXT:    call void @__captured_stmt.1(ptr [[I28]])
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY33_ORDERED_AFTER:%.*]]
 // CHECK3-IRBUILDER:       omp.inner.for.body33.ordered.after:
-// CHECK3-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE38:%.*]]
-// CHECK3-IRBUILDER:       omp.body.continue38:
-// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC39:%.*]]
-// CHECK3-IRBUILDER:       omp.inner.for.inc39:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_REGION_FINALIZE38:%.*]]
+// CHECK3-IRBUILDER:       omp_region.finalize38:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE39:%.*]]
+// CHECK3-IRBUILDER:       omp.body.continue39:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC40:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.inc40:
 // CHECK3-IRBUILDER-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4
 // CHECK3-IRBUILDER-NEXT:    [[ADD40:%.*]] = add i32 [[TMP32]], 1
 // CHECK3-IRBUILDER-NEXT:    store i32 [[ADD40]], ptr [[DOTOMP_IV16]], align 4
 // CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM41:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB12]])
 // CHECK3-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM41]])
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP5:![0-9]+]]
-// CHECK3-IRBUILDER:       omp.inner.for.end42:
+// CHECK3-IRBUILDER:       omp.inner.for.end43:
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK3-IRBUILDER:       omp.dispatch.inc:
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
diff --git a/clang/test/OpenMP/parallel_codegen.cpp b/clang/test/OpenMP/parallel_codegen.cpp
index e8e57aedaa164..9f6004e37db9c 100644
--- a/clang/test/OpenMP/parallel_codegen.cpp
+++ b/clang/test/OpenMP/parallel_codegen.cpp
@@ -906,6 +906,8 @@ int main (int argc, char **argv) {
 // CHECK4:       omp.par.region.parallel.after:
 // CHECK4-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK4:       omp.par.pre_finalize:
+// CHECK4-NEXT:    br label [[FINI:%.*]]
+// CHECK4:       .fini:
 // CHECK4-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]], !dbg [[DBG35]]
 // CHECK4:       omp.par.exit.exitStub:
 // CHECK4-NEXT:    ret void
@@ -975,6 +977,8 @@ int main (int argc, char **argv) {
 // CHECK4:       omp.par.region.parallel.after:
 // CHECK4-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK4:       omp.par.pre_finalize:
+// CHECK4-NEXT:    br label [[FINI:%.*]]
+// CHECK4:       .fini:
 // CHECK4-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]], !dbg [[DBG66]]
 // CHECK4:       omp.par.exit.exitStub:
 // CHECK4-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_update_codegen.cpp b/clang/test/OpenMP/target_update_codegen.cpp
index c8211f475c7fc..6c754c1c953ea 100644
--- a/clang/test/OpenMP/target_update_codegen.cpp
+++ b/clang/test/OpenMP/target_update_codegen.cpp
@@ -1560,5 +1560,37 @@ void foo(int arg) {
   { ++arg; }
 }
 
+#endif
+// RUN: %clang_cc1 -DCK26 -verify -Wno-vla -fopenmp -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK26 --check-prefix CK26-64
+// RUN: %clang_cc1 -DCK26 -fopenmp -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck %s  --check-prefix CK26 --check-prefix CK26-64
+// RUN: %clang_cc1 -DCK26 -fopenmp-version=51 -verify -Wno-vla -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK26 --check-prefix CK26-32
+// RUN: %clang_cc1 -DCK26 -fopenmp -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck %s  --check-prefix CK26 --check-prefix CK26-32
+
+// RUN: %clang_cc1 -DCK26 -verify -Wno-vla -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY19 %s
+// RUN: %clang_cc1 -DCK26 -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY19 %s
+// RUN: %clang_cc1 -DCK26 -verify -Wno-vla -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY19 %s
+// RUN: %clang_cc1 -DCK26 -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY19 %s
+// SIMD-ONLY19-NOT: {{__kmpc|__tgt}}
+#ifdef CK26
+void foo() {
+int a[10];
+#pragma omp target update to(iterator(int it = 0:10) : a[it])
+// CK26-LABEL: define {{.+}}foo
+// CK26: %[[ITER:[a-zA-Z0-9_]+]] = alloca i32, align 4
+// CK26: %[[LOAD2:.*]] = load i32, ptr %[[ITER]], align 4
+}
+
+void foo1() {
+int a[10];
+#pragma omp target update from(iterator(int it = 0:10) : a[it])
+// CK26-LABEL: define {{.+}}foo1
+// CK26: %[[ITER:[a-zA-Z0-9_]+]] = alloca i32, align 4
+// CK26: %[[LOAD2:.*]] = load i32, ptr %[[ITER]], align 4
+}
+
 #endif
 #endif
diff --git a/clang/test/OpenMP/target_update_iterator_ast_print.cpp b/clang/test/OpenMP/target_update_iterator_ast_print.cpp
new file mode 100644
index 0000000000000..322f565c9c732
--- /dev/null
+++ b/clang/test/OpenMP/target_update_iterator_ast_print.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -ast-print %s | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void test() {
+  int a[10];
+  #pragma omp target update to(iterator(int it = 0:10): a[it]) 
+  // CHECK:   int a[10];
+  // CHECK: #pragma omp target update to(iterator(int it = 0:10): a[it])
+  #pragma omp target update from(iterator(int it = 0:10): a[it]) 
+  // CHECK: #pragma omp target update from(iterator(int it = 0:10): a[it])
+}
+
+#endif
diff --git a/clang/test/OpenMP/target_update_iterator_serialization.cpp b/clang/test/OpenMP/target_update_iterator_serialization.cpp
new file mode 100644
index 0000000000000..c1ad380f7c9a5
--- /dev/null
+++ b/clang/test/OpenMP/target_update_iterator_serialization.cpp
@@ -0,0 +1,35 @@
+// Test without serialization:
+// RUN: %clang_cc1 -std=c++20 -fopenmp  %s -ast-dump | FileCheck %s
+
+// Test with serialization:
+// RUN: %clang_cc1 -std=c++20 -fopenmp  -emit-pch -o %t %s
+// RUN: %clang_cc1 -x c++ -std=c++20 -fopenmp -include-pch %t -ast-dump-all /dev/null  \
+// RUN:   | sed -e "s/ <undeserialized declarations>//" -e "s/ imported//" \
+// RUN:   | FileCheck %s
+
+// CHECK: OMPTargetUpdateDirective
+// CHECK-NEXT: OMPFromClause
+// CHECK-NEXT: ArraySubscriptExpr
+// CHECK: DeclRefExpr {{.*}} 'a'
+// CHECK: DeclRefExpr {{.*}} 'it'
+
+
+void foo1() {
+  int a[10];
+
+#pragma omp target update from(iterator(int it = 0:10) : a[it])
+  ;
+}
+
+// CHECK: OMPTargetUpdateDirective
+// CHECK-NEXT: OMPToClause
+// CHECK-NEXT: ArraySubscriptExpr
+// CHECK: DeclRefExpr {{.*}} 'a'
+// CHECK: DeclRefExpr {{.*}} 'it'
+
+void foo2() {
+  int a[10];
+
+#pragma omp target update to(iterator(int it = 0:10) : a[it])
+  ;
+}
diff --git a/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90 b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
index cf77c46346b7f..fd59d39b552da 100644
--- a/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
+++ b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
@@ -174,10 +174,13 @@ subroutine worst_case(a, b, c, d)
 ! CHECK-NEXT:    br label %omp.par.pre_finalize
 
 ! CHECK:       omp.par.pre_finalize:                             ; preds = %reduce.finalize
+! CHECK-NEXT:    br label %.fini
+
+! CHECK:       .fini:
 ! CHECK-NEXT:    %{{.*}} = load ptr, ptr
 ! CHECK-NEXT:    br label %omp.reduction.cleanup
 
-! CHECK:       omp.reduction.cleanup:                            ; preds = %omp.par.pre_finalize
+! CHECK:       omp.reduction.cleanup:                            ; preds = %.fini
 !                [null check]
 ! CHECK:         br i1 %{{.*}}, label %omp.reduction.cleanup43, label %omp.reduction.cleanup44
 
diff --git a/lldb/tools/debugserver/source/DNB.cpp b/lldb/tools/debugserver/source/DNB.cpp
index 0cd48d91a682a..4d5afcf93a44b 100644
--- a/lldb/tools/debugserver/source/DNB.cpp
+++ b/lldb/tools/debugserver/source/DNB.cpp
@@ -1101,7 +1101,7 @@ DNBGetLibrariesInfoForAddresses(nub_process_t pid,
 JSONGenerator::ObjectSP DNBGetSharedCacheInfo(nub_process_t pid) {
   MachProcessSP procSP;
   if (GetProcessSP(pid, procSP)) {
-    return procSP->GetSharedCacheInfo(pid);
+    return procSP->GetInferiorSharedCacheInfo(pid);
   }
   return JSONGenerator::ObjectSP();
 }
diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.h b/lldb/tools/debugserver/source/MacOSX/MachProcess.h
index 56bc9d6c7461e..67b27b9902999 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachProcess.h
+++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.h
@@ -283,7 +283,10 @@ class MachProcess {
   JSONGenerator::ObjectSP
   GetAllLoadedLibrariesInfos(nub_process_t pid,
                              bool fetch_report_load_commands);
-  JSONGenerator::ObjectSP GetSharedCacheInfo(nub_process_t pid);
+  bool GetDebugserverSharedCacheInfo(uuid_t &uuid,
+                                     std::string &shared_cache_path);
+  bool GetInferiorSharedCacheFilepath(std::string &inferior_sc_path);
+  JSONGenerator::ObjectSP GetInferiorSharedCacheInfo(nub_process_t pid);
 
   nub_size_t GetNumThreads() const;
   nub_thread_t GetThreadAtIndex(nub_size_t thread_idx) const;
@@ -474,6 +477,14 @@ class MachProcess {
 
   void *(*m_dyld_process_info_create)(task_t task, uint64_t timestamp,
                                       kern_return_t *kernelError);
+  void *(*m_dyld_process_create_for_task)(task_read_t task, kern_return_t *kr);
+  void *(*m_dyld_process_snapshot_create_for_process)(void *process,
+                                                      kern_return_t *kr);
+  void *(*m_dyld_process_snapshot_get_shared_cache)(void *snapshot);
+  void (*m_dyld_shared_cache_for_each_file)(
+      void *cache, void (^block)(const char *file_path));
+  void (*m_dyld_process_snapshot_dispose)(void *snapshot);
+  void (*m_dyld_process_dispose)(void *process);
   void (*m_dyld_process_info_for_each_image)(
       void *info, void (^callback)(uint64_t machHeaderAddress,
                                    const uuid_t uuid, const char *path));
@@ -481,6 +492,7 @@ class MachProcess {
   void (*m_dyld_process_info_get_cache)(void *info, void *cacheInfo);
   uint32_t (*m_dyld_process_info_get_platform)(void *info);
   void (*m_dyld_process_info_get_state)(void *info, void *stateInfo);
+  const char *(*m_dyld_shared_cache_file_path)();
 };
 
 #endif // LLDB_TOOLS_DEBUGSERVER_SOURCE_MACOSX_MACHPROCESS_H
diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
index 3b875e61a268d..10ed8045a9211 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
+++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
@@ -534,13 +534,35 @@ static bool FBSAddEventDataToOptions(NSMutableDictionary *options,
       m_image_infos_baton(NULL), m_sent_interrupt_signo(0),
       m_auto_resume_signo(0), m_did_exec(false),
       m_dyld_process_info_create(nullptr),
+      m_dyld_process_create_for_task(nullptr),
+      m_dyld_process_snapshot_create_for_process(nullptr),
+      m_dyld_process_snapshot_get_shared_cache(nullptr),
+      m_dyld_shared_cache_for_each_file(nullptr),
+      m_dyld_process_snapshot_dispose(nullptr), m_dyld_process_dispose(nullptr),
       m_dyld_process_info_for_each_image(nullptr),
       m_dyld_process_info_release(nullptr),
       m_dyld_process_info_get_cache(nullptr),
-      m_dyld_process_info_get_state(nullptr) {
+      m_dyld_process_info_get_state(nullptr),
+      m_dyld_shared_cache_file_path(nullptr) {
   m_dyld_process_info_create =
       (void *(*)(task_t task, uint64_t timestamp, kern_return_t * kernelError))
           dlsym(RTLD_DEFAULT, "_dyld_process_info_create");
+
+  m_dyld_process_create_for_task =
+      (void *(*)(task_read_t, kern_return_t *))dlsym(
+          RTLD_DEFAULT, "dyld_process_create_for_task");
+  m_dyld_process_snapshot_create_for_process =
+      (void *(*)(void *, kern_return_t *))dlsym(
+          RTLD_DEFAULT, "dyld_process_snapshot_create_for_process");
+  m_dyld_process_snapshot_get_shared_cache = (void *(*)(void *))dlsym(
+      RTLD_DEFAULT, "dyld_process_snapshot_get_shared_cache");
+  m_dyld_shared_cache_for_each_file =
+      (void (*)(void *, void (^)(const char *)))dlsym(
+          RTLD_DEFAULT, "dyld_shared_cache_for_each_file");
+  m_dyld_process_snapshot_dispose =
+      (void (*)(void *))dlsym(RTLD_DEFAULT, "dyld_process_snapshot_dispose");
+  m_dyld_process_dispose =
+      (void (*)(void *))dlsym(RTLD_DEFAULT, "dyld_process_dispose");
   m_dyld_process_info_for_each_image =
       (void (*)(void *info, void (^)(uint64_t machHeaderAddress,
                                      const uuid_t uuid, const char *path)))
@@ -553,6 +575,8 @@ static bool FBSAddEventDataToOptions(NSMutableDictionary *options,
       RTLD_DEFAULT, "_dyld_process_info_get_platform");
   m_dyld_process_info_get_state = (void (*)(void *info, void *stateInfo))dlsym(
       RTLD_DEFAULT, "_dyld_process_info_get_state");
+  m_dyld_shared_cache_file_path =
+      (const char *(*)())dlsym(RTLD_DEFAULT, "dyld_shared_cache_file_path");
 
   DNBLogThreadedIf(LOG_PROCESS | LOG_VERBOSE, "%s", __PRETTY_FUNCTION__);
 }
@@ -1179,13 +1203,82 @@ static bool mach_header_validity_test(uint32_t magic, uint32_t cputype) {
                                           /* report_load_commands =  */ true);
 }
 
-// From dyld's internal podyld_process_info.h:
+bool MachProcess::GetDebugserverSharedCacheInfo(
+    uuid_t &uuid, std::string &shared_cache_path) {
+  uuid_clear(uuid);
+  shared_cache_path.clear();
+
+  if (m_dyld_process_info_create && m_dyld_process_info_get_cache) {
+    kern_return_t kern_ret;
+    dyld_process_info info =
+        m_dyld_process_info_create(mach_task_self(), 0, &kern_ret);
+    if (info) {
+      struct dyld_process_cache_info shared_cache_info;
+      m_dyld_process_info_get_cache(info, &shared_cache_info);
+      uuid_copy(uuid, shared_cache_info.cacheUUID);
+      m_dyld_process_info_release(info);
+    }
+  }
+  if (m_dyld_shared_cache_file_path) {
+    const char *cache_path = m_dyld_shared_cache_file_path();
+    if (cache_path)
+      shared_cache_path = cache_path;
+  }
+  if (!uuid_is_null(uuid))
+    return true;
+  return false;
+}
+
+bool MachProcess::GetInferiorSharedCacheFilepath(
+    std::string &inferior_sc_path) {
+  inferior_sc_path.clear();
+
+  if (!m_dyld_process_create_for_task ||
+      !m_dyld_process_snapshot_create_for_process ||
+      !m_dyld_process_snapshot_get_shared_cache ||
+      !m_dyld_shared_cache_for_each_file || !m_dyld_process_snapshot_dispose ||
+      !m_dyld_process_dispose)
+    return false;
+
+  __block std::string sc_path;
+  kern_return_t kr;
+  void *process = m_dyld_process_create_for_task(m_task.TaskPort(), &kr);
+  if (kr != KERN_SUCCESS)
+    return false;
+  void *snapshot = m_dyld_process_snapshot_create_for_process(process, &kr);
+  if (kr != KERN_SUCCESS)
+    return false;
+  void *cache = m_dyld_process_snapshot_get_shared_cache(snapshot);
+
+  // The shared cache is a collection of files on disk, this callback
+  // will iterate over all of them.
+  // The first filepath provided is the base filename of the cache.
+  __block bool done = false;
+  m_dyld_shared_cache_for_each_file(cache, ^(const char *path) {
+    if (done) {
+      return;
+    }
+    done = true;
+    sc_path = path;
+  });
+  m_dyld_process_snapshot_dispose(snapshot);
+  m_dyld_process_dispose(process);
+
+  inferior_sc_path = sc_path;
+  if (!sc_path.empty())
+    return true;
+  return false;
+}
+
+// From dyld's internal dyld_process_info.h:
 
-JSONGenerator::ObjectSP MachProcess::GetSharedCacheInfo(nub_process_t pid) {
+JSONGenerator::ObjectSP
+MachProcess::GetInferiorSharedCacheInfo(nub_process_t pid) {
   JSONGenerator::DictionarySP reply_sp(new JSONGenerator::Dictionary());
 
-  kern_return_t kern_ret;
+  uuid_t inferior_sc_uuid;
   if (m_dyld_process_info_create && m_dyld_process_info_get_cache) {
+    kern_return_t kern_ret;
     dyld_process_info info =
         m_dyld_process_info_create(m_task.TaskPort(), 0, &kern_ret);
     if (info) {
@@ -1197,6 +1290,7 @@ static bool mach_header_validity_test(uint32_t magic, uint32_t cputype) {
 
       uuid_string_t uuidstr;
       uuid_unparse_upper(shared_cache_info.cacheUUID, uuidstr);
+      uuid_copy(inferior_sc_uuid, shared_cache_info.cacheUUID);
       reply_sp->AddStringItem("shared_cache_uuid", uuidstr);
 
       reply_sp->AddBooleanItem("no_shared_cache", shared_cache_info.noCache);
@@ -1206,6 +1300,29 @@ static bool mach_header_validity_test(uint32_t magic, uint32_t cputype) {
       m_dyld_process_info_release(info);
     }
   }
+
+  // If debugserver and the inferior are have the same cache UUID,
+  // use the simple call to get the filepath to debugserver's shared
+  // cache, return that.
+  uuid_t debugserver_sc_uuid;
+  std::string debugserver_sc_path;
+  bool found_sc_filepath = false;
+  if (GetDebugserverSharedCacheInfo(debugserver_sc_uuid, debugserver_sc_path)) {
+    if (uuid_compare(inferior_sc_uuid, debugserver_sc_uuid) == 0 &&
+        !debugserver_sc_path.empty()) {
+      reply_sp->AddStringItem("shared_cache_path", debugserver_sc_path);
+      found_sc_filepath = true;
+    }
+  }
+
+  // Use SPI that are only available on newer OSes to fetch the
+  // filepath of the shared cache of the inferior, if available.
+  if (!found_sc_filepath) {
+    std::string inferior_sc_path;
+    if (GetInferiorSharedCacheFilepath(inferior_sc_path))
+      reply_sp->AddStringItem("shared_cache_path", inferior_sc_path);
+  }
+
   return reply_sp;
 }
 
diff --git a/llvm/include/llvm/Analysis/DependenceAnalysis.h b/llvm/include/llvm/Analysis/DependenceAnalysis.h
index 8286d8e8e45cc..ad46d2f1466cf 100644
--- a/llvm/include/llvm/Analysis/DependenceAnalysis.h
+++ b/llvm/include/llvm/Analysis/DependenceAnalysis.h
@@ -506,17 +506,6 @@ class DependenceInfo {
   bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X,
                         const SCEV *Y) const;
 
-  /// isKnownLessThan - Compare to see if S is less than Size
-  /// Another wrapper for isKnownNegative(S - max(Size, 1)) with some extra
-  /// checking if S is an AddRec and we can prove lessthan using the loop
-  /// bounds.
-  bool isKnownLessThan(const SCEV *S, const SCEV *Size) const;
-
-  /// isKnownNonNegative - Compare to see if S is known not to be negative
-  /// Uses the fact that S comes from Ptr, which may be an inbound GEP,
-  /// Proving there is no wrapping going on.
-  bool isKnownNonNegative(const SCEV *S, const Value *Ptr) const;
-
   /// collectUpperBound - All subscripts are the same type (on my machine,
   /// an i64). The loop bound may be a smaller type. collectUpperBound
   /// find the bound, if available, and zero extends it to the Type T.
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 501cbc947132e..21f622ea471e1 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1185,11 +1185,11 @@ class SelectionDAG {
   SDValue getPOISON(EVT VT) { return getNode(ISD::POISON, SDLoc(), VT); }
 
   /// Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
-  LLVM_ABI SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm,
-                             bool ConstantFold = true);
+  LLVM_ABI SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm);
 
-  LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
-                                   bool ConstantFold = true);
+  LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC);
+
+  LLVM_ABI SDValue getTypeSize(const SDLoc &DL, EVT VT, TypeSize TS);
 
   /// Return a GLOBAL_OFFSET_TABLE node. This does not have a useful SDLoc.
   SDValue getGLOBAL_OFFSET_TABLE(EVT VT) {
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 06f4fd30664a8..fa29d4afbec6f 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -578,16 +578,33 @@ class OpenMPIRBuilder {
   using FinalizeCallbackTy = std::function<Error(InsertPointTy CodeGenIP)>;
 
   struct FinalizationInfo {
-    /// The finalization callback provided by the last in-flight invocation of
-    /// createXXXX for the directive of kind DK.
-    FinalizeCallbackTy FiniCB;
-
+    FinalizationInfo(FinalizeCallbackTy FiniCB, omp::Directive DK,
+                     bool IsCancellable)
+        : DK(DK), IsCancellable(IsCancellable), FiniCB(std::move(FiniCB)) {}
     /// The directive kind of the innermost directive that has an associated
     /// region which might require finalization when it is left.
-    omp::Directive DK;
+    const omp::Directive DK;
 
     /// Flag to indicate if the directive is cancellable.
-    bool IsCancellable;
+    const bool IsCancellable;
+
+    /// The basic block to which control should be transferred to
+    /// implement the FiniCB. Memoized to avoid generating finalization
+    /// multiple times.
+    Expected<BasicBlock *> getFiniBB(IRBuilderBase &Builder);
+
+    /// For cases where there is an unavoidable existing finalization block
+    /// (e.g. loop finialization after omp sections). The existing finalization
+    /// block must not contain any non-finalization code.
+    Error mergeFiniBB(IRBuilderBase &Builder, BasicBlock *ExistingFiniBB);
+
+  private:
+    /// Access via getFiniBB.
+    BasicBlock *FiniBB = nullptr;
+
+    /// The finalization callback provided by the last in-flight invocation of
+    /// createXXXX for the directive of kind DK.
+    FinalizeCallbackTy FiniCB;
   };
 
   /// Push a finalization callback on the finalization stack.
@@ -2268,8 +2285,7 @@ class OpenMPIRBuilder {
   ///
   /// \return an error, if any were triggered during execution.
   LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag,
-                                          omp::Directive CanceledDirective,
-                                          FinalizeCallbackTy ExitCB = {});
+                                          omp::Directive CanceledDirective);
 
   /// Generate a target region entry call.
   ///
@@ -3496,7 +3512,8 @@ class OpenMPIRBuilder {
   /// Common interface to finalize the region
   ///
   /// \param OMPD Directive to generate exiting code for
-  /// \param FinIP Insertion point for emitting Finalization code and exit call
+  /// \param FinIP Insertion point for emitting Finalization code and exit call.
+  ///              This block must not contain any non-finalization code.
   /// \param ExitCall Call to the ending OMP Runtime Function
   /// \param HasFinalize indicate if the directive will require finalization
   ///         and has a finalization callback in the stack that
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 4274e951446b8..53b7aede7b4a5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1702,10 +1702,8 @@ void DAGTypeLegalizer::SplitVecRes_LOOP_DEPENDENCE_MASK(SDNode *N, SDValue &Lo,
   Lo = DAG.getNode(N->getOpcode(), DL, LoVT, PtrA, PtrB, N->getOperand(2));
 
   unsigned EltSize = N->getConstantOperandVal(2);
-  unsigned Offset = EltSize * HiVT.getVectorMinNumElements();
-  SDValue Addend = HiVT.isScalableVT()
-                       ? DAG.getVScale(DL, MVT::i64, APInt(64, Offset))
-                       : DAG.getConstant(Offset, DL, MVT::i64);
+  ElementCount Offset = HiVT.getVectorElementCount() * EltSize;
+  SDValue Addend = DAG.getElementCount(DL, MVT::i64, Offset);
 
   PtrA = DAG.getNode(ISD::ADD, DL, MVT::i64, PtrA, Addend);
   Hi = DAG.getNode(N->getOpcode(), DL, HiVT, PtrA, PtrB, N->getOperand(2));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 42786db653fa5..4d9dd842ce4c7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2098,32 +2098,43 @@ SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) {
   return SDValue(CondCodeNodes[Cond], 0);
 }
 
-SDValue SelectionDAG::getVScale(const SDLoc &DL, EVT VT, APInt MulImm,
-                                bool ConstantFold) {
+SDValue SelectionDAG::getVScale(const SDLoc &DL, EVT VT, APInt MulImm) {
   assert(MulImm.getBitWidth() == VT.getSizeInBits() &&
          "APInt size does not match type size!");
 
   if (MulImm == 0)
     return getConstant(0, DL, VT);
 
-  if (ConstantFold) {
-    const MachineFunction &MF = getMachineFunction();
-    const Function &F = MF.getFunction();
-    ConstantRange CR = getVScaleRange(&F, 64);
-    if (const APInt *C = CR.getSingleElement())
-      return getConstant(MulImm * C->getZExtValue(), DL, VT);
-  }
+  const MachineFunction &MF = getMachineFunction();
+  const Function &F = MF.getFunction();
+  ConstantRange CR = getVScaleRange(&F, 64);
+  if (const APInt *C = CR.getSingleElement())
+    return getConstant(MulImm * C->getZExtValue(), DL, VT);
 
   return getNode(ISD::VSCALE, DL, VT, getConstant(MulImm, DL, VT));
 }
 
-SDValue SelectionDAG::getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
-                                      bool ConstantFold) {
-  if (EC.isScalable())
-    return getVScale(DL, VT,
-                     APInt(VT.getSizeInBits(), EC.getKnownMinValue()));
+/// \returns a value of type \p VT that represents the runtime value of \p
+/// Quantity, i.e. scaled by vscale if it's scalable, or a fixed constant
+/// otherwise. Quantity should be a FixedOrScalableQuantity, i.e. ElementCount
+/// or TypeSize.
+template <typename Ty>
+static SDValue getFixedOrScalableQuantity(SelectionDAG &DAG, const SDLoc &DL,
+                                          EVT VT, Ty Quantity) {
+  if (Quantity.isScalable())
+    return DAG.getVScale(
+        DL, VT, APInt(VT.getSizeInBits(), Quantity.getKnownMinValue()));
+
+  return DAG.getConstant(Quantity.getKnownMinValue(), DL, VT);
+}
+
+SDValue SelectionDAG::getElementCount(const SDLoc &DL, EVT VT,
+                                      ElementCount EC) {
+  return getFixedOrScalableQuantity(*this, DL, VT, EC);
+}
 
-  return getConstant(EC.getKnownMinValue(), DL, VT);
+SDValue SelectionDAG::getTypeSize(const SDLoc &DL, EVT VT, TypeSize TS) {
+  return getFixedOrScalableQuantity(*this, DL, VT, TS);
 }
 
 SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT) {
@@ -8500,16 +8511,7 @@ static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG,
 SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, TypeSize Offset,
                                            const SDLoc &DL,
                                            const SDNodeFlags Flags) {
-  EVT VT = Base.getValueType();
-  SDValue Index;
-
-  if (Offset.isScalable())
-    Index = getVScale(DL, Base.getValueType(),
-                      APInt(Base.getValueSizeInBits().getFixedValue(),
-                            Offset.getKnownMinValue()));
-  else
-    Index = getConstant(Offset.getFixedValue(), DL, VT);
-
+  SDValue Index = getTypeSize(DL, Base.getValueType(), Offset);
   return getMemBasePlusOffset(Base, Index, DL, Flags);
 }
 
@@ -13614,11 +13616,8 @@ std::pair<SDValue, SDValue> SelectionDAG::SplitEVL(SDValue N, EVT VecVT,
   EVT VT = N.getValueType();
   assert(VecVT.getVectorElementCount().isKnownEven() &&
          "Expecting the mask to be an evenly-sized vector");
-  unsigned HalfMinNumElts = VecVT.getVectorMinNumElements() / 2;
-  SDValue HalfNumElts =
-      VecVT.isFixedLengthVector()
-          ? getConstant(HalfMinNumElts, DL, VT)
-          : getVScale(DL, VT, APInt(VT.getScalarSizeInBits(), HalfMinNumElts));
+  SDValue HalfNumElts = getElementCount(
+      DL, VT, VecVT.getVectorElementCount().divideCoefficientBy(2));
   SDValue Lo = getNode(ISD::UMIN, DL, VT, N, HalfNumElts);
   SDValue Hi = getNode(ISD::USUBSAT, DL, VT, N, HalfNumElts);
   return std::make_pair(Lo, Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 3b7db2c54bae0..73510db3d140d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4584,17 +4584,9 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
   if (AllocSize.getValueType() != IntPtr)
     AllocSize = DAG.getZExtOrTrunc(AllocSize, dl, IntPtr);
 
-  if (TySize.isScalable())
-    AllocSize = DAG.getNode(ISD::MUL, dl, IntPtr, AllocSize,
-                            DAG.getVScale(dl, IntPtr,
-                                          APInt(IntPtr.getScalarSizeInBits(),
-                                                TySize.getKnownMinValue())));
-  else {
-    SDValue TySizeValue =
-        DAG.getConstant(TySize.getFixedValue(), dl, MVT::getIntegerVT(64));
-    AllocSize = DAG.getNode(ISD::MUL, dl, IntPtr, AllocSize,
-                            DAG.getZExtOrTrunc(TySizeValue, dl, IntPtr));
-  }
+  AllocSize = DAG.getNode(
+      ISD::MUL, dl, IntPtr, AllocSize,
+      DAG.getZExtOrTrunc(DAG.getTypeSize(dl, MVT::i64, TySize), dl, IntPtr));
 
   // Handle alignment.  If the requested alignment is less than or equal to
   // the stack alignment, ignore it.  If the size is greater than or equal to
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 521d8f07434e6..783ec4b0bd211 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10628,12 +10628,8 @@ TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
                                     AddrVT);
     Increment = DAG.getZExtOrTrunc(Increment, DL, AddrVT);
     Increment = DAG.getNode(ISD::MUL, DL, AddrVT, Increment, Scale);
-  } else if (DataVT.isScalableVector()) {
-    Increment = DAG.getVScale(DL, AddrVT,
-                              APInt(AddrVT.getFixedSizeInBits(),
-                                    DataVT.getStoreSize().getKnownMinValue()));
   } else
-    Increment = DAG.getConstant(DataVT.getStoreSize(), DL, AddrVT);
+    Increment = DAG.getTypeSize(DL, AddrVT, DataVT.getStoreSize());
 
   return DAG.getNode(ISD::ADD, DL, AddrVT, Addr, Increment);
 }
@@ -11926,10 +11922,8 @@ SDValue TargetLowering::expandVectorSplice(SDNode *Node,
   // Store the lo part of CONCAT_VECTORS(V1, V2)
   SDValue StoreV1 = DAG.getStore(DAG.getEntryNode(), DL, V1, StackPtr, PtrInfo);
   // Store the hi part of CONCAT_VECTORS(V1, V2)
-  SDValue OffsetToV2 = DAG.getVScale(
-      DL, PtrVT,
-      APInt(PtrVT.getFixedSizeInBits(), VT.getStoreSize().getKnownMinValue()));
-  SDValue StackPtr2 = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, OffsetToV2);
+  SDValue VTBytes = DAG.getTypeSize(DL, PtrVT, VT.getStoreSize());
+  SDValue StackPtr2 = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, VTBytes);
   SDValue StoreV2 = DAG.getStore(StoreV1, DL, V2, StackPtr2, PtrInfo);
 
   if (Imm >= 0) {
@@ -11948,13 +11942,8 @@ SDValue TargetLowering::expandVectorSplice(SDNode *Node,
   SDValue TrailingBytes =
       DAG.getConstant(TrailingElts * EltByteSize, DL, PtrVT);
 
-  if (TrailingElts > VT.getVectorMinNumElements()) {
-    SDValue VLBytes =
-        DAG.getVScale(DL, PtrVT,
-                      APInt(PtrVT.getFixedSizeInBits(),
-                            VT.getStoreSize().getKnownMinValue()));
-    TrailingBytes = DAG.getNode(ISD::UMIN, DL, PtrVT, TrailingBytes, VLBytes);
-  }
+  if (TrailingElts > VT.getVectorMinNumElements())
+    TrailingBytes = DAG.getNode(ISD::UMIN, DL, PtrVT, TrailingBytes, VTBytes);
 
   // Calculate the start address of the spliced result.
   StackPtr2 = DAG.getNode(ISD::SUB, DL, PtrVT, StackPtr2, TrailingBytes);
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 29900115d7a25..cfaf0935e240b 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -809,6 +809,47 @@ FunctionCallee OpenMPIRBuilder::unsignedGetOrCreateAtomicCASRuntimeFunction(
   return {FnTy, C};
 }
 
+Expected<BasicBlock *>
+OpenMPIRBuilder::FinalizationInfo::getFiniBB(IRBuilderBase &Builder) {
+  if (!FiniBB) {
+    Function *ParentFunc = Builder.GetInsertBlock()->getParent();
+    IRBuilderBase::InsertPointGuard Guard(Builder);
+    FiniBB = BasicBlock::Create(Builder.getContext(), ".fini", ParentFunc);
+    Builder.SetInsertPoint(FiniBB);
+    // FiniCB adds the branch to the exit stub.
+    if (Error Err = FiniCB(Builder.saveIP()))
+      return Err;
+  }
+  return FiniBB;
+}
+
+Error OpenMPIRBuilder::FinalizationInfo::mergeFiniBB(IRBuilderBase &Builder,
+                                                     BasicBlock *OtherFiniBB) {
+  // Simple case: FiniBB does not exist yet: re-use OtherFiniBB.
+  if (!FiniBB) {
+    FiniBB = OtherFiniBB;
+
+    Builder.SetInsertPoint(FiniBB->getFirstNonPHIIt());
+    if (Error Err = FiniCB(Builder.saveIP()))
+      return Err;
+
+    return Error::success();
+  }
+
+  // Move instructions from FiniBB to the start of OtherFiniBB.
+  auto EndIt = FiniBB->end();
+  if (FiniBB->size() >= 1)
+    if (auto Prev = std::prev(EndIt); Prev->isTerminator())
+      EndIt = Prev;
+  OtherFiniBB->splice(OtherFiniBB->getFirstNonPHIIt(), FiniBB, FiniBB->begin(),
+                      EndIt);
+
+  FiniBB->replaceAllUsesWith(OtherFiniBB);
+  FiniBB->eraseFromParent();
+  FiniBB = OtherFiniBB;
+  return Error::success();
+}
+
 Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
   FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID);
   auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
@@ -1231,8 +1272,20 @@ OpenMPIRBuilder::createCancel(const LocationDescription &Loc,
   auto *UI = Builder.CreateUnreachable();
 
   Instruction *ThenTI = UI, *ElseTI = nullptr;
-  if (IfCondition)
+  if (IfCondition) {
     SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
+
+    // Even if the if condition evaluates to false, this should count as a
+    // cancellation point
+    Builder.SetInsertPoint(ElseTI);
+    auto ElseIP = Builder.saveIP();
+
+    InsertPointOrErrorTy IPOrErr = createCancellationPoint(
+        LocationDescription{ElseIP, Loc.DL}, CanceledDirective);
+    if (!IPOrErr)
+      return IPOrErr;
+  }
+
   Builder.SetInsertPoint(ThenTI);
 
   Value *CancelKind = nullptr;
@@ -1252,21 +1305,9 @@ OpenMPIRBuilder::createCancel(const LocationDescription &Loc,
   Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
   Value *Result = createRuntimeFunctionCall(
       getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
-  auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
-    if (CanceledDirective == OMPD_parallel) {
-      IRBuilder<>::InsertPointGuard IPG(Builder);
-      Builder.restoreIP(IP);
-      return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
-                           omp::Directive::OMPD_unknown,
-                           /* ForceSimpleCall */ false,
-                           /* CheckCancelFlag */ false)
-          .takeError();
-    }
-    return Error::success();
-  };
 
   // The actual cancel logic is shared with others, e.g., cancel_barriers.
-  if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
+  if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
     return Err;
 
   // Update the insertion point and remove the terminator we introduced.
@@ -1303,21 +1344,9 @@ OpenMPIRBuilder::createCancellationPoint(const LocationDescription &Loc,
   Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
   Value *Result = createRuntimeFunctionCall(
       getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
-  auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
-    if (CanceledDirective == OMPD_parallel) {
-      IRBuilder<>::InsertPointGuard IPG(Builder);
-      Builder.restoreIP(IP);
-      return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
-                           omp::Directive::OMPD_unknown,
-                           /* ForceSimpleCall */ false,
-                           /* CheckCancelFlag */ false)
-          .takeError();
-    }
-    return Error::success();
-  };
 
   // The actual cancel logic is shared with others, e.g., cancel_barriers.
-  if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
+  if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
     return Err;
 
   // Update the insertion point and remove the terminator we introduced.
@@ -1420,8 +1449,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch(
 }
 
 Error OpenMPIRBuilder::emitCancelationCheckImpl(
-    Value *CancelFlag, omp::Directive CanceledDirective,
-    FinalizeCallbackTy ExitCB) {
+    Value *CancelFlag, omp::Directive CanceledDirective) {
   assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
          "Unexpected cancellation!");
 
@@ -1448,13 +1476,12 @@ Error OpenMPIRBuilder::emitCancelationCheckImpl(
 
   // From the cancellation block we finalize all variables and go to the
   // post finalization block that is known to the FiniCB callback.
-  Builder.SetInsertPoint(CancellationBlock);
-  if (ExitCB)
-    if (Error Err = ExitCB(Builder.saveIP()))
-      return Err;
   auto &FI = FinalizationStack.back();
-  if (Error Err = FI.FiniCB(Builder.saveIP()))
-    return Err;
+  Expected<BasicBlock *> FiniBBOrErr = FI.getFiniBB(Builder);
+  if (!FiniBBOrErr)
+    return FiniBBOrErr.takeError();
+  Builder.SetInsertPoint(CancellationBlock);
+  Builder.CreateBr(*FiniBBOrErr);
 
   // The continuation block is where code generation continues.
   Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
@@ -2053,8 +2080,18 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
   Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
 
   InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
-  if (Error Err = FiniCB(PreFiniIP))
-    return Err;
+  Expected<BasicBlock *> FiniBBOrErr = FiniInfo.getFiniBB(Builder);
+  if (!FiniBBOrErr)
+    return FiniBBOrErr.takeError();
+  {
+    IRBuilderBase::InsertPointGuard Guard(Builder);
+    Builder.restoreIP(PreFiniIP);
+    Builder.CreateBr(*FiniBBOrErr);
+    // There's currently a branch to omp.par.exit. Delete it. We will get there
+    // via the fini block
+    if (Instruction *Term = Builder.GetInsertBlock()->getTerminator())
+      Term->eraseFromParent();
+  }
 
   // Register the outlined info.
   addOutlineInfo(std::move(OI));
@@ -2493,23 +2530,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections(
   if (!updateToLocation(Loc))
     return Loc.IP;
 
-  // FiniCBWrapper needs to create a branch to the loop finalization block, but
-  // this has not been created yet at some times when this callback runs.
-  SmallVector<BranchInst *> CancellationBranches;
-  auto FiniCBWrapper = [&](InsertPointTy IP) {
-    if (IP.getBlock()->end() != IP.getPoint())
-      return FiniCB(IP);
-    // This must be done otherwise any nested constructs using FinalizeOMPRegion
-    // will fail because that function requires the Finalization Basic Block to
-    // have a terminator, which is already removed by EmitOMPRegionBody.
-    // IP is currently at cancelation block.
-    BranchInst *DummyBranch = Builder.CreateBr(IP.getBlock());
-    IP = InsertPointTy(DummyBranch->getParent(), DummyBranch->getIterator());
-    CancellationBranches.push_back(DummyBranch);
-    return FiniCB(IP);
-  };
-
-  FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
+  FinalizationStack.push_back({FiniCB, OMPD_sections, IsCancellable});
 
   // Each section is emitted as a switch case
   // Each finalization callback is handled from clang.EmitOMPSectionDirective()
@@ -2576,20 +2597,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections(
   auto FiniInfo = FinalizationStack.pop_back_val();
   assert(FiniInfo.DK == OMPD_sections &&
          "Unexpected finalization stack state!");
-  if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
-    Builder.restoreIP(AfterIP);
-    BasicBlock *FiniBB =
-        splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
-    if (Error Err = CB(Builder.saveIP()))
-      return Err;
-    AfterIP = {FiniBB, FiniBB->begin()};
-  }
-
-  // Now we can fix the dummy branch to point to the right place
-  for (BranchInst *DummyBranch : CancellationBranches) {
-    assert(DummyBranch->getNumSuccessors() == 1);
-    DummyBranch->setSuccessor(0, LoopFini);
-  }
+  if (Error Err = FiniInfo.mergeFiniBB(Builder, LoopFini))
+    return Err;
 
   return AfterIP;
 }
@@ -6957,9 +6966,6 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
       emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
   if (!AfterIP)
     return AfterIP.takeError();
-  assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
-         "Unexpected Control Flow State!");
-  MergeBlockIntoPredecessor(FiniBB);
 
   // If we are skipping the region of a non conditional, remove the exit
   // block, and clear the builder's insertion point.
@@ -7019,14 +7025,12 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
     FinalizationInfo Fi = FinalizationStack.pop_back_val();
     assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
 
-    if (Error Err = Fi.FiniCB(FinIP))
-      return Err;
-
-    BasicBlock *FiniBB = FinIP.getBlock();
-    Instruction *FiniBBTI = FiniBB->getTerminator();
+    if (Error Err = Fi.mergeFiniBB(Builder, FinIP.getBlock()))
+      return std::move(Err);
 
-    // set Builder IP for call creation
-    Builder.SetInsertPoint(FiniBBTI);
+    // Exit condition: insertion point is before the terminator of the new Fini
+    // block
+    Builder.SetInsertPoint(FinIP.getBlock()->getTerminator());
   }
 
   if (!ExitCall)
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index b454c9a4cd3ae..9beaee60d0bc1 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -841,6 +841,8 @@ ConstantRange ConstantRange::zeroExtend(uint32_t DstTySize) const {
   if (isEmptySet()) return getEmpty(DstTySize);
 
   unsigned SrcTySize = getBitWidth();
+  if (DstTySize == SrcTySize)
+    return *this;
   assert(SrcTySize < DstTySize && "Not a value extension");
   if (isFullSet() || isUpperWrapped()) {
     // Change into [0, 1 << src bit width)
@@ -858,6 +860,8 @@ ConstantRange ConstantRange::signExtend(uint32_t DstTySize) const {
   if (isEmptySet()) return getEmpty(DstTySize);
 
   unsigned SrcTySize = getBitWidth();
+  if (DstTySize == SrcTySize)
+    return *this;
   assert(SrcTySize < DstTySize && "Not a value extension");
 
   // special case: [X, INT_MIN) -- not really wrapping around
@@ -874,6 +878,8 @@ ConstantRange ConstantRange::signExtend(uint32_t DstTySize) const {
 
 ConstantRange ConstantRange::truncate(uint32_t DstTySize,
                                       unsigned NoWrapKind) const {
+  if (DstTySize == getBitWidth())
+    return *this;
   assert(getBitWidth() > DstTySize && "Not a value truncation");
   if (isEmptySet())
     return getEmpty(DstTySize);
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 34d74d04c4419..60e6a82d41cc8 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1717,6 +1717,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
    }
    case AArch64::InOutZAUsePseudo:
    case AArch64::RequiresZASavePseudo:
+   case AArch64::RequiresZT0SavePseudo:
    case AArch64::SMEStateAllocPseudo:
    case AArch64::COALESCER_BARRIER_FPR16:
    case AArch64::COALESCER_BARRIER_FPR32:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6072fd9d8f242..b4f47d249885d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8647,7 +8647,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
               Subtarget->isWindowsArm64EC()) &&
              "Indirect arguments should be scalable on most subtargets");
 
-      uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
+      TypeSize PartSize = VA.getValVT().getStoreSize();
       unsigned NumParts = 1;
       if (Ins[i].Flags.isInConsecutiveRegs()) {
         while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
@@ -8664,16 +8664,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
         InVals.push_back(ArgValue);
         NumParts--;
         if (NumParts > 0) {
-          SDValue BytesIncrement;
-          if (PartLoad.isScalableVector()) {
-            BytesIncrement = DAG.getVScale(
-                DL, Ptr.getValueType(),
-                APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
-          } else {
-            BytesIncrement = DAG.getConstant(
-                APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
-                Ptr.getValueType());
-          }
+          SDValue BytesIncrement =
+              DAG.getTypeSize(DL, Ptr.getValueType(), PartSize);
           Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
                             BytesIncrement, SDNodeFlags::NoUnsignedWrap);
           ExtraArgLocs++;
@@ -9642,6 +9634,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     if (CallAttrs.requiresLazySave() ||
         CallAttrs.requiresPreservingAllZAState())
       ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE;
+    else if (CallAttrs.requiresPreservingZT0())
+      ZAMarkerNode = AArch64ISD::REQUIRES_ZT0_SAVE;
     else if (CallAttrs.caller().hasZAState() ||
              CallAttrs.caller().hasZT0State())
       ZAMarkerNode = AArch64ISD::INOUT_ZA_USE;
@@ -9761,7 +9755,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   SDValue ZTFrameIdx;
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  bool ShouldPreserveZT0 = CallAttrs.requiresPreservingZT0();
+  bool ShouldPreserveZT0 =
+      !UseNewSMEABILowering && CallAttrs.requiresPreservingZT0();
 
   // If the caller has ZT0 state which will not be preserved by the callee,
   // spill ZT0 before the call.
@@ -9774,7 +9769,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // If caller shares ZT0 but the callee is not shared ZA, we need to stop
   // PSTATE.ZA before the call if there is no lazy-save active.
-  bool DisableZA = CallAttrs.requiresDisablingZABeforeCall();
+  bool DisableZA =
+      !UseNewSMEABILowering && CallAttrs.requiresDisablingZABeforeCall();
   assert((!DisableZA || !RequiresLazySave) &&
          "Lazy-save should have PSTATE.SM=1 on entry to the function");
 
@@ -9876,8 +9872,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       assert((isScalable || Subtarget->isWindowsArm64EC()) &&
              "Indirect arguments should be scalable on most subtargets");
 
-      uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
-      uint64_t PartSize = StoreSize;
+      TypeSize StoreSize = VA.getValVT().getStoreSize();
+      TypeSize PartSize = StoreSize;
       unsigned NumParts = 1;
       if (Outs[i].Flags.isInConsecutiveRegs()) {
         while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
@@ -9888,7 +9884,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
       Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
       MachineFrameInfo &MFI = MF.getFrameInfo();
-      int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
+      int FI =
+          MFI.CreateStackObject(StoreSize.getKnownMinValue(), Alignment, false);
       if (isScalable) {
         bool IsPred = VA.getValVT() == MVT::aarch64svcount ||
                       VA.getValVT().getVectorElementType() == MVT::i1;
@@ -9909,16 +9906,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
         NumParts--;
         if (NumParts > 0) {
-          SDValue BytesIncrement;
-          if (isScalable) {
-            BytesIncrement = DAG.getVScale(
-                DL, Ptr.getValueType(),
-                APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
-          } else {
-            BytesIncrement = DAG.getConstant(
-                APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
-                Ptr.getValueType());
-          }
+          SDValue BytesIncrement =
+              DAG.getTypeSize(DL, Ptr.getValueType(), PartSize);
           MPI = MachinePointerInfo(MPI.getAddrSpace());
           Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
                             BytesIncrement, SDNodeFlags::NoUnsignedWrap);
@@ -10263,7 +10252,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         getSMToggleCondition(CallAttrs));
   }
 
-  if (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall())
+  if (!UseNewSMEABILowering &&
+      (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall()))
     // Unconditionally resume ZA.
     Result = DAG.getNode(
         AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result,
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 737169253ddb3..b099f15ecf7e3 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -102,6 +102,7 @@ def : Pat<(i64 (AArch64AllocateSMESaveBuffer GPR64:$size)),
 let hasSideEffects = 1, isMeta = 1 in {
   def InOutZAUsePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
   def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
+  def RequiresZT0SavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
 }
 
 def SMEStateAllocPseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
@@ -122,6 +123,11 @@ def AArch64_requires_za_save
            [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
 def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>;
 
+def AArch64_requires_zt0_save
+  : SDNode<"AArch64ISD::REQUIRES_ZT0_SAVE", SDTypeProfile<0, 0, []>,
+           [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+def : Pat<(AArch64_requires_zt0_save), (RequiresZT0SavePseudo)>;
+
 def AArch64_sme_state_alloc
   : SDNode<"AArch64ISD::SME_STATE_ALLOC", SDTypeProfile<0, 0,[]>,
            [SDNPHasChain]>;
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index ead1dfceb96a0..b96f6f12a58d6 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -72,20 +72,34 @@ using namespace llvm;
 
 namespace {
 
-enum ZAState {
+// Note: For agnostic ZA, we assume the function is always entered/exited in the
+// "ACTIVE" state -- this _may_ not be the case (since OFF is also a
+// possibility, but for the purpose of placing ZA saves/restores, that does not
+// matter).
+enum ZAState : uint8_t {
   // Any/unknown state (not valid)
   ANY = 0,
 
   // ZA is in use and active (i.e. within the accumulator)
   ACTIVE,
 
+  // ZA is active, but ZT0 has been saved.
+  // This handles the edge case of sharedZA && !sharesZT0.
+  ACTIVE_ZT0_SAVED,
+
   // A ZA save has been set up or committed (i.e. ZA is dormant or off)
+  // If the function uses ZT0 it must also be saved.
   LOCAL_SAVED,
 
+  // ZA has been committed to the lazy save buffer of the current function.
+  // If the function uses ZT0 it must also be saved.
+  // ZA is off.
+  LOCAL_COMMITTED,
+
   // The ZA/ZT0 state on entry to the function.
   ENTRY,
 
-  // ZA is off
+  // ZA is off.
   OFF,
 
   // The number of ZA states (not a valid state)
@@ -164,6 +178,14 @@ class EmitContext {
     return AgnosticZABufferPtr;
   }
 
+  int getZT0SaveSlot(MachineFunction &MF) {
+    if (ZT0SaveFI)
+      return *ZT0SaveFI;
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    ZT0SaveFI = MFI.CreateSpillStackObject(64, Align(16));
+    return *ZT0SaveFI;
+  }
+
   /// Returns true if the function must allocate a ZA save buffer on entry. This
   /// will be the case if, at any point in the function, a ZA save was emitted.
   bool needsSaveBuffer() const {
@@ -173,6 +195,7 @@ class EmitContext {
   }
 
 private:
+  std::optional<int> ZT0SaveFI;
   std::optional<int> TPIDR2BlockFI;
   Register AgnosticZABufferPtr = AArch64::NoRegister;
 };
@@ -184,8 +207,10 @@ class EmitContext {
 /// state would not be legal, as transitioning to it drops the content of ZA.
 static bool isLegalEdgeBundleZAState(ZAState State) {
   switch (State) {
-  case ZAState::ACTIVE:      // ZA state within the accumulator/ZT0.
-  case ZAState::LOCAL_SAVED: // ZA state is saved on the stack.
+  case ZAState::ACTIVE:           // ZA state within the accumulator/ZT0.
+  case ZAState::ACTIVE_ZT0_SAVED: // ZT0 is saved (ZA is active).
+  case ZAState::LOCAL_SAVED:      // ZA state may be saved on the stack.
+  case ZAState::LOCAL_COMMITTED:  // ZA state is saved on the stack.
     return true;
   default:
     return false;
@@ -199,7 +224,9 @@ StringRef getZAStateString(ZAState State) {
   switch (State) {
     MAKE_CASE(ZAState::ANY)
     MAKE_CASE(ZAState::ACTIVE)
+    MAKE_CASE(ZAState::ACTIVE_ZT0_SAVED)
     MAKE_CASE(ZAState::LOCAL_SAVED)
+    MAKE_CASE(ZAState::LOCAL_COMMITTED)
     MAKE_CASE(ZAState::ENTRY)
     MAKE_CASE(ZAState::OFF)
   default:
@@ -221,18 +248,39 @@ static bool isZAorZTRegOp(const TargetRegisterInfo &TRI,
 /// Returns the required ZA state needed before \p MI and an iterator pointing
 /// to where any code required to change the ZA state should be inserted.
 static std::pair<ZAState, MachineBasicBlock::iterator>
-getZAStateBeforeInst(const TargetRegisterInfo &TRI, MachineInstr &MI,
-                     bool ZAOffAtReturn) {
+getInstNeededZAState(const TargetRegisterInfo &TRI, MachineInstr &MI,
+                     SMEAttrs SMEFnAttrs) {
   MachineBasicBlock::iterator InsertPt(MI);
 
+  // Note: InOutZAUsePseudo, RequiresZASavePseudo, and RequiresZT0SavePseudo are
+  // intended to mark the position immediately before a call. Due to
+  // SelectionDAG constraints, these markers occur after the ADJCALLSTACKDOWN,
+  // so we use std::prev(InsertPt) to get the position before the call.
+
   if (MI.getOpcode() == AArch64::InOutZAUsePseudo)
     return {ZAState::ACTIVE, std::prev(InsertPt)};
 
+  // Note: If we need to save both ZA and ZT0 we use RequiresZASavePseudo.
   if (MI.getOpcode() == AArch64::RequiresZASavePseudo)
     return {ZAState::LOCAL_SAVED, std::prev(InsertPt)};
 
-  if (MI.isReturn())
+  // If we only need to save ZT0 there's two cases to consider:
+  //   1. The function has ZA state (that we don't need to save).
+  //      - In this case we switch to the "ACTIVE_ZT0_SAVED" state.
+  //        This only saves ZT0.
+  //   2. The function does not have ZA state
+  //      - In this case we switch to "LOCAL_COMMITTED" state.
+  //        This saves ZT0 and turns ZA off.
+  if (MI.getOpcode() == AArch64::RequiresZT0SavePseudo) {
+    return {SMEFnAttrs.hasZAState() ? ZAState::ACTIVE_ZT0_SAVED
+                                    : ZAState::LOCAL_COMMITTED,
+            std::prev(InsertPt)};
+  }
+
+  if (MI.isReturn()) {
+    bool ZAOffAtReturn = SMEFnAttrs.hasPrivateZAInterface();
     return {ZAOffAtReturn ? ZAState::OFF : ZAState::ACTIVE, InsertPt};
+  }
 
   for (auto &MO : MI.operands()) {
     if (isZAorZTRegOp(TRI, MO))
@@ -280,6 +328,9 @@ struct MachineSMEABI : public MachineFunctionPass {
   /// predecessors).
   void propagateDesiredStates(FunctionInfo &FnInfo, bool Forwards = true);
 
+  void emitZT0SaveRestore(EmitContext &, MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI, bool IsSave);
+
   // Emission routines for private and shared ZA functions (using lazy saves).
   void emitSMEPrologue(MachineBasicBlock &MBB,
                        MachineBasicBlock::iterator MBBI);
@@ -290,8 +341,8 @@ struct MachineSMEABI : public MachineFunctionPass {
                          MachineBasicBlock::iterator MBBI);
   void emitAllocateLazySaveBuffer(EmitContext &, MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator MBBI);
-  void emitZAOff(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                 bool ClearTPIDR2);
+  void emitZAMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                  bool ClearTPIDR2, bool On);
 
   // Emission routines for agnostic ZA functions.
   void emitSetupFullZASave(MachineBasicBlock &MBB,
@@ -409,7 +460,7 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
       Block.FixedEntryState = ZAState::ENTRY;
     } else if (MBB.isEHPad()) {
       // EH entry block:
-      Block.FixedEntryState = ZAState::LOCAL_SAVED;
+      Block.FixedEntryState = ZAState::LOCAL_COMMITTED;
     }
 
     LiveRegUnits LiveUnits(*TRI);
@@ -431,8 +482,7 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
         PhysLiveRegsAfterSMEPrologue = PhysLiveRegs;
       }
       // Note: We treat Agnostic ZA as inout_za with an alternate save/restore.
-      auto [NeededState, InsertPt] = getZAStateBeforeInst(
-          *TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface());
+      auto [NeededState, InsertPt] = getInstNeededZAState(*TRI, MI, SMEFnAttrs);
       assert((InsertPt == MBBI || isCallStartOpcode(InsertPt->getOpcode())) &&
              "Unexpected state change insertion point!");
       // TODO: Do something to avoid state changes where NZCV is live.
@@ -752,9 +802,9 @@ void MachineSMEABI::emitRestoreLazySave(EmitContext &Context,
   restorePhyRegSave(RegSave, MBB, MBBI, DL);
 }
 
-void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MBBI,
-                              bool ClearTPIDR2) {
+void MachineSMEABI::emitZAMode(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI,
+                               bool ClearTPIDR2, bool On) {
   DebugLoc DL = getDebugLoc(MBB, MBBI);
 
   if (ClearTPIDR2)
@@ -765,7 +815,7 @@ void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
   // Disable ZA.
   BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
       .addImm(AArch64SVCR::SVCRZA)
-      .addImm(0);
+      .addImm(On ? 1 : 0);
 }
 
 void MachineSMEABI::emitAllocateLazySaveBuffer(
@@ -891,6 +941,28 @@ void MachineSMEABI::emitFullZASaveRestore(EmitContext &Context,
   restorePhyRegSave(RegSave, MBB, MBBI, DL);
 }
 
+void MachineSMEABI::emitZT0SaveRestore(EmitContext &Context,
+                                       MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI,
+                                       bool IsSave) {
+  DebugLoc DL = getDebugLoc(MBB, MBBI);
+  Register ZT0Save = MRI->createVirtualRegister(&AArch64::GPR64spRegClass);
+
+  BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), ZT0Save)
+      .addFrameIndex(Context.getZT0SaveSlot(*MF))
+      .addImm(0)
+      .addImm(0);
+
+  if (IsSave) {
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::STR_TX))
+        .addReg(AArch64::ZT0)
+        .addReg(ZT0Save);
+  } else {
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDR_TX), AArch64::ZT0)
+        .addReg(ZT0Save);
+  }
+}
+
 void MachineSMEABI::emitAllocateFullZASaveBuffer(
     EmitContext &Context, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator MBBI, LiveRegs PhysLiveRegs) {
@@ -935,6 +1007,17 @@ void MachineSMEABI::emitAllocateFullZASaveBuffer(
   restorePhyRegSave(RegSave, MBB, MBBI, DL);
 }
 
+struct FromState {
+  ZAState From;
+
+  constexpr uint8_t to(ZAState To) const {
+    static_assert(NUM_ZA_STATE < 16, "expected ZAState to fit in 4-bits");
+    return uint8_t(From) << 4 | uint8_t(To);
+  }
+};
+
+constexpr FromState transitionFrom(ZAState From) { return FromState{From}; }
+
 void MachineSMEABI::emitStateChange(EmitContext &Context,
                                     MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator InsertPt,
@@ -949,8 +1032,6 @@ void MachineSMEABI::emitStateChange(EmitContext &Context,
   if (From == ZAState::ENTRY && To == ZAState::OFF)
     return;
 
-  [[maybe_unused]] SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs();
-
   // TODO: Avoid setting up the save buffer if there's no transition to
   // LOCAL_SAVED.
   if (From == ZAState::ENTRY) {
@@ -966,17 +1047,67 @@ void MachineSMEABI::emitStateChange(EmitContext &Context,
     From = ZAState::ACTIVE;
   }
 
-  if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED)
-    emitZASave(Context, MBB, InsertPt, PhysLiveRegs);
-  else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE)
-    emitZARestore(Context, MBB, InsertPt, PhysLiveRegs);
-  else if (To == ZAState::OFF) {
-    assert(From != ZAState::ENTRY &&
-           "ENTRY to OFF should have already been handled");
-    assert(!SMEFnAttrs.hasAgnosticZAInterface() &&
-           "Should not turn ZA off in agnostic ZA function");
-    emitZAOff(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED);
-  } else {
+  SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs();
+  bool IsAgnosticZA = SMEFnAttrs.hasAgnosticZAInterface();
+  bool HasZT0State = SMEFnAttrs.hasZT0State();
+  bool HasZAState = IsAgnosticZA || SMEFnAttrs.hasZAState();
+
+  switch (transitionFrom(From).to(To)) {
+  // This section handles: ACTIVE <-> ACTIVE_ZT0_SAVED
+  case transitionFrom(ZAState::ACTIVE).to(ZAState::ACTIVE_ZT0_SAVED):
+    emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/true);
+    break;
+  case transitionFrom(ZAState::ACTIVE_ZT0_SAVED).to(ZAState::ACTIVE):
+    emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/false);
+    break;
+
+  // This section handles: ACTIVE[_ZT0_SAVED] -> LOCAL_SAVED
+  case transitionFrom(ZAState::ACTIVE).to(ZAState::LOCAL_SAVED):
+  case transitionFrom(ZAState::ACTIVE_ZT0_SAVED).to(ZAState::LOCAL_SAVED):
+    if (HasZT0State && From == ZAState::ACTIVE)
+      emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/true);
+    if (HasZAState)
+      emitZASave(Context, MBB, InsertPt, PhysLiveRegs);
+    break;
+
+  // This section handles: ACTIVE -> LOCAL_COMMITTED
+  case transitionFrom(ZAState::ACTIVE).to(ZAState::LOCAL_COMMITTED):
+    // TODO: We could support ZA state here, but this transition is currently
+    // only possible when we _don't_ have ZA state.
+    assert(HasZT0State && !HasZAState && "Expect to only have ZT0 state.");
+    emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/true);
+    emitZAMode(MBB, InsertPt, /*ClearTPIDR2=*/false, /*On=*/false);
+    break;
+
+  // This section handles: LOCAL_COMMITTED -> (OFF|LOCAL_SAVED)
+  case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::OFF):
+  case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::LOCAL_SAVED):
+    // These transistions are a no-op.
+    break;
+
+  // This section handles: LOCAL_(SAVED|COMMITTED) -> ACTIVE[_ZT0_SAVED]
+  case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::ACTIVE):
+  case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::ACTIVE_ZT0_SAVED):
+  case transitionFrom(ZAState::LOCAL_SAVED).to(ZAState::ACTIVE):
+    if (HasZAState)
+      emitZARestore(Context, MBB, InsertPt, PhysLiveRegs);
+    else
+      emitZAMode(MBB, InsertPt, /*ClearTPIDR2=*/false, /*On=*/true);
+    if (HasZT0State && To == ZAState::ACTIVE)
+      emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/false);
+    break;
+
+  // This section handles transistions to OFF (not previously covered)
+  case transitionFrom(ZAState::ACTIVE).to(ZAState::OFF):
+  case transitionFrom(ZAState::ACTIVE_ZT0_SAVED).to(ZAState::OFF):
+  case transitionFrom(ZAState::LOCAL_SAVED).to(ZAState::OFF):
+    assert(SMEFnAttrs.hasPrivateZAInterface() &&
+           "Did not expect to turn ZA off in shared/agnostic ZA function");
+    emitZAMode(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED,
+               /*On=*/false);
+    break;
+
+  default:
     dbgs() << "Error: Transition from " << getZAStateString(From) << " to "
            << getZAStateString(To) << '\n';
     llvm_unreachable("Unimplemented state transition");
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index a6212f5cc84be..afbbc0dbeb7ad 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -12783,10 +12783,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op,
 
     SmallVector<SDValue, 8> Loads(Factor);
 
-    SDValue Increment =
-        DAG.getVScale(DL, PtrVT,
-                      APInt(PtrVT.getFixedSizeInBits(),
-                            VecVT.getStoreSize().getKnownMinValue()));
+    SDValue Increment = DAG.getTypeSize(DL, PtrVT, VecVT.getStoreSize());
     for (unsigned i = 0; i != Factor; ++i) {
       if (i != 0)
         StackPtr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, Increment);
@@ -14184,9 +14181,8 @@ RISCVTargetLowering::lowerVPReverseExperimental(SDValue Op,
 
       // Slide off any elements from past EVL that were reversed into the low
       // elements.
-      unsigned MinElts = GatherVT.getVectorMinNumElements();
       SDValue VLMax =
-          DAG.getVScale(DL, XLenVT, APInt(XLenVT.getSizeInBits(), MinElts));
+          DAG.getElementCount(DL, XLenVT, GatherVT.getVectorElementCount());
       SDValue Diff = DAG.getNode(ISD::SUB, DL, XLenVT, VLMax, EVL);
 
       Result = getVSlidedown(DAG, Subtarget, DL, GatherVT,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 13d048a98d6ea..ce4db2e112fa0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -460,8 +460,8 @@ def : Pat<(i64 (WebAssemblyWrapperREL texternalsym:$addr)),
 include "WebAssemblyInstrMemory.td"
 include "WebAssemblyInstrCall.td"
 include "WebAssemblyInstrControl.td"
-include "WebAssemblyInstrInteger.td"
 include "WebAssemblyInstrConv.td"
+include "WebAssemblyInstrInteger.td"
 include "WebAssemblyInstrFloat.td"
 include "WebAssemblyInstrAtomics.td"
 include "WebAssemblyInstrSIMD.td"
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
index d4c8f92c883e7..eb692679f5971 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -107,6 +107,9 @@ def : Pat<(rotr I32:$lhs, (and I32:$rhs, 31)), (ROTR_I32 I32:$lhs, I32:$rhs)>;
 def : Pat<(rotl I64:$lhs, (and I64:$rhs, 63)), (ROTL_I64 I64:$lhs, I64:$rhs)>;
 def : Pat<(rotr I64:$lhs, (and I64:$rhs, 63)), (ROTR_I64 I64:$lhs, I64:$rhs)>;
 
+def : Pat<(shl I64:$lhs, (zext (and I32:$rhs, 63))),
+                               (SHL_I64 I64:$lhs, (I64_EXTEND_U_I32 I32:$rhs))>;
+
 defm SELECT_I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs, I32:$cond),
                     (outs), (ins),
                     [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))],
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index e7dc366b13798..c9f51e4b294b1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1163,6 +1163,7 @@ static Value *canonicalizeSaturatedAddSigned(ICmpInst *Cmp, Value *TVal,
   // (X >= Y) ? INT_MAX : (X + C) --> sadd.sat(X, C)
   // where Y is INT_MAX - C or INT_MAX - C - 1, and C > 0
   if ((Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE) &&
+      isa<Constant>(Cmp1) &&
       match(FVal, m_Add(m_Specific(Cmp0), m_StrictlyPositive(C)))) {
     APInt IntMax =
         APInt::getSignedMaxValue(Cmp1->getType()->getScalarSizeInBits());
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 4947d03a2dc66..021bf0618754a 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -2098,6 +2098,38 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
       return (void)mergeInValue(ValueState[II], II,
                                 ValueLatticeElement::getRange(Result));
     }
+    if (II->getIntrinsicID() == Intrinsic::experimental_get_vector_length) {
+      Value *CountArg = II->getArgOperand(0);
+      Value *VF = II->getArgOperand(1);
+      bool Scalable = cast<ConstantInt>(II->getArgOperand(2))->isOne();
+
+      // Computation happens in the larger type.
+      unsigned BitWidth = std::max(CountArg->getType()->getScalarSizeInBits(),
+                                   VF->getType()->getScalarSizeInBits());
+
+      ConstantRange Count = getValueState(CountArg)
+                                .asConstantRange(CountArg->getType(), false)
+                                .zeroExtend(BitWidth);
+      ConstantRange MaxLanes = getValueState(VF)
+                                   .asConstantRange(VF->getType(), false)
+                                   .zeroExtend(BitWidth);
+      if (Scalable)
+        MaxLanes =
+            MaxLanes.multiply(getVScaleRange(II->getFunction(), BitWidth));
+
+      // The result is always less than both Count and MaxLanes.
+      ConstantRange Result(
+          APInt::getZero(BitWidth),
+          APIntOps::umin(Count.getUpper(), MaxLanes.getUpper()));
+
+      // If Count <= MaxLanes, getvectorlength(Count, MaxLanes) = Count
+      if (Count.icmp(CmpInst::ICMP_ULE, MaxLanes))
+        Result = Count;
+
+      Result = Result.truncate(II->getType()->getScalarSizeInBits());
+      return (void)mergeInValue(ValueState[II], II,
+                                ValueLatticeElement::getRange(Result));
+    }
 
     if (ConstantRange::isIntrinsicSupported(II->getIntrinsicID())) {
       // Compute result range for intrinsics supported by ConstantRange.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b12f8ccc73c7e..f7281283bae81 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -769,7 +769,8 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
     // Replace wide pointer inductions which have only their scalars used by
     // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
     if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
-      if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
+      if (!Plan.hasScalarVFOnly() &&
+          !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
         continue;
 
       VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
index a3027f01e73cf..ea1341186ddfa 100644
--- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
+++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
@@ -230,10 +230,6 @@ define void @test7() nounwind "aarch64_inout_zt0" {
 ; CHECK-NEXT:    str zt0, [x19]
 ; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    bl callee
-; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    ldr zt0, [x19]
-; CHECK-NEXT:    str zt0, [x19]
-; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    ldr zt0, [x19]
diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
index ef74825e02881..3947127c47844 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
@@ -511,7 +511,6 @@ exit:
 ;
 ; This code may require reloading ZT0 in the cleanup for ~ZT0Resource().
 ;
-; FIXME: Codegen with `-aarch64-new-sme-abi` is broken with ZT0 (as it is not implemented).
 define void @try_catch_shared_zt0_callee() "aarch64_inout_zt0" personality ptr @__gxx_personality_v0 {
 ; CHECK-LABEL: try_catch_shared_zt0_callee:
 ; CHECK:       .Lfunc_begin3:
@@ -519,52 +518,37 @@ define void @try_catch_shared_zt0_callee() "aarch64_inout_zt0" personality ptr @
 ; CHECK-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
 ; CHECK-NEXT:    .cfi_lsda 28, .Lexception3
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    sub sp, sp, #80
-; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -24
-; CHECK-NEXT:    .cfi_offset w29, -32
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x9, x8, x8, x9
-; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    stp x9, x8, [x29, #-80]
+; CHECK-NEXT:    .cfi_offset w30, -32
 ; CHECK-NEXT:  .Ltmp9: // EH_LABEL
-; CHECK-NEXT:    sub x19, x29, #64
+; CHECK-NEXT:    mov x19, sp
 ; CHECK-NEXT:    str zt0, [x19]
 ; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    bl may_throw
+; CHECK-NEXT:  .Ltmp10: // EH_LABEL
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    ldr zt0, [x19]
-; CHECK-NEXT:  .Ltmp10: // EH_LABEL
 ; CHECK-NEXT:  // %bb.1: // %return_normally
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Reload
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB3_2: // %unwind_dtors
 ; CHECK-NEXT:  .Ltmp11: // EH_LABEL
-; CHECK-NEXT:    sub x20, x29, #64
+; CHECK-NEXT:    mov x20, sp
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-NEXT:    sub x0, x29, #80
-; CHECK-NEXT:    cbnz x8, .LBB3_4
-; CHECK-NEXT:  // %bb.3: // %unwind_dtors
-; CHECK-NEXT:    bl __arm_tpidr2_restore
-; CHECK-NEXT:  .LBB3_4: // %unwind_dtors
-; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    ldr zt0, [x20]
 ; CHECK-NEXT:    bl shared_zt0_call
 ; CHECK-NEXT:    str zt0, [x20]
 ; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    mov x0, x19
 ; CHECK-NEXT:    bl _Unwind_Resume
-; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    ldr zt0, [x20]
 ;
 ; CHECK-SDAG-LABEL: try_catch_shared_zt0_callee:
 ; CHECK-SDAG:       .Lfunc_begin3:
@@ -965,6 +949,239 @@ exit:
   ret void
 }
 
+define void @try_catch_inout_zt0() "aarch64_inout_zt0" personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: try_catch_inout_zt0:
+; CHECK:       .Lfunc_begin7:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-NEXT:    .cfi_lsda 28, .Lexception7
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:  .Ltmp21: // EH_LABEL
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    str zt0, [x19]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    bl may_throw
+; CHECK-NEXT:  .Ltmp22: // EH_LABEL
+; CHECK-NEXT:  .LBB7_1: // %exit
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB7_2: // %catch
+; CHECK-NEXT:  .Ltmp23: // EH_LABEL
+; CHECK-NEXT:    bl __cxa_begin_catch
+; CHECK-NEXT:    bl __cxa_end_catch
+; CHECK-NEXT:    b .LBB7_1
+;
+; CHECK-SDAG-LABEL: try_catch_inout_zt0:
+; CHECK-SDAG:       .Lfunc_begin7:
+; CHECK-SDAG-NEXT:    .cfi_startproc
+; CHECK-SDAG-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT:    .cfi_lsda 28, .Lexception7
+; CHECK-SDAG-NEXT:  // %bb.0: // %entry
+; CHECK-SDAG-NEXT:    sub sp, sp, #80
+; CHECK-SDAG-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-SDAG-NEXT:    .cfi_offset w19, -8
+; CHECK-SDAG-NEXT:    .cfi_offset w30, -16
+; CHECK-SDAG-NEXT:  .Ltmp21: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x19, sp
+; CHECK-SDAG-NEXT:    str zt0, [x19]
+; CHECK-SDAG-NEXT:    smstop za
+; CHECK-SDAG-NEXT:    bl may_throw
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x19]
+; CHECK-SDAG-NEXT:  .Ltmp22: // EH_LABEL
+; CHECK-SDAG-NEXT:  .LBB7_1: // %exit
+; CHECK-SDAG-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    add sp, sp, #80
+; CHECK-SDAG-NEXT:    ret
+; CHECK-SDAG-NEXT:  .LBB7_2: // %catch
+; CHECK-SDAG-NEXT:  .Ltmp23: // EH_LABEL
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x19]
+; CHECK-SDAG-NEXT:    str zt0, [x19]
+; CHECK-SDAG-NEXT:    smstop za
+; CHECK-SDAG-NEXT:    bl __cxa_begin_catch
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x19]
+; CHECK-SDAG-NEXT:    str zt0, [x19]
+; CHECK-SDAG-NEXT:    smstop za
+; CHECK-SDAG-NEXT:    bl __cxa_end_catch
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x19]
+; CHECK-SDAG-NEXT:    b .LBB7_1
+entry:
+  invoke void @may_throw()
+          to label %exit unwind label %catch
+
+catch:
+  %eh_info = landingpad { ptr, i32 }
+          catch ptr null
+  %exception_ptr = extractvalue { ptr, i32 } %eh_info, 0
+  tail call ptr @__cxa_begin_catch(ptr %exception_ptr)
+  tail call void @__cxa_end_catch()
+  br label %exit
+
+exit:
+  ret void
+}
+
+define void @try_catch_shared_za_callee_zt0_saved(ptr %callee) "aarch64_inout_za" "aarch64_in_zt0" personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: try_catch_shared_za_callee_zt0_saved:
+; CHECK:       .Lfunc_begin8:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-NEXT:    .cfi_lsda 28, .Lexception8
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x9, x8, x8, x9
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    stp x9, x8, [x29, #-80]
+; CHECK-NEXT:  .Ltmp24: // EH_LABEL
+; CHECK-NEXT:    sub x20, x29, #64
+; CHECK-NEXT:    sub x8, x29, #80
+; CHECK-NEXT:    str zt0, [x20]
+; CHECK-NEXT:    msr TPIDR2_EL0, x8
+; CHECK-NEXT:    bl may_throw
+; CHECK-NEXT:  .Ltmp25: // EH_LABEL
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #80
+; CHECK-NEXT:    cbnz x8, .LBB8_2
+; CHECK-NEXT:  // %bb.1:
+; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:  .LBB8_2:
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    ldr zt0, [x20]
+; CHECK-NEXT:  // %bb.3: // %return_normally
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB8_4: // %unwind_dtors
+; CHECK-NEXT:  .Ltmp26: // EH_LABEL
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #80
+; CHECK-NEXT:    cbnz x8, .LBB8_6
+; CHECK-NEXT:  // %bb.5: // %unwind_dtors
+; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:  .LBB8_6: // %unwind_dtors
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    blr x19
+; CHECK-NEXT:    sub x8, x29, #80
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    msr TPIDR2_EL0, x8
+; CHECK-NEXT:    bl _Unwind_Resume
+;
+; CHECK-SDAG-LABEL: try_catch_shared_za_callee_zt0_saved:
+; CHECK-SDAG:       .Lfunc_begin8:
+; CHECK-SDAG-NEXT:    .cfi_startproc
+; CHECK-SDAG-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT:    .cfi_lsda 28, .Lexception8
+; CHECK-SDAG-NEXT:  // %bb.0:
+; CHECK-SDAG-NEXT:    stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    mov x29, sp
+; CHECK-SDAG-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    sub sp, sp, #80
+; CHECK-SDAG-NEXT:    .cfi_def_cfa w29, 48
+; CHECK-SDAG-NEXT:    .cfi_offset w19, -8
+; CHECK-SDAG-NEXT:    .cfi_offset w20, -16
+; CHECK-SDAG-NEXT:    .cfi_offset w21, -24
+; CHECK-SDAG-NEXT:    .cfi_offset w22, -32
+; CHECK-SDAG-NEXT:    .cfi_offset w30, -40
+; CHECK-SDAG-NEXT:    .cfi_offset w29, -48
+; CHECK-SDAG-NEXT:    rdsvl x8, #1
+; CHECK-SDAG-NEXT:    mov x9, sp
+; CHECK-SDAG-NEXT:    mov x19, x0
+; CHECK-SDAG-NEXT:    msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT:    mov sp, x9
+; CHECK-SDAG-NEXT:    stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT:  .Ltmp24: // EH_LABEL
+; CHECK-SDAG-NEXT:    sub x8, x29, #16
+; CHECK-SDAG-NEXT:    sub x20, x29, #80
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x8
+; CHECK-SDAG-NEXT:    str zt0, [x20]
+; CHECK-SDAG-NEXT:    bl may_throw
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x20]
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB8_2
+; CHECK-SDAG-NEXT:  // %bb.1:
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB8_2:
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:  .Ltmp25: // EH_LABEL
+; CHECK-SDAG-NEXT:  // %bb.3: // %return_normally
+; CHECK-SDAG-NEXT:    mov sp, x29
+; CHECK-SDAG-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    ldp x29, x30, [sp], #48 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    ret
+; CHECK-SDAG-NEXT:  .LBB8_4: // %unwind_dtors
+; CHECK-SDAG-NEXT:  .Ltmp26: // EH_LABEL
+; CHECK-SDAG-NEXT:    sub x21, x29, #80
+; CHECK-SDAG-NEXT:    sub x22, x29, #16
+; CHECK-SDAG-NEXT:    mov x20, x0
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x21]
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB8_6
+; CHECK-SDAG-NEXT:  // %bb.5: // %unwind_dtors
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB8_6: // %unwind_dtors
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    str zt0, [x21]
+; CHECK-SDAG-NEXT:    blr x19
+; CHECK-SDAG-NEXT:    ldr zt0, [x21]
+; CHECK-SDAG-NEXT:    mov x0, x20
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x22
+; CHECK-SDAG-NEXT:    str zt0, [x21]
+; CHECK-SDAG-NEXT:    bl _Unwind_Resume
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x21]
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB8_8
+; CHECK-SDAG-NEXT:  // %bb.7: // %unwind_dtors
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB8_8: // %unwind_dtors
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+  invoke void @may_throw()
+          to label %return_normally unwind label %unwind_dtors
+
+unwind_dtors:
+  %5 = landingpad { ptr, i32 }
+          cleanup
+  call void %callee() "aarch64_inout_za"
+  resume { ptr, i32 } %5
+
+return_normally:
+  ret void
+}
+
 declare ptr @__cxa_allocate_exception(i64)
 declare void @__cxa_throw(ptr, ptr, ptr)
 declare ptr @__cxa_begin_catch(ptr)
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index 69c69f027a33f..0d4a39b2eeb2f 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -193,7 +193,7 @@ define void @zt0_new_caller_zt0_new_callee(ptr %callee) "aarch64_new_zt0" nounwi
 ; CHECK-NEWLOWERING-LABEL: zt0_new_caller_zt0_new_callee:
 ; CHECK-NEWLOWERING:       // %bb.0:
 ; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #80
-; CHECK-NEWLOWERING-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    str x30, [sp, #64] // 8-byte Spill
 ; CHECK-NEWLOWERING-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEWLOWERING-NEXT:    cbz x8, .LBB6_2
 ; CHECK-NEWLOWERING-NEXT:  // %bb.1:
@@ -202,14 +202,11 @@ define void @zt0_new_caller_zt0_new_callee(ptr %callee) "aarch64_new_zt0" nounwi
 ; CHECK-NEWLOWERING-NEXT:    zero { zt0 }
 ; CHECK-NEWLOWERING-NEXT:  .LBB6_2:
 ; CHECK-NEWLOWERING-NEXT:    smstart za
-; CHECK-NEWLOWERING-NEXT:    mov x19, sp
-; CHECK-NEWLOWERING-NEXT:    str zt0, [x19]
+; CHECK-NEWLOWERING-NEXT:    mov x8, sp
+; CHECK-NEWLOWERING-NEXT:    str zt0, [x8]
 ; CHECK-NEWLOWERING-NEXT:    smstop za
 ; CHECK-NEWLOWERING-NEXT:    blr x0
-; CHECK-NEWLOWERING-NEXT:    smstart za
-; CHECK-NEWLOWERING-NEXT:    ldr zt0, [x19]
-; CHECK-NEWLOWERING-NEXT:    smstop za
-; CHECK-NEWLOWERING-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldr x30, [sp, #64] // 8-byte Reload
 ; CHECK-NEWLOWERING-NEXT:    add sp, sp, #80
 ; CHECK-NEWLOWERING-NEXT:    ret
   call void %callee() "aarch64_new_zt0";
@@ -246,7 +243,7 @@ define i64 @zt0_new_caller_abi_routine_callee() "aarch64_new_zt0" nounwind {
 ; CHECK-NEWLOWERING-LABEL: zt0_new_caller_abi_routine_callee:
 ; CHECK-NEWLOWERING:       // %bb.0:
 ; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #80
-; CHECK-NEWLOWERING-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    str x30, [sp, #64] // 8-byte Spill
 ; CHECK-NEWLOWERING-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEWLOWERING-NEXT:    cbz x8, .LBB7_2
 ; CHECK-NEWLOWERING-NEXT:  // %bb.1:
@@ -255,12 +252,11 @@ define i64 @zt0_new_caller_abi_routine_callee() "aarch64_new_zt0" nounwind {
 ; CHECK-NEWLOWERING-NEXT:    zero { zt0 }
 ; CHECK-NEWLOWERING-NEXT:  .LBB7_2:
 ; CHECK-NEWLOWERING-NEXT:    smstart za
-; CHECK-NEWLOWERING-NEXT:    mov x19, sp
-; CHECK-NEWLOWERING-NEXT:    str zt0, [x19]
-; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_state
-; CHECK-NEWLOWERING-NEXT:    ldr zt0, [x19]
+; CHECK-NEWLOWERING-NEXT:    mov x8, sp
+; CHECK-NEWLOWERING-NEXT:    str zt0, [x8]
 ; CHECK-NEWLOWERING-NEXT:    smstop za
-; CHECK-NEWLOWERING-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_state
+; CHECK-NEWLOWERING-NEXT:    ldr x30, [sp, #64] // 8-byte Reload
 ; CHECK-NEWLOWERING-NEXT:    add sp, sp, #80
 ; CHECK-NEWLOWERING-NEXT:    ret
   %res = call {i64, i64} @__arm_sme_state()
@@ -382,37 +378,57 @@ define void @shared_za_new_zt0(ptr %callee) "aarch64_inout_za" "aarch64_new_zt0"
 
 
 define void @zt0_multiple_private_za_calls(ptr %callee) "aarch64_in_zt0" nounwind {
-; CHECK-COMMON-LABEL: zt0_multiple_private_za_calls:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    sub sp, sp, #96
-; CHECK-COMMON-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    mov x20, sp
-; CHECK-COMMON-NEXT:    mov x19, x0
-; CHECK-COMMON-NEXT:    str x30, [sp, #64] // 8-byte Spill
-; CHECK-COMMON-NEXT:    str zt0, [x20]
-; CHECK-COMMON-NEXT:    smstop za
-; CHECK-COMMON-NEXT:    blr x0
-; CHECK-COMMON-NEXT:    smstart za
-; CHECK-COMMON-NEXT:    ldr zt0, [x20]
-; CHECK-COMMON-NEXT:    str zt0, [x20]
-; CHECK-COMMON-NEXT:    smstop za
-; CHECK-COMMON-NEXT:    blr x19
-; CHECK-COMMON-NEXT:    smstart za
-; CHECK-COMMON-NEXT:    ldr zt0, [x20]
-; CHECK-COMMON-NEXT:    str zt0, [x20]
-; CHECK-COMMON-NEXT:    smstop za
-; CHECK-COMMON-NEXT:    blr x19
-; CHECK-COMMON-NEXT:    smstart za
-; CHECK-COMMON-NEXT:    ldr zt0, [x20]
-; CHECK-COMMON-NEXT:    str zt0, [x20]
-; CHECK-COMMON-NEXT:    smstop za
-; CHECK-COMMON-NEXT:    blr x19
-; CHECK-COMMON-NEXT:    smstart za
-; CHECK-COMMON-NEXT:    ldr zt0, [x20]
-; CHECK-COMMON-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT:    ldr x30, [sp, #64] // 8-byte Reload
-; CHECK-COMMON-NEXT:    add sp, sp, #96
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: zt0_multiple_private_za_calls:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x20, sp
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Spill
+; CHECK-NEXT:    str zt0, [x20]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    blr x0
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x20]
+; CHECK-NEXT:    str zt0, [x20]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    blr x19
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x20]
+; CHECK-NEXT:    str zt0, [x20]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    blr x19
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x20]
+; CHECK-NEXT:    str zt0, [x20]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    blr x19
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x20]
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+;
+; CHECK-NEWLOWERING-LABEL: zt0_multiple_private_za_calls:
+; CHECK-NEWLOWERING:       // %bb.0:
+; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #96
+; CHECK-NEWLOWERING-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    mov x20, sp
+; CHECK-NEWLOWERING-NEXT:    mov x19, x0
+; CHECK-NEWLOWERING-NEXT:    str x30, [sp, #64] // 8-byte Spill
+; CHECK-NEWLOWERING-NEXT:    str zt0, [x20]
+; CHECK-NEWLOWERING-NEXT:    smstop za
+; CHECK-NEWLOWERING-NEXT:    blr x0
+; CHECK-NEWLOWERING-NEXT:    blr x19
+; CHECK-NEWLOWERING-NEXT:    blr x19
+; CHECK-NEWLOWERING-NEXT:    blr x19
+; CHECK-NEWLOWERING-NEXT:    smstart za
+; CHECK-NEWLOWERING-NEXT:    ldr zt0, [x20]
+; CHECK-NEWLOWERING-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldr x30, [sp, #64] // 8-byte Reload
+; CHECK-NEWLOWERING-NEXT:    add sp, sp, #96
+; CHECK-NEWLOWERING-NEXT:    ret
   call void %callee()
   call void %callee()
   call void %callee()
diff --git a/llvm/test/CodeGen/RISCV/zicond-fp-select-zfinx.ll b/llvm/test/CodeGen/RISCV/zicond-fp-select-zfinx.ll
index b505c84166eb1..0e8a0c704207d 100644
--- a/llvm/test/CodeGen/RISCV/zicond-fp-select-zfinx.ll
+++ b/llvm/test/CodeGen/RISCV/zicond-fp-select-zfinx.ll
@@ -1,19 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; Zicond with zfinx(implies by zdinx)
-; RUN: llc -mtriple=riscv64 -mattr=+zdinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64ZDINX_ZICOND
-; RUN: llc -mtriple=riscv64 -mattr=+zdinx         -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64ZDINX_NOZICOND
+; RUN: llc -mtriple=riscv64 -mattr=+zdinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZDINX_ZICOND,RV64ZDINX_ZICOND
+; RUN: llc -mtriple=riscv64 -mattr=+zdinx         -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZDINX_NOZICOND,RV64ZDINX_NOZICOND
 
 ; Zicond with zfinx(implies by zhinx)
-; RUN: llc -mtriple=riscv64 -mattr=+zhinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64ZHINX_ZICOND
+; RUN: llc -mtriple=riscv64 -mattr=+zhinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZHINX_ZICOND,RV64ZHINX_ZICOND
 
 ; Baseline with classic FP registers (no *inx); zicond select should NOT trigger
 ; RUN: llc -mtriple=riscv64 -mattr=+f,+d          -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64FD
 
 ; Check same optimize work on 32bit machine
-; RUN: llc -mtriple=riscv32 -mattr=+zfinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32ZFINX_ZICOND
+; RUN: llc -mtriple=riscv32 -mattr=+zfinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZHINX_ZICOND,RV32ZFINX_ZICOND
 ; RUN: llc -mtriple=riscv32 -mattr=+zfinx         -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32ZFINX_NOZICOND
-; RUN: llc -mtriple=riscv32 -mattr=+zdinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32ZDINX_ZICOND
-; RUN: llc -mtriple=riscv32 -mattr=+zdinx         -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32ZDINX_NOZICOND
+; RUN: llc -mtriple=riscv32 -mattr=+zdinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZDINX_ZICOND,RV32ZDINX_ZICOND
+; RUN: llc -mtriple=riscv32 -mattr=+zdinx         -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZDINX_NOZICOND,RV32ZDINX_NOZICOND
 
 ; This test checks that floating-point SELECT is lowered through integer
 ; SELECT (and thus to Zicond czero.* sequence) when FP values live in GPRs
@@ -25,37 +25,37 @@
 ; -----------------------------------------------------------------------------
 
 define float @select_f32_i1(i1 %cond, float %t, float %f) nounwind {
-; RV64ZDINX_ZICOND-LABEL: select_f32_i1:
-; RV64ZDINX_ZICOND:       # %bb.0: # %entry
-; RV64ZDINX_ZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
-; RV64ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV64ZDINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV64ZDINX_ZICOND-NEXT:    czero.nez a2, a2, a0
-; RV64ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV64ZDINX_ZICOND-NEXT:    or a0, a0, a2
-; RV64ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV64ZDINX_ZICOND-NEXT:    ret
-;
-; RV64ZDINX_NOZICOND-LABEL: select_f32_i1:
-; RV64ZDINX_NOZICOND:       # %bb.0: # %entry
-; RV64ZDINX_NOZICOND-NEXT:    andi a3, a0, 1
-; RV64ZDINX_NOZICOND-NEXT:    mv a0, a1
-; RV64ZDINX_NOZICOND-NEXT:    bnez a3, .LBB0_2
-; RV64ZDINX_NOZICOND-NEXT:  # %bb.1: # %entry
-; RV64ZDINX_NOZICOND-NEXT:    mv a0, a2
-; RV64ZDINX_NOZICOND-NEXT:  .LBB0_2: # %entry
-; RV64ZDINX_NOZICOND-NEXT:    ret
-;
-; RV64ZHINX_ZICOND-LABEL: select_f32_i1:
-; RV64ZHINX_ZICOND:       # %bb.0: # %entry
-; RV64ZHINX_ZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
-; RV64ZHINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV64ZHINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV64ZHINX_ZICOND-NEXT:    czero.nez a2, a2, a0
-; RV64ZHINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV64ZHINX_ZICOND-NEXT:    or a0, a0, a2
-; RV64ZHINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV64ZHINX_ZICOND-NEXT:    ret
+; ZDINX_ZICOND-LABEL: select_f32_i1:
+; ZDINX_ZICOND:       # %bb.0: # %entry
+; ZDINX_ZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
+; ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
+; ZDINX_ZICOND-NEXT:    andi a0, a0, 1
+; ZDINX_ZICOND-NEXT:    czero.nez a2, a2, a0
+; ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
+; ZDINX_ZICOND-NEXT:    or a0, a0, a2
+; ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
+; ZDINX_ZICOND-NEXT:    ret
+;
+; ZDINX_NOZICOND-LABEL: select_f32_i1:
+; ZDINX_NOZICOND:       # %bb.0: # %entry
+; ZDINX_NOZICOND-NEXT:    andi a3, a0, 1
+; ZDINX_NOZICOND-NEXT:    mv a0, a1
+; ZDINX_NOZICOND-NEXT:    bnez a3, .LBB0_2
+; ZDINX_NOZICOND-NEXT:  # %bb.1: # %entry
+; ZDINX_NOZICOND-NEXT:    mv a0, a2
+; ZDINX_NOZICOND-NEXT:  .LBB0_2: # %entry
+; ZDINX_NOZICOND-NEXT:    ret
+;
+; ZHINX_ZICOND-LABEL: select_f32_i1:
+; ZHINX_ZICOND:       # %bb.0: # %entry
+; ZHINX_ZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
+; ZHINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
+; ZHINX_ZICOND-NEXT:    andi a0, a0, 1
+; ZHINX_ZICOND-NEXT:    czero.nez a2, a2, a0
+; ZHINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
+; ZHINX_ZICOND-NEXT:    or a0, a0, a2
+; ZHINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
+; ZHINX_ZICOND-NEXT:    ret
 ;
 ; RV64FD-LABEL: select_f32_i1:
 ; RV64FD:       # %bb.0: # %entry
@@ -66,17 +66,6 @@ define float @select_f32_i1(i1 %cond, float %t, float %f) nounwind {
 ; RV64FD-NEXT:  .LBB0_2: # %entry
 ; RV64FD-NEXT:    ret
 ;
-; RV32ZFINX_ZICOND-LABEL: select_f32_i1:
-; RV32ZFINX_ZICOND:       # %bb.0: # %entry
-; RV32ZFINX_ZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
-; RV32ZFINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV32ZFINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV32ZFINX_ZICOND-NEXT:    czero.nez a2, a2, a0
-; RV32ZFINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV32ZFINX_ZICOND-NEXT:    or a0, a0, a2
-; RV32ZFINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV32ZFINX_ZICOND-NEXT:    ret
-;
 ; RV32ZFINX_NOZICOND-LABEL: select_f32_i1:
 ; RV32ZFINX_NOZICOND:       # %bb.0: # %entry
 ; RV32ZFINX_NOZICOND-NEXT:    andi a3, a0, 1
@@ -86,27 +75,6 @@ define float @select_f32_i1(i1 %cond, float %t, float %f) nounwind {
 ; RV32ZFINX_NOZICOND-NEXT:    mv a0, a2
 ; RV32ZFINX_NOZICOND-NEXT:  .LBB0_2: # %entry
 ; RV32ZFINX_NOZICOND-NEXT:    ret
-;
-; RV32ZDINX_ZICOND-LABEL: select_f32_i1:
-; RV32ZDINX_ZICOND:       # %bb.0: # %entry
-; RV32ZDINX_ZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
-; RV32ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV32ZDINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV32ZDINX_ZICOND-NEXT:    czero.nez a2, a2, a0
-; RV32ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV32ZDINX_ZICOND-NEXT:    or a0, a0, a2
-; RV32ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV32ZDINX_ZICOND-NEXT:    ret
-;
-; RV32ZDINX_NOZICOND-LABEL: select_f32_i1:
-; RV32ZDINX_NOZICOND:       # %bb.0: # %entry
-; RV32ZDINX_NOZICOND-NEXT:    andi a3, a0, 1
-; RV32ZDINX_NOZICOND-NEXT:    mv a0, a1
-; RV32ZDINX_NOZICOND-NEXT:    bnez a3, .LBB0_2
-; RV32ZDINX_NOZICOND-NEXT:  # %bb.1: # %entry
-; RV32ZDINX_NOZICOND-NEXT:    mv a0, a2
-; RV32ZDINX_NOZICOND-NEXT:  .LBB0_2: # %entry
-; RV32ZDINX_NOZICOND-NEXT:    ret
 entry:
   %sel = select i1 %cond, float %t, float %f
   ret float %sel
@@ -353,32 +321,32 @@ entry:
 ; -----------------------------------------------------------------------------
 
 define dso_local noundef half @select_half_i1(i1 %cond, half %a, half %b) nounwind {
-; RV64ZDINX_ZICOND-LABEL: select_half_i1:
-; RV64ZDINX_ZICOND:       # %bb.0: # %entry
-; RV64ZDINX_ZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
-; RV64ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV64ZDINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV64ZDINX_ZICOND-NEXT:    czero.nez a2, a2, a0
-; RV64ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV64ZDINX_ZICOND-NEXT:    or a0, a0, a2
-; RV64ZDINX_ZICOND-NEXT:    lui a1, 1048560
-; RV64ZDINX_ZICOND-NEXT:    or a0, a0, a1
-; RV64ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV64ZDINX_ZICOND-NEXT:    ret
-;
-; RV64ZDINX_NOZICOND-LABEL: select_half_i1:
-; RV64ZDINX_NOZICOND:       # %bb.0: # %entry
-; RV64ZDINX_NOZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
-; RV64ZDINX_NOZICOND-NEXT:    andi a0, a0, 1
-; RV64ZDINX_NOZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV64ZDINX_NOZICOND-NEXT:    bnez a0, .LBB3_2
-; RV64ZDINX_NOZICOND-NEXT:  # %bb.1: # %entry
-; RV64ZDINX_NOZICOND-NEXT:    mv a1, a2
-; RV64ZDINX_NOZICOND-NEXT:  .LBB3_2: # %entry
-; RV64ZDINX_NOZICOND-NEXT:    lui a0, 1048560
-; RV64ZDINX_NOZICOND-NEXT:    or a0, a1, a0
-; RV64ZDINX_NOZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV64ZDINX_NOZICOND-NEXT:    ret
+; ZDINX_ZICOND-LABEL: select_half_i1:
+; ZDINX_ZICOND:       # %bb.0: # %entry
+; ZDINX_ZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
+; ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
+; ZDINX_ZICOND-NEXT:    andi a0, a0, 1
+; ZDINX_ZICOND-NEXT:    czero.nez a2, a2, a0
+; ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
+; ZDINX_ZICOND-NEXT:    or a0, a0, a2
+; ZDINX_ZICOND-NEXT:    lui a1, 1048560
+; ZDINX_ZICOND-NEXT:    or a0, a0, a1
+; ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
+; ZDINX_ZICOND-NEXT:    ret
+;
+; ZDINX_NOZICOND-LABEL: select_half_i1:
+; ZDINX_NOZICOND:       # %bb.0: # %entry
+; ZDINX_NOZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
+; ZDINX_NOZICOND-NEXT:    andi a0, a0, 1
+; ZDINX_NOZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
+; ZDINX_NOZICOND-NEXT:    bnez a0, .LBB3_2
+; ZDINX_NOZICOND-NEXT:  # %bb.1: # %entry
+; ZDINX_NOZICOND-NEXT:    mv a1, a2
+; ZDINX_NOZICOND-NEXT:  .LBB3_2: # %entry
+; ZDINX_NOZICOND-NEXT:    lui a0, 1048560
+; ZDINX_NOZICOND-NEXT:    or a0, a1, a0
+; ZDINX_NOZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
+; ZDINX_NOZICOND-NEXT:    ret
 ;
 ; RV64ZHINX_ZICOND-LABEL: select_half_i1:
 ; RV64ZHINX_ZICOND:       # %bb.0: # %entry
@@ -432,33 +400,6 @@ define dso_local noundef half @select_half_i1(i1 %cond, half %a, half %b) nounwi
 ; RV32ZFINX_NOZICOND-NEXT:    or a0, a1, a0
 ; RV32ZFINX_NOZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
 ; RV32ZFINX_NOZICOND-NEXT:    ret
-;
-; RV32ZDINX_ZICOND-LABEL: select_half_i1:
-; RV32ZDINX_ZICOND:       # %bb.0: # %entry
-; RV32ZDINX_ZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
-; RV32ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV32ZDINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV32ZDINX_ZICOND-NEXT:    czero.nez a2, a2, a0
-; RV32ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV32ZDINX_ZICOND-NEXT:    or a0, a0, a2
-; RV32ZDINX_ZICOND-NEXT:    lui a1, 1048560
-; RV32ZDINX_ZICOND-NEXT:    or a0, a0, a1
-; RV32ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV32ZDINX_ZICOND-NEXT:    ret
-;
-; RV32ZDINX_NOZICOND-LABEL: select_half_i1:
-; RV32ZDINX_NOZICOND:       # %bb.0: # %entry
-; RV32ZDINX_NOZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
-; RV32ZDINX_NOZICOND-NEXT:    andi a0, a0, 1
-; RV32ZDINX_NOZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV32ZDINX_NOZICOND-NEXT:    bnez a0, .LBB3_2
-; RV32ZDINX_NOZICOND-NEXT:  # %bb.1: # %entry
-; RV32ZDINX_NOZICOND-NEXT:    mv a1, a2
-; RV32ZDINX_NOZICOND-NEXT:  .LBB3_2: # %entry
-; RV32ZDINX_NOZICOND-NEXT:    lui a0, 1048560
-; RV32ZDINX_NOZICOND-NEXT:    or a0, a1, a0
-; RV32ZDINX_NOZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV32ZDINX_NOZICOND-NEXT:    ret
 entry:
   %sel = select i1 %cond, half %a, half %b
   ret half %sel
@@ -468,31 +409,31 @@ entry:
 ; Test select with i1 condition and zero ret val (cond ? a : 0), Zfinx
 ; -----------------------------------------------------------------------------
 define dso_local noundef float @select_i1_f32_0(i1 %cond, float %t) nounwind {
-; RV64ZDINX_ZICOND-LABEL: select_i1_f32_0:
-; RV64ZDINX_ZICOND:       # %bb.0: # %entry
-; RV64ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV64ZDINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV64ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV64ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV64ZDINX_ZICOND-NEXT:    ret
-;
-; RV64ZDINX_NOZICOND-LABEL: select_i1_f32_0:
-; RV64ZDINX_NOZICOND:       # %bb.0: # %entry
-; RV64ZDINX_NOZICOND-NEXT:    andi a2, a0, 1
-; RV64ZDINX_NOZICOND-NEXT:    mv a0, a1
-; RV64ZDINX_NOZICOND-NEXT:    bnez a2, .LBB4_2
-; RV64ZDINX_NOZICOND-NEXT:  # %bb.1: # %entry
-; RV64ZDINX_NOZICOND-NEXT:    li a0, 0
-; RV64ZDINX_NOZICOND-NEXT:  .LBB4_2: # %entry
-; RV64ZDINX_NOZICOND-NEXT:    ret
-;
-; RV64ZHINX_ZICOND-LABEL: select_i1_f32_0:
-; RV64ZHINX_ZICOND:       # %bb.0: # %entry
-; RV64ZHINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV64ZHINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV64ZHINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV64ZHINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV64ZHINX_ZICOND-NEXT:    ret
+; ZDINX_ZICOND-LABEL: select_i1_f32_0:
+; ZDINX_ZICOND:       # %bb.0: # %entry
+; ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
+; ZDINX_ZICOND-NEXT:    andi a0, a0, 1
+; ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
+; ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
+; ZDINX_ZICOND-NEXT:    ret
+;
+; ZDINX_NOZICOND-LABEL: select_i1_f32_0:
+; ZDINX_NOZICOND:       # %bb.0: # %entry
+; ZDINX_NOZICOND-NEXT:    andi a2, a0, 1
+; ZDINX_NOZICOND-NEXT:    mv a0, a1
+; ZDINX_NOZICOND-NEXT:    bnez a2, .LBB4_2
+; ZDINX_NOZICOND-NEXT:  # %bb.1: # %entry
+; ZDINX_NOZICOND-NEXT:    li a0, 0
+; ZDINX_NOZICOND-NEXT:  .LBB4_2: # %entry
+; ZDINX_NOZICOND-NEXT:    ret
+;
+; ZHINX_ZICOND-LABEL: select_i1_f32_0:
+; ZHINX_ZICOND:       # %bb.0: # %entry
+; ZHINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
+; ZHINX_ZICOND-NEXT:    andi a0, a0, 1
+; ZHINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
+; ZHINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
+; ZHINX_ZICOND-NEXT:    ret
 ;
 ; RV64FD-LABEL: select_i1_f32_0:
 ; RV64FD:       # %bb.0: # %entry
@@ -503,14 +444,6 @@ define dso_local noundef float @select_i1_f32_0(i1 %cond, float %t) nounwind {
 ; RV64FD-NEXT:  .LBB4_2: # %entry
 ; RV64FD-NEXT:    ret
 ;
-; RV32ZFINX_ZICOND-LABEL: select_i1_f32_0:
-; RV32ZFINX_ZICOND:       # %bb.0: # %entry
-; RV32ZFINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV32ZFINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV32ZFINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV32ZFINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV32ZFINX_ZICOND-NEXT:    ret
-;
 ; RV32ZFINX_NOZICOND-LABEL: select_i1_f32_0:
 ; RV32ZFINX_NOZICOND:       # %bb.0: # %entry
 ; RV32ZFINX_NOZICOND-NEXT:    andi a2, a0, 1
@@ -520,24 +453,6 @@ define dso_local noundef float @select_i1_f32_0(i1 %cond, float %t) nounwind {
 ; RV32ZFINX_NOZICOND-NEXT:    li a0, 0
 ; RV32ZFINX_NOZICOND-NEXT:  .LBB4_2: # %entry
 ; RV32ZFINX_NOZICOND-NEXT:    ret
-;
-; RV32ZDINX_ZICOND-LABEL: select_i1_f32_0:
-; RV32ZDINX_ZICOND:       # %bb.0: # %entry
-; RV32ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV32ZDINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV32ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV32ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV32ZDINX_ZICOND-NEXT:    ret
-;
-; RV32ZDINX_NOZICOND-LABEL: select_i1_f32_0:
-; RV32ZDINX_NOZICOND:       # %bb.0: # %entry
-; RV32ZDINX_NOZICOND-NEXT:    andi a2, a0, 1
-; RV32ZDINX_NOZICOND-NEXT:    mv a0, a1
-; RV32ZDINX_NOZICOND-NEXT:    bnez a2, .LBB4_2
-; RV32ZDINX_NOZICOND-NEXT:  # %bb.1: # %entry
-; RV32ZDINX_NOZICOND-NEXT:    li a0, 0
-; RV32ZDINX_NOZICOND-NEXT:  .LBB4_2: # %entry
-; RV32ZDINX_NOZICOND-NEXT:    ret
 entry:
   %sel = select i1 %cond, float %t, float 0.000000e+00
   ret float %sel
@@ -547,15 +462,15 @@ entry:
 ; Test select with i1 condition and zero ret val for half fp (cond ? a : 0)
 ; -----------------------------------------------------------------------------
 define dso_local noundef half @select_i1_half_0(i1 %cond, half %val) nounwind {
-; RV64ZDINX_ZICOND-LABEL: select_i1_half_0:
-; RV64ZDINX_ZICOND:       # %bb.0: # %entry
-; RV64ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV64ZDINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV64ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV64ZDINX_ZICOND-NEXT:    lui a1, 1048560
-; RV64ZDINX_ZICOND-NEXT:    or a0, a0, a1
-; RV64ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV64ZDINX_ZICOND-NEXT:    ret
+; ZDINX_ZICOND-LABEL: select_i1_half_0:
+; ZDINX_ZICOND:       # %bb.0: # %entry
+; ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
+; ZDINX_ZICOND-NEXT:    andi a0, a0, 1
+; ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
+; ZDINX_ZICOND-NEXT:    lui a1, 1048560
+; ZDINX_ZICOND-NEXT:    or a0, a0, a1
+; ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
+; ZDINX_ZICOND-NEXT:    ret
 ;
 ; RV64ZDINX_NOZICOND-LABEL: select_i1_half_0:
 ; RV64ZDINX_NOZICOND:       # %bb.0: # %entry
@@ -608,16 +523,6 @@ define dso_local noundef half @select_i1_half_0(i1 %cond, half %val) nounwind {
 ; RV32ZFINX_NOZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
 ; RV32ZFINX_NOZICOND-NEXT:    ret
 ;
-; RV32ZDINX_ZICOND-LABEL: select_i1_half_0:
-; RV32ZDINX_ZICOND:       # %bb.0: # %entry
-; RV32ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV32ZDINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV32ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV32ZDINX_ZICOND-NEXT:    lui a1, 1048560
-; RV32ZDINX_ZICOND-NEXT:    or a0, a0, a1
-; RV32ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV32ZDINX_ZICOND-NEXT:    ret
-;
 ; RV32ZDINX_NOZICOND-LABEL: select_i1_half_0:
 ; RV32ZDINX_NOZICOND:       # %bb.0: # %entry
 ; RV32ZDINX_NOZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
diff --git a/llvm/test/CodeGen/WebAssembly/masked-shifts.ll b/llvm/test/CodeGen/WebAssembly/masked-shifts.ll
index 5bcb023e546b5..368f30fd5d7ed 100644
--- a/llvm/test/CodeGen/WebAssembly/masked-shifts.ll
+++ b/llvm/test/CodeGen/WebAssembly/masked-shifts.ll
@@ -18,6 +18,21 @@ define i32 @shl_i32(i32 %v, i32 %x) {
   ret i32 %a
 }
 
+define i64 @shl_i64_zext(i64 %v, i32 %x) {
+; CHECK-LABEL: shl_i64_zext:
+; CHECK:         .functype shl_i64_zext (i64, i32) -> (i64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.extend_i32_u
+; CHECK-NEXT:    i64.shl
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i32 %x, 63
+  %z = zext i32 %m to i64
+  %a = shl i64 %v, %z
+  ret i64 %a
+}
+
 define i32 @sra_i32(i32 %v, i32 %x) {
 ; CHECK-LABEL: sra_i32:
 ; CHECK:         .functype sra_i32 (i32, i32) -> (i32)
diff --git a/llvm/test/CodeGen/X86/combine-fceil.ll b/llvm/test/CodeGen/X86/combine-fceil.ll
new file mode 100644
index 0000000000000..78f1476a49152
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-fceil.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_ceil_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: concat_ceil_v4f64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $10, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $10, %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_ceil_v4f64_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundpd $10, %xmm0, %xmm0
+; AVX-NEXT:    vroundpd $10, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a0)
+  %v1 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a1)
+  %res  = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %res
+}
+
+define <8 x float> @concat_ceil_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_ceil_v8f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $10, %xmm0, %xmm0
+; SSE-NEXT:    roundps $10, %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_ceil_v8f32_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundps $10, %xmm0, %xmm0
+; AVX-NEXT:    vroundps $10, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a1)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+define <8 x double> @concat_ceil_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; SSE-LABEL: concat_ceil_v8f64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $10, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $10, %xmm1, %xmm1
+; SSE-NEXT:    roundpd $10, %xmm2, %xmm2
+; SSE-NEXT:    roundpd $10, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_ceil_v8f64_v2f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $10, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundpd $10, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundpd $10, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundpd $10, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_ceil_v8f64_v2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $10, %xmm0, %xmm0
+; AVX512-NEXT:    vroundpd $10, %xmm1, %xmm1
+; AVX512-NEXT:    vroundpd $10, %xmm2, %xmm2
+; AVX512-NEXT:    vroundpd $10, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a0)
+  %v1 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a1)
+  %v2 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a2)
+  %v3 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a3)
+  %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res  = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_ceil_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_ceil_v16f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $10, %xmm0, %xmm0
+; SSE-NEXT:    roundps $10, %xmm1, %xmm1
+; SSE-NEXT:    roundps $10, %xmm2, %xmm2
+; SSE-NEXT:    roundps $10, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_ceil_v16f32_v4f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $10, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundps $10, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundps $10, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundps $10, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_ceil_v16f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $10, %xmm0, %xmm0
+; AVX512-NEXT:    vroundps $10, %xmm1, %xmm1
+; AVX512-NEXT:    vroundps $10, %xmm2, %xmm2
+; AVX512-NEXT:    vroundps $10, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a1)
+  %v2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a2)
+  %v3 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a3)
+  %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res  = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
+
+define <8 x double> @concat_ceil_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: concat_ceil_v8f64_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $10, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $10, %xmm1, %xmm1
+; SSE-NEXT:    roundpd $10, %xmm2, %xmm2
+; SSE-NEXT:    roundpd $10, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_ceil_v8f64_v4f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $10, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundpd $10, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_ceil_v8f64_v4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $10, %ymm0, %ymm0
+; AVX512-NEXT:    vroundpd $10, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x double> @llvm.ceil.v4f64(<4 x double> %a0)
+  %v1 = call <4 x double> @llvm.ceil.v4f64(<4 x double> %a1)
+  %res  = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_ceil_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_ceil_v16f32_v8f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $10, %xmm0, %xmm0
+; SSE-NEXT:    roundps $10, %xmm1, %xmm1
+; SSE-NEXT:    roundps $10, %xmm2, %xmm2
+; SSE-NEXT:    roundps $10, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_ceil_v16f32_v8f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $10, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundps $10, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_ceil_v16f32_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $10, %ymm0, %ymm0
+; AVX512-NEXT:    vroundps $10, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <8 x float> @llvm.ceil.v8f32(<8 x float> %a0)
+  %v1 = call <8 x float> @llvm.ceil.v8f32(<8 x float> %a1)
+  %res  = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-fnearbyint.ll b/llvm/test/CodeGen/X86/combine-fnearbyint.ll
new file mode 100644
index 0000000000000..14d1017aec630
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-fnearbyint.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_nearbyint_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: concat_nearbyint_v4f64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $12, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $12, %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_nearbyint_v4f64_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundpd $12, %xmm0, %xmm0
+; AVX-NEXT:    vroundpd $12, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a0)
+  %v1 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a1)
+  %res  = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %res
+}
+
+define <8 x float> @concat_nearbyint_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_nearbyint_v8f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $12, %xmm0, %xmm0
+; SSE-NEXT:    roundps $12, %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_nearbyint_v8f32_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundps $12, %xmm0, %xmm0
+; AVX-NEXT:    vroundps $12, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a1)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+define <8 x double> @concat_nearbyint_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; SSE-LABEL: concat_nearbyint_v8f64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $12, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $12, %xmm1, %xmm1
+; SSE-NEXT:    roundpd $12, %xmm2, %xmm2
+; SSE-NEXT:    roundpd $12, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_nearbyint_v8f64_v2f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $12, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundpd $12, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundpd $12, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundpd $12, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_nearbyint_v8f64_v2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $12, %xmm0, %xmm0
+; AVX512-NEXT:    vroundpd $12, %xmm1, %xmm1
+; AVX512-NEXT:    vroundpd $12, %xmm2, %xmm2
+; AVX512-NEXT:    vroundpd $12, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a0)
+  %v1 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a1)
+  %v2 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a2)
+  %v3 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a3)
+  %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res  = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_nearbyint_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_nearbyint_v16f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $12, %xmm0, %xmm0
+; SSE-NEXT:    roundps $12, %xmm1, %xmm1
+; SSE-NEXT:    roundps $12, %xmm2, %xmm2
+; SSE-NEXT:    roundps $12, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_nearbyint_v16f32_v4f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $12, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundps $12, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundps $12, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundps $12, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_nearbyint_v16f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $12, %xmm0, %xmm0
+; AVX512-NEXT:    vroundps $12, %xmm1, %xmm1
+; AVX512-NEXT:    vroundps $12, %xmm2, %xmm2
+; AVX512-NEXT:    vroundps $12, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a1)
+  %v2 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a2)
+  %v3 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a3)
+  %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res  = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
+
+define <8 x double> @concat_nearbyint_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: concat_nearbyint_v8f64_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $12, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $12, %xmm1, %xmm1
+; SSE-NEXT:    roundpd $12, %xmm2, %xmm2
+; SSE-NEXT:    roundpd $12, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_nearbyint_v8f64_v4f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $12, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundpd $12, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_nearbyint_v8f64_v4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $12, %ymm0, %ymm0
+; AVX512-NEXT:    vroundpd $12, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %a0)
+  %v1 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %a1)
+  %res  = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_nearbyint_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_nearbyint_v16f32_v8f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $12, %xmm0, %xmm0
+; SSE-NEXT:    roundps $12, %xmm1, %xmm1
+; SSE-NEXT:    roundps $12, %xmm2, %xmm2
+; SSE-NEXT:    roundps $12, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_nearbyint_v16f32_v8f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $12, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundps $12, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_nearbyint_v16f32_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $12, %ymm0, %ymm0
+; AVX512-NEXT:    vroundps $12, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %a0)
+  %v1 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %a1)
+  %res  = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-frint.ll b/llvm/test/CodeGen/X86/combine-frint.ll
new file mode 100644
index 0000000000000..901ce2c1f0d82
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-frint.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_rint_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: concat_rint_v4f64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $4, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $4, %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_rint_v4f64_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundpd $4, %xmm0, %xmm0
+; AVX-NEXT:    vroundpd $4, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a0)
+  %v1 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a1)
+  %res  = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %res
+}
+
+define <8 x float> @concat_rint_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_rint_v8f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $4, %xmm0, %xmm0
+; SSE-NEXT:    roundps $4, %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_rint_v8f32_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundps $4, %xmm0, %xmm0
+; AVX-NEXT:    vroundps $4, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a1)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+define <8 x double> @concat_rint_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; SSE-LABEL: concat_rint_v8f64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $4, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $4, %xmm1, %xmm1
+; SSE-NEXT:    roundpd $4, %xmm2, %xmm2
+; SSE-NEXT:    roundpd $4, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_rint_v8f64_v2f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $4, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundpd $4, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundpd $4, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundpd $4, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_rint_v8f64_v2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $4, %xmm0, %xmm0
+; AVX512-NEXT:    vroundpd $4, %xmm1, %xmm1
+; AVX512-NEXT:    vroundpd $4, %xmm2, %xmm2
+; AVX512-NEXT:    vroundpd $4, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a0)
+  %v1 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a1)
+  %v2 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a2)
+  %v3 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a3)
+  %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res  = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_rint_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_rint_v16f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $4, %xmm0, %xmm0
+; SSE-NEXT:    roundps $4, %xmm1, %xmm1
+; SSE-NEXT:    roundps $4, %xmm2, %xmm2
+; SSE-NEXT:    roundps $4, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_rint_v16f32_v4f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $4, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundps $4, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundps $4, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundps $4, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_rint_v16f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $4, %xmm0, %xmm0
+; AVX512-NEXT:    vroundps $4, %xmm1, %xmm1
+; AVX512-NEXT:    vroundps $4, %xmm2, %xmm2
+; AVX512-NEXT:    vroundps $4, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a1)
+  %v2 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a2)
+  %v3 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a3)
+  %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res  = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
+
+define <8 x double> @concat_rint_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: concat_rint_v8f64_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $4, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $4, %xmm1, %xmm1
+; SSE-NEXT:    roundpd $4, %xmm2, %xmm2
+; SSE-NEXT:    roundpd $4, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_rint_v8f64_v4f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $4, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundpd $4, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_rint_v8f64_v4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $4, %ymm0, %ymm0
+; AVX512-NEXT:    vroundpd $4, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %a0)
+  %v1 = call <4 x double> @llvm.rint.v4f64(<4 x double> %a1)
+  %res  = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_rint_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_rint_v16f32_v8f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $4, %xmm0, %xmm0
+; SSE-NEXT:    roundps $4, %xmm1, %xmm1
+; SSE-NEXT:    roundps $4, %xmm2, %xmm2
+; SSE-NEXT:    roundps $4, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_rint_v16f32_v8f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $4, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundps $4, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_rint_v16f32_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $4, %ymm0, %ymm0
+; AVX512-NEXT:    vroundps $4, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <8 x float> @llvm.rint.v8f32(<8 x float> %a0)
+  %v1 = call <8 x float> @llvm.rint.v8f32(<8 x float> %a1)
+  %res  = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-froundeven.ll b/llvm/test/CodeGen/X86/combine-froundeven.ll
new file mode 100644
index 0000000000000..484e3a9680450
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-froundeven.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_roundeven_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: concat_roundeven_v4f64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $8, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $8, %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_roundeven_v4f64_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundpd $8, %xmm0, %xmm0
+; AVX-NEXT:    vroundpd $8, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a0)
+  %v1 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a1)
+  %res  = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %res
+}
+
+define <8 x float> @concat_roundeven_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_roundeven_v8f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $8, %xmm0, %xmm0
+; SSE-NEXT:    roundps $8, %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_roundeven_v8f32_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundps $8, %xmm0, %xmm0
+; AVX-NEXT:    vroundps $8, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a1)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+define <8 x double> @concat_roundeven_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; SSE-LABEL: concat_roundeven_v8f64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $8, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $8, %xmm1, %xmm1
+; SSE-NEXT:    roundpd $8, %xmm2, %xmm2
+; SSE-NEXT:    roundpd $8, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_roundeven_v8f64_v2f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $8, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundpd $8, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundpd $8, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundpd $8, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_roundeven_v8f64_v2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $8, %xmm0, %xmm0
+; AVX512-NEXT:    vroundpd $8, %xmm1, %xmm1
+; AVX512-NEXT:    vroundpd $8, %xmm2, %xmm2
+; AVX512-NEXT:    vroundpd $8, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a0)
+  %v1 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a1)
+  %v2 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a2)
+  %v3 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a3)
+  %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res  = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_roundeven_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_roundeven_v16f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $8, %xmm0, %xmm0
+; SSE-NEXT:    roundps $8, %xmm1, %xmm1
+; SSE-NEXT:    roundps $8, %xmm2, %xmm2
+; SSE-NEXT:    roundps $8, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_roundeven_v16f32_v4f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $8, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundps $8, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundps $8, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundps $8, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_roundeven_v16f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $8, %xmm0, %xmm0
+; AVX512-NEXT:    vroundps $8, %xmm1, %xmm1
+; AVX512-NEXT:    vroundps $8, %xmm2, %xmm2
+; AVX512-NEXT:    vroundps $8, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a1)
+  %v2 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a2)
+  %v3 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a3)
+  %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res  = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
+
+define <8 x double> @concat_roundeven_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: concat_roundeven_v8f64_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $8, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $8, %xmm1, %xmm1
+; SSE-NEXT:    roundpd $8, %xmm2, %xmm2
+; SSE-NEXT:    roundpd $8, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_roundeven_v8f64_v4f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $8, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundpd $8, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_roundeven_v8f64_v4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $8, %ymm0, %ymm0
+; AVX512-NEXT:    vroundpd $8, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %a0)
+  %v1 = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %a1)
+  %res  = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_roundeven_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_roundeven_v16f32_v8f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $8, %xmm0, %xmm0
+; SSE-NEXT:    roundps $8, %xmm1, %xmm1
+; SSE-NEXT:    roundps $8, %xmm2, %xmm2
+; SSE-NEXT:    roundps $8, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_roundeven_v16f32_v8f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $8, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundps $8, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_roundeven_v16f32_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $8, %ymm0, %ymm0
+; AVX512-NEXT:    vroundps $8, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %a0)
+  %v1 = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %a1)
+  %res  = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-fsqrt.ll b/llvm/test/CodeGen/X86/combine-fsqrt.ll
new file mode 100644
index 0000000000000..ddd7d3ac24315
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-fsqrt.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64    | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <8 x float> @concat_sqrt_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_sqrt_v8f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    sqrtps %xmm0, %xmm0
+; SSE-NEXT:    sqrtps %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_sqrt_v8f32_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vsqrtps %xmm0, %xmm0
+; AVX-NEXT:    vsqrtps %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a1)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+define <16 x float> @concat_sqrt_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_sqrt_v16f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    sqrtps %xmm0, %xmm0
+; SSE-NEXT:    sqrtps %xmm1, %xmm1
+; SSE-NEXT:    sqrtps %xmm2, %xmm2
+; SSE-NEXT:    sqrtps %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_sqrt_v16f32_v4f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vsqrtps %xmm0, %xmm0
+; AVX1OR2-NEXT:    vsqrtps %xmm1, %xmm1
+; AVX1OR2-NEXT:    vsqrtps %xmm2, %xmm2
+; AVX1OR2-NEXT:    vsqrtps %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_sqrt_v16f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vsqrtps %xmm0, %xmm0
+; AVX512-NEXT:    vsqrtps %xmm1, %xmm1
+; AVX512-NEXT:    vsqrtps %xmm2, %xmm2
+; AVX512-NEXT:    vsqrtps %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a1)
+  %v2 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a2)
+  %v3 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a3)
+  %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res  = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
+
+define <16 x float> @concat_sqrt_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_sqrt_v16f32_v8f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    sqrtps %xmm0, %xmm0
+; SSE-NEXT:    sqrtps %xmm1, %xmm1
+; SSE-NEXT:    sqrtps %xmm2, %xmm2
+; SSE-NEXT:    sqrtps %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_sqrt_v16f32_v8f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vsqrtps %ymm0, %ymm0
+; AVX1OR2-NEXT:    vsqrtps %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_sqrt_v16f32_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vsqrtps %ymm0, %ymm0
+; AVX512-NEXT:    vsqrtps %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0)
+  %v1 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a1)
+  %res  = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-ftrunc.ll b/llvm/test/CodeGen/X86/combine-ftrunc.ll
new file mode 100644
index 0000000000000..a6c703a1cbeae
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-ftrunc.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_trunc_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: concat_trunc_v4f64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $11, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $11, %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_trunc_v4f64_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundpd $11, %xmm0, %xmm0
+; AVX-NEXT:    vroundpd $11, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a0)
+  %v1 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a1)
+  %res  = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %res
+}
+
+define <8 x float> @concat_trunc_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_trunc_v8f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $11, %xmm0, %xmm0
+; SSE-NEXT:    roundps $11, %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_trunc_v8f32_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundps $11, %xmm0, %xmm0
+; AVX-NEXT:    vroundps $11, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a1)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+define <8 x double> @concat_trunc_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; SSE-LABEL: concat_trunc_v8f64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $11, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $11, %xmm1, %xmm1
+; SSE-NEXT:    roundpd $11, %xmm2, %xmm2
+; SSE-NEXT:    roundpd $11, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_trunc_v8f64_v2f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $11, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundpd $11, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundpd $11, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundpd $11, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_trunc_v8f64_v2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $11, %xmm0, %xmm0
+; AVX512-NEXT:    vroundpd $11, %xmm1, %xmm1
+; AVX512-NEXT:    vroundpd $11, %xmm2, %xmm2
+; AVX512-NEXT:    vroundpd $11, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a0)
+  %v1 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a1)
+  %v2 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a2)
+  %v3 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a3)
+  %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res  = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_trunc_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_trunc_v16f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $11, %xmm0, %xmm0
+; SSE-NEXT:    roundps $11, %xmm1, %xmm1
+; SSE-NEXT:    roundps $11, %xmm2, %xmm2
+; SSE-NEXT:    roundps $11, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_trunc_v16f32_v4f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $11, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundps $11, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundps $11, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundps $11, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_trunc_v16f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $11, %xmm0, %xmm0
+; AVX512-NEXT:    vroundps $11, %xmm1, %xmm1
+; AVX512-NEXT:    vroundps $11, %xmm2, %xmm2
+; AVX512-NEXT:    vroundps $11, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a1)
+  %v2 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a2)
+  %v3 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a3)
+  %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res  = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
+
+define <8 x double> @concat_trunc_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: concat_trunc_v8f64_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $11, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $11, %xmm1, %xmm1
+; SSE-NEXT:    roundpd $11, %xmm2, %xmm2
+; SSE-NEXT:    roundpd $11, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_trunc_v8f64_v4f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $11, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundpd $11, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_trunc_v8f64_v4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $11, %ymm0, %ymm0
+; AVX512-NEXT:    vroundpd $11, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x double> @llvm.trunc.v4f64(<4 x double> %a0)
+  %v1 = call <4 x double> @llvm.trunc.v4f64(<4 x double> %a1)
+  %res  = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_trunc_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_trunc_v16f32_v8f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $11, %xmm0, %xmm0
+; SSE-NEXT:    roundps $11, %xmm1, %xmm1
+; SSE-NEXT:    roundps $11, %xmm2, %xmm2
+; SSE-NEXT:    roundps $11, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_trunc_v16f32_v8f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $11, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundps $11, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_trunc_v16f32_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $11, %ymm0, %ymm0
+; AVX512-NEXT:    vroundps $11, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <8 x float> @llvm.trunc.v8f32(<8 x float> %a0)
+  %v1 = call <8 x float> @llvm.trunc.v8f32(<8 x float> %a1)
+  %res  = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-rcp.ll b/llvm/test/CodeGen/X86/combine-rcp.ll
new file mode 100644
index 0000000000000..7de3e96d592db
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-rcp.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64    | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <8 x float> @concat_rcp_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_rcp_v8f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rcpps %xmm0, %xmm0
+; SSE-NEXT:    rcpps %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_rcp_v8f32_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrcpps %xmm0, %xmm0
+; AVX-NEXT:    vrcpps %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a1)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+; Ensure we don't convert rcpps to rcp14ps
+define <16 x float> @concat_rcp_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_rcp_v16f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rcpps %xmm0, %xmm0
+; SSE-NEXT:    rcpps %xmm1, %xmm1
+; SSE-NEXT:    rcpps %xmm2, %xmm2
+; SSE-NEXT:    rcpps %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_rcp_v16f32_v4f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vrcpps %xmm0, %xmm0
+; AVX1OR2-NEXT:    vrcpps %xmm1, %xmm1
+; AVX1OR2-NEXT:    vrcpps %xmm2, %xmm2
+; AVX1OR2-NEXT:    vrcpps %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_rcp_v16f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrcpps %xmm0, %xmm0
+; AVX512-NEXT:    vrcpps %xmm1, %xmm1
+; AVX512-NEXT:    vrcpps %xmm2, %xmm2
+; AVX512-NEXT:    vrcpps %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a1)
+  %v2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a2)
+  %v3 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a3)
+  %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res  = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-rndscale.ll b/llvm/test/CodeGen/X86/combine-rndscale.ll
new file mode 100644
index 0000000000000..25117e864b512
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-rndscale.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_roundpd_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; AVX-LABEL: concat_roundpd_v4f64_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundpd $4, %xmm0, %xmm0
+; AVX-NEXT:    vroundpd $4, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 4)
+  %v1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a1, i32 4)
+  %res  = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %res
+}
+
+define <8 x float> @concat_roundps_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; AVX-LABEL: concat_roundps_v8f32_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundps $4, %xmm0, %xmm0
+; AVX-NEXT:    vroundps $4, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 4)
+  %v1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a1, i32 4)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+define <8 x double> @concat_roundpd_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; AVX1OR2-LABEL: concat_roundpd_v8f64_v2f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $4, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundpd $4, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundpd $4, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundpd $4, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_roundpd_v8f64_v2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $4, %xmm0, %xmm0
+; AVX512-NEXT:    vroundpd $4, %xmm1, %xmm1
+; AVX512-NEXT:    vroundpd $4, %xmm2, %xmm2
+; AVX512-NEXT:    vroundpd $4, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 4)
+  %v1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a1, i32 4)
+  %v2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a2, i32 4)
+  %v3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a3, i32 4)
+  %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res  = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_roundps_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; AVX1OR2-LABEL: concat_roundps_v16f32_v4f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $4, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundps $4, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundps $4, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundps $4, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_roundps_v16f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $4, %xmm0, %xmm0
+; AVX512-NEXT:    vroundps $4, %xmm1, %xmm1
+; AVX512-NEXT:    vroundps $4, %xmm2, %xmm2
+; AVX512-NEXT:    vroundps $4, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 4)
+  %v1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a1, i32 4)
+  %v2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a2, i32 4)
+  %v3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a3, i32 4)
+  %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res  = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
+
+define <8 x double> @concat_roundpd_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; AVX1OR2-LABEL: concat_roundpd_v8f64_v4f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $4, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundpd $4, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_roundpd_v8f64_v4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $4, %ymm0, %ymm0
+; AVX512-NEXT:    vroundpd $4, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 4)
+  %v1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a1, i32 4)
+  %res  = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_roundps_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; AVX1OR2-LABEL: concat_roundps_v16f32_v8f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $4, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundps $4, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_roundps_v16f32_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $4, %ymm0, %ymm0
+; AVX512-NEXT:    vroundps $4, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 4)
+  %v1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a1, i32 4)
+  %res  = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
+
+; negative test - rounding mode mismatch
+define <8 x float> @concat_roundps_v8f32_v4f32_mismatch(<4 x float> %a0, <4 x float> %a1) {
+; AVX-LABEL: concat_roundps_v8f32_v4f32_mismatch:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundps $0, %xmm0, %xmm0
+; AVX-NEXT:    vroundps $4, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 0)
+  %v1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a1, i32 4)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-rsqrt.ll b/llvm/test/CodeGen/X86/combine-rsqrt.ll
new file mode 100644
index 0000000000000..78688701f8cd3
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-rsqrt.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64    | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <8 x float> @concat_rsqrt_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_rsqrt_v8f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rsqrtps %xmm0, %xmm0
+; SSE-NEXT:    rsqrtps %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_rsqrt_v8f32_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrsqrtps %xmm0, %xmm0
+; AVX-NEXT:    vrsqrtps %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a1)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+; Ensure we don't convert rsqrtps to rsqrt14ps
+define <16 x float> @concat_rsqrt_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_rsqrt_v16f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rsqrtps %xmm0, %xmm0
+; SSE-NEXT:    rsqrtps %xmm1, %xmm1
+; SSE-NEXT:    rsqrtps %xmm2, %xmm2
+; SSE-NEXT:    rsqrtps %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_rsqrt_v16f32_v4f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vrsqrtps %xmm0, %xmm0
+; AVX1OR2-NEXT:    vrsqrtps %xmm1, %xmm1
+; AVX1OR2-NEXT:    vrsqrtps %xmm2, %xmm2
+; AVX1OR2-NEXT:    vrsqrtps %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_rsqrt_v16f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrsqrtps %xmm0, %xmm0
+; AVX512-NEXT:    vrsqrtps %xmm1, %xmm1
+; AVX512-NEXT:    vrsqrtps %xmm2, %xmm2
+; AVX512-NEXT:    vrsqrtps %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a1)
+  %v2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a2)
+  %v3 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a3)
+  %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res  = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
diff --git a/llvm/test/Transforms/InstCombine/saturating-add-sub.ll b/llvm/test/Transforms/InstCombine/saturating-add-sub.ll
index c0ad5818e448a..1294f867f07c0 100644
--- a/llvm/test/Transforms/InstCombine/saturating-add-sub.ll
+++ b/llvm/test/Transforms/InstCombine/saturating-add-sub.ll
@@ -2671,3 +2671,19 @@ define i8 @neg_neg_constant(i8 %x, i8 %y) {
   %s = select i1 %cmp, i8 127, i8 %d
   ret i8 %s
 }
+
+; Make sure we don't crash in this case.
+define i32 @pr153053_strict_pred_with_nonconstant_rhs(i32 %x, i32 %y) {
+; CHECK-LABEL: @pr153053_strict_pred_with_nonconstant_rhs(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[X]], 1
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 [[ADD]], i32 2147483647
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %cmp = icmp slt i32 %x, %y
+  %add = add i32 %x, 1
+  %res = select i1 %cmp, i32 %add, i32 2147483647
+  ret i32 %res
+}
diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
index b4fd06316a2e5..4f19a7c586bc3 100644
--- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -152,59 +152,106 @@ for.end:
   ret ptr %ptr.phi
 }
 
-define ptr @both(i32 %k)  {
-; CHECK-LABEL: define ptr @both(
-; CHECK-SAME: i32 [[K:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[BASE:%.*]] = getelementptr inbounds i32, ptr undef, i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[K]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 4
-; CHECK-NEXT:    [[IND_END1:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 4
-; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr undef, i64 [[TMP4]]
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[IND_END1]], i64 -4
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[IND_END1]], %[[MIDDLE_BLOCK]] ], [ [[BASE]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END2]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ]
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[INC_LAG1:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[TMP:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[INC_LAG2:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[INC_LAG1]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP]] = getelementptr inbounds i32, ptr [[INC_LAG1]], i64 1
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[INC_PHI]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]]
-; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
-; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[INC_LAG1_LCSSA:%.*]] = phi ptr [ [[INC_LAG1]], %[[FOR_BODY]] ], [ [[IND_ESCAPE]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret ptr [[INC_LAG1_LCSSA]]
+define ptr @both(ptr %p, i32 %k)  {
+; VEC-LABEL: define ptr @both(
+; VEC-SAME: ptr [[P:%.*]], i32 [[K:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    [[BASE:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 1
+; VEC-NEXT:    [[TMP0:%.*]] = add i32 [[K]], -1
+; VEC-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; VEC-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; VEC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2
+; VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2
+; VEC-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; VEC-NEXT:    [[TMP3:%.*]] = trunc i64 [[N_VEC]] to i32
+; VEC-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 4
+; VEC-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP4]]
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[BASE]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <2 x i64> <i64 0, i64 4>
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VEC-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 8
+; VEC-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x ptr> [[VECTOR_GEP]], i32 1
+; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; VEC-NEXT:    [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[TMP5]], i64 -4
+; VEC-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ [[BASE]], %[[ENTRY]] ]
+; VEC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[BASE]], %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[FOR_BODY:.*]]
+; VEC:       [[FOR_BODY]]:
+; VEC-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; VEC-NEXT:    [[INC_LAG1:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[TMP:%.*]], %[[FOR_BODY]] ]
+; VEC-NEXT:    [[INC_LAG2:%.*]] = phi ptr [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[INC_LAG1]], %[[FOR_BODY]] ]
+; VEC-NEXT:    [[TMP]] = getelementptr inbounds i32, ptr [[INC_LAG1]], i64 1
+; VEC-NEXT:    [[INC]] = add nsw i32 [[INC_PHI]], 1
+; VEC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]]
+; VEC-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[FOR_END]]:
+; VEC-NEXT:    [[INC_LAG1_LCSSA:%.*]] = phi ptr [ [[INC_LAG1]], %[[FOR_BODY]] ], [ [[IND_ESCAPE]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret ptr [[INC_LAG1_LCSSA]]
+;
+; INTERLEAVE-LABEL: define ptr @both(
+; INTERLEAVE-SAME: ptr [[P:%.*]], i32 [[K:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[BASE:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 1
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i32 [[K]], -1
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2
+; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = trunc i64 [[N_VEC]] to i32
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = mul i64 [[N_VEC]], 4
+; INTERLEAVE-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP6]]
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 4
+; INTERLEAVE-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP8]]
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; INTERLEAVE-NEXT:    [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 -4
+; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[NEXT_GEP]], %[[MIDDLE_BLOCK]] ], [ [[BASE]], %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[NEXT_GEP1]], %[[MIDDLE_BLOCK]] ], [ [[BASE]], %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[FOR_BODY:.*]]
+; INTERLEAVE:       [[FOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[INC_LAG1:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[TMP:%.*]], %[[FOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[INC_LAG2:%.*]] = phi ptr [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[INC_LAG1]], %[[FOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP]] = getelementptr inbounds i32, ptr [[INC_LAG1]], i64 1
+; INTERLEAVE-NEXT:    [[INC]] = add nsw i32 [[INC_PHI]], 1
+; INTERLEAVE-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]]
+; INTERLEAVE-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[FOR_END]]:
+; INTERLEAVE-NEXT:    [[INC_LAG1_LCSSA:%.*]] = phi ptr [ [[INC_LAG1]], %[[FOR_BODY]] ], [ [[IND_ESCAPE]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret ptr [[INC_LAG1_LCSSA]]
 ;
 entry:
-  %base = getelementptr inbounds i32, ptr undef, i64 1
+  %base = getelementptr inbounds i32, ptr %p, i64 1
   br label %for.body
 
 for.body:
   %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
   %inc.lag1 = phi ptr [ %base, %entry ], [ %tmp, %for.body]
-  %inc.lag2 = phi ptr [ undef, %entry ], [ %inc.lag1, %for.body]
+  %inc.lag2 = phi ptr [ %base, %entry ], [ %inc.lag1, %for.body]
   %tmp = getelementptr inbounds i32, ptr %inc.lag1, i64 1
   %inc = add nsw i32 %inc.phi, 1
   %cmp = icmp eq i32 %inc, %k
diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll
index f2e2e2846614b..70c6c7e900c51 100644
--- a/llvm/test/Transforms/LoopVectorize/struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll
@@ -29,8 +29,9 @@ define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -77,8 +78,9 @@ define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -232,8 +234,9 @@ define void @struct_return_i32_three_results_widen(ptr noalias %in, ptr noalias
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -273,7 +276,7 @@ define void @scalarized_predicated_struct_return(ptr %a) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call { i64, i64 } @bar_i64(i64 [[TMP3]]) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call { i64, i64 } @bar_i64(i64 [[TMP3]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { i64, i64 } [[TMP4]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]]
@@ -286,7 +289,7 @@ define void @scalarized_predicated_struct_return(ptr %a) {
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_IF1]]:
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = tail call { i64, i64 } @bar_i64(i64 [[TMP11]]) #[[ATTR4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call { i64, i64 } @bar_i64(i64 [[TMP11]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { i64, i64 } [[TMP12]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = udiv i64 [[TMP13]], [[TMP14]]
@@ -299,8 +302,9 @@ define void @scalarized_predicated_struct_return(ptr %a) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -385,7 +389,7 @@ define void @negative_mixed_element_type_struct_return(ptr noalias %in, ptr noal
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[IV]]
 ; CHECK-NEXT:    [[IN_VAL:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call { float, i32 } @baz(float [[IN_VAL]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call { float, i32 } @baz(float [[IN_VAL]]) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    [[EXTRACT_A:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    [[EXTRACT_B:%.*]] = extractvalue { float, i32 } [[CALL]], 1
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[IV]]
@@ -433,7 +437,7 @@ define void @negative_named_struct_return(ptr noalias readonly %in, ptr noalias
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[IV]]
 ; CHECK-NEXT:    [[IN_VAL:%.*]] = load double, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[CALL:%.*]] = tail call [[NAMED_STRUCT:%.*]] @[[BAR_NAMED:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](double [[IN_VAL]]) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call [[NAMED_STRUCT:%.*]] @[[BAR_NAMED:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](double [[IN_VAL]]) #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    [[EXTRACT_A:%.*]] = extractvalue [[NAMED_STRUCT]] [[CALL]], 0
 ; CHECK-NEXT:    [[EXTRACT_B:%.*]] = extractvalue [[NAMED_STRUCT]] [[CALL]], 1
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[OUT_A]], i64 [[IV]]
diff --git a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
index 83452e72b56b9..1bbac5cc3154b 100644
--- a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
@@ -4880,6 +4880,8 @@ entry:
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
+; CHECK2-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+; CHECK2:       omp_region.finalize:
 ; CHECK2-NEXT:    call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.exit.exitStub:
@@ -4974,6 +4976,8 @@ entry:
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
+; CHECK2-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+; CHECK2:       omp_region.finalize:
 ; CHECK2-NEXT:    call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.exit.exitStub:
@@ -5070,6 +5074,8 @@ entry:
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
+; CHECK2-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+; CHECK2:       omp_region.finalize:
 ; CHECK2-NEXT:    call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.exit.exitStub:
@@ -5157,6 +5163,8 @@ entry:
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
+; CHECK2-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+; CHECK2:       omp_region.finalize:
 ; CHECK2-NEXT:    call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.exit.exitStub:
@@ -5254,6 +5262,8 @@ entry:
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
+; CHECK2-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+; CHECK2:       omp_region.finalize:
 ; CHECK2-NEXT:    call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.exit.exitStub:
@@ -5434,6 +5444,8 @@ entry:
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
+; CHECK2-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+; CHECK2:       omp_region.finalize:
 ; CHECK2-NEXT:    call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.exit.exitStub:
@@ -5624,8 +5636,10 @@ entry:
 ; CHECK2:       omp.par.region.split:
 ; CHECK2-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 ; CHECK2:       omp.par.pre_finalize:
-; CHECK2-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
-; CHECK2:       omp_region.body5:
+; CHECK2-NEXT:    br label [[FINI:%.*]]
+; CHECK2:       .fini:
+; CHECK2-NEXT:    br label [[OMP_PAR_EXIT_EXITSTUB:.*]]
+; CHECK2:       omp_region.body6:
 ; CHECK2-NEXT:    br label [[SEQ_PAR_MERGED2:%.*]]
 ; CHECK2:       seq.par.merged2:
 ; CHECK2-NEXT:    [[ADD_SEQ_OUTPUT_LOAD:%.*]] = load i32, ptr [[LOADGEP_ADD_SEQ_OUTPUT_ALLOC]], align 4
@@ -5634,7 +5648,9 @@ entry:
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT_SPLIT:%.*]]
 ; CHECK2:       omp.par.merged.split.split.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY5_SPLIT:%.*]]
-; CHECK2:       omp_region.body5.split:
+; CHECK2:       omp_region.body6.split:
+; CHECK2-NEXT:    br label [[OMP_REGION_FINALIZE5:%.*]]
+; CHECK2:       omp_region.finalize{{.*}}:
 ; CHECK2-NEXT:    call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END4]]
 ; CHECK2:       omp_region.body:
@@ -5646,6 +5662,8 @@ entry:
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
+; CHECK2-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+; CHECK2:       omp_region.finalize:
 ; CHECK2-NEXT:    call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.exit.exitStub:
diff --git a/llvm/test/Transforms/SCCP/get_vector_length-intrinsic.ll b/llvm/test/Transforms/SCCP/get_vector_length-intrinsic.ll
new file mode 100644
index 0000000000000..d0741161e729e
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/get_vector_length-intrinsic.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -p sccp -S | FileCheck %s
+
+define i1 @result_le_count() {
+; CHECK-LABEL: define i1 @result_le_count() {
+; CHECK-NEXT:    ret i1 true
+;
+  %x = call i32 @llvm.experimental.get.vector.length(i32 3, i32 4, i1 false)
+  %res = icmp ule i32 %x, 3
+  ret i1 %res
+}
+
+define i1 @result_le_max_lanes(i32 %count) {
+; CHECK-LABEL: define i1 @result_le_max_lanes(
+; CHECK-SAME: i32 [[COUNT:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[COUNT]], i32 3, i1 false)
+; CHECK-NEXT:    ret i1 true
+;
+  %x = call i32 @llvm.experimental.get.vector.length(i32 %count, i32 3, i1 false)
+  %res = icmp ule i32 %x, 3
+  ret i1 %res
+}
+
+define i1 @result_le_max_lanes_scalable(i32 %count) vscale_range(2, 4) {
+; CHECK-LABEL: define i1 @result_le_max_lanes_scalable(
+; CHECK-SAME: i32 [[COUNT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[COUNT]], i32 4, i1 true)
+; CHECK-NEXT:    ret i1 true
+;
+  %x = call i32 @llvm.experimental.get.vector.length(i32 %count, i32 4, i1 true)
+  %res = icmp ule i32 %x, 16
+  ret i1 %res
+}
+
+define i32 @count_le_max_lanes() {
+; CHECK-LABEL: define i32 @count_le_max_lanes() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 4
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [4, %entry], [%iv.next, %loop]
+  %x = call i32 @llvm.experimental.get.vector.length(i32 %iv, i32 4, i1 false)
+  %iv.next = sub i32 %iv, %x
+  %ec = icmp eq i32 %iv.next, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %x
+}
+
+; Can't simplify because %iv isn't <= max lanes.
+define i32 @count_not_le_max_lanes() {
+; CHECK-LABEL: define range(i32 0, 5) i32 @count_not_le_max_lanes() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 6, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[IV]], i32 4, i1 false)
+; CHECK-NEXT:    [[IV_NEXT]] = sub i32 [[IV]], [[X]]
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 [[X]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [6, %entry], [%iv.next, %loop]
+  %x = call i32 @llvm.experimental.get.vector.length(i32 %iv, i32 4, i1 false)
+  %iv.next = sub i32 %iv, %x
+  %ec = icmp eq i32 %iv.next, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %x
+}
+
+define i32 @count_le_max_lanes_scalable_known() vscale_range(4, 8) {
+; CHECK-LABEL: define i32 @count_le_max_lanes_scalable_known(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 16
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [16, %entry], [%iv.next, %loop]
+  %x = call i32 @llvm.experimental.get.vector.length(i32 %iv, i32 4, i1 true)
+  %iv.next = sub i32 %iv, %x
+  %ec = icmp eq i32 %iv.next, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %x
+}
+
+; Can't simplify because %iv isn't guaranteed <= max lanes.
+define i32 @count_le_max_lanes_scalable_unknown() {
+; CHECK-LABEL: define range(i32 0, -1) i32 @count_le_max_lanes_scalable_unknown() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 16, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[IV]], i32 4, i1 true)
+; CHECK-NEXT:    [[IV_NEXT]] = sub i32 [[IV]], [[X]]
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 [[X]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [16, %entry], [%iv.next, %loop]
+  %x = call i32 @llvm.experimental.get.vector.length(i32 %iv, i32 4, i1 true)
+  %iv.next = sub i32 %iv, %x
+  %ec = icmp eq i32 %iv.next, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %x
+}
+
+define i1 @result_le_overflow() {
+; CHECK-LABEL: define i1 @result_le_overflow() {
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 4294967296, i32 4, i1 false)
+; CHECK-NEXT:    [[RES:%.*]] = icmp ule i32 [[X]], 3
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %x = call i32 @llvm.experimental.get.vector.length(i64 u0x100000000, i32 4, i1 false)
+  %res = icmp ule i32 %x, 3
+  ret i1 %res
+}
diff --git a/llvm/unittests/CAS/CASTestConfig.h b/llvm/unittests/CAS/CASTestConfig.h
index b1c0e59ff2b92..20a95dd2f6aa6 100644
--- a/llvm/unittests/CAS/CASTestConfig.h
+++ b/llvm/unittests/CAS/CASTestConfig.h
@@ -15,6 +15,11 @@
 #include "gtest/gtest.h"
 #include <memory>
 
+#ifdef _WIN32
+#include "llvm/Support/VersionTuple.h"
+#include "llvm/Support/Windows/WindowsSupport.h"
+#endif
+
 namespace llvm::unittest::cas {
 class MockEnv {
   void anchor();
@@ -68,6 +73,10 @@ class CASTest
   }
 
   void SetUp() override {
+#ifdef _WIN32
+    if (llvm::GetWindowsOSVersion() < llvm::VersionTuple(10, 0, 0, 17763))
+      GTEST_SKIP() << "CAS tests skipped on older windows version";
+#endif
     NextCASIndex = 0;
     setMaxOnDiskCASMappingSize();
   }
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 1fe32eba9b3c3..9ba4a91755bd1 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -429,8 +429,8 @@ TEST_F(OpenMPIRBuilderTest, CreateCancel) {
                        OMPBuilder.createCancel(Loc, nullptr, OMPD_parallel));
   Builder.restoreIP(NewIP);
   EXPECT_FALSE(M->global_empty());
-  EXPECT_EQ(M->size(), 4U);
-  EXPECT_EQ(F->size(), 4U);
+  EXPECT_EQ(M->size(), 3U);
+  EXPECT_EQ(F->size(), 5U);
   EXPECT_EQ(BB->size(), 4U);
 
   CallInst *GTID = dyn_cast<CallInst>(&BB->front());
@@ -450,23 +450,16 @@ TEST_F(OpenMPIRBuilderTest, CreateCancel) {
   Instruction *CancelBBTI = Cancel->getParent()->getTerminator();
   EXPECT_EQ(CancelBBTI->getNumSuccessors(), 2U);
   EXPECT_EQ(CancelBBTI->getSuccessor(0), NewIP.getBlock());
-  EXPECT_EQ(CancelBBTI->getSuccessor(1)->size(), 3U);
-  CallInst *GTID1 = dyn_cast<CallInst>(&CancelBBTI->getSuccessor(1)->front());
-  EXPECT_NE(GTID1, nullptr);
-  EXPECT_EQ(GTID1->arg_size(), 1U);
-  EXPECT_EQ(GTID1->getCalledFunction()->getName(), "__kmpc_global_thread_num");
-  EXPECT_FALSE(GTID1->getCalledFunction()->doesNotAccessMemory());
-  EXPECT_FALSE(GTID1->getCalledFunction()->doesNotFreeMemory());
-  CallInst *Barrier = dyn_cast<CallInst>(GTID1->getNextNode());
-  EXPECT_NE(Barrier, nullptr);
-  EXPECT_EQ(Barrier->arg_size(), 2U);
-  EXPECT_EQ(Barrier->getCalledFunction()->getName(), "__kmpc_cancel_barrier");
-  EXPECT_FALSE(Barrier->getCalledFunction()->doesNotAccessMemory());
-  EXPECT_FALSE(Barrier->getCalledFunction()->doesNotFreeMemory());
-  EXPECT_TRUE(Barrier->use_empty());
+  EXPECT_EQ(CancelBBTI->getSuccessor(1)->size(), 1U);
   EXPECT_EQ(CancelBBTI->getSuccessor(1)->getTerminator()->getNumSuccessors(),
             1U);
-  EXPECT_EQ(CancelBBTI->getSuccessor(1)->getTerminator()->getSuccessor(0), CBB);
+  // cancel branch instruction (1) -> .cncl -> .fini -> CBB
+  EXPECT_EQ(CancelBBTI->getSuccessor(1)
+                ->getTerminator()
+                ->getSuccessor(0)
+                ->getTerminator()
+                ->getSuccessor(0),
+            CBB);
 
   EXPECT_EQ(cast<CallInst>(Cancel)->getArgOperand(1), GTID);
 
@@ -499,7 +492,7 @@ TEST_F(OpenMPIRBuilderTest, CreateCancelIfCond) {
   Builder.restoreIP(NewIP);
   EXPECT_FALSE(M->global_empty());
   EXPECT_EQ(M->size(), 4U);
-  EXPECT_EQ(F->size(), 7U);
+  EXPECT_EQ(F->size(), 10U);
   EXPECT_EQ(BB->size(), 1U);
   ASSERT_TRUE(isa<BranchInst>(BB->getTerminator()));
   ASSERT_EQ(BB->getTerminator()->getNumSuccessors(), 2U);
@@ -525,23 +518,15 @@ TEST_F(OpenMPIRBuilderTest, CreateCancelIfCond) {
   EXPECT_EQ(CancelBBTI->getSuccessor(0)->size(), 1U);
   EXPECT_EQ(CancelBBTI->getSuccessor(0)->getUniqueSuccessor(),
             NewIP.getBlock());
-  EXPECT_EQ(CancelBBTI->getSuccessor(1)->size(), 3U);
-  CallInst *GTID1 = dyn_cast<CallInst>(&CancelBBTI->getSuccessor(1)->front());
-  EXPECT_NE(GTID1, nullptr);
-  EXPECT_EQ(GTID1->arg_size(), 1U);
-  EXPECT_EQ(GTID1->getCalledFunction()->getName(), "__kmpc_global_thread_num");
-  EXPECT_FALSE(GTID1->getCalledFunction()->doesNotAccessMemory());
-  EXPECT_FALSE(GTID1->getCalledFunction()->doesNotFreeMemory());
-  CallInst *Barrier = dyn_cast<CallInst>(GTID1->getNextNode());
-  EXPECT_NE(Barrier, nullptr);
-  EXPECT_EQ(Barrier->arg_size(), 2U);
-  EXPECT_EQ(Barrier->getCalledFunction()->getName(), "__kmpc_cancel_barrier");
-  EXPECT_FALSE(Barrier->getCalledFunction()->doesNotAccessMemory());
-  EXPECT_FALSE(Barrier->getCalledFunction()->doesNotFreeMemory());
-  EXPECT_TRUE(Barrier->use_empty());
+  EXPECT_EQ(CancelBBTI->getSuccessor(1)->size(), 1U);
   EXPECT_EQ(CancelBBTI->getSuccessor(1)->getTerminator()->getNumSuccessors(),
             1U);
-  EXPECT_EQ(CancelBBTI->getSuccessor(1)->getTerminator()->getSuccessor(0), CBB);
+  EXPECT_EQ(CancelBBTI->getSuccessor(1)
+                ->getTerminator()
+                ->getSuccessor(0)
+                ->getTerminator()
+                ->getSuccessor(0),
+            CBB);
 
   EXPECT_EQ(cast<CallInst>(Cancel)->getArgOperand(1), GTID);
 
@@ -573,7 +558,7 @@ TEST_F(OpenMPIRBuilderTest, CreateCancelBarrier) {
   Builder.restoreIP(NewIP);
   EXPECT_FALSE(M->global_empty());
   EXPECT_EQ(M->size(), 3U);
-  EXPECT_EQ(F->size(), 4U);
+  EXPECT_EQ(F->size(), 5U);
   EXPECT_EQ(BB->size(), 4U);
 
   CallInst *GTID = dyn_cast<CallInst>(&BB->front());
@@ -596,7 +581,11 @@ TEST_F(OpenMPIRBuilderTest, CreateCancelBarrier) {
   EXPECT_EQ(BarrierBBTI->getSuccessor(1)->size(), 1U);
   EXPECT_EQ(BarrierBBTI->getSuccessor(1)->getTerminator()->getNumSuccessors(),
             1U);
-  EXPECT_EQ(BarrierBBTI->getSuccessor(1)->getTerminator()->getSuccessor(0),
+  EXPECT_EQ(BarrierBBTI->getSuccessor(1)
+                ->getTerminator()
+                ->getSuccessor(0)
+                ->getTerminator()
+                ->getSuccessor(0),
             CBB);
 
   EXPECT_EQ(cast<CallInst>(Barrier)->getArgOperand(1), GTID);
@@ -1323,8 +1312,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelCancelBarrier) {
 
   EXPECT_EQ(NumBodiesGenerated, 1U);
   EXPECT_EQ(NumPrivatizedVars, 0U);
-  EXPECT_EQ(NumFinalizationPoints, 2U);
-  EXPECT_TRUE(FakeDestructor->hasNUses(2));
+  EXPECT_EQ(NumFinalizationPoints, 1U);
+  EXPECT_TRUE(FakeDestructor->hasNUses(1));
 
   Builder.restoreIP(AfterIP);
   Builder.CreateRetVoid();
@@ -2961,7 +2950,8 @@ TEST_F(OpenMPIRBuilderTest, MasterDirective) {
   BranchInst *EntryBr = cast<BranchInst>(EntryBB->getTerminator());
   EXPECT_TRUE(EntryBr->isConditional());
   EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB);
-  BasicBlock *ExitBB = ThenBB->getUniqueSuccessor();
+  BasicBlock *FinalizeBB = ThenBB->getUniqueSuccessor();
+  BasicBlock *ExitBB = FinalizeBB->getUniqueSuccessor();
   EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB);
 
   CmpInst *CondInst = cast<CmpInst>(EntryBr->getCondition());
@@ -2973,7 +2963,7 @@ TEST_F(OpenMPIRBuilderTest, MasterDirective) {
   EXPECT_TRUE(isa<GlobalVariable>(MasterEntryCI->getArgOperand(0)));
 
   CallInst *MasterEndCI = nullptr;
-  for (auto &FI : *ThenBB) {
+  for (auto &FI : *FinalizeBB) {
     Instruction *cur = &FI;
     if (isa<CallInst>(cur)) {
       MasterEndCI = cast<CallInst>(cur);
@@ -3044,7 +3034,8 @@ TEST_F(OpenMPIRBuilderTest, MaskedDirective) {
   BranchInst *EntryBr = cast<BranchInst>(EntryBB->getTerminator());
   EXPECT_TRUE(EntryBr->isConditional());
   EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB);
-  BasicBlock *ExitBB = ThenBB->getUniqueSuccessor();
+  BasicBlock *FinalizeBB = ThenBB->getUniqueSuccessor();
+  BasicBlock *ExitBB = FinalizeBB->getUniqueSuccessor();
   EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB);
 
   CmpInst *CondInst = cast<CmpInst>(EntryBr->getCondition());
@@ -3056,7 +3047,7 @@ TEST_F(OpenMPIRBuilderTest, MaskedDirective) {
   EXPECT_TRUE(isa<GlobalVariable>(MaskedEntryCI->getArgOperand(0)));
 
   CallInst *MaskedEndCI = nullptr;
-  for (auto &FI : *ThenBB) {
+  for (auto &FI : *FinalizeBB) {
     Instruction *cur = &FI;
     if (isa<CallInst>(cur)) {
       MaskedEndCI = cast<CallInst>(cur);
@@ -3109,6 +3100,9 @@ TEST_F(OpenMPIRBuilderTest, CriticalDirective) {
                                 FINICB_WRAPPER(FiniCB), "testCRT", nullptr));
   Builder.restoreIP(AfterIP);
 
+  BasicBlock *FinalizeBB = EntryBB->getUniqueSuccessor();
+  EXPECT_NE(FinalizeBB, nullptr);
+
   CallInst *CriticalEntryCI = nullptr;
   for (auto &EI : *EntryBB) {
     Instruction *cur = &EI;
@@ -3125,7 +3119,7 @@ TEST_F(OpenMPIRBuilderTest, CriticalDirective) {
   EXPECT_TRUE(isa<GlobalVariable>(CriticalEntryCI->getArgOperand(0)));
 
   CallInst *CriticalEndCI = nullptr;
-  for (auto &FI : *EntryBB) {
+  for (auto &FI : *FinalizeBB) {
     Instruction *cur = &FI;
     if (isa<CallInst>(cur)) {
       CriticalEndCI = cast<CallInst>(cur);
@@ -3360,6 +3354,9 @@ TEST_F(OpenMPIRBuilderTest, OrderedDirectiveThreads) {
                                           FINICB_WRAPPER(FiniCB), true));
   Builder.restoreIP(AfterIP);
 
+  BasicBlock *FinalizeBB = EntryBB->getUniqueSuccessor();
+  EXPECT_NE(FinalizeBB, nullptr);
+
   Builder.CreateRetVoid();
   OMPBuilder.finalize();
   EXPECT_FALSE(verifyModule(*M, &errs()));
@@ -3382,7 +3379,7 @@ TEST_F(OpenMPIRBuilderTest, OrderedDirectiveThreads) {
   EXPECT_TRUE(isa<GlobalVariable>(OrderedEntryCI->getArgOperand(0)));
 
   CallInst *OrderedEndCI = nullptr;
-  for (auto &FI : *EntryBB) {
+  for (auto &FI : *FinalizeBB) {
     Instruction *Cur = &FI;
     if (isa<CallInst>(Cur)) {
       OrderedEndCI = cast<CallInst>(Cur);
@@ -3558,7 +3555,8 @@ TEST_F(OpenMPIRBuilderTest, SingleDirective) {
   BranchInst *EntryBr = cast<BranchInst>(EntryBB->getTerminator());
   EXPECT_TRUE(EntryBr->isConditional());
   EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB);
-  BasicBlock *ExitBB = ThenBB->getUniqueSuccessor();
+  BasicBlock *FinalizeBB = ThenBB->getUniqueSuccessor();
+  BasicBlock *ExitBB = FinalizeBB->getUniqueSuccessor();
   EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB);
 
   CmpInst *CondInst = cast<CmpInst>(EntryBr->getCondition());
@@ -3570,7 +3568,7 @@ TEST_F(OpenMPIRBuilderTest, SingleDirective) {
   EXPECT_TRUE(isa<GlobalVariable>(SingleEntryCI->getArgOperand(0)));
 
   CallInst *SingleEndCI = nullptr;
-  for (auto &FI : *ThenBB) {
+  for (auto &FI : *FinalizeBB) {
     Instruction *cur = &FI;
     if (isa<CallInst>(cur)) {
       SingleEndCI = cast<CallInst>(cur);
@@ -3652,7 +3650,8 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveNowait) {
   BranchInst *EntryBr = cast<BranchInst>(EntryBB->getTerminator());
   EXPECT_TRUE(EntryBr->isConditional());
   EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB);
-  BasicBlock *ExitBB = ThenBB->getUniqueSuccessor();
+  BasicBlock *FinalizeBB = ThenBB->getUniqueSuccessor();
+  BasicBlock *ExitBB = FinalizeBB->getUniqueSuccessor();
   EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB);
 
   CmpInst *CondInst = cast<CmpInst>(EntryBr->getCondition());
@@ -3664,7 +3663,7 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveNowait) {
   EXPECT_TRUE(isa<GlobalVariable>(SingleEntryCI->getArgOperand(0)));
 
   CallInst *SingleEndCI = nullptr;
-  for (auto &FI : *ThenBB) {
+  for (auto &FI : *FinalizeBB) {
     Instruction *cur = &FI;
     if (isa<CallInst>(cur)) {
       SingleEndCI = cast<CallInst>(cur);
@@ -3776,7 +3775,8 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveCopyPrivate) {
   BranchInst *EntryBr = cast<BranchInst>(EntryBB->getTerminator());
   EXPECT_TRUE(EntryBr->isConditional());
   EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB);
-  BasicBlock *ExitBB = ThenBB->getUniqueSuccessor();
+  BasicBlock *FinalizeBB = ThenBB->getUniqueSuccessor();
+  BasicBlock *ExitBB = FinalizeBB->getUniqueSuccessor();
   EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB);
 
   CmpInst *CondInst = cast<CmpInst>(EntryBr->getCondition());
@@ -3795,25 +3795,28 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveCopyPrivate) {
   EXPECT_EQ(PrivLI->getPointerOperand(), PrivAI);
   // icmp
   EXPECT_TRUE(ThenBBI.next<ICmpInst>());
+
+  // check FinalizeBB
+  BBInstIter FinalizeBBI(FinalizeBB);
   // store 1, DidIt
-  auto *DidItSI = ThenBBI.next<StoreInst>();
+  auto *DidItSI = FinalizeBBI.next<StoreInst>();
   EXPECT_NE(DidItSI, nullptr);
   EXPECT_EQ(DidItSI->getValueOperand(),
             ConstantInt::get(Type::getInt32Ty(Ctx), 1));
   Value *DidIt = DidItSI->getPointerOperand();
   // call __kmpc_end_single
-  auto *SingleEndCI = ThenBBI.next<CallInst>();
+  auto *SingleEndCI = FinalizeBBI.next<CallInst>();
   EXPECT_NE(SingleEndCI, nullptr);
   EXPECT_EQ(SingleEndCI->getCalledFunction()->getName(), "__kmpc_end_single");
   EXPECT_EQ(SingleEndCI->arg_size(), 2U);
   EXPECT_TRUE(isa<GlobalVariable>(SingleEndCI->getArgOperand(0)));
   EXPECT_EQ(SingleEndCI->getArgOperand(1), SingleEntryCI->getArgOperand(1));
   // br ExitBB
-  auto *ExitBBBI = ThenBBI.next<BranchInst>();
+  auto *ExitBBBI = FinalizeBBI.next<BranchInst>();
   EXPECT_NE(ExitBBBI, nullptr);
   EXPECT_TRUE(ExitBBBI->isUnconditional());
   EXPECT_EQ(ExitBBBI->getOperand(0), ExitBB);
-  EXPECT_FALSE(ThenBBI.hasNext());
+  EXPECT_FALSE(FinalizeBBI.hasNext());
 
   // check ExitBB
   BBInstIter ExitBBI(ExitBB);
diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp
index 53d581c8db7c9..13712a76d3edf 100644
--- a/llvm/unittests/IR/ConstantRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantRangeTest.cpp
@@ -449,6 +449,9 @@ TEST_F(ConstantRangeTest, Trunc) {
   // trunc([7, 1), 3->2) = [3, 1)
   ConstantRange SevenOne(APInt(3, 7), APInt(3, 1));
   EXPECT_EQ(SevenOne.truncate(2), ConstantRange(APInt(2, 3), APInt(2, 1)));
+
+  ConstantRange Nop = Full.truncate(Full.getBitWidth());
+  EXPECT_EQ(Full, Nop);
 }
 
 TEST_F(ConstantRangeTest, TruncNuw) {
@@ -527,6 +530,9 @@ TEST_F(ConstantRangeTest, ZExt) {
   // zext([5, 0), 3->7) = [5, 8)
   ConstantRange FiveZero(APInt(3, 5), APInt(3, 0));
   EXPECT_EQ(FiveZero.zeroExtend(7), ConstantRange(APInt(7, 5), APInt(7, 8)));
+
+  ConstantRange Nop = Full.zeroExtend(Full.getBitWidth());
+  EXPECT_EQ(Full, Nop);
 }
 
 TEST_F(ConstantRangeTest, SExt) {
@@ -550,6 +556,9 @@ TEST_F(ConstantRangeTest, SExt) {
 
   EXPECT_EQ(ConstantRange(APInt(16, 0x0200), APInt(16, 0x8000)).signExtend(19),
             ConstantRange(APInt(19, 0x0200), APInt(19, 0x8000)));
+
+  ConstantRange Nop = Full.signExtend(Full.getBitWidth());
+  EXPECT_EQ(Full, Nop);
 }
 
 TEST_F(ConstantRangeTest, IntersectWith) {
diff --git a/llvm/utils/lit/lit/run.py b/llvm/utils/lit/lit/run.py
index 3fc4a1b9b40bd..9c54511bfd625 100644
--- a/llvm/utils/lit/lit/run.py
+++ b/llvm/utils/lit/lit/run.py
@@ -7,6 +7,14 @@
 import lit.util
 import lit.worker
 
+# Windows has a limit of 60 workers per pool.
+# This is defined in the multiprocessing module implementation.
+# See: https://github.com/python/cpython/blob/6bc65c30ff1fd0b581a2c93416496fc720bc442c/Lib/concurrent/futures/process.py#L669-L672
+WINDOWS_MAX_WORKERS_PER_POOL = 60
+
+
+def _ceilDiv(a, b):
+    return (a + b - 1) // b
 
 class MaxFailuresError(Exception):
     pass
@@ -72,25 +80,65 @@ def _execute(self, deadline):
             if v is not None
         }
 
-        pool = multiprocessing.Pool(
-            self.workers, lit.worker.initialize, (self.lit_config, semaphores)
+        # Windows has a limit of 60 workers per pool, so we need to use multiple pools
+        # if we have more workers requested than the limit.
+        # Also, allow to override the limit with the LIT_WINDOWS_MAX_WORKERS_PER_POOL environment variable.
+        max_workers_per_pool = (
+            WINDOWS_MAX_WORKERS_PER_POOL if os.name == "nt" else self.workers
+        )
+        max_workers_per_pool = int(
+            os.getenv("LIT_WINDOWS_MAX_WORKERS_PER_POOL", max_workers_per_pool)
         )
 
-        async_results = [
-            pool.apply_async(
-                lit.worker.execute, args=[test], callback=self.progress_callback
+        num_pools = max(1, _ceilDiv(self.workers, max_workers_per_pool))
+
+        # Distribute self.workers across num_pools as evenly as possible
+        workers_per_pool_list = [self.workers // num_pools] * num_pools
+        for pool_idx in range(self.workers % num_pools):
+            workers_per_pool_list[pool_idx] += 1
+
+        if num_pools > 1:
+            self.lit_config.note(
+                "Using %d pools balancing %d workers total distributed as %s (Windows worker limit workaround)"
+                % (num_pools, self.workers, workers_per_pool_list)
             )
-            for test in self.tests
-        ]
-        pool.close()
+
+        # Create multiple pools
+        pools = []
+        for pool_size in workers_per_pool_list:
+            pool = multiprocessing.Pool(
+                pool_size, lit.worker.initialize, (self.lit_config, semaphores)
+            )
+            pools.append(pool)
+
+        # Distribute tests across pools
+        tests_per_pool = _ceilDiv(len(self.tests), num_pools)
+        async_results = []
+
+        for pool_idx, pool in enumerate(pools):
+            start_idx = pool_idx * tests_per_pool
+            end_idx = min(start_idx + tests_per_pool, len(self.tests))
+            for test in self.tests[start_idx:end_idx]:
+                ar = pool.apply_async(
+                    lit.worker.execute, args=[test], callback=self.progress_callback
+                )
+                async_results.append(ar)
+
+        # Close all pools
+        for pool in pools:
+            pool.close()
 
         try:
             self._wait_for(async_results, deadline)
         except:
-            pool.terminate()
+            # Terminate all pools on exception
+            for pool in pools:
+                pool.terminate()
             raise
         finally:
-            pool.join()
+            # Join all pools
+            for pool in pools:
+                pool.join()
 
     def _wait_for(self, async_results, deadline):
         timeout = deadline - time.time()
diff --git a/llvm/utils/lit/lit/util.py b/llvm/utils/lit/lit/util.py
index e4e031b3e0898..6f25fbc94b757 100644
--- a/llvm/utils/lit/lit/util.py
+++ b/llvm/utils/lit/lit/util.py
@@ -114,11 +114,6 @@ def usable_core_count():
     except AttributeError:
         n = os.cpu_count() or 1
 
-    # On Windows with more than 60 processes, multiprocessing's call to
-    # _winapi.WaitForMultipleObjects() prints an error and lit hangs.
-    if platform.system() == "Windows":
-        return min(n, 60)
-
     return n
 
 def abs_path_preserve_drive(path):
diff --git a/llvm/utils/lit/tests/windows-pools.py b/llvm/utils/lit/tests/windows-pools.py
new file mode 100644
index 0000000000000..85110b37c2601
--- /dev/null
+++ b/llvm/utils/lit/tests/windows-pools.py
@@ -0,0 +1,27 @@
+# Create a directory with 20 files and check the number of pools and workers per pool that lit will use.
+
+# RUN: rm -Rf %t.dir && mkdir -p %t.dir
+# RUN: python -c "for i in range(20): open(rf'%t.dir/file{i}.txt', 'w').write('RUN:')"
+
+# RUN:  echo "import lit.formats" > %t.dir/lit.cfg
+# RUN:  echo "config.name = \"top-level-suite\"" >> %t.dir/lit.cfg
+# RUN:  echo "config.suffixes = [\".txt\"]" >> %t.dir/lit.cfg
+# RUN:  echo "config.test_format = lit.formats.ShTest()" >> %t.dir/lit.cfg
+
+
+# 15 workers per pool max, 100 workers total max: we expect lit to cap the workers to the number of files
+# RUN: env "LIT_WINDOWS_MAX_WORKERS_PER_POOL=15" %{lit} -s %t.dir/ -j100 > %t.out 2>&1
+# CHECK: Using 2 pools balancing 20 workers total distributed as [10, 10]
+# CHECK: Passed: 20
+
+# 5 workers per pool max, 17 workers total max
+# RUN: env "LIT_WINDOWS_MAX_WORKERS_PER_POOL=5" %{lit} -s %t.dir/ -j17 >> %t.out 2>&1
+# CHECK: Using 4 pools balancing 17 workers total distributed as [5, 4, 4, 4]
+# CHECK: Passed: 20
+
+# 19 workers per pool max, 19 workers total max
+# RUN: env "LIT_WINDOWS_MAX_WORKERS_PER_POOL=19" %{lit} -s %t.dir/ -j19 >> %t.out 2>&1
+# CHECK-NOT: workers total distributed as
+# CHECK: Passed: 20
+
+# RUN: cat %t.out | FileCheck %s
diff --git a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
index 81fbdb1611deb..5c68236526b7d 100644
--- a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
+++ b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
@@ -41,15 +41,17 @@ static FuncOp createFnDecl(OpBuilder &b, SymbolOpInterface symTable,
 }
 
 /// Helper function to look up or create the symbol for a runtime library
-/// function with the given parameter types. Always returns an int64_t.
+/// function with the given parameter types. Returns an int64_t, unless a
+/// different result type is specified.
 static FailureOr<FuncOp>
 lookupOrCreateApFloatFn(OpBuilder &b, SymbolOpInterface symTable,
                         StringRef name, TypeRange paramTypes,
-                        SymbolTableCollection *symbolTables = nullptr) {
-  auto i64Type = IntegerType::get(symTable->getContext(), 64);
-
+                        SymbolTableCollection *symbolTables = nullptr,
+                        Type resultType = {}) {
+  if (!resultType)
+    resultType = IntegerType::get(symTable->getContext(), 64);
   std::string funcName = (llvm::Twine("_mlir_apfloat_") + name).str();
-  auto funcT = FunctionType::get(b.getContext(), paramTypes, {i64Type});
+  auto funcT = FunctionType::get(b.getContext(), paramTypes, {resultType});
   FailureOr<FuncOp> func =
       lookupFnDecl(symTable, funcName, funcT, symbolTables);
   // Failed due to type mismatch.
@@ -308,6 +310,188 @@ struct IntToFpConversion final : OpRewritePattern<OpTy> {
   bool isUnsigned;
 };
 
+struct CmpFOpToAPFloatConversion final : OpRewritePattern<arith::CmpFOp> {
+  CmpFOpToAPFloatConversion(MLIRContext *context, SymbolOpInterface symTable,
+                            PatternBenefit benefit = 1)
+      : OpRewritePattern<arith::CmpFOp>(context, benefit), symTable(symTable) {}
+
+  LogicalResult matchAndRewrite(arith::CmpFOp op,
+                                PatternRewriter &rewriter) const override {
+    // Get APFloat function from runtime library.
+    auto i1Type = IntegerType::get(symTable->getContext(), 1);
+    auto i8Type = IntegerType::get(symTable->getContext(), 8);
+    auto i32Type = IntegerType::get(symTable->getContext(), 32);
+    auto i64Type = IntegerType::get(symTable->getContext(), 64);
+    FailureOr<FuncOp> fn =
+        lookupOrCreateApFloatFn(rewriter, symTable, "compare",
+                                {i32Type, i64Type, i64Type}, nullptr, i8Type);
+    if (failed(fn))
+      return fn;
+
+    // Cast operands to 64-bit integers.
+    rewriter.setInsertionPoint(op);
+    Location loc = op.getLoc();
+    auto floatTy = cast<FloatType>(op.getLhs().getType());
+    auto intWType = rewriter.getIntegerType(floatTy.getWidth());
+    Value lhsBits = arith::ExtUIOp::create(
+        rewriter, loc, i64Type,
+        arith::BitcastOp::create(rewriter, loc, intWType, op.getLhs()));
+    Value rhsBits = arith::ExtUIOp::create(
+        rewriter, loc, i64Type,
+        arith::BitcastOp::create(rewriter, loc, intWType, op.getRhs()));
+
+    // Call APFloat function.
+    Value semValue = getSemanticsValue(rewriter, loc, floatTy);
+    SmallVector<Value> params = {semValue, lhsBits, rhsBits};
+    Value comparisonResult =
+        func::CallOp::create(rewriter, loc, TypeRange(i8Type),
+                             SymbolRefAttr::get(*fn), params)
+            ->getResult(0);
+
+    // Generate an i1 SSA value that is "true" if the comparison result matches
+    // the given `val`.
+    auto checkResult = [&](llvm::APFloat::cmpResult val) {
+      return arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::eq, comparisonResult,
+          arith::ConstantOp::create(
+              rewriter, loc, i8Type,
+              rewriter.getIntegerAttr(i8Type, static_cast<int8_t>(val)))
+              .getResult());
+    };
+    // Generate an i1 SSA value that is "true" if the comparison result matches
+    // any of the given `vals`.
+    std::function<Value(ArrayRef<llvm::APFloat::cmpResult>)> checkResults =
+        [&](ArrayRef<llvm::APFloat::cmpResult> vals) {
+          Value first = checkResult(vals.front());
+          if (vals.size() == 1)
+            return first;
+          Value rest = checkResults(vals.drop_front());
+          return arith::OrIOp::create(rewriter, loc, first, rest).getResult();
+        };
+
+    // This switch-case statement was taken from arith::applyCmpPredicate.
+    Value result;
+    switch (op.getPredicate()) {
+    case arith::CmpFPredicate::AlwaysFalse:
+      result = arith::ConstantOp::create(rewriter, loc, i1Type,
+                                         rewriter.getIntegerAttr(i1Type, 0))
+                   .getResult();
+      break;
+    case arith::CmpFPredicate::OEQ:
+      result = checkResult(llvm::APFloat::cmpEqual);
+      break;
+    case arith::CmpFPredicate::OGT:
+      result = checkResult(llvm::APFloat::cmpGreaterThan);
+      break;
+    case arith::CmpFPredicate::OGE:
+      result = checkResults(
+          {llvm::APFloat::cmpGreaterThan, llvm::APFloat::cmpEqual});
+      break;
+    case arith::CmpFPredicate::OLT:
+      result = checkResult(llvm::APFloat::cmpLessThan);
+      break;
+    case arith::CmpFPredicate::OLE:
+      result =
+          checkResults({llvm::APFloat::cmpLessThan, llvm::APFloat::cmpEqual});
+      break;
+    case arith::CmpFPredicate::ONE:
+      // Not cmpUnordered and not cmpUnordered.
+      result = checkResults(
+          {llvm::APFloat::cmpLessThan, llvm::APFloat::cmpGreaterThan});
+      break;
+    case arith::CmpFPredicate::ORD:
+      // Not cmpUnordered.
+      result = checkResults({llvm::APFloat::cmpLessThan,
+                             llvm::APFloat::cmpGreaterThan,
+                             llvm::APFloat::cmpEqual});
+      break;
+    case arith::CmpFPredicate::UEQ:
+      result =
+          checkResults({llvm::APFloat::cmpUnordered, llvm::APFloat::cmpEqual});
+      break;
+    case arith::CmpFPredicate::UGT:
+      result = checkResults(
+          {llvm::APFloat::cmpUnordered, llvm::APFloat::cmpGreaterThan});
+      break;
+    case arith::CmpFPredicate::UGE:
+      result = checkResults({llvm::APFloat::cmpUnordered,
+                             llvm::APFloat::cmpGreaterThan,
+                             llvm::APFloat::cmpEqual});
+      break;
+    case arith::CmpFPredicate::ULT:
+      result = checkResults(
+          {llvm::APFloat::cmpUnordered, llvm::APFloat::cmpLessThan});
+      break;
+    case arith::CmpFPredicate::ULE:
+      result =
+          checkResults({llvm::APFloat::cmpUnordered, llvm::APFloat::cmpLessThan,
+                        llvm::APFloat::cmpEqual});
+      break;
+    case arith::CmpFPredicate::UNE:
+      // Not cmpEqual.
+      result = checkResults({llvm::APFloat::cmpLessThan,
+                             llvm::APFloat::cmpGreaterThan,
+                             llvm::APFloat::cmpUnordered});
+      break;
+    case arith::CmpFPredicate::UNO:
+      result = checkResult(llvm::APFloat::cmpUnordered);
+      break;
+    case arith::CmpFPredicate::AlwaysTrue:
+      result = arith::ConstantOp::create(rewriter, loc, i1Type,
+                                         rewriter.getIntegerAttr(i1Type, 1))
+                   .getResult();
+      break;
+    }
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+
+  SymbolOpInterface symTable;
+};
+
+struct NegFOpToAPFloatConversion final : OpRewritePattern<arith::NegFOp> {
+  NegFOpToAPFloatConversion(MLIRContext *context, SymbolOpInterface symTable,
+                            PatternBenefit benefit = 1)
+      : OpRewritePattern<arith::NegFOp>(context, benefit), symTable(symTable) {}
+
+  LogicalResult matchAndRewrite(arith::NegFOp op,
+                                PatternRewriter &rewriter) const override {
+    // Get APFloat function from runtime library.
+    auto i32Type = IntegerType::get(symTable->getContext(), 32);
+    auto i64Type = IntegerType::get(symTable->getContext(), 64);
+    FailureOr<FuncOp> fn =
+        lookupOrCreateApFloatFn(rewriter, symTable, "neg", {i32Type, i64Type});
+    if (failed(fn))
+      return fn;
+
+    // Cast operand to 64-bit integer.
+    rewriter.setInsertionPoint(op);
+    Location loc = op.getLoc();
+    auto floatTy = cast<FloatType>(op.getOperand().getType());
+    auto intWType = rewriter.getIntegerType(floatTy.getWidth());
+    Value operandBits = arith::ExtUIOp::create(
+        rewriter, loc, i64Type, arith::BitcastOp::create(rewriter, loc, intWType, op.getOperand()));
+
+    // Call APFloat function.
+    Value semValue = getSemanticsValue(rewriter, loc, floatTy);
+    SmallVector<Value> params = {semValue, operandBits};
+    Value negatedBits =
+        func::CallOp::create(rewriter, loc, TypeRange(i64Type),
+                             SymbolRefAttr::get(*fn), params)
+            ->getResult(0);
+
+    // Truncate result to the original width.
+    Value truncatedBits = arith::TruncIOp::create(rewriter, loc, intWType,
+                                                  negatedBits);
+    Value result =
+        arith::BitcastOp::create(rewriter, loc, floatTy, truncatedBits);
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+
+  SymbolOpInterface symTable;
+};
+
 namespace {
 struct ArithToAPFloatConversionPass final
     : impl::ArithToAPFloatConversionPassBase<ArithToAPFloatConversionPass> {
@@ -329,8 +513,17 @@ void ArithToAPFloatConversionPass::runOnOperation() {
       context, "divide", getOperation());
   patterns.add<BinaryArithOpToAPFloatConversion<arith::RemFOp>>(
       context, "remainder", getOperation());
+  patterns.add<BinaryArithOpToAPFloatConversion<arith::MinNumFOp>>(
+      context, "minnum", getOperation());
+  patterns.add<BinaryArithOpToAPFloatConversion<arith::MaxNumFOp>>(
+      context, "maxnum", getOperation());
+  patterns.add<BinaryArithOpToAPFloatConversion<arith::MinimumFOp>>(
+      context, "minimum", getOperation());
+  patterns.add<BinaryArithOpToAPFloatConversion<arith::MaximumFOp>>(
+      context, "maximum", getOperation());
   patterns
-      .add<FpToFpConversion<arith::ExtFOp>, FpToFpConversion<arith::TruncFOp>>(
+      .add<FpToFpConversion<arith::ExtFOp>, FpToFpConversion<arith::TruncFOp>,
+           CmpFOpToAPFloatConversion, NegFOpToAPFloatConversion>(
           context, getOperation());
   patterns.add<FpToIntConversion<arith::FPToSIOp>>(context, getOperation(),
                                                    /*isUnsigned=*/false);
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index d3c305555fde8..b98f15cfe6d75 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -4707,16 +4707,20 @@ LogicalResult NVVMTargetAttr::verifyTarget(Operation *gpuModule) {
                      "Minimum NVVM target SM version is sm_20");
   }
 
-  gpuModuleOp->walk([&](Operation *op) {
-    if (auto reqOp = llvm::dyn_cast<NVVM::RequiresSMInterface>(op)) {
-      const NVVMCheckSMVersion requirement = reqOp.getRequiredMinSMVersion();
-      if (!requirement.isCompatibleWith(targetSMVersion)) {
-        op->emitOpError() << "is not supported on " << getChip();
-        return WalkResult::interrupt();
-      }
-    }
-    return WalkResult::advance();
-  });
+  if (gpuModuleOp
+          ->walk([&](Operation *op) {
+            if (auto reqOp = llvm::dyn_cast<NVVM::RequiresSMInterface>(op)) {
+              const NVVMCheckSMVersion requirement =
+                  reqOp.getRequiredMinSMVersion();
+              if (!requirement.isCompatibleWith(targetSMVersion)) {
+                op->emitOpError() << "is not supported on " << getChip();
+                return WalkResult::interrupt();
+              }
+            }
+            return WalkResult::advance();
+          })
+          .wasInterrupted())
+    return failure();
 
   return success();
 }
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index e85a2ab26bd32..01e6e1e248658 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -430,19 +430,33 @@ static bool convLayoutMatches(ArrayRef<ArrayRef<AffineExpr>> mapListExpected,
                           })));
 }
 
-/// Enum of all kinds of Pooling Op's type.
-enum PoolingType {
-  NONE,
-  MAX_SIGNED,
-  MAX_UNSIGNED,
-  MIN_SIGNED,
-  MIN_UNSIGNED,
-  SUM
+/// Enum representing pooling operation types used by ConvMatcherBuilder.
+enum class PoolingType {
+  None,
+  MaxSigned,
+  MaxUnsigned,
+  MinSigned,
+  MinUnsigned,
+  Sum
 };
 
 /// Helper class for building convolution op matchers with minimal boilerplate.
 /// Reduces repetitive code across Conv1D/2D/3D and Depthwise variants as well
 /// as Pooling ops.
+///
+/// Usage: Create an instance with the op, spatial rank, and output pointers for
+/// extracted dilations/strides. Then chain matchStride() calls for each spatial
+/// dimension, followed by matchMaps() to verify indexing maps, and finally
+/// matchBody() to verify the operation body pattern.
+///
+/// The `matched` flag starts as `true` and is set to `false` if any match step
+/// fails. This allows chaining multiple match calls; once any match fails, all
+/// subsequent calls become no-ops and the final result is `false`.
+///
+/// The `dilations` and `strides` pointers are output parameters that get
+/// populated with the extracted dilation and stride values from the operation's
+/// indexing maps during matchStride() calls. These values are initially set to
+/// 1 for each spatial dimension and updated as patterns are matched.
 class ConvMatcherBuilder {
   LinalgOp op;
   MLIRContext *ctx;
@@ -454,7 +468,7 @@ class ConvMatcherBuilder {
 public:
   ConvMatcherBuilder(LinalgOp op, unsigned spatialRank, SmallVector<int64_t> *d,
                      SmallVector<int64_t> *s,
-                     PoolingType poolingType = PoolingType::NONE)
+                     PoolingType poolingType = PoolingType::None)
       : op(op), ctx(op->getContext()), dilations(d), strides(s),
         indexingMaps(op.getIndexingMaps()), poolingType(poolingType) {
     *dilations = SmallVector<int64_t>(spatialRank, 1);
@@ -474,16 +488,16 @@ class ConvMatcherBuilder {
   ConvMatcherBuilder &matchStride(unsigned iDim, unsigned fDim, unsigned oDim,
                                   unsigned idx) {
     if (matched) {
-      matched = matchConvDimAddExprPattern(indexingMaps, iDim, fDim, oDim,
-                                           (*dilations)[idx], (*strides)[idx]);
+      matched &= matchConvDimAddExprPattern(indexingMaps, iDim, fDim, oDim,
+                                            (*dilations)[idx], (*strides)[idx]);
     }
     return *this;
   }
 
   /// Match expected indexing maps layout. Returns *this for method chaining.
-  ConvMatcherBuilder &expectMaps(ArrayRef<ArrayRef<AffineExpr>> maps) {
+  ConvMatcherBuilder &matchMaps(ArrayRef<ArrayRef<AffineExpr>> maps) {
     if (matched)
-      matched = convLayoutMatches(maps, indexingMaps, ctx);
+      matched &= convLayoutMatches(maps, indexingMaps, ctx);
     return *this;
   }
 
@@ -494,17 +508,17 @@ class ConvMatcherBuilder {
     Block *body = op.getBlock();
     auto yieldOp = cast<linalg::YieldOp>(body->getTerminator());
     switch (poolingType) {
-    case PoolingType::NONE:
+    case PoolingType::None:
       return bodyMatcherForConvolutionOps(yieldOp.getOperand(0), body);
-    case PoolingType::MAX_SIGNED:
+    case PoolingType::MaxSigned:
       return bodyMatcherForMaxSignedPoolOps(yieldOp.getOperand(0), body);
-    case PoolingType::MAX_UNSIGNED:
+    case PoolingType::MaxUnsigned:
       return bodyMatcherForMaxUnsignedPoolOps(yieldOp.getOperand(0), body);
-    case PoolingType::MIN_SIGNED:
+    case PoolingType::MinSigned:
       return bodyMatcherForMinSignedPoolOps(yieldOp.getOperand(0), body);
-    case PoolingType::MIN_UNSIGNED:
+    case PoolingType::MinUnsigned:
       return bodyMatcherForMinUnsignedPoolOps(yieldOp.getOperand(0), body);
-    case PoolingType::SUM:
+    case PoolingType::Sum:
       return bodyMatcherForSumPoolOps(yieldOp.getOperand(0), body);
     }
     return false;
@@ -533,9 +547,9 @@ bool isaConvolutionOpOfType<linalg::Conv1DOp>(LinalgOp op,
   AffineExpr w = m.dim(1);
 
   return m.matchStride(/*iDim=*/0, /*fDim=*/0, /*oDim=*/0, /*idx=*/0)
-      .expectMaps({/*inputMap=*/{m.strided(W, w, 0)},
-                   /*filterMap=*/{w},
-                   /*outputMap=*/{W}})
+      .matchMaps({/*inputMap=*/{m.strided(W, w, 0)},
+                  /*filterMap=*/{w},
+                  /*outputMap=*/{W}})
       .matchBody();
 }
 
@@ -560,9 +574,9 @@ bool isaConvolutionOpOfType<linalg::Conv1DNwcWcfOp>(
   AffineExpr c = m.dim(4);
 
   return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
-      .expectMaps({/*inputMap=*/{N, m.strided(W, w, 0), c},
-                   /*filterMap=*/{w, c, F},
-                   /*outputMap=*/{N, W, F}})
+      .matchMaps({/*inputMap=*/{N, m.strided(W, w, 0), c},
+                  /*filterMap=*/{w, c, F},
+                  /*outputMap=*/{N, W, F}})
       .matchBody();
 }
 
@@ -587,9 +601,9 @@ bool isaConvolutionOpOfType<linalg::Conv1DNcwFcwOp>(
   AffineExpr w = m.dim(4);
 
   return m.matchStride(/*iDim=*/2, /*fDim=*/2, /*oDim=*/2, /*idx=*/0)
-      .expectMaps({/*inputMap=*/{N, c, m.strided(W, w, 0)},
-                   /*filterMap=*/{F, c, w},
-                   /*outputMap=*/{N, F, W}})
+      .matchMaps({/*inputMap=*/{N, c, m.strided(W, w, 0)},
+                  /*filterMap=*/{F, c, w},
+                  /*outputMap=*/{N, F, W}})
       .matchBody();
 }
 
@@ -614,9 +628,9 @@ bool isaConvolutionOpOfType<linalg::Conv2DOp>(LinalgOp op,
 
   return m.matchStride(/*iDim=*/0, /*fDim=*/0, /*oDim=*/0, /*idx=*/0)
       .matchStride(/*iDim=*/1, /*fDim=*/1, /*oDim=*/1, /*idx=*/1)
-      .expectMaps({/*inputMap=*/{m.strided(H, h, 0), m.strided(W, w, 1)},
-                   /*filterMap=*/{h, w},
-                   /*outputMap=*/{H, W}})
+      .matchMaps({/*inputMap=*/{m.strided(H, h, 0), m.strided(W, w, 1)},
+                  /*filterMap=*/{h, w},
+                  /*outputMap=*/{H, W}})
       .matchBody();
 }
 
@@ -644,10 +658,10 @@ bool isaConvolutionOpOfType<linalg::Conv3DOp>(LinalgOp op,
   return m.matchStride(/*iDim=*/0, /*fDim=*/0, /*oDim=*/0, /*idx=*/0)
       .matchStride(/*iDim=*/1, /*fDim=*/1, /*oDim=*/1, /*idx=*/1)
       .matchStride(/*iDim=*/2, /*fDim=*/2, /*oDim=*/2, /*idx=*/2)
-      .expectMaps({/*inputMap=*/{m.strided(D, d, 0), m.strided(H, h, 1),
-                                 m.strided(W, w, 2)},
-                   /*filterMap=*/{d, h, w},
-                   /*outputMap=*/{D, H, W}})
+      .matchMaps({/*inputMap=*/{m.strided(D, d, 0), m.strided(H, h, 1),
+                                m.strided(W, w, 2)},
+                  /*filterMap=*/{d, h, w},
+                  /*outputMap=*/{D, H, W}})
       .matchBody();
 }
 
@@ -671,9 +685,9 @@ bool isaConvolutionOpOfType<linalg::DepthwiseConv1DNcwCwOp>(
   AffineExpr w = m.dim(3);
 
   return m.matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/0)
-      .expectMaps({/*inputMap=*/{N, C, m.strided(W, w, 0)},
-                   /*filterMap=*/{C, w},
-                   /*outputMap=*/{N, C, W}})
+      .matchMaps({/*inputMap=*/{N, C, m.strided(W, w, 0)},
+                  /*filterMap=*/{C, w},
+                  /*outputMap=*/{N, C, W}})
       .matchBody();
 }
 
@@ -697,9 +711,9 @@ bool isaConvolutionOpOfType<linalg::DepthwiseConv1DNwcWcOp>(
   AffineExpr w = m.dim(3);
 
   return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
-      .expectMaps({/*inputMap=*/{N, m.strided(W, w, 0), C},
-                   /*filterMap=*/{w, C},
-                   /*outputMap=*/{N, W, C}})
+      .matchMaps({/*inputMap=*/{N, m.strided(W, w, 0), C},
+                  /*filterMap=*/{w, C},
+                  /*outputMap=*/{N, W, C}})
       .matchBody();
 }
 
@@ -724,9 +738,9 @@ bool isaConvolutionOpOfType<linalg::DepthwiseConv1DNwcWcmOp>(
   AffineExpr w = m.dim(4);
 
   return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
-      .expectMaps({/*inputMap=*/{N, m.strided(W, w, 0), C},
-                   /*filterMap=*/{w, C, CM},
-                   /*outputMap=*/{N, W, C, CM}})
+      .matchMaps({/*inputMap=*/{N, m.strided(W, w, 0), C},
+                  /*filterMap=*/{w, C, CM},
+                  /*outputMap=*/{N, W, C, CM}})
       .matchBody();
 }
 
@@ -753,9 +767,9 @@ bool isaConvolutionOpOfType<linalg::DepthwiseConv2DNchwChwOp>(
 
   return m.matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/0)
       .matchStride(/*iDim=*/3, /*fDim=*/2, /*oDim=*/3, /*idx=*/1)
-      .expectMaps({/*inputMap=*/{N, C, m.strided(H, h, 0), m.strided(W, w, 1)},
-                   /*filterMap=*/{C, h, w},
-                   /*outputMap=*/{N, C, H, W}})
+      .matchMaps({/*inputMap=*/{N, C, m.strided(H, h, 0), m.strided(W, w, 1)},
+                  /*filterMap=*/{C, h, w},
+                  /*outputMap=*/{N, C, H, W}})
       .matchBody();
 }
 
@@ -789,10 +803,10 @@ bool isaConvolutionOpOfType<linalg::DepthwiseConv3DNdhwcDhwcmOp>(
   return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
       .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1)
       .matchStride(/*iDim=*/3, /*fDim=*/2, /*oDim=*/3, /*idx=*/2)
-      .expectMaps({/*inputMap=*/{N, m.strided(D, d, 0), m.strided(H, h, 1),
-                                 m.strided(W, w, 2), C},
-                   /*filterMap=*/{d, h, w, C, CM},
-                   /*outputMap=*/{N, D, H, W, C, CM}})
+      .matchMaps({/*inputMap=*/{N, m.strided(D, d, 0), m.strided(H, h, 1),
+                                m.strided(W, w, 2), C},
+                  /*filterMap=*/{d, h, w, C, CM},
+                  /*outputMap=*/{N, D, H, W, C, CM}})
       .matchBody();
 }
 
@@ -810,7 +824,7 @@ bool isaConvolutionOpOfType<linalg::PoolingNhwcMaxOp>(
          "expected op to implement ConvolutionOpInterface");
 
   ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides,
-                       PoolingType::MAX_SIGNED);
+                       PoolingType::MaxSigned);
   AffineExpr N = m.dim(0);
   AffineExpr H = m.dim(1);
   AffineExpr W = m.dim(2);
@@ -820,9 +834,9 @@ bool isaConvolutionOpOfType<linalg::PoolingNhwcMaxOp>(
 
   return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
       .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1)
-      .expectMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
-                   /*filterMap=*/{h, w},
-                   /*outputMap=*/{N, H, W, C}})
+      .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
+                  /*filterMap=*/{h, w},
+                  /*outputMap=*/{N, H, W, C}})
       .matchBody();
 }
 
@@ -840,7 +854,7 @@ bool isaConvolutionOpOfType<linalg::PoolingNhwcMinOp>(
          "expected op to implement ConvolutionOpInterface");
 
   ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides,
-                       PoolingType::MIN_SIGNED);
+                       PoolingType::MinSigned);
   AffineExpr N = m.dim(0);
   AffineExpr H = m.dim(1);
   AffineExpr W = m.dim(2);
@@ -850,9 +864,9 @@ bool isaConvolutionOpOfType<linalg::PoolingNhwcMinOp>(
 
   return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
       .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1)
-      .expectMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
-                   /*filterMap=*/{h, w},
-                   /*outputMap=*/{N, H, W, C}})
+      .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
+                  /*filterMap=*/{h, w},
+                  /*outputMap=*/{N, H, W, C}})
       .matchBody();
 }
 
@@ -870,7 +884,7 @@ bool isaConvolutionOpOfType<linalg::PoolingNhwcSumOp>(
          "expected op to implement ConvolutionOpInterface");
 
   ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides,
-                       PoolingType::SUM);
+                       PoolingType::Sum);
   AffineExpr N = m.dim(0);
   AffineExpr H = m.dim(1);
   AffineExpr W = m.dim(2);
@@ -880,9 +894,9 @@ bool isaConvolutionOpOfType<linalg::PoolingNhwcSumOp>(
 
   return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
       .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1)
-      .expectMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
-                   /*filterMap=*/{h, w},
-                   /*outputMap=*/{N, H, W, C}})
+      .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
+                  /*filterMap=*/{h, w},
+                  /*outputMap=*/{N, H, W, C}})
       .matchBody();
 }
 
@@ -900,7 +914,7 @@ bool isaConvolutionOpOfType<linalg::PoolingNhwcMaxUnsignedOp>(
          "expected op to implement ConvolutionOpInterface");
 
   ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides,
-                       PoolingType::MAX_UNSIGNED);
+                       PoolingType::MaxUnsigned);
   AffineExpr N = m.dim(0);
   AffineExpr H = m.dim(1);
   AffineExpr W = m.dim(2);
@@ -910,9 +924,9 @@ bool isaConvolutionOpOfType<linalg::PoolingNhwcMaxUnsignedOp>(
 
   return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
       .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1)
-      .expectMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
-                   /*filterMap=*/{h, w},
-                   /*outputMap=*/{N, H, W, C}})
+      .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
+                  /*filterMap=*/{h, w},
+                  /*outputMap=*/{N, H, W, C}})
       .matchBody();
 }
 
@@ -930,7 +944,7 @@ bool isaConvolutionOpOfType<linalg::PoolingNhwcMinUnsignedOp>(
          "expected op to implement ConvolutionOpInterface");
 
   ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides,
-                       PoolingType::MIN_UNSIGNED);
+                       PoolingType::MinUnsigned);
   AffineExpr N = m.dim(0);
   AffineExpr H = m.dim(1);
   AffineExpr W = m.dim(2);
@@ -940,9 +954,9 @@ bool isaConvolutionOpOfType<linalg::PoolingNhwcMinUnsignedOp>(
 
   return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
       .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1)
-      .expectMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
-                   /*filterMap=*/{h, w},
-                   /*outputMap=*/{N, H, W, C}})
+      .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
+                  /*filterMap=*/{h, w},
+                  /*outputMap=*/{N, H, W, C}})
       .matchBody();
 }
 
diff --git a/mlir/lib/Dialect/SCF/IR/CMakeLists.txt b/mlir/lib/Dialect/SCF/IR/CMakeLists.txt
index 423e1c3e1e042..b111117410ba3 100644
--- a/mlir/lib/Dialect/SCF/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/SCF/IR/CMakeLists.txt
@@ -19,5 +19,5 @@ add_mlir_dialect_library(MLIRSCFDialect
   MLIRSideEffectInterfaces
   MLIRTensorDialect
   MLIRValueBoundsOpInterface
+  MLIRTransformUtils
   )
-
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index a09b9639f8dd1..a63e44f4808e0 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -26,6 +26,7 @@
 #include "mlir/Interfaces/ParallelCombiningOpInterface.h"
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Transforms/InliningUtils.h"
+#include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -3682,6 +3683,133 @@ LogicalResult scf::WhileOp::verify() {
 }
 
 namespace {
+/// Move a scf.if op that is directly before the scf.condition op in the while
+/// before region, and whose condition matches the condition of the
+/// scf.condition op, down into the while after region.
+///
+/// scf.while (..) : (...) -> ... {
+///  %additional_used_values = ...
+///  %cond = ...
+///  ...
+///  %res = scf.if %cond -> (...) {
+///    use(%additional_used_values)
+///    ... // then block
+///    scf.yield %then_value
+///  } else {
+///    scf.yield %else_value
+///  }
+///  scf.condition(%cond) %res, ...
+/// } do {
+/// ^bb0(%res_arg, ...):
+///    use(%res_arg)
+///    ...
+///
+/// becomes
+/// scf.while (..) : (...) -> ... {
+///  %additional_used_values = ...
+///  %cond = ...
+///  ...
+///  scf.condition(%cond) %else_value, ..., %additional_used_values
+/// } do {
+/// ^bb0(%res_arg ..., %additional_args): :
+///    use(%additional_args)
+///    ... // if then block
+///    use(%then_value)
+///    ...
+struct WhileMoveIfDown : public OpRewritePattern<scf::WhileOp> {
+  using OpRewritePattern<scf::WhileOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(scf::WhileOp op,
+                                PatternRewriter &rewriter) const override {
+    auto conditionOp = op.getConditionOp();
+
+    // Only support ifOp right before the condition at the moment. Relaxing this
+    // would require to:
+    // - check that the body does not have side-effects conflicting with
+    //    operations between the if and the condition.
+    // - check that results of the if operation are only used as arguments to
+    //    the condition.
+    auto ifOp = dyn_cast_or_null<scf::IfOp>(conditionOp->getPrevNode());
+
+    // Check that the ifOp is directly before the conditionOp and that it
+    // matches the condition of the conditionOp. Also ensure that the ifOp has
+    // no else block with content, as that would complicate the transformation.
+    // TODO: support else blocks with content.
+    if (!ifOp || ifOp.getCondition() != conditionOp.getCondition() ||
+        (ifOp.elseBlock() && !ifOp.elseBlock()->without_terminator().empty()))
+      return failure();
+
+    assert(ifOp->use_empty() || (llvm::all_equal(ifOp->getUsers()) &&
+                                 *ifOp->user_begin() == conditionOp) &&
+                                    "ifOp has unexpected uses");
+
+    Location loc = op.getLoc();
+
+    // Replace uses of ifOp results in the conditionOp with the yielded values
+    // from the ifOp branches.
+    for (auto [idx, arg] : llvm::enumerate(conditionOp.getArgs())) {
+      auto it = llvm::find(ifOp->getResults(), arg);
+      if (it != ifOp->getResults().end()) {
+        size_t ifOpIdx = it.getIndex();
+        Value thenValue = ifOp.thenYield()->getOperand(ifOpIdx);
+        Value elseValue = ifOp.elseYield()->getOperand(ifOpIdx);
+
+        rewriter.replaceAllUsesWith(ifOp->getResults()[ifOpIdx], elseValue);
+        rewriter.replaceAllUsesWith(op.getAfterArguments()[idx], thenValue);
+      }
+    }
+
+    // Collect additional used values from before region.
+    SetVector<Value> additionalUsedValuesSet;
+    visitUsedValuesDefinedAbove(ifOp.getThenRegion(), [&](OpOperand *operand) {
+      if (&op.getBefore() == operand->get().getParentRegion())
+        additionalUsedValuesSet.insert(operand->get());
+    });
+
+    // Create new whileOp with additional used values as results.
+    auto additionalUsedValues = additionalUsedValuesSet.getArrayRef();
+    auto additionalValueTypes = llvm::map_to_vector(
+        additionalUsedValues, [](Value val) { return val.getType(); });
+    size_t additionalValueSize = additionalUsedValues.size();
+    SmallVector<Type> newResultTypes(op.getResultTypes());
+    newResultTypes.append(additionalValueTypes);
+
+    auto newWhileOp =
+        scf::WhileOp::create(rewriter, loc, newResultTypes, op.getInits());
+
+    rewriter.modifyOpInPlace(newWhileOp, [&] {
+      newWhileOp.getBefore().takeBody(op.getBefore());
+      newWhileOp.getAfter().takeBody(op.getAfter());
+      newWhileOp.getAfter().addArguments(
+          additionalValueTypes,
+          SmallVector<Location>(additionalValueSize, loc));
+    });
+
+    rewriter.modifyOpInPlace(conditionOp, [&] {
+      conditionOp.getArgsMutable().append(additionalUsedValues);
+    });
+
+    // Replace uses of additional used values inside the ifOp then region with
+    // the whileOp after region arguments.
+    rewriter.replaceUsesWithIf(
+        additionalUsedValues,
+        newWhileOp.getAfterArguments().take_back(additionalValueSize),
+        [&](OpOperand &use) {
+          return ifOp.getThenRegion().isAncestor(
+              use.getOwner()->getParentRegion());
+        });
+
+    // Inline ifOp then region into new whileOp after region.
+    rewriter.eraseOp(ifOp.thenYield());
+    rewriter.inlineBlockBefore(ifOp.thenBlock(), newWhileOp.getAfterBody(),
+                               newWhileOp.getAfterBody()->begin());
+    rewriter.eraseOp(ifOp);
+    rewriter.replaceOp(op,
+                       newWhileOp->getResults().drop_back(additionalValueSize));
+    return success();
+  }
+};
+
 /// Replace uses of the condition within the do block with true, since otherwise
 /// the block would not be evaluated.
 ///
@@ -4394,7 +4522,8 @@ void WhileOp::getCanonicalizationPatterns(RewritePatternSet &results,
   results.add<RemoveLoopInvariantArgsFromBeforeBlock,
               RemoveLoopInvariantValueYielded, WhileConditionTruth,
               WhileCmpCond, WhileUnusedResult, WhileRemoveDuplicatedResults,
-              WhileRemoveUnusedArgs, WhileOpAlignBeforeArgs>(context);
+              WhileRemoveUnusedArgs, WhileOpAlignBeforeArgs, WhileMoveIfDown>(
+      context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp b/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp
index 9f242f9e62b8e..ec1044aaa42ac 100644
--- a/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp
@@ -19,83 +19,6 @@
 using namespace mlir;
 
 namespace {
-/// Move an scf.if op that is directly before the scf.condition op in the while
-/// before region, and whose condition matches the condition of the
-/// scf.condition op, down into the while after region.
-///
-/// scf.while (%init) : (...) -> ... {
-///   %cond = ...
-///   %res = scf.if %cond -> (...) {
-///     use1(%init)
-///     %then_val = ...
-///      ... // then block
-///     scf.yield %then_val
-///   } else {
-///     scf.yield %init
-///   }
-///   scf.condition(%cond) %res
-/// } do {
-/// ^bb0(%arg):
-///   use2(%arg)
-///    ...
-///
-/// becomes
-/// scf.while (%init) : (...) -> ... {
-///   %cond = ...
-///   scf.condition(%cond) %init
-/// } do {
-/// ^bb0(%arg): :
-///   use1(%arg)
-///    ... // if then block
-///   %then_val = ...
-///   use2(%then_val)
-///    ...
-struct WhileMoveIfDown : public OpRewritePattern<scf::WhileOp> {
-  using OpRewritePattern<scf::WhileOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(scf::WhileOp op,
-                                PatternRewriter &rewriter) const override {
-    // Check that the first opeation produces one result and that result must
-    // have exactly two uses (these two uses come from the `scf.if` and
-    // `scf.condition` operations).
-    Operation &condOp = op.getBeforeBody()->front();
-    if (condOp.getNumResults() != 1 || !condOp.getResult(0).hasNUses(2))
-      return failure();
-
-    Value condVal = condOp.getResult(0);
-    auto ifOp = dyn_cast<scf::IfOp>(condOp.getNextNode());
-    if (!ifOp || ifOp.getCondition() != condVal)
-      return failure();
-
-    auto term = dyn_cast<scf::ConditionOp>(ifOp->getNextNode());
-    if (!term || term.getCondition() != condVal)
-      return failure();
-
-    // Check that if results and else yield operands match the scf.condition op
-    // arguments and while before arguments respectively.
-    if (!llvm::equal(ifOp->getResults(), term.getArgs()) ||
-        !llvm::equal(ifOp.elseYield()->getOperands(), op.getBeforeArguments()))
-      return failure();
-
-    // Update uses and move the if op into the after region.
-    rewriter.replaceAllUsesWith(op.getAfterArguments(),
-                                ifOp.thenYield()->getOperands());
-    rewriter.replaceUsesWithIf(op.getBeforeArguments(), op.getAfterArguments(),
-                               [&](OpOperand &use) {
-                                 return ifOp.getThenRegion().isAncestor(
-                                     use.getOwner()->getParentRegion());
-                               });
-    rewriter.modifyOpInPlace(
-        term, [&]() { term.getArgsMutable().assign(op.getBeforeArguments()); });
-
-    rewriter.eraseOp(ifOp.thenYield());
-    rewriter.inlineBlockBefore(ifOp.thenBlock(), op.getAfterBody(),
-                               op.getAfterBody()->begin());
-    rewriter.eraseOp(ifOp);
-    return success();
-  }
-};
-
 struct UpliftWhileOp : public OpRewritePattern<scf::WhileOp> {
   using OpRewritePattern::OpRewritePattern;
 
@@ -344,5 +267,5 @@ FailureOr<scf::ForOp> mlir::scf::upliftWhileToForLoop(RewriterBase &rewriter,
 }
 
 void mlir::scf::populateUpliftWhileToForPatterns(RewritePatternSet &patterns) {
-  patterns.add<UpliftWhileOp, WhileMoveIfDown>(patterns.getContext());
+  patterns.add<UpliftWhileOp>(patterns.getContext());
 }
diff --git a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
index 44980ccd77491..f3e38eb8ffa2d 100644
--- a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
@@ -131,4 +131,44 @@ MLIR_APFLOAT_WRAPPERS_EXPORT uint64_t _mlir_apfloat_convert_from_int(
                           llvm::RoundingMode::NearestTiesToEven);
   return result.bitcastToAPInt().getZExtValue();
 }
+
+MLIR_APFLOAT_WRAPPERS_EXPORT int8_t _mlir_apfloat_compare(int32_t semantics,
+                                                          uint64_t a,
+                                                          uint64_t b) {
+  const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(
+      static_cast<llvm::APFloatBase::Semantics>(semantics));
+  unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);
+  llvm::APFloat x(sem, llvm::APInt(bitWidth, a));
+  llvm::APFloat y(sem, llvm::APInt(bitWidth, b));
+  return static_cast<int8_t>(x.compare(y));
+}
+
+MLIR_APFLOAT_WRAPPERS_EXPORT uint64_t _mlir_apfloat_neg(int32_t semantics, uint64_t a) {
+  const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(
+      static_cast<llvm::APFloatBase::Semantics>(semantics));
+  unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);
+  llvm::APFloat x(sem, llvm::APInt(bitWidth, a));
+  x.changeSign();
+  return x.bitcastToAPInt().getZExtValue();
+}
+
+/// Min/max operations.
+#define APFLOAT_MIN_MAX_OP(OP)                                                 \
+  MLIR_APFLOAT_WRAPPERS_EXPORT uint64_t _mlir_apfloat_##OP(                    \
+      int32_t semantics, uint64_t a, uint64_t b) {                             \
+    const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(        \
+        static_cast<llvm::APFloatBase::Semantics>(semantics));                 \
+    unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);           \
+    llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a));                          \
+    llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b));                          \
+    llvm::APFloat result = llvm::OP(lhs, rhs);                                 \
+    return result.bitcastToAPInt().getZExtValue();                             \
+  }
+
+APFLOAT_MIN_MAX_OP(minimum)
+APFLOAT_MIN_MAX_OP(maximum)
+APFLOAT_MIN_MAX_OP(minnum)
+APFLOAT_MIN_MAX_OP(maxnum)
+
+#undef APFLOAT_MIN_MAX_OP
 }
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 0cab8f352ada7..a2871b36fd9d8 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2809,6 +2809,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   ArrayRef<bool> isByRef = getIsByRef(opInst.getReductionByref());
   assert(isByRef.size() == opInst.getNumReductionVars());
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  bool isCancellable = constructIsCancellable(opInst);
 
   if (failed(checkImplementationStatus(*opInst)))
     return failure();
@@ -2946,6 +2947,18 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
                                   opInst.getLoc(), privateVarsInfo)))
       return llvm::make_error<PreviouslyReportedError>();
 
+    // If we could be performing cancellation, add the cancellation barrier on
+    // the way out of the outlined region.
+    if (isCancellable) {
+      auto IPOrErr = ompBuilder->createBarrier(
+          llvm::OpenMPIRBuilder::LocationDescription(builder),
+          llvm::omp::Directive::OMPD_unknown,
+          /* ForceSimpleCall */ false,
+          /* CheckCancelFlag */ false);
+      if (!IPOrErr)
+        return IPOrErr.takeError();
+    }
+
     builder.restoreIP(oldIP);
     return llvm::Error::success();
   };
@@ -2959,7 +2972,6 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   auto pbKind = llvm::omp::OMP_PROC_BIND_default;
   if (auto bind = opInst.getProcBindKind())
     pbKind = getProcBindKind(*bind);
-  bool isCancellable = constructIsCancellable(opInst);
 
   llvm::SmallVector<llvm::OpenMPIRBuilder::InsertPointTy> deallocIPs;
   llvm::OpenMPIRBuilder::InsertPointTy allocIP =
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
index 252be796488c5..d08e7ecf326ca 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
@@ -2923,9 +2923,6 @@ LogicalResult spirv::Deserializer::structurizeControlFlow() {
     return failure();
   }
 
-  // TODO: This loop is non-deterministic. Iteration order may vary between runs
-  // for the same shader as the key to the map is a pointer. See:
-  // https://github.com/llvm/llvm-project/issues/128547
   while (!blockMergeInfo.empty()) {
     Block *headerBlock = blockMergeInfo.begin()->first;
     BlockMergeInfo mergeInfo = blockMergeInfo.begin()->second;
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h
index 243e6fd70ae43..6d09d556c4d02 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h
@@ -58,7 +58,9 @@ struct DebugLine {
 };
 
 /// Map from a selection/loop's header block to its merge (and continue) target.
-using BlockMergeInfoMap = DenseMap<Block *, BlockMergeInfo>;
+/// Use `MapVector<>` to ensure a deterministic iteration order with a pointer
+/// key.
+using BlockMergeInfoMap = llvm::MapVector<Block *, BlockMergeInfo>;
 
 /// A "deferred struct type" is a struct type with one or more member types not
 /// known when the Deserializer first encounters the struct. This happens, for
diff --git a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
index d71d81dddcd4f..950d2cecefa95 100644
--- a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
+++ b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
@@ -198,3 +198,68 @@ func.func @uitofp(%arg0: i32) {
   %0 = arith.uitofp %arg0 : i32 to f4E2M1FN
   return
 }
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_compare(i32, i64, i64) -> i8
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: %[[cmp:.*]] = call @_mlir_apfloat_compare(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i8
+// CHECK: %[[c3:.*]] = arith.constant 3 : i8
+// CHECK: %[[is_unordered:.*]] = arith.cmpi eq, %[[cmp]], %[[c3]] : i8
+// CHECK: %[[c0:.*]] = arith.constant 0 : i8
+// CHECK: %[[is_lt:.*]] = arith.cmpi eq, %[[cmp]], %[[c0]] : i8
+// CHECK: arith.ori %[[is_unordered]], %[[is_lt]] : i1
+func.func @cmpf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.cmpf "ult", %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_neg(i32, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 2 : i32
+// CHECK: %[[res:.*]] = call @_mlir_apfloat_neg(%[[sem]], %{{.*}}) : (i32, i64) -> i64
+func.func @negf(%arg0: f32) {
+  %0 = arith.negf %arg0 : f32
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_minimum(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 2 : i32
+// CHECK: %[[res:.*]] = call @_mlir_apfloat_minimum(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @minimumf(%arg0: f32, %arg1: f32) {
+  %0 = arith.minimumf %arg0, %arg1 : f32
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_maximum(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 2 : i32
+// CHECK: %[[res:.*]] = call @_mlir_apfloat_maximum(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @maximumf(%arg0: f32, %arg1: f32) {
+  %0 = arith.maximumf %arg0, %arg1 : f32
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_minnum(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 2 : i32
+// CHECK: %[[res:.*]] = call @_mlir_apfloat_minnum(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @minnumf(%arg0: f32, %arg1: f32) {
+  %0 = arith.minnumf %arg0, %arg1 : f32
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_maxnum(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 2 : i32
+// CHECK: %[[res:.*]] = call @_mlir_apfloat_maxnum(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @maxnumf(%arg0: f32, %arg1: f32) {
+  %0 = arith.maxnumf %arg0, %arg1 : f32
+  return
+}
diff --git a/mlir/test/Conversion/UBToSPIRV/ub-to-spirv.mlir b/mlir/test/Conversion/UBToSPIRV/ub-to-spirv.mlir
index edbe8b8001bba..9c277cf99b9a8 100644
--- a/mlir/test/Conversion/UBToSPIRV/ub-to-spirv.mlir
+++ b/mlir/test/Conversion/UBToSPIRV/ub-to-spirv.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -split-input-file -convert-ub-to-spirv -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-opt -split-input-file -convert-ub-to-spirv %s | FileCheck %s
 
 module attributes {
   spirv.target_env = #spirv.target_env<
@@ -22,15 +22,17 @@ func.func @check_poison() {
 
 // -----
 
-// No successful test because the dialect conversion framework does not convert
-// unreachable blocks.
-
 module attributes {
   spirv.target_env = #spirv.target_env<
     #spirv.vce<v1.0, [Int8, Int16, Int64, Float16, Float64, Shader], []>, #spirv.resource_limits<>>
 } {
-func.func @check_unrechable() {
-// expected-error@+1{{cannot be used in reachable block}}
-  spirv.Unreachable
+// CHECK-LABEL: @check_unrechable
+func.func @check_unrechable(%c: i1) {
+  cf.cond_br %c, ^bb1, ^bb2
+^bb1:
+// CHECK: spirv.Unreachable
+  ub.unreachable
+^bb2:
+  return
 }
 }
diff --git a/mlir/test/Dialect/LLVMIR/nvvm-target-invalid.mlir b/mlir/test/Dialect/LLVMIR/nvvm-target-invalid.mlir
new file mode 100644
index 0000000000000..c2cfa7689978b
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/nvvm-target-invalid.mlir
@@ -0,0 +1,11 @@
+// RUN: not mlir-opt %s 2>&1 | FileCheck %s
+// CHECK: 'nvvm.tcgen05.alloc' op is not supported on sm_90
+
+module {
+    gpu.module @mod [#nvvm.target<chip = "sm_90">] {
+        func.func @tcgen05_alloc(%arg0: !llvm.ptr<7>, %arg1: i32) {
+             nvvm.tcgen05.alloc %arg0, %arg1 : !llvm.ptr<7>, i32
+             return
+        }
+    }
+}
diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir
index 084c3fc065de3..ac590fc0c47b9 100644
--- a/mlir/test/Dialect/SCF/canonicalize.mlir
+++ b/mlir/test/Dialect/SCF/canonicalize.mlir
@@ -974,6 +974,56 @@ func.func @replace_if_with_cond3(%arg0 : i1, %arg2: i64) -> (i32, i64) {
 
 // -----
 
+// CHECK-LABEL: @while_move_if_down
+func.func @while_move_if_down() -> i32 {
+  %defined_outside = "test.get_some_value0" () : () -> (i32)
+  %0 = scf.while () : () -> (i32) {
+    %used_value = "test.get_some_value1" () : () -> (i32)
+    %used_by_subregion = "test.get_some_value2" () : () -> (i32)
+    %else_value = "test.get_some_value3" () : () -> (i32)
+    %condition = "test.condition"() : () -> i1
+    %res = scf.if %condition -> (i32) {
+      "test.use0" (%defined_outside) : (i32) -> ()
+      "test.use1" (%used_value) : (i32) -> ()
+      test.alloca_scope_region {
+        "test.use2" (%used_by_subregion) : (i32) -> ()
+      }
+      %then_value = "test.get_some_value4" () : () -> (i32)
+      scf.yield %then_value : i32
+    } else {
+      scf.yield %else_value : i32
+    }
+    scf.condition(%condition) %res : i32
+  } do {
+  ^bb0(%res_arg: i32):
+    "test.use3" (%res_arg) : (i32) -> ()
+    scf.yield
+  }
+  return %0 : i32
+}
+// CHECK:           %[[defined_outside:.*]] = "test.get_some_value0"() : () -> i32
+// CHECK:           %[[WHILE_RES:.*]]:3 = scf.while : () -> (i32, i32, i32) {
+// CHECK:             %[[used_value:.*]] = "test.get_some_value1"() : () -> i32
+// CHECK:             %[[used_by_subregion:.*]] = "test.get_some_value2"() : () -> i32
+// CHECK:             %[[else_value:.*]] = "test.get_some_value3"() : () -> i32
+// CHECK:             %[[condition:.*]] = "test.condition"() : () -> i1
+// CHECK:             scf.condition(%[[condition]]) %[[else_value]], %[[used_value]], %[[used_by_subregion]] : i32, i32, i32
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[res_arg:.*]]: i32, %[[used_value_arg:.*]]: i32, %[[used_by_subregion_arg:.*]]: i32):
+// CHECK:             "test.use0"(%[[defined_outside]]) : (i32) -> ()
+// CHECK:             "test.use1"(%[[used_value_arg]]) : (i32) -> ()
+// CHECK:             test.alloca_scope_region {
+// CHECK:               "test.use2"(%[[used_by_subregion_arg]]) : (i32) -> ()
+// CHECK:             }
+// CHECK:             %[[then_value:.*]] = "test.get_some_value4"() : () -> i32
+// CHECK:             "test.use3"(%[[then_value]]) : (i32) -> ()
+// CHECK:             scf.yield
+// CHECK:           }
+// CHECK:           return %[[WHILE_RES]]#0 : i32
+// CHECK:         }
+
+// -----
+
 // CHECK-LABEL: @while_cond_true
 func.func @while_cond_true() -> i1 {
   %0 = scf.while () : () -> i1 {
diff --git a/mlir/test/Dialect/SCF/uplift-while.mlir b/mlir/test/Dialect/SCF/uplift-while.mlir
index 736112824c515..cbe2ce5076ad2 100644
--- a/mlir/test/Dialect/SCF/uplift-while.mlir
+++ b/mlir/test/Dialect/SCF/uplift-while.mlir
@@ -185,34 +185,3 @@ func.func @uplift_while(%arg0: index, %arg1: index, %arg2: index) -> (i32, f32)
 //       CHECK:     %[[T2:.*]] = "test.test2"(%[[ARG2]]) : (f32) -> f32
 //       CHECK:     scf.yield %[[T1]], %[[T2]] : i32, f32
 //       CHECK:     return %[[RES]]#0, %[[RES]]#1 : i32, f32
-
-// -----
-
-func.func @uplift_while(%low: index, %upper: index, %val : i32) -> i32 {
-  %c1 = arith.constant 1 : index
-  %1:2 = scf.while (%iv = %low, %iter = %val) : (index, i32) -> (index, i32) {
-    %2 = arith.cmpi slt, %iv, %upper : index
-    %3:2 = scf.if %2 -> (index, i32) {
-      %4 = "test.test"(%iter) : (i32) -> i32
-      %5 = arith.addi %iv, %c1 : index
-      scf.yield %5, %4 : index, i32
-    } else {
-      scf.yield %iv, %iter : index, i32
-    }
-    scf.condition(%2) %3#0, %3#1 : index, i32
-  } do {
-  ^bb0(%arg0: index, %arg1: i32):
-    scf.yield %arg0, %arg1 : index, i32
-  }
-  return %1#1 : i32
-}
-
-// CHECK-LABEL:   func.func @uplift_while(
-// CHECK-SAME:      %[[ARG0:.*]]: index, %[[ARG1:.*]]: index, %[[ARG2:.*]]: i32) -> i32 {
-// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1 : index
-// CHECK:           %[[FOR_0:.*]] = scf.for %[[VAL_0:.*]] = %[[ARG0]] to %[[ARG1]] step %[[CONSTANT_0]] iter_args(%[[VAL_1:.*]] = %[[ARG2]]) -> (i32) {
-// CHECK:             %[[VAL_2:.*]] = "test.test"(%[[VAL_1]]) : (i32) -> i32
-// CHECK:             scf.yield %[[VAL_2]] : i32
-// CHECK:           }
-// CHECK:           return %[[FOR_0]] : i32
-// CHECK:         }
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
index 8046610d479a8..7f72dd5931488 100644
--- a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
+++ b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
@@ -43,6 +43,18 @@ func.func @entry() {
   %cvt = arith.truncf %b2 : f32 to f8E4M3FN
   vector.print %cvt : f8E4M3FN
 
+  // CHECK-NEXT: -2.25
+  %negated = arith.negf %cvt : f8E4M3FN
+  vector.print %negated : f8E4M3FN
+
+  // CHECK-NEXT: -2.25
+  %min = arith.minimumf %cvt, %negated : f8E4M3FN
+  vector.print %min : f8E4M3FN
+
+  // CHECK-NEXT: 1
+  %cmp1 = arith.cmpf "olt", %cvt, %c1 : f8E4M3FN
+  vector.print %cmp1 : i1
+
   // CHECK-NEXT: 1
   // Bit pattern: 01, interpreted as signed integer: 1
   %cvt_int_signed = arith.fptosi %cvt : f8E4M3FN to i2
diff --git a/mlir/test/Target/LLVMIR/openmp-barrier-cancel.mlir b/mlir/test/Target/LLVMIR/openmp-barrier-cancel.mlir
index c4b245667a1f3..6585549de7f96 100644
--- a/mlir/test/Target/LLVMIR/openmp-barrier-cancel.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-barrier-cancel.mlir
@@ -29,22 +29,24 @@ llvm.func @test() {
 // CHECK:         %[[VAL_14:.*]] = icmp eq i32 %[[VAL_13]], 0
 // CHECK:         br i1 %[[VAL_14]], label %[[VAL_15:.*]], label %[[VAL_16:.*]]
 // CHECK:       omp.par.region1.cncl:                             ; preds = %[[VAL_11]]
-// CHECK:         %[[VAL_17:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
-// CHECK:         %[[VAL_18:.*]] = call i32 @__kmpc_cancel_barrier(ptr @2, i32 %[[VAL_17]])
-// CHECK:         br label %[[VAL_19:.*]]
+// CHECK:         br label %[[FINI:.*]]
+// CHECK:       .fini:
+// CHECK:         %[[TID:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         %[[CNCL_BARRIER:.*]] = call i32 @__kmpc_cancel_barrier(ptr @2, i32 %[[TID]])
+// CHECK:         br label %[[EXIT_STUB:.*]]
 // CHECK:       omp.par.region1.split:                            ; preds = %[[VAL_11]]
 // CHECK:         %[[VAL_20:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         %[[VAL_21:.*]] = call i32 @__kmpc_cancel_barrier(ptr @3, i32 %[[VAL_20]])
 // CHECK:         %[[VAL_22:.*]] = icmp eq i32 %[[VAL_21]], 0
 // CHECK:         br i1 %[[VAL_22]], label %[[VAL_23:.*]], label %[[VAL_24:.*]]
 // CHECK:       omp.par.region1.split.cncl:                       ; preds = %[[VAL_15]]
-// CHECK:         br label %[[VAL_19]]
+// CHECK:         br label %[[FINI]]
 // CHECK:       omp.par.region1.split.cont:                       ; preds = %[[VAL_15]]
 // CHECK:         br label %[[VAL_25:.*]]
 // CHECK:       omp.region.cont:                                  ; preds = %[[VAL_23]]
 // CHECK:         br label %[[VAL_26:.*]]
 // CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_25]]
-// CHECK:         br label %[[VAL_19]]
-// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[VAL_26]], %[[VAL_24]], %[[VAL_16]]
+// CHECK:         br label %[[FINI]]
+// CHECK:       omp.par.exit.exitStub:
 // CHECK:         ret void
 
diff --git a/mlir/test/Target/LLVMIR/openmp-cancel.mlir b/mlir/test/Target/LLVMIR/openmp-cancel.mlir
index e1abb15fbb476..59f2c36a2523a 100644
--- a/mlir/test/Target/LLVMIR/openmp-cancel.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-cancel.mlir
@@ -25,16 +25,18 @@ llvm.func @cancel_parallel() {
 // CHECK:         %[[VAL_15:.*]] = icmp eq i32 %[[VAL_14]], 0
 // CHECK:         br i1 %[[VAL_15]], label %[[VAL_16:.*]], label %[[VAL_17:.*]]
 // CHECK:       omp.par.region1.cncl:                             ; preds = %[[VAL_12]]
+// CHECK:         br label %[[VAL_20:.*]]
+// CHECK:       .fini:
 // CHECK:         %[[VAL_18:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         %[[VAL_19:.*]] = call i32 @__kmpc_cancel_barrier(ptr @2, i32 %[[VAL_18]])
-// CHECK:         br label %[[VAL_20:.*]]
+// CHECK:         br label %[[EXIT_STUB:.*]]
 // CHECK:       omp.par.region1.split:                            ; preds = %[[VAL_12]]
 // CHECK:         br label %[[VAL_21:.*]]
 // CHECK:       omp.region.cont:                                  ; preds = %[[VAL_16]]
 // CHECK:         br label %[[VAL_22:.*]]
 // CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_21]]
 // CHECK:         br label %[[VAL_20]]
-// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[VAL_22]], %[[VAL_17]]
+// CHECK:       omp.par.exit.exitStub:
 // CHECK:         ret void
 
 llvm.func @cancel_parallel_if(%arg0 : i1) {
@@ -59,27 +61,36 @@ llvm.func @cancel_parallel_if(%arg0 : i1) {
 // CHECK:       omp.par.region:                                   ; preds = %[[VAL_17]]
 // CHECK:         br label %[[VAL_20:.*]]
 // CHECK:       omp.par.region1:                                  ; preds = %[[VAL_19]]
-// CHECK:         br i1 %[[VAL_16]], label %[[VAL_21:.*]], label %[[VAL_22:.*]]
+// CHECK:         br i1 %[[VAL_16]], label %[[SPLIT:.*]], label %[[VAL_22:.*]]
 // CHECK:       3:                                                ; preds = %[[VAL_20]]
-// CHECK:         br label %[[VAL_23:.*]]
-// CHECK:       4:                                                ; preds = %[[VAL_22]], %[[VAL_24:.*]]
+// CHECK:         %[[GTN:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         %[[NOT_CANCELLED:.*]] = call i32 @__kmpc_cancellationpoint(ptr @1, i32 %[[GTN]], i32 1)
+// CHECK:         %[[COND:.*]] = icmp eq i32 %[[NOT_CANCELLED]], 0
+// CHECK:         br i1 %[[COND]], label %[[VAL_23:.*]], label %[[CNCL:.*]]
+// CHECK:       .cncl:
+// CHECK:         br label %[[FINI:.*]]
+// CHECK:       .fini:
+// CHECK:         %[[VAL_32:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         %[[VAL_33:.*]] = call i32 @__kmpc_cancel_barrier(ptr @2, i32 %[[VAL_32]])
+// CHECK:         br label %[[EXIT_STUB:.*]]
+// CHECK:       .split:
+// CHECK:         br label %[[SEVEN:.*]]
+// CHECK:       7:
 // CHECK:         br label %[[VAL_25:.*]]
-// CHECK:       omp.region.cont:                                  ; preds = %[[VAL_23]]
+// CHECK:       omp.region.cont:
 // CHECK:         br label %[[VAL_26:.*]]
 // CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_25]]
 // CHECK:         br label %[[VAL_27:.*]]
-// CHECK:       5:                                                ; preds = %[[VAL_20]]
+// CHECK:       8:                                                ; preds = %[[VAL_20]]
 // CHECK:         %[[VAL_28:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         %[[VAL_29:.*]] = call i32 @__kmpc_cancel(ptr @1, i32 %[[VAL_28]], i32 1)
 // CHECK:         %[[VAL_30:.*]] = icmp eq i32 %[[VAL_29]], 0
-// CHECK:         br i1 %[[VAL_30]], label %[[VAL_24]], label %[[VAL_31:.*]]
-// CHECK:       .cncl:                                            ; preds = %[[VAL_21]]
-// CHECK:         %[[VAL_32:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
-// CHECK:         %[[VAL_33:.*]] = call i32 @__kmpc_cancel_barrier(ptr @2, i32 %[[VAL_32]])
-// CHECK:         br label %[[VAL_27]]
-// CHECK:       .split:                                           ; preds = %[[VAL_21]]
-// CHECK:         br label %[[VAL_23]]
-// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[VAL_31]], %[[VAL_26]]
+// CHECK:         br i1 %[[VAL_30]], label %[[SPLIT5:.*]], label %[[VAL_31:.*]]
+// CHECK:       .cncl{{.*}}:
+// CHECK:         br label %[[FINI]]
+// CHECK:       .split{{.*}}:
+// CHECK:         br label %[[SEVEN]]
+// CHECK:       omp.par.exit.exitStub:
 // CHECK:         ret void
 
 llvm.func @cancel_sections_if(%cond : i1) {
@@ -133,11 +144,16 @@ llvm.func @cancel_sections_if(%cond : i1) {
 // CHECK:         %[[VAL_30:.*]] = call i32 @__kmpc_cancel(ptr @1, i32 %[[VAL_29]], i32 3)
 // CHECK:         %[[VAL_31:.*]] = icmp eq i32 %[[VAL_30]], 0
 // CHECK:         br i1 %[[VAL_31]], label %[[VAL_32:.*]], label %[[VAL_33:.*]]
-// CHECK:       .split:                                           ; preds = %[[VAL_27]]
+// CHECK:       .split{{.*}}:                                     ; preds = %[[VAL_27]]
 // CHECK:         br label %[[VAL_34:.*]]
-// CHECK:       11:                                               ; preds = %[[VAL_25]]
+// CHECK:       12:                                               ; preds = %[[VAL_25]]
+// CHECK:         %[[GTN:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         %[[CANCEL_POINT:.*]] = call i32 @__kmpc_cancellationpoint(ptr @1, i32 %[[GTN]], i32 3)
+// CHECK:         %[[COND:.*]] = icmp eq i32 %13, 0
+// CHECK:         br i1 %[[COND]], label %[[SPLIT:.*]], label %[[CNCL:.*]]
+// CHECK:       .split{{.*}}:
 // CHECK:         br label %[[VAL_34]]
-// CHECK:       12:                                               ; preds = %[[VAL_28]], %[[VAL_32]]
+// CHECK:       15:
 // CHECK:         br label %[[VAL_35:.*]]
 // CHECK:       omp.region.cont:                                  ; preds = %[[VAL_34]]
 // CHECK:         br label %[[VAL_23]]
@@ -146,17 +162,17 @@ llvm.func @cancel_sections_if(%cond : i1) {
 // CHECK:       omp_section_loop.inc:                             ; preds = %[[VAL_23]]
 // CHECK:         %[[VAL_15]] = add nuw i32 %[[VAL_14]], 1
 // CHECK:         br label %[[VAL_12]]
-// CHECK:       omp_section_loop.exit:                            ; preds = %[[VAL_33]], %[[VAL_16]]
+// CHECK:       omp_section_loop.exit:
 // CHECK:         call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_7]])
 // CHECK:         %[[VAL_36:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_36]])
 // CHECK:         br label %[[VAL_37:.*]]
 // CHECK:       omp_section_loop.after:                           ; preds = %[[VAL_19]]
-// CHECK:         br label %[[VAL_38:.*]]
-// CHECK:       omp_section_loop.aftersections.fini:              ; preds = %[[VAL_37]]
 // CHECK:         ret void
-// CHECK:       .cncl:                                            ; preds = %[[VAL_27]]
-// CHECK:         br label %[[VAL_19]]
+// CHECK:       .cncl:
+// CHECK:         br label %[[OMP_SECTION_LOOP_EXIT:.*]]
+// CHECK:       .cncl{{.*}}:
+// CHECK:         br label %[[OMP_SECTION_LOOP_EXIT:.*]]
 
 llvm.func @cancel_wsloop_if(%lb : i32, %ub : i32, %step : i32, %cond : i1) {
   omp.wsloop {
@@ -222,18 +238,23 @@ llvm.func @cancel_wsloop_if(%lb : i32, %ub : i32, %step : i32, %cond : i1) {
 // CHECK:         %[[VAL_47:.*]] = call i32 @__kmpc_cancel(ptr @1, i32 %[[VAL_46]], i32 2)
 // CHECK:         %[[VAL_48:.*]] = icmp eq i32 %[[VAL_47]], 0
 // CHECK:         br i1 %[[VAL_48]], label %[[VAL_49:.*]], label %[[VAL_50:.*]]
-// CHECK:       .split:                                           ; preds = %[[VAL_44]]
+// CHECK:       .split{{.*}}:
 // CHECK:         br label %[[VAL_51:.*]]
-// CHECK:       28:                                               ; preds = %[[VAL_42]]
+// CHECK:       28:
+// CHECK:         %[[GTN:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         %[[CANCEL_POINT:.*]] = call i32 @__kmpc_cancellationpoint(ptr @1, i32 %[[GTN]], i32 2)
+// CHECK:         %[[COND:.*]] = icmp eq i32 %[[CANCEL_POINT]], 0
+// CHECK:         br i1 %[[COND]], label %[[SPLIT3:.*]], label %[[CNCL4:.*]]
+// CHECK:       .split{{.*}}:
 // CHECK:         br label %[[VAL_51]]
-// CHECK:       29:                                               ; preds = %[[VAL_45]], %[[VAL_49]]
+// CHECK:       31:
 // CHECK:         br label %[[VAL_52:.*]]
 // CHECK:       omp.region.cont1:                                 ; preds = %[[VAL_51]]
 // CHECK:         br label %[[VAL_32]]
 // CHECK:       omp_loop.inc:                                     ; preds = %[[VAL_52]]
 // CHECK:         %[[VAL_34]] = add nuw i32 %[[VAL_33]], 1
 // CHECK:         br label %[[VAL_31]]
-// CHECK:       omp_loop.exit:                                    ; preds = %[[VAL_50]], %[[VAL_35]]
+// CHECK:       omp_loop.exit:
 // CHECK:         call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_26]])
 // CHECK:         %[[VAL_53:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_53]])
@@ -242,8 +263,12 @@ llvm.func @cancel_wsloop_if(%lb : i32, %ub : i32, %step : i32, %cond : i1) {
 // CHECK:         br label %[[VAL_55:.*]]
 // CHECK:       omp.region.cont:                                  ; preds = %[[VAL_54]]
 // CHECK:         ret void
-// CHECK:       .cncl:                                            ; preds = %[[VAL_44]]
-// CHECK:         br label %[[VAL_38]]
+// CHECK:       .cncl{{.*}}:
+// CHECK:         br label %[[FINI:.*]]
+// CHECK:       .fini:
+// CHECK:         br label %[[OMP_LOOP_EXIT:.*]]
+// CHECK:       .cncl{{.*}}:
+// CHECK:         br label %[[FINI:.*]]
 
 omp.private {type = firstprivate} @i32_priv : i32 copy {
 ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
diff --git a/mlir/test/Target/LLVMIR/openmp-cancellation-point.mlir b/mlir/test/Target/LLVMIR/openmp-cancellation-point.mlir
index 5e0d3f9f7e293..93fa2064ab99a 100644
--- a/mlir/test/Target/LLVMIR/openmp-cancellation-point.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-cancellation-point.mlir
@@ -24,16 +24,18 @@ llvm.func @cancellation_point_parallel() {
 // CHECK:         %[[VAL_15:.*]] = icmp eq i32 %[[VAL_14]], 0
 // CHECK:         br i1 %[[VAL_15]], label %[[VAL_16:.*]], label %[[VAL_17:.*]]
 // CHECK:       omp.par.region1.cncl:                             ; preds = %[[VAL_12]]
+// CHECK:         br label %[[FINI:.*]]
+// CHECK:       .fini:
 // CHECK:         %[[VAL_18:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         %[[VAL_19:.*]] = call i32 @__kmpc_cancel_barrier(ptr @2, i32 %[[VAL_18]])
-// CHECK:         br label %[[VAL_20:.*]]
+// CHECK:         br label %[[EXIT_STUB:.*]]
 // CHECK:       omp.par.region1.split:                            ; preds = %[[VAL_12]]
 // CHECK:         br label %[[VAL_21:.*]]
 // CHECK:       omp.region.cont:                                  ; preds = %[[VAL_16]]
 // CHECK:         br label %[[VAL_22:.*]]
 // CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_21]]
-// CHECK:         br label %[[VAL_20]]
-// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[VAL_22]], %[[VAL_17]]
+// CHECK:         br label %[[FINI]]
+// CHECK:       omp.par.exit.exitStub:
 // CHECK:         ret void
 
 llvm.func @cancellation_point_sections() {
@@ -94,14 +96,12 @@ llvm.func @cancellation_point_sections() {
 // CHECK:       omp_section_loop.inc:                             ; preds = %[[VAL_46]]
 // CHECK:         %[[VAL_38]] = add nuw i32 %[[VAL_37]], 1
 // CHECK:         br label %[[VAL_35]]
-// CHECK:       omp_section_loop.exit:                            ; preds = %[[VAL_53]], %[[VAL_39]]
+// CHECK:       omp_section_loop.exit:
 // CHECK:         call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_30]])
 // CHECK:         %[[VAL_55:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_55]])
 // CHECK:         br label %[[VAL_56:.*]]
 // CHECK:       omp_section_loop.after:                           ; preds = %[[VAL_42]]
-// CHECK:         br label %[[VAL_57:.*]]
-// CHECK:       omp_section_loop.aftersections.fini:              ; preds = %[[VAL_56]]
 // CHECK:         ret void
 // CHECK:       omp.section.region.cncl:                          ; preds = %[[VAL_48]]
 // CHECK:         br label %[[VAL_42]]
@@ -175,7 +175,7 @@ llvm.func @cancellation_point_wsloop(%lb : i32, %ub : i32, %step : i32) {
 // CHECK:       omp_loop.inc:                                     ; preds = %[[VAL_106]]
 // CHECK:         %[[VAL_92]] = add nuw i32 %[[VAL_91]], 1
 // CHECK:         br label %[[VAL_89]]
-// CHECK:       omp_loop.exit:                                    ; preds = %[[VAL_105]], %[[VAL_93]]
+// CHECK:       omp_loop.exit:
 // CHECK:         call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_84]])
 // CHECK:         %[[VAL_107:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_107]])
diff --git a/mlir/test/Target/LLVMIR/openmp-outline-infinite-loop.mlir b/mlir/test/Target/LLVMIR/openmp-outline-infinite-loop.mlir
index faccfc678adfe..99f37c7e79be8 100644
--- a/mlir/test/Target/LLVMIR/openmp-outline-infinite-loop.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-outline-infinite-loop.mlir
@@ -21,9 +21,11 @@ llvm.func @parallel_infinite_loop() -> () {
 // CHECK:       omp.region.cont:                                  ; No predecessors!
 // CHECK:         br label %[[VAL_4:.*]]
 // CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_5:.*]]
-// CHECK:         br label %[[VAL_6:.*]]
-// CHECK:       omp.par.exit:                                     ; preds = %[[VAL_4]]
+// CHECK:         br label %[[FINI:.*]]
+// CHECK:       [[OMP_PAR_EXIT:omp.par.exit]]:                                     ; preds = %[[FINI]]
 // CHECK:         ret void
+// CHECK:       [[FINI]]:
+// CHECK:         br label %[[OMP_PAR_EXIT]]
 // CHECK:       }
 
 // CHECK-LABEL: define internal void @parallel_infinite_loop..omp_par(
diff --git a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir
index 887d2977e45cc..c79c369b69d7f 100644
--- a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir
@@ -108,6 +108,8 @@ llvm.func @missordered_blocks_(%arg0: !llvm.ptr {fir.bindc_name = "x"}, %arg1: !
 // CHECK:       reduce.finalize:                                  ; preds = %[[VAL_49]], %[[VAL_43]]
 // CHECK:         br label %[[VAL_53:.*]]
 // CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_48]]
+// CHECK:         br label %[[FINI:.*]]
+// CHECK:       .fini:
 // CHECK:         %[[VAL_54:.*]] = load ptr, ptr %[[VAL_20]], align 8
 // CHECK:         %[[VAL_55:.*]] = load ptr, ptr %[[VAL_21]], align 8
 // CHECK:         br label %[[VAL_56:.*]]
@@ -115,5 +117,5 @@ llvm.func @missordered_blocks_(%arg0: !llvm.ptr {fir.bindc_name = "x"}, %arg1: !
 // CHECK:         br label %[[VAL_38]]
 // CHECK:       omp.reduction.neutral1:                           ; preds = %[[VAL_25]]
 // CHECK:         br label %[[VAL_30]]
-// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[VAL_53]]
+// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[FINI]]
 // CHECK:         ret void
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
index b302b4b20edd5..13f52f054869e 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
@@ -127,8 +127,6 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
 // CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_36]])
 // CHECK:         br label %[[VAL_37:.*]]
 // CHECK:       omp_section_loop.after:                           ; preds = %[[VAL_35]]
-// CHECK:         br label %[[VAL_38:.*]]
-// CHECK:       omp_section_loop.aftersections.fini:              ; preds = %[[VAL_37]]
 // CHECK:         %[[VAL_39:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_14]], i64 0, i64 0
 // CHECK:         store ptr %[[VAL_21]], ptr %[[VAL_39]], align 8
 // CHECK:         %[[VAL_40:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
@@ -137,9 +135,9 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
 // CHECK:           i32 1, label %[[VAL_43:.*]]
 // CHECK:           i32 2, label %[[VAL_44:.*]]
 // CHECK:         ]
-// CHECK:       reduce.switch.atomic:                             ; preds = %[[VAL_38]]
+// CHECK:       reduce.switch.atomic:                             ; preds = %[[VAL_37]]
 // CHECK:         unreachable
-// CHECK:       reduce.switch.nonatomic:                          ; preds = %[[VAL_38]]
+// CHECK:       reduce.switch.nonatomic:                          ; preds = %[[VAL_37]]
 // CHECK:         %[[VAL_45:.*]] = load ptr, ptr %[[VAL_21]], align 8
 // CHECK:         br label %[[VAL_46:.*]]
 // CHECK:       omp.reduction.nonatomic.body:                     ; preds = %[[VAL_43]]
@@ -157,7 +155,7 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
 // CHECK:       omp.reduction.nonatomic.body17:                   ; preds = %[[VAL_47]]
 // CHECK:         %[[VAL_50]] = sub i64 %[[VAL_49]], 1
 // CHECK:         br label %[[VAL_47]]
-// CHECK:       reduce.finalize:                                  ; preds = %[[VAL_53]], %[[VAL_38]]
+// CHECK:       reduce.finalize:                                  ; preds = %[[VAL_53]], %[[VAL_37]]
 // CHECK:         %[[VAL_55:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_55]])
 // CHECK:         %[[VAL_56:.*]] = load ptr, ptr %[[VAL_21]], align 8
@@ -173,7 +171,9 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
 // CHECK:       omp.region.cont:                                  ; preds = %[[VAL_62]]
 // CHECK:         br label %[[VAL_64:.*]]
 // CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_63]]
-// CHECK:         br label %[[VAL_65:.*]]
+// CHECK:         br label %[[FINI:.fini.*]]
+// CHECK:       [[FINI]]:
+// CHECK:         br label %[[EXIT:.*]]
 // CHECK:       omp.reduction.cleanup21:                          ; preds = %[[VAL_57]]
 // CHECK:         br label %[[VAL_61]]
 // CHECK:       omp_section_loop.body:                            ; preds = %[[VAL_32]]
@@ -219,5 +219,5 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
 // CHECK:       omp_section_loop.inc:                             ; preds = %[[VAL_69]]
 // CHECK:         %[[VAL_31]] = add nuw i32 %[[VAL_30]], 1
 // CHECK:         br label %[[VAL_28]]
-// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[VAL_64]]
+// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[FINI]]
 // CHECK:         ret void
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
index a714ca68a1e95..cb30d3b2f4473 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
@@ -96,8 +96,10 @@ module {
 // CHECK:       reduce.finalize:                                  ; preds = %[[VAL_34]], %[[VAL_28]]
 // CHECK:         br label %[[VAL_38:.*]]
 // CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_33]]
+// CHECK:         br label %[[FINI:.*]]
+// CHECK:       [[FINI]]:
 // CHECK:         br label %[[VAL_39:.*]]
-// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[VAL_38]]
+// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[FINI]]
 // CHECK:         ret void
 // CHECK:         %[[VAL_40:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_41:.*]], i64 0, i64 0
 // CHECK:         %[[VAL_42:.*]] = load ptr, ptr %[[VAL_40]], align 8
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir
index 19da6f8517fcd..00f6c1b02206e 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir
@@ -86,8 +86,6 @@ llvm.func @sections_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attributes {fir.in
 // CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_40]])
 // CHECK:         br label %[[VAL_41:.*]]
 // CHECK:       omp_section_loop.after:                           ; preds = %[[VAL_39]]
-// CHECK:         br label %[[VAL_42:.*]]
-// CHECK:       omp_section_loop.aftersections.fini:              ; preds = %[[VAL_41]]
 // CHECK:         %[[VAL_43:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_21]], i64 0, i64 0
 // CHECK:         store ptr %[[VAL_20]], ptr %[[VAL_43]], align 8
 // CHECK:         %[[VAL_44:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
@@ -96,23 +94,25 @@ llvm.func @sections_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attributes {fir.in
 // CHECK:           i32 1, label %[[VAL_47:.*]]
 // CHECK:           i32 2, label %[[VAL_48:.*]]
 // CHECK:         ]
-// CHECK:       reduce.switch.atomic:                             ; preds = %[[VAL_42]]
+// CHECK:       reduce.switch.atomic:                             ; preds = %[[VAL_41]]
 // CHECK:         unreachable
-// CHECK:       reduce.switch.nonatomic:                          ; preds = %[[VAL_42]]
+// CHECK:       reduce.switch.nonatomic:                          ; preds = %[[VAL_41]]
 // CHECK:         %[[VAL_49:.*]] = load float, ptr %[[VAL_11]], align 4
 // CHECK:         %[[VAL_50:.*]] = load float, ptr %[[VAL_20]], align 4
 // CHECK:         %[[VAL_51:.*]] = fadd contract float %[[VAL_49]], %[[VAL_50]]
 // CHECK:         store float %[[VAL_51]], ptr %[[VAL_11]], align 4
 // CHECK:         call void @__kmpc_end_reduce(ptr @1, i32 %[[VAL_44]], ptr @.gomp_critical_user_.reduction.var)
 // CHECK:         br label %[[VAL_46]]
-// CHECK:       reduce.finalize:                                  ; preds = %[[VAL_47]], %[[VAL_42]]
+// CHECK:       reduce.finalize:                                  ; preds = %[[VAL_47]], %[[VAL_41]]
 // CHECK:         %[[VAL_52:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_52]])
 // CHECK:         br label %[[VAL_53:.*]]
 // CHECK:       omp.region.cont:                                  ; preds = %[[VAL_46]]
 // CHECK:         br label %[[VAL_54:.*]]
 // CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_53]]
-// CHECK:         br label %[[VAL_55:.*]]
+// CHECK:         br label %[[FINI:.fini.*]]
+// CHECK:       [[FINI]]:
+// CHECK:         br label %[[EXIT:.*]]
 // CHECK:       omp_section_loop.body:                            ; preds = %[[VAL_36]]
 // CHECK:         %[[VAL_56:.*]] = add i32 %[[VAL_34]], %[[VAL_28]]
 // CHECK:         %[[VAL_57:.*]] = mul i32 %[[VAL_56]], 1
@@ -144,8 +144,10 @@ llvm.func @sections_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attributes {fir.in
 // CHECK:       omp_section_loop.inc:                             ; preds = %[[VAL_59]]
 // CHECK:         %[[VAL_35]] = add nuw i32 %[[VAL_34]], 1
 // CHECK:         br label %[[VAL_32]]
-// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[VAL_54]]
+// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[FINI]]
 // CHECK:         ret void
+
+// CHECK-LABEL: define internal void @.omp.reduction.func
 // CHECK:         %[[VAL_70:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_71:.*]], i64 0, i64 0
 // CHECK:         %[[VAL_72:.*]] = load ptr, ptr %[[VAL_70]], align 8
 // CHECK:         %[[VAL_73:.*]] = load float, ptr %[[VAL_72]], align 4
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 13a7705091b24..c574ba5877b3d 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -4444,6 +4444,7 @@ cc_library(
         ":SCFIncGen",
         ":SideEffectInterfaces",
         ":TensorDialect",
+        ":TransformUtils",
         ":ValueBoundsOpInterface",
         ":ViewLikeInterface",
         "//llvm:Support",