From 81c5d468cf00d6e41112fba6c89d6c40013bcbda Mon Sep 17 00:00:00 2001
From: Men-cotton <mencotton0410@gmail.com>
Date: Mon, 1 Dec 2025 13:20:13 +0900
Subject: [PATCH 01/39] [MLIR][NVVM] Propagate verification failure for
 unsupported SM targets (#170001)

Fixes: https://github.com/llvm/llvm-project/issues/169113

Correctly propagate verification failure when
`NVVM::RequiresSMInterface` check fails during `gpu.module`
verification.
Previously, the walk was interrupted but the function returned
`success()`, causing a mismatch between the emitted diagnostic and the
return status. This led to assertion failures in Python bindings which
expect `failure()` when diagnostics are emitted.

CC: @grypp
---
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp    | 24 +++++++++++--------
 .../Dialect/LLVMIR/nvvm-target-invalid.mlir   | 11 +++++++++
 2 files changed, 25 insertions(+), 10 deletions(-)
 create mode 100644 mlir/test/Dialect/LLVMIR/nvvm-target-invalid.mlir
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index d3c305555fde8..b98f15cfe6d75 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -4707,16 +4707,20 @@ LogicalResult NVVMTargetAttr::verifyTarget(Operation *gpuModule) {
                      "Minimum NVVM target SM version is sm_20");
   }
 
-  gpuModuleOp->walk([&](Operation *op) {
-    if (auto reqOp = llvm::dyn_cast<NVVM::RequiresSMInterface>(op)) {
-      const NVVMCheckSMVersion requirement = reqOp.getRequiredMinSMVersion();
-      if (!requirement.isCompatibleWith(targetSMVersion)) {
-        op->emitOpError() << "is not supported on " << getChip();
-        return WalkResult::interrupt();
-      }
-    }
-    return WalkResult::advance();
-  });
+  if (gpuModuleOp
+          ->walk([&](Operation *op) {
+            if (auto reqOp = llvm::dyn_cast<NVVM::RequiresSMInterface>(op)) {
+              const NVVMCheckSMVersion requirement =
+                  reqOp.getRequiredMinSMVersion();
+              if (!requirement.isCompatibleWith(targetSMVersion)) {
+                op->emitOpError() << "is not supported on " << getChip();
+                return WalkResult::interrupt();
+              }
+            }
+            return WalkResult::advance();
+          })
+          .wasInterrupted())
+    return failure();
 
   return success();
 }
diff --git a/mlir/test/Dialect/LLVMIR/nvvm-target-invalid.mlir b/mlir/test/Dialect/LLVMIR/nvvm-target-invalid.mlir
new file mode 100644
index 0000000000000..c2cfa7689978b
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/nvvm-target-invalid.mlir
@@ -0,0 +1,11 @@
+// RUN: not mlir-opt %s 2>&1 | FileCheck %s
+// CHECK: 'nvvm.tcgen05.alloc' op is not supported on sm_90
+
+module {
+    gpu.module @mod [#nvvm.target<chip = "sm_90">] {
+        func.func @tcgen05_alloc(%arg0: !llvm.ptr<7>, %arg1: i32) {
+             nvvm.tcgen05.alloc %arg0, %arg1 : !llvm.ptr<7>, i32
+             return
+        }
+    }
+}

From 036279addf48cc5a5d7596f4abd06d33242f4f19 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Sun, 30 Nov 2025 21:40:13 -0800
Subject: [PATCH 02/39] [lldb][debugserver] Return shared cache filepath in
 jGetSharedCacheInfo (#168474)

Add a "shared_cache_path" key-value to the jGetSharedCacheInfo response,
if we can fetch the shared cache path.

If debugserver and the inferior process are running with the same shared
cache UUID, there is a simple SPI to get debugserver's own shared cache
filepath and we will return that.

On newer OSes, there are SPI we can use to get the inferior process'
shared cache filepath, use that if necessary and the SPI are available.

The response for the jGetSharedCacheInfo packet will now look like


{"shared_cache_base_address":6609256448,"shared_cache_uuid":"B69FF43C-DBFD-3FB1-B4FE-A8FE32EA1062","no_shared_cache":false,"shared_cache_private_cache":false,"shared_cache_path":"/System/Volumes/Preboot/Cryptexes/OS/System/Library/dyld/dyld_shared_cache_arm64e"}

when we have the full information about the shared cache in the
inferior. There are three possible types of responses:

1. inferior has not yet mapped in a shared cache (read: when stopped at
dyld_start and dyld hasn't started executing yet). In this case, no
"shared_cache_path" is listed. ("shared_cache_base_address" will be 0,
"shared_cache_uuid" will be all-zeroes uuid)

2. inferior has a shared cache, but it is different than debugserver's
and we do not have the new SPI to query the shared cache filepath. No
"shared_cache_path" is listed.

3. We were able to find the shared cache filepath, and it is included in
the response, as above.

I'm not using this information in lldb yet, but changes that build on
this will be forthcoming.

rdar://148939795
---
 lldb/tools/debugserver/source/DNB.cpp         |   2 +-
 .../debugserver/source/MacOSX/MachProcess.h   |  14 +-
 .../debugserver/source/MacOSX/MachProcess.mm  | 125 +++++++++++++++++-
 3 files changed, 135 insertions(+), 6 deletions(-)

diff --git a/lldb/tools/debugserver/source/DNB.cpp b/lldb/tools/debugserver/source/DNB.cpp
index 0cd48d91a682a..4d5afcf93a44b 100644
--- a/lldb/tools/debugserver/source/DNB.cpp
+++ b/lldb/tools/debugserver/source/DNB.cpp
@@ -1101,7 +1101,7 @@ DNBGetLibrariesInfoForAddresses(nub_process_t pid,
 JSONGenerator::ObjectSP DNBGetSharedCacheInfo(nub_process_t pid) {
   MachProcessSP procSP;
   if (GetProcessSP(pid, procSP)) {
-    return procSP->GetSharedCacheInfo(pid);
+    return procSP->GetInferiorSharedCacheInfo(pid);
   }
   return JSONGenerator::ObjectSP();
 }
diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.h b/lldb/tools/debugserver/source/MacOSX/MachProcess.h
index 56bc9d6c7461e..67b27b9902999 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachProcess.h
+++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.h
@@ -283,7 +283,10 @@ class MachProcess {
   JSONGenerator::ObjectSP
   GetAllLoadedLibrariesInfos(nub_process_t pid,
                              bool fetch_report_load_commands);
-  JSONGenerator::ObjectSP GetSharedCacheInfo(nub_process_t pid);
+  bool GetDebugserverSharedCacheInfo(uuid_t &uuid,
+                                     std::string &shared_cache_path);
+  bool GetInferiorSharedCacheFilepath(std::string &inferior_sc_path);
+  JSONGenerator::ObjectSP GetInferiorSharedCacheInfo(nub_process_t pid);
 
   nub_size_t GetNumThreads() const;
   nub_thread_t GetThreadAtIndex(nub_size_t thread_idx) const;
@@ -474,6 +477,14 @@ class MachProcess {
 
   void *(*m_dyld_process_info_create)(task_t task, uint64_t timestamp,
                                       kern_return_t *kernelError);
+  void *(*m_dyld_process_create_for_task)(task_read_t task, kern_return_t *kr);
+  void *(*m_dyld_process_snapshot_create_for_process)(void *process,
+                                                      kern_return_t *kr);
+  void *(*m_dyld_process_snapshot_get_shared_cache)(void *snapshot);
+  void (*m_dyld_shared_cache_for_each_file)(
+      void *cache, void (^block)(const char *file_path));
+  void (*m_dyld_process_snapshot_dispose)(void *snapshot);
+  void (*m_dyld_process_dispose)(void *process);
   void (*m_dyld_process_info_for_each_image)(
       void *info, void (^callback)(uint64_t machHeaderAddress,
                                    const uuid_t uuid, const char *path));
@@ -481,6 +492,7 @@ class MachProcess {
   void (*m_dyld_process_info_get_cache)(void *info, void *cacheInfo);
   uint32_t (*m_dyld_process_info_get_platform)(void *info);
   void (*m_dyld_process_info_get_state)(void *info, void *stateInfo);
+  const char *(*m_dyld_shared_cache_file_path)();
 };
 
 #endif // LLDB_TOOLS_DEBUGSERVER_SOURCE_MACOSX_MACHPROCESS_H
diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
index 3b875e61a268d..10ed8045a9211 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
+++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
@@ -534,13 +534,35 @@ static bool FBSAddEventDataToOptions(NSMutableDictionary *options,
       m_image_infos_baton(NULL), m_sent_interrupt_signo(0),
       m_auto_resume_signo(0), m_did_exec(false),
       m_dyld_process_info_create(nullptr),
+      m_dyld_process_create_for_task(nullptr),
+      m_dyld_process_snapshot_create_for_process(nullptr),
+      m_dyld_process_snapshot_get_shared_cache(nullptr),
+      m_dyld_shared_cache_for_each_file(nullptr),
+      m_dyld_process_snapshot_dispose(nullptr), m_dyld_process_dispose(nullptr),
       m_dyld_process_info_for_each_image(nullptr),
       m_dyld_process_info_release(nullptr),
       m_dyld_process_info_get_cache(nullptr),
-      m_dyld_process_info_get_state(nullptr) {
+      m_dyld_process_info_get_state(nullptr),
+      m_dyld_shared_cache_file_path(nullptr) {
   m_dyld_process_info_create =
       (void *(*)(task_t task, uint64_t timestamp, kern_return_t * kernelError))
           dlsym(RTLD_DEFAULT, "_dyld_process_info_create");
+
+  m_dyld_process_create_for_task =
+      (void *(*)(task_read_t, kern_return_t *))dlsym(
+          RTLD_DEFAULT, "dyld_process_create_for_task");
+  m_dyld_process_snapshot_create_for_process =
+      (void *(*)(void *, kern_return_t *))dlsym(
+          RTLD_DEFAULT, "dyld_process_snapshot_create_for_process");
+  m_dyld_process_snapshot_get_shared_cache = (void *(*)(void *))dlsym(
+      RTLD_DEFAULT, "dyld_process_snapshot_get_shared_cache");
+  m_dyld_shared_cache_for_each_file =
+      (void (*)(void *, void (^)(const char *)))dlsym(
+          RTLD_DEFAULT, "dyld_shared_cache_for_each_file");
+  m_dyld_process_snapshot_dispose =
+      (void (*)(void *))dlsym(RTLD_DEFAULT, "dyld_process_snapshot_dispose");
+  m_dyld_process_dispose =
+      (void (*)(void *))dlsym(RTLD_DEFAULT, "dyld_process_dispose");
   m_dyld_process_info_for_each_image =
       (void (*)(void *info, void (^)(uint64_t machHeaderAddress,
                                      const uuid_t uuid, const char *path)))
@@ -553,6 +575,8 @@ static bool FBSAddEventDataToOptions(NSMutableDictionary *options,
       RTLD_DEFAULT, "_dyld_process_info_get_platform");
   m_dyld_process_info_get_state = (void (*)(void *info, void *stateInfo))dlsym(
       RTLD_DEFAULT, "_dyld_process_info_get_state");
+  m_dyld_shared_cache_file_path =
+      (const char *(*)())dlsym(RTLD_DEFAULT, "dyld_shared_cache_file_path");
 
   DNBLogThreadedIf(LOG_PROCESS | LOG_VERBOSE, "%s", __PRETTY_FUNCTION__);
 }
@@ -1179,13 +1203,82 @@ static bool mach_header_validity_test(uint32_t magic, uint32_t cputype) {
                                           /* report_load_commands =  */ true);
 }
 
-// From dyld's internal podyld_process_info.h:
+bool MachProcess::GetDebugserverSharedCacheInfo(
+    uuid_t &uuid, std::string &shared_cache_path) {
+  uuid_clear(uuid);
+  shared_cache_path.clear();
+
+  if (m_dyld_process_info_create && m_dyld_process_info_get_cache) {
+    kern_return_t kern_ret;
+    dyld_process_info info =
+        m_dyld_process_info_create(mach_task_self(), 0, &kern_ret);
+    if (info) {
+      struct dyld_process_cache_info shared_cache_info;
+      m_dyld_process_info_get_cache(info, &shared_cache_info);
+      uuid_copy(uuid, shared_cache_info.cacheUUID);
+      m_dyld_process_info_release(info);
+    }
+  }
+  if (m_dyld_shared_cache_file_path) {
+    const char *cache_path = m_dyld_shared_cache_file_path();
+    if (cache_path)
+      shared_cache_path = cache_path;
+  }
+  if (!uuid_is_null(uuid))
+    return true;
+  return false;
+}
+
+bool MachProcess::GetInferiorSharedCacheFilepath(
+    std::string &inferior_sc_path) {
+  inferior_sc_path.clear();
+
+  if (!m_dyld_process_create_for_task ||
+      !m_dyld_process_snapshot_create_for_process ||
+      !m_dyld_process_snapshot_get_shared_cache ||
+      !m_dyld_shared_cache_for_each_file || !m_dyld_process_snapshot_dispose ||
+      !m_dyld_process_dispose)
+    return false;
+
+  __block std::string sc_path;
+  kern_return_t kr;
+  void *process = m_dyld_process_create_for_task(m_task.TaskPort(), &kr);
+  if (kr != KERN_SUCCESS)
+    return false;
+  void *snapshot = m_dyld_process_snapshot_create_for_process(process, &kr);
+  if (kr != KERN_SUCCESS)
+    return false;
+  void *cache = m_dyld_process_snapshot_get_shared_cache(snapshot);
+
+  // The shared cache is a collection of files on disk, this callback
+  // will iterate over all of them.
+  // The first filepath provided is the base filename of the cache.
+  __block bool done = false;
+  m_dyld_shared_cache_for_each_file(cache, ^(const char *path) {
+    if (done) {
+      return;
+    }
+    done = true;
+    sc_path = path;
+  });
+  m_dyld_process_snapshot_dispose(snapshot);
+  m_dyld_process_dispose(process);
+
+  inferior_sc_path = sc_path;
+  if (!sc_path.empty())
+    return true;
+  return false;
+}
+
+// From dyld's internal dyld_process_info.h:
 
-JSONGenerator::ObjectSP MachProcess::GetSharedCacheInfo(nub_process_t pid) {
+JSONGenerator::ObjectSP
+MachProcess::GetInferiorSharedCacheInfo(nub_process_t pid) {
   JSONGenerator::DictionarySP reply_sp(new JSONGenerator::Dictionary());
 
-  kern_return_t kern_ret;
+  uuid_t inferior_sc_uuid;
   if (m_dyld_process_info_create && m_dyld_process_info_get_cache) {
+    kern_return_t kern_ret;
     dyld_process_info info =
         m_dyld_process_info_create(m_task.TaskPort(), 0, &kern_ret);
     if (info) {
@@ -1197,6 +1290,7 @@ static bool mach_header_validity_test(uint32_t magic, uint32_t cputype) {
 
       uuid_string_t uuidstr;
       uuid_unparse_upper(shared_cache_info.cacheUUID, uuidstr);
+      uuid_copy(inferior_sc_uuid, shared_cache_info.cacheUUID);
       reply_sp->AddStringItem("shared_cache_uuid", uuidstr);
 
       reply_sp->AddBooleanItem("no_shared_cache", shared_cache_info.noCache);
@@ -1206,6 +1300,29 @@ static bool mach_header_validity_test(uint32_t magic, uint32_t cputype) {
       m_dyld_process_info_release(info);
     }
   }
+
+  // If debugserver and the inferior are have the same cache UUID,
+  // use the simple call to get the filepath to debugserver's shared
+  // cache, return that.
+  uuid_t debugserver_sc_uuid;
+  std::string debugserver_sc_path;
+  bool found_sc_filepath = false;
+  if (GetDebugserverSharedCacheInfo(debugserver_sc_uuid, debugserver_sc_path)) {
+    if (uuid_compare(inferior_sc_uuid, debugserver_sc_uuid) == 0 &&
+        !debugserver_sc_path.empty()) {
+      reply_sp->AddStringItem("shared_cache_path", debugserver_sc_path);
+      found_sc_filepath = true;
+    }
+  }
+
+  // Use SPI that are only available on newer OSes to fetch the
+  // filepath of the shared cache of the inferior, if available.
+  if (!found_sc_filepath) {
+    std::string inferior_sc_path;
+    if (GetInferiorSharedCacheFilepath(inferior_sc_path))
+      reply_sp->AddStringItem("shared_cache_path", inferior_sc_path);
+  }
+
   return reply_sp;
 }
 

From 9416b19e4f3b471216dcc3fcabac98f2a430faea Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Mon, 1 Dec 2025 15:20:45 +0800
Subject: [PATCH 03/39] [InstCombine] Add missing constant check (#170068)

`cast<Constant>` is not guarded by a type check during canonicalization
of predicates. This patch adds a type check in the outer if to avoid the
crash. `dyn_cast` may introduce another nested if, so I just use
`isa<Constant>` instead.

Address the crash reported in
https://github.com/llvm/llvm-project/pull/153053#issuecomment-3593914124.
---
 .../Transforms/InstCombine/InstCombineSelect.cpp |  1 +
 .../Transforms/InstCombine/saturating-add-sub.ll | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index e7dc366b13798..c9f51e4b294b1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1163,6 +1163,7 @@ static Value *canonicalizeSaturatedAddSigned(ICmpInst *Cmp, Value *TVal,
   // (X >= Y) ? INT_MAX : (X + C) --> sadd.sat(X, C)
   // where Y is INT_MAX - C or INT_MAX - C - 1, and C > 0
   if ((Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE) &&
+      isa<Constant>(Cmp1) &&
       match(FVal, m_Add(m_Specific(Cmp0), m_StrictlyPositive(C)))) {
     APInt IntMax =
         APInt::getSignedMaxValue(Cmp1->getType()->getScalarSizeInBits());
diff --git a/llvm/test/Transforms/InstCombine/saturating-add-sub.ll b/llvm/test/Transforms/InstCombine/saturating-add-sub.ll
index c0ad5818e448a..1294f867f07c0 100644
--- a/llvm/test/Transforms/InstCombine/saturating-add-sub.ll
+++ b/llvm/test/Transforms/InstCombine/saturating-add-sub.ll
@@ -2671,3 +2671,19 @@ define i8 @neg_neg_constant(i8 %x, i8 %y) {
   %s = select i1 %cmp, i8 127, i8 %d
   ret i8 %s
 }
+
+; Make sure we don't crash in this case.
+define i32 @pr153053_strict_pred_with_nonconstant_rhs(i32 %x, i32 %y) {
+; CHECK-LABEL: @pr153053_strict_pred_with_nonconstant_rhs(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[X]], 1
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 [[ADD]], i32 2147483647
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %cmp = icmp slt i32 %x, %y
+  %add = add i32 %x, 1
+  %res = select i1 %cmp, i32 %add, i32 2147483647
+  ret i32 %res
+}

From dc5ce79cc143e2e33e9cabbaa41349199b919cda Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Mon, 1 Dec 2025 15:22:45 +0800
Subject: [PATCH 04/39] [LV] Regenerate some check lines. NFC

The scalar loop doesn't exist anymore after 8907b6d39371d439461cdd3475d5590f87821377
---
 .../Transforms/LoopVectorize/struct-return.ll | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll
index f2e2e2846614b..70c6c7e900c51 100644
--- a/llvm/test/Transforms/LoopVectorize/struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll
@@ -29,8 +29,9 @@ define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -77,8 +78,9 @@ define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -232,8 +234,9 @@ define void @struct_return_i32_three_results_widen(ptr noalias %in, ptr noalias
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -273,7 +276,7 @@ define void @scalarized_predicated_struct_return(ptr %a) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call { i64, i64 } @bar_i64(i64 [[TMP3]]) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call { i64, i64 } @bar_i64(i64 [[TMP3]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { i64, i64 } [[TMP4]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]]
@@ -286,7 +289,7 @@ define void @scalarized_predicated_struct_return(ptr %a) {
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_IF1]]:
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = tail call { i64, i64 } @bar_i64(i64 [[TMP11]]) #[[ATTR4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call { i64, i64 } @bar_i64(i64 [[TMP11]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { i64, i64 } [[TMP12]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = udiv i64 [[TMP13]], [[TMP14]]
@@ -299,8 +302,9 @@ define void @scalarized_predicated_struct_return(ptr %a) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -385,7 +389,7 @@ define void @negative_mixed_element_type_struct_return(ptr noalias %in, ptr noal
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[IV]]
 ; CHECK-NEXT:    [[IN_VAL:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call { float, i32 } @baz(float [[IN_VAL]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call { float, i32 } @baz(float [[IN_VAL]]) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    [[EXTRACT_A:%.*]] = extractvalue { float, i32 } [[CALL]], 0
 ; CHECK-NEXT:    [[EXTRACT_B:%.*]] = extractvalue { float, i32 } [[CALL]], 1
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[IV]]
@@ -433,7 +437,7 @@ define void @negative_named_struct_return(ptr noalias readonly %in, ptr noalias
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[IV]]
 ; CHECK-NEXT:    [[IN_VAL:%.*]] = load double, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[CALL:%.*]] = tail call [[NAMED_STRUCT:%.*]] @[[BAR_NAMED:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](double [[IN_VAL]]) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call [[NAMED_STRUCT:%.*]] @[[BAR_NAMED:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](double [[IN_VAL]]) #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    [[EXTRACT_A:%.*]] = extractvalue [[NAMED_STRUCT]] [[CALL]], 0
 ; CHECK-NEXT:    [[EXTRACT_B:%.*]] = extractvalue [[NAMED_STRUCT]] [[CALL]], 1
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[OUT_A]], i64 [[IV]]

From bbb0dbadfaf292766922f5914f1c8946e2ef8519 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 1 Dec 2025 08:33:54 +0100
Subject: [PATCH 05/39] [clang][AST] Add `RecordDecl::getNumFields()` (#170022)

Not sure why that didn't exist yet, but we have quite a few places using
the same `std::distance` pattern.
---
 clang/include/clang/AST/Decl.h          |  5 +++++
 clang/lib/AST/ComparisonCategories.cpp  |  2 +-
 clang/lib/AST/ExprConstant.cpp          | 19 ++++++++-----------
 clang/lib/CodeGen/CGHLSLRuntime.cpp     |  6 ++----
 clang/lib/Sema/CodeCompleteConsumer.cpp |  3 +--
 5 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index ee2321dd158d4..5394b2558b407 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -4524,6 +4524,11 @@ class RecordDecl : public TagDecl {
     return field_begin() == field_end();
   }
 
+  /// Returns the number of fields (non-static data members) in this record.
+  unsigned getNumFields() const {
+    return std::distance(field_begin(), field_end());
+  }
+
   /// noload_fields - Iterate over the fields stored in this record
   /// that are currently loaded; don't attempt to retrieve anything
   /// from an external source.
diff --git a/clang/lib/AST/ComparisonCategories.cpp b/clang/lib/AST/ComparisonCategories.cpp
index 0c7a7f4eacbbf..1b9c938e2ace3 100644
--- a/clang/lib/AST/ComparisonCategories.cpp
+++ b/clang/lib/AST/ComparisonCategories.cpp
@@ -49,7 +49,7 @@ bool ComparisonCategoryInfo::ValueInfo::hasValidIntValue() const {
   // Before we attempt to get the value of the first field, ensure that we
   // actually have one (and only one) field.
   const auto *Record = VD->getType()->getAsCXXRecordDecl();
-  if (std::distance(Record->field_begin(), Record->field_end()) != 1 ||
+  if (Record->getNumFields() != 1 ||
       !Record->field_begin()->getType()->isIntegralOrEnumerationType())
     return false;
 
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index b986ee6ca4fa3..e5af4cb049ba9 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -3971,8 +3971,7 @@ static bool constructAggregate(EvalInfo &Info, const FPOptions FPO,
       if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD))
         NumBases = CXXRD->getNumBases();
 
-      *Res = APValue(APValue::UninitStruct(), NumBases,
-                     std::distance(RD->field_begin(), RD->field_end()));
+      *Res = APValue(APValue::UninitStruct(), NumBases, RD->getNumFields());
 
       SmallVector<std::tuple<APValue *, QualType, unsigned>> ReverseList;
       // we need to traverse backwards
@@ -5529,8 +5528,8 @@ static bool handleDefaultInitValue(QualType T, APValue &Result) {
       Result = APValue((const FieldDecl *)nullptr);
       return true;
     }
-    Result = APValue(APValue::UninitStruct(), RD->getNumBases(),
-                     std::distance(RD->field_begin(), RD->field_end()));
+    Result =
+        APValue(APValue::UninitStruct(), RD->getNumBases(), RD->getNumFields());
 
     unsigned Index = 0;
     for (CXXRecordDecl::base_class_const_iterator I = RD->bases_begin(),
@@ -7184,7 +7183,7 @@ static bool HandleConstructorCall(const Expr *E, const LValue &This,
   if (!Result.hasValue()) {
     if (!RD->isUnion())
       Result = APValue(APValue::UninitStruct(), RD->getNumBases(),
-                       std::distance(RD->field_begin(), RD->field_end()));
+                       RD->getNumFields());
     else
       // A union starts with no active member.
       Result = APValue((const FieldDecl*)nullptr);
@@ -8135,8 +8134,7 @@ class BufferToAPValueConverter {
     if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD))
       NumBases = CXXRD->getNumBases();
 
-    APValue ResultVal(APValue::UninitStruct(), NumBases,
-                      std::distance(RD->field_begin(), RD->field_end()));
+    APValue ResultVal(APValue::UninitStruct(), NumBases, RD->getNumFields());
 
     // Visit the base classes.
     if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
@@ -11146,7 +11144,7 @@ static bool HandleClassZeroInitialization(EvalInfo &Info, const Expr *E,
   assert(!RD->isUnion() && "Expected non-union class type");
   const CXXRecordDecl *CD = dyn_cast<CXXRecordDecl>(RD);
   Result = APValue(APValue::UninitStruct(), CD ? CD->getNumBases() : 0,
-                   std::distance(RD->field_begin(), RD->field_end()));
+                   RD->getNumFields());
 
   if (RD->isInvalidDecl()) return false;
   const ASTRecordLayout &Layout = Info.Ctx.getASTRecordLayout(RD);
@@ -11342,7 +11340,7 @@ bool RecordExprEvaluator::VisitCXXParenListOrInitListExpr(
 
   if (!Result.hasValue())
     Result = APValue(APValue::UninitStruct(), CXXRD ? CXXRD->getNumBases() : 0,
-                     std::distance(RD->field_begin(), RD->field_end()));
+                     RD->getNumFields());
   unsigned ElementNo = 0;
   bool Success = true;
 
@@ -11549,8 +11547,7 @@ bool RecordExprEvaluator::VisitLambdaExpr(const LambdaExpr *E) {
   if (ClosureClass->isInvalidDecl())
     return false;
 
-  const size_t NumFields =
-      std::distance(ClosureClass->field_begin(), ClosureClass->field_end());
+  const size_t NumFields = ClosureClass->getNumFields();
 
   assert(NumFields == (size_t)std::distance(E->capture_init_begin(),
                                             E->capture_init_end()) &&
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index f5c07fe2e33ff..bbe85986b07fc 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -816,8 +816,7 @@ CGHLSLRuntime::handleStructSemanticLoad(
   const llvm::StructType *ST = cast<StructType>(Type);
   const clang::RecordDecl *RD = Decl->getType()->getAsRecordDecl();
 
-  assert(std::distance(RD->field_begin(), RD->field_end()) ==
-         ST->getNumElements());
+  assert(RD->getNumFields() == ST->getNumElements());
 
   llvm::Value *Aggregate = llvm::PoisonValue::get(Type);
   auto FieldDecl = RD->field_begin();
@@ -849,8 +848,7 @@ CGHLSLRuntime::handleStructSemanticStore(
     RD = Decl->getType()->getAsRecordDecl();
   assert(RD);
 
-  assert(std::distance(RD->field_begin(), RD->field_end()) ==
-         ST->getNumElements());
+  assert(RD->getNumFields() == ST->getNumElements());
 
   auto FieldDecl = RD->field_begin();
   for (unsigned I = 0; I < ST->getNumElements(); ++I) {
diff --git a/clang/lib/Sema/CodeCompleteConsumer.cpp b/clang/lib/Sema/CodeCompleteConsumer.cpp
index e3fc7c11f4594..50a552272f421 100644
--- a/clang/lib/Sema/CodeCompleteConsumer.cpp
+++ b/clang/lib/Sema/CodeCompleteConsumer.cpp
@@ -539,8 +539,7 @@ unsigned CodeCompleteConsumer::OverloadCandidate::getNumParams() const {
     return Template->getTemplateParameters()->size();
 
   if (Kind == CK_Aggregate) {
-    unsigned Count =
-        std::distance(AggregateType->field_begin(), AggregateType->field_end());
+    unsigned Count = AggregateType->getNumFields();
     if (const auto *CRD = dyn_cast<CXXRecordDecl>(AggregateType))
       Count += CRD->getNumBases();
     return Count;

From a751ed97acf1ea760d6724bc6ea72b1b9b59a448 Mon Sep 17 00:00:00 2001
From: Vasily Leonenko <vleonen@users.noreply.github.com>
Date: Mon, 1 Dec 2025 10:55:00 +0300
Subject: [PATCH 06/39] [BOLT] Support runtime library hook via DT_INIT_ARRAY
 (#167467)

Major part of this PR is commit implementing support for DT_INIT_ARRAY
for BOLT runtime libraries initialization. Also, it adds related
hook-init test & fixes couple of X86 instrumentation tests.

This commit follows implementation of instrumentation hook via
DT_FINI_ARRAY (https://github.com/llvm/llvm-project/pull/67348) and
extends it for BOLT runtime libraries (including instrumentation
library) initialization hooking.

Initialization has has differences compared to finalization:
- Executables always use ELF entry point address. Update code checks it
and updates init_array entry if ELF is shared library (have no interp
entry) and have no DT_INIT entry. Also this commit introduces
"runtime-lib-init-hook" option to select primary initialization hook
(entry_point, init, init_array) with fall back to next available hook in
input binary. e.g. in case of libc we can explicitly set it to
init_array.
- Shared library init_array entries relocations usually has
R_AARCH64_ABS64 type on AArch64 binaries. We check relocation type and
adjust methods for reading init_array relocations in discovery and
update methods.

---------

Co-authored-by: Vasily Leonenko <vasily.leonenko@huawei.com>
---
 bolt/docs/CommandLineArgumentReference.md     |   9 +
 bolt/include/bolt/Core/BinaryContext.h        |   9 +
 bolt/include/bolt/Rewrite/RewriteInstance.h   |  11 +-
 bolt/lib/Rewrite/RewriteInstance.cpp          | 233 ++++++++++++++++--
 bolt/test/AArch64/hook-fini.s                 |  14 +-
 bolt/test/AArch64/hook-init.s                 | 221 +++++++++++++++++
 bolt/test/X86/hook-init.s                     | 221 +++++++++++++++++
 bolt/test/X86/internal-call-instrument-so.s   |   9 +-
 .../runtime/X86/instrument-wrong-target.s     |   7 +
 9 files changed, 708 insertions(+), 26 deletions(-)
 create mode 100644 bolt/test/AArch64/hook-init.s
 create mode 100644 bolt/test/X86/hook-init.s

diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index 7c6e01d669b74..0dbf6f59d5e88 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -811,6 +811,15 @@
 
   Specify file name of the runtime instrumentation library
 
+- `--runtime-lib-init-hook=<value>`
+
+  Primary target for hooking runtime library initialization, used in
+  fallback order of availability in input binary (entry_point -> init
+   -> init_array) (default: entry_point)
+  - `entry_point`: use ELF Header Entry Point
+  - `init`: use ELF DT_INIT entry
+  - `init_array`: use ELF .init_array entry
+
 - `--sctc-mode=<value>`
 
   Mode for simplify conditional tail calls
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 2af1d330b7545..8a90febcea3cc 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -807,6 +807,15 @@ class BinaryContext {
   /// the execution of the binary is completed.
   std::optional<uint64_t> FiniFunctionAddress;
 
+  /// DT_INIT.
+  std::optional<uint64_t> InitAddress;
+
+  /// DT_INIT_ARRAY. Only used when DT_INIT is not set.
+  std::optional<uint64_t> InitArrayAddress;
+
+  /// DT_INIT_ARRAYSZ. Only used when DT_INIT is not set.
+  std::optional<uint64_t> InitArraySize;
+
   /// DT_FINI.
   std::optional<uint64_t> FiniAddress;
 
diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h
index 35abf6b4d4ddd..5950b3c1630e1 100644
--- a/bolt/include/bolt/Rewrite/RewriteInstance.h
+++ b/bolt/include/bolt/Rewrite/RewriteInstance.h
@@ -93,14 +93,23 @@ class RewriteInstance {
   /// section allocations if found.
   void discoverBOLTReserved();
 
+  /// Check whether we should use DT_INIT or DT_INIT_ARRAY for instrumentation.
+  /// DT_INIT is preferred; DT_INIT_ARRAY is only used when no DT_INIT entry was
+  /// found.
+  Error discoverRtInitAddress();
+
   /// Check whether we should use DT_FINI or DT_FINI_ARRAY for instrumentation.
   /// DT_FINI is preferred; DT_FINI_ARRAY is only used when no DT_FINI entry was
   /// found.
   Error discoverRtFiniAddress();
 
+  /// If DT_INIT_ARRAY is used for instrumentation, update the relocation of its
+  /// first entry to point to the instrumentation library's init address.
+  Error updateRtInitReloc();
+
   /// If DT_FINI_ARRAY is used for instrumentation, update the relocation of its
   /// first entry to point to the instrumentation library's fini address.
-  void updateRtFiniReloc();
+  Error updateRtFiniReloc();
 
   /// Create and initialize metadata rewriters for this instance.
   void initializeMetadataManager();
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 8a5bbe28e5f66..1c6244b2d2bf8 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -294,6 +294,28 @@ cl::bits<GadgetScannerKind> GadgetScannersToRun(
         clEnumValN(GS_ALL, "all", "All implemented scanners")),
     cl::ZeroOrMore, cl::CommaSeparated, cl::cat(BinaryAnalysisCategory));
 
+// Primary targets for hooking runtime library initialization hooking
+// with fallback to next item in case if current item is not available
+// in the input binary.
+enum RuntimeLibInitHookTarget : char {
+  RLIH_ENTRY_POINT = 0, /// Use ELF Header Entry Point
+  RLIH_INIT = 1,        /// Use ELF DT_INIT entry
+  RLIH_INIT_ARRAY = 2,  /// Use ELF .init_array entry
+};
+
+cl::opt<RuntimeLibInitHookTarget> RuntimeLibInitHook(
+    "runtime-lib-init-hook",
+    cl::desc("Primary target for hooking runtime library initialization, used "
+             "in fallback order of availabiliy in input binary (entry_point -> "
+             "init -> init_array) (default: entry_point)"),
+    cl::Hidden, cl::init(RLIH_ENTRY_POINT),
+    cl::values(clEnumValN(RLIH_ENTRY_POINT, "entry_point",
+                          "use ELF Header Entry Point"),
+               clEnumValN(RLIH_INIT, "init", "use ELF DT_INIT entry"),
+               clEnumValN(RLIH_INIT_ARRAY, "init_array",
+                          "use ELF .init_array entry")),
+    cl::ZeroOrMore, cl::cat(BoltOptCategory));
+
 } // namespace opts
 
 // FIXME: implement a better way to mark sections for replacement.
@@ -741,9 +763,12 @@ Error RewriteInstance::run() {
   adjustCommandLineOptions();
   discoverFileObjects();
 
-  if (opts::Instrument && !BC->IsStaticExecutable)
+  if (opts::Instrument && !BC->IsStaticExecutable) {
+    if (Error E = discoverRtInitAddress())
+      return E;
     if (Error E = discoverRtFiniAddress())
       return E;
+  }
 
   preprocessProfileData();
 
@@ -785,8 +810,12 @@ Error RewriteInstance::run() {
 
   updateMetadata();
 
-  if (opts::Instrument && !BC->IsStaticExecutable)
-    updateRtFiniReloc();
+  if (opts::Instrument && !BC->IsStaticExecutable) {
+    if (Error E = updateRtInitReloc())
+      return E;
+    if (Error E = updateRtFiniReloc())
+      return E;
+  }
 
   if (opts::OutputFilename == "/dev/null") {
     BC->outs() << "BOLT-INFO: skipping writing final binary to disk\n";
@@ -1411,6 +1440,65 @@ void RewriteInstance::discoverBOLTReserved() {
   NextAvailableAddress = BC->BOLTReserved.start();
 }
 
+Error RewriteInstance::discoverRtInitAddress() {
+  if (BC->HasInterpHeader && opts::RuntimeLibInitHook == opts::RLIH_ENTRY_POINT)
+    return Error::success();
+
+  // Use DT_INIT if it's available.
+  if (BC->InitAddress && opts::RuntimeLibInitHook <= opts::RLIH_INIT) {
+    BC->StartFunctionAddress = BC->InitAddress;
+    return Error::success();
+  }
+
+  if (!BC->InitArrayAddress || !BC->InitArraySize) {
+    return createStringError(std::errc::not_supported,
+                             "Instrumentation of shared library needs either "
+                             "DT_INIT or DT_INIT_ARRAY");
+  }
+
+  if (*BC->InitArraySize < BC->AsmInfo->getCodePointerSize()) {
+    return createStringError(std::errc::not_supported,
+                             "Need at least 1 DT_INIT_ARRAY slot");
+  }
+
+  ErrorOr<BinarySection &> InitArraySection =
+      BC->getSectionForAddress(*BC->InitArrayAddress);
+  if (auto EC = InitArraySection.getError())
+    return errorCodeToError(EC);
+
+  if (InitArraySection->getAddress() != *BC->InitArrayAddress) {
+    return createStringError(std::errc::not_supported,
+                             "Inconsistent address of .init_array section");
+  }
+
+  if (const Relocation *Reloc = InitArraySection->getDynamicRelocationAt(0)) {
+    if (Reloc->isRelative()) {
+      BC->StartFunctionAddress = Reloc->Addend;
+    } else {
+      MCSymbol *Sym = Reloc->Symbol;
+      if (!Sym)
+        return createStringError(
+            std::errc::not_supported,
+            "Failed to locate symbol for 0 entry of .init_array");
+      const BinaryFunction *BF = BC->getFunctionForSymbol(Sym);
+      if (!BF)
+        return createStringError(
+            std::errc::not_supported,
+            "Failed to locate binary function for 0 entry of .init_array");
+      BC->StartFunctionAddress = BF->getAddress() + Reloc->Addend;
+    }
+    return Error::success();
+  }
+
+  if (const Relocation *Reloc = InitArraySection->getRelocationAt(0)) {
+    BC->StartFunctionAddress = Reloc->Value;
+    return Error::success();
+  }
+
+  return createStringError(std::errc::not_supported,
+                           "No relocation for first DT_INIT_ARRAY slot");
+}
+
 Error RewriteInstance::discoverRtFiniAddress() {
   // Use DT_FINI if it's available.
   if (BC->FiniAddress) {
@@ -1434,6 +1522,11 @@ Error RewriteInstance::discoverRtFiniAddress() {
   if (auto EC = FiniArraySection.getError())
     return errorCodeToError(EC);
 
+  if (FiniArraySection->getAddress() != *BC->FiniArrayAddress) {
+    return createStringError(std::errc::not_supported,
+                             "Inconsistent address of .fini_array section");
+  }
+
   if (const Relocation *Reloc = FiniArraySection->getDynamicRelocationAt(0)) {
     BC->FiniFunctionAddress = Reloc->Addend;
     return Error::success();
@@ -1448,26 +1541,95 @@ Error RewriteInstance::discoverRtFiniAddress() {
                            "No relocation for first DT_FINI_ARRAY slot");
 }
 
-void RewriteInstance::updateRtFiniReloc() {
+Error RewriteInstance::updateRtInitReloc() {
+  if (BC->HasInterpHeader && opts::RuntimeLibInitHook == opts::RLIH_ENTRY_POINT)
+    return Error::success();
+
+  // Updating DT_INIT is handled by patchELFDynamic.
+  if (BC->InitAddress && opts::RuntimeLibInitHook <= opts::RLIH_INIT)
+    return Error::success();
+
+  const RuntimeLibrary *RT = BC->getRuntimeLibrary();
+  if (!RT || !RT->getRuntimeStartAddress())
+    return Error::success();
+
+  if (!BC->InitArrayAddress)
+    return Error::success();
+
+  if (!BC->InitArrayAddress || !BC->InitArraySize)
+    return createStringError(std::errc::not_supported,
+                             "inconsistent .init_array state");
+
+  ErrorOr<BinarySection &> InitArraySection =
+      BC->getSectionForAddress(*BC->InitArrayAddress);
+  if (!InitArraySection)
+    return createStringError(std::errc::not_supported, ".init_array removed");
+
+  if (std::optional<Relocation> Reloc =
+          InitArraySection->takeDynamicRelocationAt(0)) {
+    if (Reloc->isRelative()) {
+      if (Reloc->Addend != BC->StartFunctionAddress)
+        return createStringError(std::errc::not_supported,
+                                 "inconsistent .init_array dynamic relocation");
+      Reloc->Addend = RT->getRuntimeStartAddress();
+      InitArraySection->addDynamicRelocation(*Reloc);
+    } else {
+      MCSymbol *Sym = Reloc->Symbol;
+      if (!Sym)
+        return createStringError(
+            std::errc::not_supported,
+            "Failed to locate symbol for 0 entry of .init_array");
+      const BinaryFunction *BF = BC->getFunctionForSymbol(Sym);
+      if (!BF)
+        return createStringError(
+            std::errc::not_supported,
+            "Failed to locate binary function for 0 entry of .init_array");
+      if (BF->getAddress() + Reloc->Addend != BC->StartFunctionAddress)
+        return createStringError(std::errc::not_supported,
+                                 "inconsistent .init_array dynamic relocation");
+      InitArraySection->addDynamicRelocation(Relocation{
+          /*Offset*/ 0, /*Symbol*/ nullptr, /*Type*/ Relocation::getAbs64(),
+          /*Addend*/ RT->getRuntimeStartAddress(), /*Value*/ 0});
+    }
+  }
+  // Update the static relocation by adding a pending relocation which will get
+  // patched when flushPendingRelocations is called in rewriteFile. Note that
+  // flushPendingRelocations will calculate the value to patch as
+  // "Symbol + Addend". Since we don't have a symbol, just set the addend to the
+  // desired value.
+  InitArraySection->addPendingRelocation(Relocation{
+      /*Offset*/ 0, /*Symbol*/ nullptr, /*Type*/ Relocation::getAbs64(),
+      /*Addend*/ RT->getRuntimeStartAddress(), /*Value*/ 0});
+  BC->outs()
+      << "BOLT-INFO: runtime library initialization was hooked via .init_array "
+         "entry, set to 0x"
+      << Twine::utohexstr(RT->getRuntimeStartAddress()) << "\n";
+  return Error::success();
+}
+
+Error RewriteInstance::updateRtFiniReloc() {
   // Updating DT_FINI is handled by patchELFDynamic.
   if (BC->FiniAddress)
-    return;
+    return Error::success();
 
   const RuntimeLibrary *RT = BC->getRuntimeLibrary();
   if (!RT || !RT->getRuntimeFiniAddress())
-    return;
+    return Error::success();
 
-  assert(BC->FiniArrayAddress && BC->FiniArraySize &&
-         "inconsistent .fini_array state");
+  if (!BC->FiniArrayAddress || !BC->FiniArraySize)
+    return createStringError(std::errc::not_supported,
+                             "inconsistent .fini_array state");
 
   ErrorOr<BinarySection &> FiniArraySection =
       BC->getSectionForAddress(*BC->FiniArrayAddress);
-  assert(FiniArraySection && ".fini_array removed");
+  if (!FiniArraySection)
+    return createStringError(std::errc::not_supported, ".fini_array removed");
 
   if (std::optional<Relocation> Reloc =
           FiniArraySection->takeDynamicRelocationAt(0)) {
-    assert(Reloc->Addend == BC->FiniFunctionAddress &&
-           "inconsistent .fini_array dynamic relocation");
+    if (Reloc->Addend != BC->FiniFunctionAddress)
+      return createStringError(std::errc::not_supported,
+                               "inconsistent .fini_array dynamic relocation");
     Reloc->Addend = RT->getRuntimeFiniAddress();
     FiniArraySection->addDynamicRelocation(*Reloc);
   }
@@ -1480,6 +1642,10 @@ void RewriteInstance::updateRtFiniReloc() {
   FiniArraySection->addPendingRelocation(Relocation{
       /*Offset*/ 0, /*Symbol*/ nullptr, /*Type*/ Relocation::getAbs64(),
       /*Addend*/ RT->getRuntimeFiniAddress(), /*Value*/ 0});
+  BC->outs() << "BOLT-INFO: runtime library finalization was hooked via "
+                ".fini_array entry, set to 0x"
+             << Twine::utohexstr(RT->getRuntimeFiniAddress()) << "\n";
+  return Error::success();
 }
 
 void RewriteInstance::registerFragments() {
@@ -2178,6 +2344,14 @@ void RewriteInstance::adjustCommandLineOptions() {
     exit(1);
   }
 
+  if (opts::Instrument && opts::RuntimeLibInitHook == opts::RLIH_ENTRY_POINT &&
+      !BC->HasInterpHeader) {
+    BC->errs()
+        << "BOLT-WARNING: adjusted runtime-lib-init-hook to 'init' due to "
+           "absence of INTERP header\n";
+    opts::RuntimeLibInitHook = opts::RLIH_INIT;
+  }
+
   if (opts::HotText && opts::HotTextMoveSections.getNumOccurrences() == 0) {
     opts::HotTextMoveSections.addValue(".stub");
     opts::HotTextMoveSections.addValue(".mover");
@@ -4849,9 +5023,14 @@ void RewriteInstance::patchELFSectionHeaderTable(ELFObjectFile<ELFT> *File) {
   ELFEhdrTy NewEhdr = Obj.getHeader();
 
   if (BC->HasRelocations) {
-    if (RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary())
+    RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary();
+    if (RtLibrary && opts::RuntimeLibInitHook == opts::RLIH_ENTRY_POINT) {
       NewEhdr.e_entry = RtLibrary->getRuntimeStartAddress();
-    else
+      BC->outs()
+          << "BOLT-INFO: runtime library initialization was hooked via ELF "
+             "Header Entry Point, set to 0x"
+          << Twine::utohexstr(NewEhdr.e_entry) << "\n";
+    } else
       NewEhdr.e_entry = getNewFunctionAddress(NewEhdr.e_entry);
     assert((NewEhdr.e_entry || !Obj.getHeader().e_entry) &&
            "cannot find new address for entry point");
@@ -5692,14 +5871,23 @@ void RewriteInstance::patchELFDynamic(ELFObjectFile<ELFT> *File) {
       }
       RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary();
       if (RtLibrary && Dyn.getTag() == ELF::DT_FINI) {
-        if (uint64_t Addr = RtLibrary->getRuntimeFiniAddress())
+        if (uint64_t Addr = RtLibrary->getRuntimeFiniAddress()) {
           NewDE.d_un.d_ptr = Addr;
+          BC->outs()
+              << "BOLT-INFO: runtime library finalization was hooked via "
+                 "DT_FINI, set to 0x"
+              << Twine::utohexstr(Addr) << "\n";
+        }
       }
-      if (RtLibrary && Dyn.getTag() == ELF::DT_INIT && !BC->HasInterpHeader) {
+      if (RtLibrary && Dyn.getTag() == ELF::DT_INIT &&
+          (!BC->HasInterpHeader ||
+           opts::RuntimeLibInitHook == opts::RLIH_INIT)) {
         if (auto Addr = RtLibrary->getRuntimeStartAddress()) {
-          LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Set DT_INIT to 0x"
-                            << Twine::utohexstr(Addr) << '\n');
           NewDE.d_un.d_ptr = Addr;
+          BC->outs()
+              << "BOLT-INFO: runtime library initialization was hooked via "
+                 "DT_INIT, set to 0x"
+              << Twine::utohexstr(Addr) << "\n";
         }
       }
       break;
@@ -5767,10 +5955,13 @@ Error RewriteInstance::readELFDynamic(ELFObjectFile<ELFT> *File) {
   for (const Elf_Dyn &Dyn : DynamicEntries) {
     switch (Dyn.d_tag) {
     case ELF::DT_INIT:
-      if (!BC->HasInterpHeader) {
-        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Set start function address\n");
-        BC->StartFunctionAddress = Dyn.getPtr();
-      }
+      BC->InitAddress = Dyn.getPtr();
+      break;
+    case ELF::DT_INIT_ARRAY:
+      BC->InitArrayAddress = Dyn.getPtr();
+      break;
+    case ELF::DT_INIT_ARRAYSZ:
+      BC->InitArraySize = Dyn.getPtr();
       break;
     case ELF::DT_FINI:
       BC->FiniAddress = Dyn.getPtr();
diff --git a/bolt/test/AArch64/hook-fini.s b/bolt/test/AArch64/hook-fini.s
index 4f321d463ef32..3bb95f9317b1b 100644
--- a/bolt/test/AArch64/hook-fini.s
+++ b/bolt/test/AArch64/hook-fini.s
@@ -15,13 +15,13 @@
 # RUN: %clang %cflags -pie %s -Wl,-q -o %t.exe
 # RUN: llvm-readelf -d %t.exe | FileCheck --check-prefix=DYN-FINI %s
 # RUN: llvm-readelf -r %t.exe | FileCheck --check-prefix=RELOC-PIE %s
-# RUN: llvm-bolt %t.exe -o %t --instrument
+# RUN: llvm-bolt %t.exe -o %t --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-FINI %s
 # RUN: llvm-readelf -drs %t | FileCheck --check-prefix=CHECK-FINI %s
 
 # RUN: %clang %cflags -pie %s -Wl,-q,-fini=0 -o %t-no-fini.exe
 # RUN: llvm-readelf -d %t-no-fini.exe | FileCheck --check-prefix=DYN-NO-FINI %s
 # RUN: llvm-readelf -r %t-no-fini.exe | FileCheck --check-prefix=RELOC-PIE %s
-# RUN: llvm-bolt %t-no-fini.exe -o %t-no-fini --instrument
+# RUN: llvm-bolt %t-no-fini.exe -o %t-no-fini --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-FINI-ARRAY %s
 # RUN: llvm-readelf -drs %t-no-fini | FileCheck --check-prefix=CHECK-NO-FINI %s
 # RUN: llvm-readelf -ds -x .fini_array %t-no-fini | FileCheck --check-prefix=CHECK-NO-FINI-RELOC %s
 
@@ -29,7 +29,7 @@
 # RUN: %clang %cflags %p/../Inputs/stub.c -fPIC -shared -o %t-stub.so
 # RUN: %clang %cflags %s -no-pie -Wl,-q,-fini=0 %t-stub.so -o %t-no-pie-no-fini.exe
 # RUN: llvm-readelf -r %t-no-pie-no-fini.exe | FileCheck --check-prefix=RELOC-NO-PIE %s
-# RUN: llvm-bolt %t-no-pie-no-fini.exe -o %t-no-pie-no-fini --instrument
+# RUN: llvm-bolt %t-no-pie-no-fini.exe -o %t-no-pie-no-fini --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-FINI-ARRAY %s
 # RUN: llvm-readelf -ds -x .fini_array %t-no-pie-no-fini | FileCheck --check-prefix=CHECK-NO-PIE-NO-FINI %s
 
 ## With fini: dynamic section should contain DT_FINI
@@ -46,6 +46,14 @@
 ## Without PIE: binary should not have relative relocations
 # RELOC-NO-PIE-NOT: R_AARCH64_RELATIVE
 
+## Check BOLT output output finalization hook (DT_FINI)
+# CHECK-BOLT-RT-FINI: runtime library finalization was hooked via DT_FINI
+# CHECK-BOLT-RT-FINI-NOT: runtime library finalization was hooked via .fini_array entry
+
+## Check BOLT output output finalization hook (.fini_array entry)
+# CHECK-BOLT-RT-FINI-ARRAY-NOT: runtime library finalization was hooked via DT_FINI
+# CHECK-BOLT-RT-FINI-ARRAY: runtime library finalization was hooked via .fini_array entry
+
 ## Check that DT_FINI is set to __bolt_runtime_fini
 # CHECK-FINI:     Dynamic section at offset {{.*}} contains {{.*}} entries:
 # CHECK-FINI-DAG: (FINI) 0x[[FINI:[[:xdigit:]]+]]
diff --git a/bolt/test/AArch64/hook-init.s b/bolt/test/AArch64/hook-init.s
new file mode 100644
index 0000000000000..a48328b630fa0
--- /dev/null
+++ b/bolt/test/AArch64/hook-init.s
@@ -0,0 +1,221 @@
+## Test the different ways of hooking the init function for instrumentation (via
+## entry point, DT_INIT and via DT_INIT_ARRAY). We test the latter for both PIE
+## and non-PIE binaries because of the different ways of handling relocations
+## (static or dynamic), executable and shared library.
+## All tests perform the following steps:
+## - Compile and link for the case to be tested
+## - Some sanity-checks on the dynamic section and relocations in the binary to
+##   verify it has the shape we want for testing:
+##   - INTERP in Program Headers
+##   - DT_INIT or DT_INIT_ARRAY in dynamic section
+##   - No relative relocations for non-PIE
+## - Instrument (with extra --runtime-lib-init-hook=init/init_array options
+##   in some cases)
+## - Verify generated binary
+# REQUIRES: system-linux,bolt-runtime,target=aarch64{{.*}}
+
+# RUN: %clang %cflags -pie %s -Wl,-q -o %t.exe
+# RUN: llvm-readelf -d %t.exe | FileCheck --check-prefix=DYN-INIT %s
+# RUN: llvm-readelf -l %t.exe | FileCheck --check-prefix=PH-INTERP %s
+# RUN: llvm-readelf -r %t.exe | FileCheck --check-prefix=RELOC-PIE %s
+# RUN: llvm-bolt %t.exe -o %t --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-EP %s
+# RUN: llvm-readelf -hdrs %t | FileCheck --check-prefix=CHECK-INIT-EP %s
+# RUN: llvm-bolt %t.exe -o %t-no-ep --instrument --runtime-lib-init-hook=init | FileCheck --check-prefix=CHECK-BOLT-RT-INIT %s
+# RUN: llvm-readelf -hdrs %t-no-ep | FileCheck --check-prefix=CHECK-INIT-NO-EP %s
+# RUN: llvm-bolt %t.exe -o %t-no-ep --instrument --runtime-lib-init-hook=init_array | FileCheck --check-prefix=CHECK-BOLT-RT-INIT-ARRAY %s
+# RUN: llvm-readelf -hdrs %t-no-ep | FileCheck --check-prefix=CHECK-INIT-ARRAY-NO-EP %s
+
+# RUN: %clang -shared %cflags -pie %s -Wl,-q -o %t-shared.exe
+# RUN: llvm-readelf -d %t-shared.exe | FileCheck --check-prefix=DYN-INIT %s
+# RUN: llvm-readelf -l %t-shared.exe | FileCheck --check-prefix=PH-INTERP-SHARED %s
+# RUN: llvm-readelf -r %t-shared.exe | FileCheck --check-prefix=RELOC-SHARED-PIE %s
+# RUN: llvm-bolt %t-shared.exe -o %t-shared --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-INIT %s
+# RUN: llvm-readelf -hdrs %t-shared | FileCheck --check-prefix=CHECK-SHARED-INIT %s
+
+# RUN: %clang %cflags -pie %s -Wl,-q,-init=0 -o %t-no-init.exe
+# RUN: llvm-readelf -d %t-no-init.exe | FileCheck --check-prefix=DYN-NO-INIT %s
+# RUN: llvm-readelf -l %t-no-init.exe | FileCheck --check-prefix=PH-INTERP %s
+# RUN: llvm-readelf -r %t-no-init.exe | FileCheck --check-prefix=RELOC-PIE %s
+# RUN: llvm-bolt %t-no-init.exe -o %t-no-init --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-EP %s
+# RUN: llvm-readelf -hdrs %t-no-init | FileCheck --check-prefix=CHECK-NO-INIT-EP %s
+# RUN: llvm-bolt %t-no-init.exe -o %t-no-init-no-ep --instrument --runtime-lib-init-hook=init | FileCheck --check-prefix=CHECK-BOLT-RT-INIT-ARRAY %s
+# RUN: llvm-readelf -hdrs %t-no-init-no-ep | FileCheck --check-prefix=CHECK-NO-INIT-NO-EP %s
+
+# RUN: %clang -shared %cflags -pie %s -Wl,-q,-init=0 -o %t-shared-no-init.exe
+# RUN: llvm-readelf -d %t-shared-no-init.exe | FileCheck --check-prefix=DYN-NO-INIT %s
+# RUN: llvm-readelf -l %t-shared-no-init.exe | FileCheck --check-prefix=PH-INTERP-SHARED %s
+# RUN: llvm-readelf -r %t-shared-no-init.exe | FileCheck --check-prefix=RELOC-SHARED-PIE %s
+# RUN: llvm-bolt %t-shared-no-init.exe -o %t-shared-no-init --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-INIT-ARRAY %s
+# RUN: llvm-readelf -drs %t-shared-no-init | FileCheck --check-prefix=CHECK-SHARED-NO-INIT %s
+
+## Create a dummy shared library to link against to force creation of the dynamic section.
+# RUN: %clang %cflags %p/../Inputs/stub.c -fPIC -shared -o %t-stub.so
+# RUN: %clang %cflags %s -no-pie -Wl,-q,-init=0 %t-stub.so -o %t-no-pie-no-init.exe
+# RUN: llvm-readelf -r %t-no-pie-no-init.exe | FileCheck --check-prefix=RELOC-NO-PIE %s
+# RUN: llvm-bolt %t-no-pie-no-init.exe -o %t-no-pie-no-init --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-EP %s
+# RUN: llvm-readelf -hds %t-no-pie-no-init | FileCheck --check-prefix=CHECK-NO-PIE-NO-INIT-EP %s
+
+## With init: dynamic section should contain DT_INIT
+# DYN-INIT: (INIT)
+
+## Without init: dynamic section should only contain DT_INIT_ARRAY
+# DYN-NO-INIT-NOT: (INIT)
+# DYN-NO-INIT:     (INIT_ARRAY)
+# DYN-NO-INIT:     (INIT_ARRAYSZ)
+
+## With interp program header (executable)
+# PH-INTERP: Program Headers:
+# PH-INTERP: INTERP
+
+## Without interp program header (shared library)
+# PH-INTERP-SHARED:     Program Headers:
+# PH-INTERP-SHARED-NOT: INTERP
+
+## With PIE: binary should have relative relocations
+# RELOC-PIE: R_AARCH64_RELATIVE
+
+## With PIE: binary should have relative relocations
+# RELOC-SHARED-PIE: R_AARCH64_ABS64
+
+## Without PIE: binary should not have relative relocations
+# RELOC-NO-PIE-NOT: R_AARCH64_RELATIVE
+
+## Check BOLT output output initialization hook (ELF Header Entry Point)
+# CHECK-BOLT-RT-EP: runtime library initialization was hooked via ELF Header Entry Point
+# CHECK-BOLT-RT-EP-NOT: runtime library initialization was hooked via DT_INIT
+# CHECK-BOLT-RT-EP-NOT: runtime library initialization was hooked via .init_array entry
+
+## Check BOLT output output initialization hook (DT_INIT)
+# CHECK-BOLT-RT-INIT-NOT: runtime library initialization was hooked via ELF Header Entry Point
+# CHECK-BOLT-RT-INIT: runtime library initialization was hooked via DT_INIT
+# CHECK-BOLT-RT-INIT-NOT: runtime library initialization was hooked via .init_array entry
+
+## Check BOLT output output initialization hook (.init_array entry)
+# CHECK-BOLT-RT-INIT-ARRAY-NOT: runtime library initialization was hooked via ELF Header Entry Point
+# CHECK-BOLT-RT-INIT-ARRAY-NOT: runtime library initialization was hooked via DT_INIT
+# CHECK-BOLT-RT-INIT-ARRAY: runtime library initialization was hooked via .init_array entry
+
+## Check that entry point address is set to __bolt_runtime_start for PIE executable with DT_INIT
+# CHECK-INIT-EP:               ELF Header:
+# CHECK-INIT-EP:               Entry point address: 0x[[#%x,EP_ADDR:]]
+## Check that the dynamic relocation at .init and .init_array were not patched
+# CHECK-INIT-EP:               Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-INIT-EP-NOT:           (INIT) 0x[[#%x, EP_ADDR]]
+# CHECK-INIT-EP-NOT:           (INIT_ARRAY) 0x[[#%x, EP_ADDR]]
+## Check that the new entry point address points to __bolt_runtime_start
+# CHECK-INIT-EP:               Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-INIT-EP:               {{0+}}[[#%x, EP_ADDR]] {{.*}} __bolt_runtime_start
+
+## Check that DT_INIT address is set to __bolt_runtime_start for PIE executable with DT_INIT
+# CHECK-INIT-NO-EP:            ELF Header:
+# CHECK-INIT-NO-EP:            Entry point address: 0x[[#%x,EP_ADDR:]]
+## Read Dynamic section DT_INIT and DT_INIT_ARRAY entries
+# CHECK-INIT-NO-EP:            Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-INIT-NO-EP-DAG:        (INIT) 0x[[#%x,INIT:]]
+# CHECK-INIT-NO-EP-DAG:        (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]]
+## Check if ELF entry point address points to _start symbol and new DT_INIT entry points to __bolt_runtime_start
+# CHECK-INIT-NO-EP:            Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-INIT-NO-EP-DAG:        {{0+}}[[#%x, EP_ADDR]] {{.*}} _start
+# CHECK-INIT-NO-EP-DAG:        {{0+}}[[#%x, INIT]] {{.*}} __bolt_runtime_start
+
+## Check that 1st entry of DT_INIT_ARRAY is set to __bolt_runtime_start and DT_INIT was not changed
+# CHECK-INIT-ARRAY-NO-EP:      ELF Header:
+# CHECK-INIT-ARRAY-NO-EP:      Entry point address: 0x[[#%x,EP_ADDR:]]
+## Read Dynamic section DT_INIT and DT_INIT_ARRAY entries
+# CHECK-INIT-ARRAY-NO-EP:      Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-INIT-ARRAY-NO-EP-DAG:  (INIT) 0x[[#%x,INIT:]]
+# CHECK-INIT-ARRAY-NO-EP-DAG:  (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]]
+## Read the dynamic relocation from 1st entry of .init_array
+# CHECK-INIT-ARRAY-NO-EP:      Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries
+# CHECK-INIT-ARRAY-NO-EP:      {{0+}}[[#%x,INIT_ARRAY]] {{.*}} R_AARCH64_RELATIVE [[#%x,INIT_ADDR:]]
+# CHECK-INIT-ARRAY-NO-EP-NOT:  {{0+}}[[#%x,INIT_ARRAY]] {{.*}} R_AARCH64_RELATIVE [[#%x,INIT]]
+## Check that 1st entry of .init_array points to __bolt_runtime_start
+# CHECK-INIT-ARRAY-NO-EP:      Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-INIT-ARRAY-NO-EP-DAG:  {{0+}}[[#%x, EP_ADDR]] {{.*}} _start
+# CHECK-INIT-ARRAY-NO-EP-DAG:  {{[0-9]]*}}: {{0+}}[[#%x, INIT_ADDR]] {{.*}} __bolt_runtime_start
+
+## Check that entry point address is set to __bolt_runtime_start for PIE executable without DT_INIT
+# CHECK-NO-INIT-EP:            ELF Header:
+# CHECK-NO-INIT-EP:            Entry point address: 0x[[#%x,EP_ADDR:]]
+## Check that the dynamic relocation at .init and .init_array were not patched
+# CHECK-NO-INIT-EP:            Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-NO-INIT-EP-NOT:        (INIT) 0x[[#%x, EP_ADDR]]
+# CHECK-NO-INIT-EP-NOT:        (INIT_ARRAY) 0x[[#%x, EP_ADDR]]
+## Check that the new entry point address points to __bolt_runtime_start
+# CHECK-NO-INIT-EP:            Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-NO-INIT-EP:            {{0+}}[[#%x, EP_ADDR]] {{.*}} __bolt_runtime_start
+
+## Check that DT_INIT is set to __bolt_runtime_start for shared library with DT_INIT
+# CHECK-SHARED-INIT:           Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-SHARED-INIT-DAG:       (INIT) 0x[[#%x, INIT:]]
+# CHECK-SHARED-INIT-DAG:       (INIT_ARRAY) 0x[[#%x, INIT_ARRAY:]]
+## Check that the dynamic relocation at .init_array was not patched
+# CHECK-SHARED-INIT:           Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries
+# CHECK-SHARED-INIT-NOT:       {{0+}}[[#%x, INIT_ARRAY]] {{.*}} R_AARCH64_ABS64 {{0+}}[[#%x, INIT]]
+## Check that dynamic section DT_INIT points to __bolt_runtime_start
+# CHECK-SHARED-INIT:           Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-SHARED-INIT:           {{0+}}[[#%x, INIT]] {{.*}} __bolt_runtime_start
+
+## Check that entry point address is set to __bolt_runtime_start for PIE executable without DT_INIT
+# CHECK-NO-INIT-NO-EP:         ELF Header:
+# CHECK-NO-INIT-NO-EP:         Entry point address: 0x[[#%x,EP_ADDR:]]
+# CHECK-NO-INIT-NO-EP:         Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-NO-INIT-NO-EP-NOT:     (INIT)
+# CHECK-NO-INIT-NO-EP:         (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]]
+## Read the dynamic relocation from 1st entry of .init_array
+# CHECK-NO-INIT-NO-EP:         Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries
+# CHECK-NO-INIT-NO-EP:         {{0+}}[[#%x,INIT_ARRAY]] {{.*}} R_AARCH64_RELATIVE [[#%x,INIT_ADDR:]]
+## Check that 1st entry of .init_array points to __bolt_runtime_start
+# CHECK-NO-INIT-NO-EP:         Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-NO-INIT-NO-EP-DAG:     {{0+}}[[#%x, EP_ADDR]] {{.*}} _start
+# CHECK-NO-INIT-NO-EP-DAG:     {{[0-9]]*}}: {{0+}}[[#%x, INIT_ADDR]] {{.*}} __bolt_runtime_start
+
+## Check that entry point address is set to __bolt_runtime_start for shared library without DT_INIT
+# CHECK-SHARED-NO-INIT:        Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-SHARED-NO-INIT-NOT:    (INIT)
+# CHECK-SHARED-NO-INIT:        (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]]
+## Read the dynamic relocation from 1st entry of .init_array
+# CHECK-SHARED-NO-INIT:        Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries
+# CHECK-SHARED-NO-INIT:        {{0+}}[[#%x, INIT_ARRAY]] {{.*}} R_AARCH64_ABS64 [[#%x,INIT_ADDR:]]
+## Check that 1st entry of .init_array points to __bolt_runtime_start
+# CHECK-SHARED-NO-INIT:        Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-SHARED-NO-INIT:        {{[0-9]]*}}: {{0+}}[[#%x, INIT_ADDR]] {{.*}} __bolt_runtime_start
+
+## Check that entry point address is set to __bolt_runtime_start for non-PIE executable with DT_INIT
+# CHECK-NO-PIE-NO-INIT-EP:     ELF Header:
+# CHECK-NO-PIE-NO-INIT-EP:     Entry point address: 0x[[#%x,EP_ADDR:]]
+## Check that the dynamic relocation at .init and .init_array were not patched
+# CHECK-NO-PIE-NO-INIT-EP:     Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-NO-PIE-NO-INIT-EP-NOT: (INIT) 0x[[#%x, EP_ADDR]]
+# CHECK-NO-PIE-NO-INIT-EP-NOT: (INIT_ARRAY) 0x[[#%x, EP_ADDR]]
+## Check that the new entry point address points to __bolt_runtime_start
+# CHECK-NO-PIE-NO-INIT-EP:     Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-NO-PIE-NO-INIT-EP:     {{0+}}[[#%x, EP_ADDR]] {{.*}} __bolt_runtime_start
+
+  .globl _start
+  .type _start, %function
+_start:
+  # Dummy relocation to force relocation mode.
+  .reloc 0, R_AARCH64_NONE
+  ret
+.size _start, .-_start
+
+  .globl _init
+  .type _init, %function
+_init:
+  ret
+  .size _init, .-_init
+
+  .globl _fini
+  .type _fini, %function
+_fini:
+  ret
+  .size _fini, .-_fini
+
+  .section .init_array,"aw"
+  .align 3
+  .dword _init
+
+  .section .fini_array,"aw"
+  .align 3
+  .dword _fini
diff --git a/bolt/test/X86/hook-init.s b/bolt/test/X86/hook-init.s
new file mode 100644
index 0000000000000..3184541f040b9
--- /dev/null
+++ b/bolt/test/X86/hook-init.s
@@ -0,0 +1,221 @@
+## Test the different ways of hooking the init function for instrumentation (via
+## entry point, DT_INIT and via DT_INIT_ARRAY). We test the latter for both PIE
+## and non-PIE binaries because of the different ways of handling relocations
+## (static or dynamic), executable and shared library.
+## All tests perform the following steps:
+## - Compile and link for the case to be tested
+## - Some sanity-checks on the dynamic section and relocations in the binary to
+##   verify it has the shape we want for testing:
+##   - INTERP in Program Headers
+##   - DT_INIT or DT_INIT_ARRAY in dynamic section
+##   - No relative relocations for non-PIE
+## - Instrument (with extra --runtime-lib-init-hook=init/init_array options
+##   in some cases)
+## - Verify generated binary
+# REQUIRES: system-linux,bolt-runtime,target=x86_64-{{.*}}
+
+# RUN: %clang %cflags -pie %s -Wl,-q -o %t.exe
+# RUN: llvm-readelf -d %t.exe | FileCheck --check-prefix=DYN-INIT %s
+# RUN: llvm-readelf -l %t.exe | FileCheck --check-prefix=PH-INTERP %s
+# RUN: llvm-readelf -r %t.exe | FileCheck --check-prefix=RELOC-PIE %s
+# RUN: llvm-bolt %t.exe -o %t --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-EP %s
+# RUN: llvm-readelf -hdrs %t | FileCheck --check-prefix=CHECK-INIT-EP %s
+# RUN: llvm-bolt %t.exe -o %t-no-ep --instrument --runtime-lib-init-hook=init | FileCheck --check-prefix=CHECK-BOLT-RT-INIT %s
+# RUN: llvm-readelf -hdrs %t-no-ep | FileCheck --check-prefix=CHECK-INIT-NO-EP %s
+# RUN: llvm-bolt %t.exe -o %t-no-ep --instrument --runtime-lib-init-hook=init_array | FileCheck --check-prefix=CHECK-BOLT-RT-INIT-ARRAY %s
+# RUN: llvm-readelf -hdrs %t-no-ep | FileCheck --check-prefix=CHECK-INIT-ARRAY-NO-EP %s
+
+# RUN: %clang -shared %cflags -pie %s -Wl,-q -o %t-shared.exe
+# RUN: llvm-readelf -d %t-shared.exe | FileCheck --check-prefix=DYN-INIT %s
+# RUN: llvm-readelf -l %t-shared.exe | FileCheck --check-prefix=PH-INTERP-SHARED %s
+# RUN: llvm-readelf -r %t-shared.exe | FileCheck --check-prefix=RELOC-SHARED-PIE %s
+# RUN: llvm-bolt %t-shared.exe -o %t-shared --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-INIT %s
+# RUN: llvm-readelf -hdrs %t-shared | FileCheck --check-prefix=CHECK-SHARED-INIT %s
+
+# RUN: %clang %cflags -pie %s -Wl,-q,-init=0 -o %t-no-init.exe
+# RUN: llvm-readelf -d %t-no-init.exe | FileCheck --check-prefix=DYN-NO-INIT %s
+# RUN: llvm-readelf -l %t-no-init.exe | FileCheck --check-prefix=PH-INTERP %s
+# RUN: llvm-readelf -r %t-no-init.exe | FileCheck --check-prefix=RELOC-PIE %s
+# RUN: llvm-bolt %t-no-init.exe -o %t-no-init --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-EP %s
+# RUN: llvm-readelf -hdrs %t-no-init | FileCheck --check-prefix=CHECK-NO-INIT-EP %s
+# RUN: llvm-bolt %t-no-init.exe -o %t-no-init-no-ep --instrument --runtime-lib-init-hook=init | FileCheck --check-prefix=CHECK-BOLT-RT-INIT-ARRAY %s
+# RUN: llvm-readelf -hdrs %t-no-init-no-ep | FileCheck --check-prefix=CHECK-NO-INIT-NO-EP %s
+
+# RUN: %clang -shared %cflags -pie %s -Wl,-q,-init=0 -o %t-shared-no-init.exe
+# RUN: llvm-readelf -d %t-shared-no-init.exe | FileCheck --check-prefix=DYN-NO-INIT %s
+# RUN: llvm-readelf -l %t-shared-no-init.exe | FileCheck --check-prefix=PH-INTERP-SHARED %s
+# RUN: llvm-readelf -r %t-shared-no-init.exe | FileCheck --check-prefix=RELOC-SHARED-PIE %s
+# RUN: llvm-bolt %t-shared-no-init.exe -o %t-shared-no-init --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-INIT-ARRAY %s
+# RUN: llvm-readelf -drs %t-shared-no-init | FileCheck --check-prefix=CHECK-SHARED-NO-INIT %s
+
+## Create a dummy shared library to link against to force creation of the dynamic section.
+# RUN: %clang %cflags %p/../Inputs/stub.c -fPIC -shared -o %t-stub.so
+# RUN: %clang %cflags %s -no-pie -Wl,-q,-init=0 %t-stub.so -o %t-no-pie-no-init.exe
+# RUN: llvm-readelf -r %t-no-pie-no-init.exe | FileCheck --check-prefix=RELOC-NO-PIE %s
+# RUN: llvm-bolt %t-no-pie-no-init.exe -o %t-no-pie-no-init --instrument | FileCheck --check-prefix=CHECK-BOLT-RT-EP %s
+# RUN: llvm-readelf -hds %t-no-pie-no-init | FileCheck --check-prefix=CHECK-NO-PIE-NO-INIT-EP %s
+
+## With init: dynamic section should contain DT_INIT
+# DYN-INIT: (INIT)
+
+## Without init: dynamic section should only contain DT_INIT_ARRAY
+# DYN-NO-INIT-NOT: (INIT)
+# DYN-NO-INIT:     (INIT_ARRAY)
+# DYN-NO-INIT:     (INIT_ARRAYSZ)
+
+## With interp program header (executable)
+# PH-INTERP: Program Headers:
+# PH-INTERP: INTERP
+
+## Without interp program header (shared library)
+# PH-INTERP-SHARED:     Program Headers:
+# PH-INTERP-SHARED-NOT: INTERP
+
+## With PIE: binary should have relative relocations
+# RELOC-PIE: R_X86_64_RELATIVE
+
+## With PIE: binary should have relative relocations
+# RELOC-SHARED-PIE: R_X86_64_64
+
+## Without PIE: binary should not have relative relocations
+# RELOC-NO-PIE-NOT: R_X86_64_RELATIVE
+
+## Check BOLT output output initialization hook (ELF Header Entry Point)
+# CHECK-BOLT-RT-EP: runtime library initialization was hooked via ELF Header Entry Point
+# CHECK-BOLT-RT-EP-NOT: runtime library initialization was hooked via DT_INIT
+# CHECK-BOLT-RT-EP-NOT: runtime library initialization was hooked via .init_array entry
+
+## Check BOLT output output initialization hook (DT_INIT)
+# CHECK-BOLT-RT-INIT-NOT: runtime library initialization was hooked via ELF Header Entry Point
+# CHECK-BOLT-RT-INIT: runtime library initialization was hooked via DT_INIT
+# CHECK-BOLT-RT-INIT-NOT: runtime library initialization was hooked via .init_array entry
+
+## Check BOLT output output initialization hook (1st entry of .init_array)
+# CHECK-BOLT-RT-INIT-ARRAY-NOT: runtime library initialization was hooked via ELF Header Entry Point
+# CHECK-BOLT-RT-INIT-ARRAY-NOT: runtime library initialization was hooked via DT_INIT
+# CHECK-BOLT-RT-INIT-ARRAY: runtime library initialization was hooked via .init_array entry
+
+## Check that entry point address is set to __bolt_runtime_start for PIE executable with DT_INIT
+# CHECK-INIT-EP:               ELF Header:
+# CHECK-INIT-EP:               Entry point address: 0x[[#%x,EP_ADDR:]]
+## Check that the dynamic relocation at .init and .init_array were not patched
+# CHECK-INIT-EP:               Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-INIT-EP-NOT:           (INIT) 0x[[#%x, EP_ADDR]]
+# CHECK-INIT-EP-NOT:           (INIT_ARRAY) 0x[[#%x, EP_ADDR]]
+## Check that the new entry point address points to __bolt_runtime_start
+# CHECK-INIT-EP:               Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-INIT-EP:               {{0+}}[[#%x, EP_ADDR]] {{.*}} __bolt_runtime_start
+
+## Check that DT_INIT address is set to __bolt_runtime_start for PIE executable with DT_INIT
+# CHECK-INIT-NO-EP:            ELF Header:
+# CHECK-INIT-NO-EP:            Entry point address: 0x[[#%x,EP_ADDR:]]
+## Read Dynamic section DT_INIT and DT_INIT_ARRAY entries
+# CHECK-INIT-NO-EP:            Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-INIT-NO-EP-DAG:        (INIT) 0x[[#%x,INIT:]]
+# CHECK-INIT-NO-EP-DAG:        (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]]
+## Check if ELF entry point address points to _start symbol and new DT_INIT entry points to __bolt_runtime_start
+# CHECK-INIT-NO-EP:            Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-INIT-NO-EP-DAG:        {{0+}}[[#%x, EP_ADDR]] {{.*}} _start
+# CHECK-INIT-NO-EP-DAG:        {{0+}}[[#%x, INIT]] {{.*}} __bolt_runtime_start
+
+## Check that 1st entry of DT_INIT_ARRAY is set to __bolt_runtime_start and DT_INIT was not changed
+# CHECK-INIT-ARRAY-NO-EP:      ELF Header:
+# CHECK-INIT-ARRAY-NO-EP:      Entry point address: 0x[[#%x,EP_ADDR:]]
+## Read Dynamic section DT_INIT and DT_INIT_ARRAY entries
+# CHECK-INIT-ARRAY-NO-EP:      Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-INIT-ARRAY-NO-EP-DAG:  (INIT) 0x[[#%x,INIT:]]
+# CHECK-INIT-ARRAY-NO-EP-DAG:  (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]]
+## Read the dynamic relocation from 1st entry of .init_array
+# CHECK-INIT-ARRAY-NO-EP:      Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries
+# CHECK-INIT-ARRAY-NO-EP:      {{0+}}[[#%x,INIT_ARRAY]] {{.*}} R_X86_64_RELATIVE [[#%x,INIT_ADDR:]]
+# CHECK-INIT-ARRAY-NO-EP-NOT:  {{0+}}[[#%x,INIT_ARRAY]] {{.*}} R_X86_64_RELATIVE [[#%x,INIT]]
+## Check that 1st entry of .init_array points to __bolt_runtime_start
+# CHECK-INIT-ARRAY-NO-EP:      Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-INIT-ARRAY-NO-EP-DAG:  {{0+}}[[#%x, EP_ADDR]] {{.*}} _start
+# CHECK-INIT-ARRAY-NO-EP-DAG:  {{[0-9]]*}}: {{0+}}[[#%x, INIT_ADDR]] {{.*}} __bolt_runtime_start
+
+## Check that entry point address is set to __bolt_runtime_start for PIE executable without DT_INIT
+# CHECK-NO-INIT-EP:            ELF Header:
+# CHECK-NO-INIT-EP:            Entry point address: 0x[[#%x,EP_ADDR:]]
+## Check that the dynamic relocation at .init and .init_array were not patched
+# CHECK-NO-INIT-EP:            Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-NO-INIT-EP-NOT:        (INIT) 0x[[#%x, EP_ADDR]]
+# CHECK-NO-INIT-EP-NOT:        (INIT_ARRAY) 0x[[#%x, EP_ADDR]]
+## Check that the new entry point address points to __bolt_runtime_start
+# CHECK-NO-INIT-EP:            Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-NO-INIT-EP:            {{0+}}[[#%x, EP_ADDR]] {{.*}} __bolt_runtime_start
+
+## Check that DT_INIT is set to __bolt_runtime_start for shared library with DT_INIT
+# CHECK-SHARED-INIT:           Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-SHARED-INIT-DAG:       (INIT) 0x[[#%x, INIT:]]
+# CHECK-SHARED-INIT-DAG:       (INIT_ARRAY) 0x[[#%x, INIT_ARRAY:]]
+## Check that the dynamic relocation at .init_array was not patched
+# CHECK-SHARED-INIT:           Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries
+# CHECK-SHARED-INIT-NOT:       {{0+}}[[#%x, INIT_ARRAY]] {{.*}} R_X86_64_64 {{0+}}[[#%x, INIT]]
+## Check that dynamic section DT_INIT points to __bolt_runtime_start
+# CHECK-SHARED-INIT:           Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-SHARED-INIT:           {{0+}}[[#%x, INIT]] {{.*}} __bolt_runtime_start
+
+## Check that entry point address is set to __bolt_runtime_start for PIE executable without DT_INIT
+# CHECK-NO-INIT-NO-EP:         ELF Header:
+# CHECK-NO-INIT-NO-EP:         Entry point address: 0x[[#%x,EP_ADDR:]]
+# CHECK-NO-INIT-NO-EP:         Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-NO-INIT-NO-EP-NOT:     (INIT)
+# CHECK-NO-INIT-NO-EP:         (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]]
+## Read the dynamic relocation from 1st entry of .init_array
+# CHECK-NO-INIT-NO-EP:         Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries
+# CHECK-NO-INIT-NO-EP:         {{0+}}[[#%x,INIT_ARRAY]] {{.*}} R_X86_64_RELATIVE [[#%x,INIT_ADDR:]]
+## Check that 1st entry of .init_array points to __bolt_runtime_start
+# CHECK-NO-INIT-NO-EP:         Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-NO-INIT-NO-EP-DAG:     {{0+}}[[#%x, EP_ADDR]] {{.*}} _start
+# CHECK-NO-INIT-NO-EP-DAG:     {{[0-9]]*}}: {{0+}}[[#%x, INIT_ADDR]] {{.*}} __bolt_runtime_start
+
+## Check that entry point address is set to __bolt_runtime_start for shared library without DT_INIT
+# CHECK-SHARED-NO-INIT:        Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-SHARED-NO-INIT-NOT:    (INIT)
+# CHECK-SHARED-NO-INIT:        (INIT_ARRAY) 0x[[#%x,INIT_ARRAY:]]
+## Read the dynamic relocation from 1st entry of .init_array
+# CHECK-SHARED-NO-INIT:        Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries
+# CHECK-SHARED-NO-INIT:        {{0+}}[[#%x, INIT_ARRAY]] {{.*}} R_X86_64_64 [[#%x,INIT_ADDR:]]
+## Check that 1st entry of .init_array points to __bolt_runtime_start
+# CHECK-SHARED-NO-INIT:        Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-SHARED-NO-INIT:        {{[0-9]]*}}: {{0+}}[[#%x, INIT_ADDR]] {{.*}} __bolt_runtime_start
+
+## Check that entry point address is set to __bolt_runtime_start for non-PIE executable with DT_INIT
+# CHECK-NO-PIE-NO-INIT-EP:     ELF Header:
+# CHECK-NO-PIE-NO-INIT-EP:     Entry point address: 0x[[#%x,EP_ADDR:]]
+## Check that the dynamic relocation at .init and .init_array were not patched
+# CHECK-NO-PIE-NO-INIT-EP:     Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-NO-PIE-NO-INIT-EP-NOT: (INIT) 0x[[#%x, EP_ADDR]]
+# CHECK-NO-PIE-NO-INIT-EP-NOT: (INIT_ARRAY) 0x[[#%x, EP_ADDR]]
+## Check that the new entry point address points to __bolt_runtime_start
+# CHECK-NO-PIE-NO-INIT-EP:     Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-NO-PIE-NO-INIT-EP:     {{0+}}[[#%x, EP_ADDR]] {{.*}} __bolt_runtime_start
+
+  .globl _start
+  .type _start, %function
+_start:
+  # Dummy relocation to force relocation mode.
+  .reloc 0, R_X86_64_NONE
+  retq
+.size _start, .-_start
+
+  .globl _init
+  .type _init, %function
+_init:
+  retq
+  .size _init, .-_init
+
+  .globl _fini
+  .type _fini, %function
+_fini:
+  retq
+  .size _fini, .-_fini
+
+  .section .init_array,"aw"
+  .align 8
+  .quad _init
+
+  .section .fini_array,"aw"
+  .align 8
+  .quad _fini
diff --git a/bolt/test/X86/internal-call-instrument-so.s b/bolt/test/X86/internal-call-instrument-so.s
index 99e5b29221409..fe23bc61afa32 100644
--- a/bolt/test/X86/internal-call-instrument-so.s
+++ b/bolt/test/X86/internal-call-instrument-so.s
@@ -5,7 +5,7 @@
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
 # Delete our BB symbols so BOLT doesn't mark them as entry points
 # RUN: llvm-strip --strip-unneeded %t.o
-# RUN: ld.lld %t.o -o %t.exe -q -shared -fini=_fini
+# RUN: ld.lld %t.o -o %t.exe -q -shared -fini=_fini -init=_init
 # RUN: llvm-bolt --instrument %t.exe --relocs -o %t.out
 
   .text
@@ -48,6 +48,13 @@ _fini:
   hlt
   .size _fini, .-_fini
 
+  .globl  _init
+  .type _init, %function
+  .p2align  4
+_init:
+  retq
+  .size _init, .-_init
+
   .data
   .globl var
 var:
diff --git a/bolt/test/runtime/X86/instrument-wrong-target.s b/bolt/test/runtime/X86/instrument-wrong-target.s
index 343d93a89ed13..fa40d43f10a0f 100644
--- a/bolt/test/runtime/X86/instrument-wrong-target.s
+++ b/bolt/test/runtime/X86/instrument-wrong-target.s
@@ -19,6 +19,13 @@ _start:
     ret
     .size _start, .-_start
 
+    .globl _init
+    .type _init, %function
+    # Force DT_INIT to be created (needed for instrumentation).
+_init:
+    ret
+    .size _init, .-_init
+
     .globl _fini
     .type _fini, %function
     # Force DT_FINI to be created (needed for instrumentation).

From 4d7abe535512e1076ff7e5fea14afde29615a8ed Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Mon, 1 Dec 2025 16:12:11 +0800
Subject: [PATCH 07/39] [mlir][arith] Add support for `cmpf` to
 `ArithToAPFloat` (#169753)

Add support for `arith.cmpf`.
---
 .../ArithToAPFloat/ArithToAPFloat.cpp         | 152 +++++++++++++++++-
 mlir/lib/ExecutionEngine/APFloatWrappers.cpp  |  11 ++
 .../ArithToApfloat/arith-to-apfloat.mlir      |  15 ++
 .../Arith/CPU/test-apfloat-emulation.mlir     |   4 +
 4 files changed, 177 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
index 81fbdb1611deb..566632bd8707f 100644
--- a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
+++ b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
@@ -41,15 +41,17 @@ static FuncOp createFnDecl(OpBuilder &b, SymbolOpInterface symTable,
 }
 
 /// Helper function to look up or create the symbol for a runtime library
-/// function with the given parameter types. Always returns an int64_t.
+/// function with the given parameter types. Returns an int64_t, unless a
+/// different result type is specified.
 static FailureOr<FuncOp>
 lookupOrCreateApFloatFn(OpBuilder &b, SymbolOpInterface symTable,
                         StringRef name, TypeRange paramTypes,
-                        SymbolTableCollection *symbolTables = nullptr) {
-  auto i64Type = IntegerType::get(symTable->getContext(), 64);
-
+                        SymbolTableCollection *symbolTables = nullptr,
+                        Type resultType = {}) {
+  if (!resultType)
+    resultType = IntegerType::get(symTable->getContext(), 64);
   std::string funcName = (llvm::Twine("_mlir_apfloat_") + name).str();
-  auto funcT = FunctionType::get(b.getContext(), paramTypes, {i64Type});
+  auto funcT = FunctionType::get(b.getContext(), paramTypes, {resultType});
   FailureOr<FuncOp> func =
       lookupFnDecl(symTable, funcName, funcT, symbolTables);
   // Failed due to type mismatch.
@@ -308,6 +310,145 @@ struct IntToFpConversion final : OpRewritePattern<OpTy> {
   bool isUnsigned;
 };
 
+struct CmpFOpToAPFloatConversion final : OpRewritePattern<arith::CmpFOp> {
+  CmpFOpToAPFloatConversion(MLIRContext *context, SymbolOpInterface symTable,
+                            PatternBenefit benefit = 1)
+      : OpRewritePattern<arith::CmpFOp>(context, benefit), symTable(symTable) {}
+
+  LogicalResult matchAndRewrite(arith::CmpFOp op,
+                                PatternRewriter &rewriter) const override {
+    // Get APFloat function from runtime library.
+    auto i1Type = IntegerType::get(symTable->getContext(), 1);
+    auto i8Type = IntegerType::get(symTable->getContext(), 8);
+    auto i32Type = IntegerType::get(symTable->getContext(), 32);
+    auto i64Type = IntegerType::get(symTable->getContext(), 64);
+    FailureOr<FuncOp> fn =
+        lookupOrCreateApFloatFn(rewriter, symTable, "compare",
+                                {i32Type, i64Type, i64Type}, nullptr, i8Type);
+    if (failed(fn))
+      return fn;
+
+    // Cast operands to 64-bit integers.
+    rewriter.setInsertionPoint(op);
+    Location loc = op.getLoc();
+    auto floatTy = cast<FloatType>(op.getLhs().getType());
+    auto intWType = rewriter.getIntegerType(floatTy.getWidth());
+    Value lhsBits = arith::ExtUIOp::create(
+        rewriter, loc, i64Type,
+        arith::BitcastOp::create(rewriter, loc, intWType, op.getLhs()));
+    Value rhsBits = arith::ExtUIOp::create(
+        rewriter, loc, i64Type,
+        arith::BitcastOp::create(rewriter, loc, intWType, op.getRhs()));
+
+    // Call APFloat function.
+    Value semValue = getSemanticsValue(rewriter, loc, floatTy);
+    SmallVector<Value> params = {semValue, lhsBits, rhsBits};
+    Value comparisonResult =
+        func::CallOp::create(rewriter, loc, TypeRange(i8Type),
+                             SymbolRefAttr::get(*fn), params)
+            ->getResult(0);
+
+    // Generate an i1 SSA value that is "true" if the comparison result matches
+    // the given `val`.
+    auto checkResult = [&](llvm::APFloat::cmpResult val) {
+      return arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::eq, comparisonResult,
+          arith::ConstantOp::create(
+              rewriter, loc, i8Type,
+              rewriter.getIntegerAttr(i8Type, static_cast<int8_t>(val)))
+              .getResult());
+    };
+    // Generate an i1 SSA value that is "true" if the comparison result matches
+    // any of the given `vals`.
+    std::function<Value(ArrayRef<llvm::APFloat::cmpResult>)> checkResults =
+        [&](ArrayRef<llvm::APFloat::cmpResult> vals) {
+          Value first = checkResult(vals.front());
+          if (vals.size() == 1)
+            return first;
+          Value rest = checkResults(vals.drop_front());
+          return arith::OrIOp::create(rewriter, loc, first, rest).getResult();
+        };
+
+    // This switch-case statement was taken from arith::applyCmpPredicate.
+    Value result;
+    switch (op.getPredicate()) {
+    case arith::CmpFPredicate::AlwaysFalse:
+      result = arith::ConstantOp::create(rewriter, loc, i1Type,
+                                         rewriter.getIntegerAttr(i1Type, 0))
+                   .getResult();
+      break;
+    case arith::CmpFPredicate::OEQ:
+      result = checkResult(llvm::APFloat::cmpEqual);
+      break;
+    case arith::CmpFPredicate::OGT:
+      result = checkResult(llvm::APFloat::cmpGreaterThan);
+      break;
+    case arith::CmpFPredicate::OGE:
+      result = checkResults(
+          {llvm::APFloat::cmpGreaterThan, llvm::APFloat::cmpEqual});
+      break;
+    case arith::CmpFPredicate::OLT:
+      result = checkResult(llvm::APFloat::cmpLessThan);
+      break;
+    case arith::CmpFPredicate::OLE:
+      result =
+          checkResults({llvm::APFloat::cmpLessThan, llvm::APFloat::cmpEqual});
+      break;
+    case arith::CmpFPredicate::ONE:
+      // Not cmpUnordered and not cmpUnordered.
+      result = checkResults(
+          {llvm::APFloat::cmpLessThan, llvm::APFloat::cmpGreaterThan});
+      break;
+    case arith::CmpFPredicate::ORD:
+      // Not cmpUnordered.
+      result = checkResults({llvm::APFloat::cmpLessThan,
+                             llvm::APFloat::cmpGreaterThan,
+                             llvm::APFloat::cmpEqual});
+      break;
+    case arith::CmpFPredicate::UEQ:
+      result =
+          checkResults({llvm::APFloat::cmpUnordered, llvm::APFloat::cmpEqual});
+      break;
+    case arith::CmpFPredicate::UGT:
+      result = checkResults(
+          {llvm::APFloat::cmpUnordered, llvm::APFloat::cmpGreaterThan});
+      break;
+    case arith::CmpFPredicate::UGE:
+      result = checkResults({llvm::APFloat::cmpUnordered,
+                             llvm::APFloat::cmpGreaterThan,
+                             llvm::APFloat::cmpEqual});
+      break;
+    case arith::CmpFPredicate::ULT:
+      result = checkResults(
+          {llvm::APFloat::cmpUnordered, llvm::APFloat::cmpLessThan});
+      break;
+    case arith::CmpFPredicate::ULE:
+      result =
+          checkResults({llvm::APFloat::cmpUnordered, llvm::APFloat::cmpLessThan,
+                        llvm::APFloat::cmpEqual});
+      break;
+    case arith::CmpFPredicate::UNE:
+      // Not cmpEqual.
+      result = checkResults({llvm::APFloat::cmpLessThan,
+                             llvm::APFloat::cmpGreaterThan,
+                             llvm::APFloat::cmpUnordered});
+      break;
+    case arith::CmpFPredicate::UNO:
+      result = checkResult(llvm::APFloat::cmpUnordered);
+      break;
+    case arith::CmpFPredicate::AlwaysTrue:
+      result = arith::ConstantOp::create(rewriter, loc, i1Type,
+                                         rewriter.getIntegerAttr(i1Type, 1))
+                   .getResult();
+      break;
+    }
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+
+  SymbolOpInterface symTable;
+};
+
 namespace {
 struct ArithToAPFloatConversionPass final
     : impl::ArithToAPFloatConversionPassBase<ArithToAPFloatConversionPass> {
@@ -340,6 +481,7 @@ void ArithToAPFloatConversionPass::runOnOperation() {
                                                    /*isUnsigned=*/false);
   patterns.add<IntToFpConversion<arith::UIToFPOp>>(context, getOperation(),
                                                    /*isUnsigned=*/true);
+  patterns.add<CmpFOpToAPFloatConversion>(context, getOperation());
   LogicalResult result = success();
   ScopedDiagnosticHandler scopedHandler(context, [&result](Diagnostic &diag) {
     if (diag.getSeverity() == DiagnosticSeverity::Error) {
diff --git a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
index 44980ccd77491..77f7137264888 100644
--- a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
@@ -131,4 +131,15 @@ MLIR_APFLOAT_WRAPPERS_EXPORT uint64_t _mlir_apfloat_convert_from_int(
                           llvm::RoundingMode::NearestTiesToEven);
   return result.bitcastToAPInt().getZExtValue();
 }
+
+MLIR_APFLOAT_WRAPPERS_EXPORT int8_t _mlir_apfloat_compare(int32_t semantics,
+                                                          uint64_t a,
+                                                          uint64_t b) {
+  const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(
+      static_cast<llvm::APFloatBase::Semantics>(semantics));
+  unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);
+  llvm::APFloat x(sem, llvm::APInt(bitWidth, a));
+  llvm::APFloat y(sem, llvm::APInt(bitWidth, b));
+  return static_cast<int8_t>(x.compare(y));
+}
 }
diff --git a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
index d71d81dddcd4f..78ce3640ecc67 100644
--- a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
+++ b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
@@ -198,3 +198,18 @@ func.func @uitofp(%arg0: i32) {
   %0 = arith.uitofp %arg0 : i32 to f4E2M1FN
   return
 }
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_compare(i32, i64, i64) -> i8
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: %[[cmp:.*]] = call @_mlir_apfloat_compare(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i8
+// CHECK: %[[c3:.*]] = arith.constant 3 : i8
+// CHECK: %[[is_unordered:.*]] = arith.cmpi eq, %[[cmp]], %[[c3]] : i8
+// CHECK: %[[c0:.*]] = arith.constant 0 : i8
+// CHECK: %[[is_lt:.*]] = arith.cmpi eq, %[[cmp]], %[[c0]] : i8
+// CHECK: arith.ori %[[is_unordered]], %[[is_lt]] : i1
+func.func @cmpf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.cmpf "ult", %arg0, %arg1 : f4E2M1FN
+  return
+}
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
index 8046610d479a8..433d058d025cf 100644
--- a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
+++ b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
@@ -43,6 +43,10 @@ func.func @entry() {
   %cvt = arith.truncf %b2 : f32 to f8E4M3FN
   vector.print %cvt : f8E4M3FN
 
+  // CHECK-NEXT: 1
+  %cmp1 = arith.cmpf "olt", %cvt, %c1 : f8E4M3FN
+  vector.print %cmp1 : i1
+
   // CHECK-NEXT: 1
   // Bit pattern: 01, interpreted as signed integer: 1
   %cvt_int_signed = arith.fptosi %cvt : f8E4M3FN to i2

From 17677ad7eb2b2391d61c976887bbd2616e7d6c3e Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Mon, 1 Dec 2025 08:12:41 +0000
Subject: [PATCH 08/39] [LV] Don't create WidePtrAdd recipes for scalar VFs
 (#169344)

While attempting to remove the use of undef from more loop vectoriser
tests I discovered a bug where this assert was firing:

```
llvm::Constant* llvm::Constant::getSplatValue(bool) const: Assertion `this->getType()->isVectorTy() && "Only valid for vectors!"' failed.
...
 #8 0x0000aaaab9e2fba4 llvm::Constant::getSplatValue
 #9 0x0000aaaab9dfb844 llvm::ConstantFoldBinaryInstruction
```

This seems to be happening because we are incorrectly generating
WidePtrAdd recipes for scalar VFs. The PR fixes this by checking whether
a plan has a scalar VF only in legalizeAndOptimizeInductions.

This PR also removes the use of undef from the test `both` in
Transforms/LoopVectorize/iv_outside_user.ll, which is what started
triggering the assert.

Fixes #169334
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  |   3 +-
 .../LoopVectorize/iv_outside_user.ll          | 139 ++++++++++++------
 2 files changed, 95 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b12f8ccc73c7e..f7281283bae81 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -769,7 +769,8 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
     // Replace wide pointer inductions which have only their scalars used by
     // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
     if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
-      if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
+      if (!Plan.hasScalarVFOnly() &&
+          !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
         continue;
 
       VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
index b4fd06316a2e5..4f19a7c586bc3 100644
--- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -152,59 +152,106 @@ for.end:
   ret ptr %ptr.phi
 }
 
-define ptr @both(i32 %k)  {
-; CHECK-LABEL: define ptr @both(
-; CHECK-SAME: i32 [[K:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[BASE:%.*]] = getelementptr inbounds i32, ptr undef, i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[K]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 4
-; CHECK-NEXT:    [[IND_END1:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 4
-; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr undef, i64 [[TMP4]]
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[IND_END1]], i64 -4
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[IND_END1]], %[[MIDDLE_BLOCK]] ], [ [[BASE]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END2]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ]
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[INC_LAG1:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[TMP:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[INC_LAG2:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[INC_LAG1]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP]] = getelementptr inbounds i32, ptr [[INC_LAG1]], i64 1
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[INC_PHI]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]]
-; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
-; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[INC_LAG1_LCSSA:%.*]] = phi ptr [ [[INC_LAG1]], %[[FOR_BODY]] ], [ [[IND_ESCAPE]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret ptr [[INC_LAG1_LCSSA]]
+define ptr @both(ptr %p, i32 %k)  {
+; VEC-LABEL: define ptr @both(
+; VEC-SAME: ptr [[P:%.*]], i32 [[K:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    [[BASE:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 1
+; VEC-NEXT:    [[TMP0:%.*]] = add i32 [[K]], -1
+; VEC-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; VEC-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; VEC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2
+; VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2
+; VEC-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; VEC-NEXT:    [[TMP3:%.*]] = trunc i64 [[N_VEC]] to i32
+; VEC-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 4
+; VEC-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP4]]
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[BASE]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <2 x i64> <i64 0, i64 4>
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VEC-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 8
+; VEC-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x ptr> [[VECTOR_GEP]], i32 1
+; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; VEC-NEXT:    [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[TMP5]], i64 -4
+; VEC-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ [[BASE]], %[[ENTRY]] ]
+; VEC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[BASE]], %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[FOR_BODY:.*]]
+; VEC:       [[FOR_BODY]]:
+; VEC-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; VEC-NEXT:    [[INC_LAG1:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[TMP:%.*]], %[[FOR_BODY]] ]
+; VEC-NEXT:    [[INC_LAG2:%.*]] = phi ptr [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[INC_LAG1]], %[[FOR_BODY]] ]
+; VEC-NEXT:    [[TMP]] = getelementptr inbounds i32, ptr [[INC_LAG1]], i64 1
+; VEC-NEXT:    [[INC]] = add nsw i32 [[INC_PHI]], 1
+; VEC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]]
+; VEC-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[FOR_END]]:
+; VEC-NEXT:    [[INC_LAG1_LCSSA:%.*]] = phi ptr [ [[INC_LAG1]], %[[FOR_BODY]] ], [ [[IND_ESCAPE]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret ptr [[INC_LAG1_LCSSA]]
+;
+; INTERLEAVE-LABEL: define ptr @both(
+; INTERLEAVE-SAME: ptr [[P:%.*]], i32 [[K:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[BASE:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 1
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i32 [[K]], -1
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2
+; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = trunc i64 [[N_VEC]] to i32
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = mul i64 [[N_VEC]], 4
+; INTERLEAVE-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP6]]
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 4
+; INTERLEAVE-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP8]]
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; INTERLEAVE-NEXT:    [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 -4
+; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[NEXT_GEP]], %[[MIDDLE_BLOCK]] ], [ [[BASE]], %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[NEXT_GEP1]], %[[MIDDLE_BLOCK]] ], [ [[BASE]], %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[FOR_BODY:.*]]
+; INTERLEAVE:       [[FOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[INC_LAG1:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[TMP:%.*]], %[[FOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[INC_LAG2:%.*]] = phi ptr [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[INC_LAG1]], %[[FOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP]] = getelementptr inbounds i32, ptr [[INC_LAG1]], i64 1
+; INTERLEAVE-NEXT:    [[INC]] = add nsw i32 [[INC_PHI]], 1
+; INTERLEAVE-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]]
+; INTERLEAVE-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[FOR_END]]:
+; INTERLEAVE-NEXT:    [[INC_LAG1_LCSSA:%.*]] = phi ptr [ [[INC_LAG1]], %[[FOR_BODY]] ], [ [[IND_ESCAPE]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret ptr [[INC_LAG1_LCSSA]]
 ;
 entry:
-  %base = getelementptr inbounds i32, ptr undef, i64 1
+  %base = getelementptr inbounds i32, ptr %p, i64 1
   br label %for.body
 
 for.body:
   %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
   %inc.lag1 = phi ptr [ %base, %entry ], [ %tmp, %for.body]
-  %inc.lag2 = phi ptr [ undef, %entry ], [ %inc.lag1, %for.body]
+  %inc.lag2 = phi ptr [ %base, %entry ], [ %inc.lag1, %for.body]
   %tmp = getelementptr inbounds i32, ptr %inc.lag1, i64 1
   %inc = add nsw i32 %inc.phi, 1
   %cmp = icmp eq i32 %inc, %k

From 7ce71414ec3c7eebe77c1c248c119a7df5067369 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <avarma094@gmail.com>
Date: Mon, 1 Dec 2025 13:44:15 +0530
Subject: [PATCH 09/39] [NFC][Linalg] Follow-up on ConvMatchBuilder (#170080)

-- This commit addresses [follow-up review comments on
169704](https://github.com/llvm/llvm-project/pull/169704#pullrequestreview-3521785548).
-- Contains NFC nit/minor changes.

Signed-off-by: Abhishek Varma <abhvarma@amd.com>
---
 mlir/lib/Dialect/Linalg/Utils/Utils.cpp | 156 +++++++++++++-----------
 1 file changed, 85 insertions(+), 71 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index e85a2ab26bd32..01e6e1e248658 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -430,19 +430,33 @@ static bool convLayoutMatches(ArrayRef<ArrayRef<AffineExpr>> mapListExpected,
                           })));
 }
 
-/// Enum of all kinds of Pooling Op's type.
-enum PoolingType {
-  NONE,
-  MAX_SIGNED,
-  MAX_UNSIGNED,
-  MIN_SIGNED,
-  MIN_UNSIGNED,
-  SUM
+/// Enum representing pooling operation types used by ConvMatcherBuilder.
+enum class PoolingType {
+  None,
+  MaxSigned,
+  MaxUnsigned,
+  MinSigned,
+  MinUnsigned,
+  Sum
 };
 
 /// Helper class for building convolution op matchers with minimal boilerplate.
 /// Reduces repetitive code across Conv1D/2D/3D and Depthwise variants as well
 /// as Pooling ops.
+///
+/// Usage: Create an instance with the op, spatial rank, and output pointers for
+/// extracted dilations/strides. Then chain matchStride() calls for each spatial
+/// dimension, followed by matchMaps() to verify indexing maps, and finally
+/// matchBody() to verify the operation body pattern.
+///
+/// The `matched` flag starts as `true` and is set to `false` if any match step
+/// fails. This allows chaining multiple match calls; once any match fails, all
+/// subsequent calls become no-ops and the final result is `false`.
+///
+/// The `dilations` and `strides` pointers are output parameters that get
+/// populated with the extracted dilation and stride values from the operation's
+/// indexing maps during matchStride() calls. These values are initially set to
+/// 1 for each spatial dimension and updated as patterns are matched.
 class ConvMatcherBuilder {
   LinalgOp op;
   MLIRContext *ctx;
@@ -454,7 +468,7 @@ class ConvMatcherBuilder {
 public:
   ConvMatcherBuilder(LinalgOp op, unsigned spatialRank, SmallVector<int64_t> *d,
                      SmallVector<int64_t> *s,
-                     PoolingType poolingType = PoolingType::NONE)
+                     PoolingType poolingType = PoolingType::None)
       : op(op), ctx(op->getContext()), dilations(d), strides(s),
         indexingMaps(op.getIndexingMaps()), poolingType(poolingType) {
     *dilations = SmallVector<int64_t>(spatialRank, 1);
@@ -474,16 +488,16 @@ class ConvMatcherBuilder {
   ConvMatcherBuilder &matchStride(unsigned iDim, unsigned fDim, unsigned oDim,
                                   unsigned idx) {
     if (matched) {
-      matched = matchConvDimAddExprPattern(indexingMaps, iDim, fDim, oDim,
-                                           (*dilations)[idx], (*strides)[idx]);
+      matched &= matchConvDimAddExprPattern(indexingMaps, iDim, fDim, oDim,
+                                            (*dilations)[idx], (*strides)[idx]);
     }
     return *this;
   }
 
   /// Match expected indexing maps layout. Returns *this for method chaining.
-  ConvMatcherBuilder &expectMaps(ArrayRef<ArrayRef<AffineExpr>> maps) {
+  ConvMatcherBuilder &matchMaps(ArrayRef<ArrayRef<AffineExpr>> maps) {
     if (matched)
-      matched = convLayoutMatches(maps, indexingMaps, ctx);
+      matched &= convLayoutMatches(maps, indexingMaps, ctx);
     return *this;
   }
 
@@ -494,17 +508,17 @@ class ConvMatcherBuilder {
     Block *body = op.getBlock();
     auto yieldOp = cast<linalg::YieldOp>(body->getTerminator());
     switch (poolingType) {
-    case PoolingType::NONE:
+    case PoolingType::None:
       return bodyMatcherForConvolutionOps(yieldOp.getOperand(0), body);
-    case PoolingType::MAX_SIGNED:
+    case PoolingType::MaxSigned:
       return bodyMatcherForMaxSignedPoolOps(yieldOp.getOperand(0), body);
-    case PoolingType::MAX_UNSIGNED:
+    case PoolingType::MaxUnsigned:
       return bodyMatcherForMaxUnsignedPoolOps(yieldOp.getOperand(0), body);
-    case PoolingType::MIN_SIGNED:
+    case PoolingType::MinSigned:
       return bodyMatcherForMinSignedPoolOps(yieldOp.getOperand(0), body);
-    case PoolingType::MIN_UNSIGNED:
+    case PoolingType::MinUnsigned:
       return bodyMatcherForMinUnsignedPoolOps(yieldOp.getOperand(0), body);
-    case PoolingType::SUM:
+    case PoolingType::Sum:
       return bodyMatcherForSumPoolOps(yieldOp.getOperand(0), body);
     }
     return false;
@@ -533,9 +547,9 @@ bool isaConvolutionOpOfType<linalg::Conv1DOp>(LinalgOp op,
   AffineExpr w = m.dim(1);
 
   return m.matchStride(/*iDim=*/0, /*fDim=*/0, /*oDim=*/0, /*idx=*/0)
-      .expectMaps({/*inputMap=*/{m.strided(W, w, 0)},
-                   /*filterMap=*/{w},
-                   /*outputMap=*/{W}})
+      .matchMaps({/*inputMap=*/{m.strided(W, w, 0)},
+                  /*filterMap=*/{w},
+                  /*outputMap=*/{W}})
       .matchBody();
 }
 
@@ -560,9 +574,9 @@ bool isaConvolutionOpOfType<linalg::Conv1DNwcWcfOp>(
   AffineExpr c = m.dim(4);
 
   return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
-      .expectMaps({/*inputMap=*/{N, m.strided(W, w, 0), c},
-                   /*filterMap=*/{w, c, F},
-                   /*outputMap=*/{N, W, F}})
+      .matchMaps({/*inputMap=*/{N, m.strided(W, w, 0), c},
+                  /*filterMap=*/{w, c, F},
+                  /*outputMap=*/{N, W, F}})
       .matchBody();
 }
 
@@ -587,9 +601,9 @@ bool isaConvolutionOpOfType<linalg::Conv1DNcwFcwOp>(
   AffineExpr w = m.dim(4);
 
   return m.matchStride(/*iDim=*/2, /*fDim=*/2, /*oDim=*/2, /*idx=*/0)
-      .expectMaps({/*inputMap=*/{N, c, m.strided(W, w, 0)},
-                   /*filterMap=*/{F, c, w},
-                   /*outputMap=*/{N, F, W}})
+      .matchMaps({/*inputMap=*/{N, c, m.strided(W, w, 0)},
+                  /*filterMap=*/{F, c, w},
+                  /*outputMap=*/{N, F, W}})
       .matchBody();
 }
 
@@ -614,9 +628,9 @@ bool isaConvolutionOpOfType<linalg::Conv2DOp>(LinalgOp op,
 
   return m.matchStride(/*iDim=*/0, /*fDim=*/0, /*oDim=*/0, /*idx=*/0)
       .matchStride(/*iDim=*/1, /*fDim=*/1, /*oDim=*/1, /*idx=*/1)
-      .expectMaps({/*inputMap=*/{m.strided(H, h, 0), m.strided(W, w, 1)},
-                   /*filterMap=*/{h, w},
-                   /*outputMap=*/{H, W}})
+      .matchMaps({/*inputMap=*/{m.strided(H, h, 0), m.strided(W, w, 1)},
+                  /*filterMap=*/{h, w},
+                  /*outputMap=*/{H, W}})
       .matchBody();
 }
 
@@ -644,10 +658,10 @@ bool isaConvolutionOpOfType<linalg::Conv3DOp>(LinalgOp op,
   return m.matchStride(/*iDim=*/0, /*fDim=*/0, /*oDim=*/0, /*idx=*/0)
       .matchStride(/*iDim=*/1, /*fDim=*/1, /*oDim=*/1, /*idx=*/1)
       .matchStride(/*iDim=*/2, /*fDim=*/2, /*oDim=*/2, /*idx=*/2)
-      .expectMaps({/*inputMap=*/{m.strided(D, d, 0), m.strided(H, h, 1),
-                                 m.strided(W, w, 2)},
-                   /*filterMap=*/{d, h, w},
-                   /*outputMap=*/{D, H, W}})
+      .matchMaps({/*inputMap=*/{m.strided(D, d, 0), m.strided(H, h, 1),
+                                m.strided(W, w, 2)},
+                  /*filterMap=*/{d, h, w},
+                  /*outputMap=*/{D, H, W}})
       .matchBody();
 }
 
@@ -671,9 +685,9 @@ bool isaConvolutionOpOfType<linalg::DepthwiseConv1DNcwCwOp>(
   AffineExpr w = m.dim(3);
 
   return m.matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/0)
-      .expectMaps({/*inputMap=*/{N, C, m.strided(W, w, 0)},
-                   /*filterMap=*/{C, w},
-                   /*outputMap=*/{N, C, W}})
+      .matchMaps({/*inputMap=*/{N, C, m.strided(W, w, 0)},
+                  /*filterMap=*/{C, w},
+                  /*outputMap=*/{N, C, W}})
       .matchBody();
 }
 
@@ -697,9 +711,9 @@ bool isaConvolutionOpOfType<linalg::DepthwiseConv1DNwcWcOp>(
   AffineExpr w = m.dim(3);
 
   return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
-      .expectMaps({/*inputMap=*/{N, m.strided(W, w, 0), C},
-                   /*filterMap=*/{w, C},
-                   /*outputMap=*/{N, W, C}})
+      .matchMaps({/*inputMap=*/{N, m.strided(W, w, 0), C},
+                  /*filterMap=*/{w, C},
+                  /*outputMap=*/{N, W, C}})
       .matchBody();
 }
 
@@ -724,9 +738,9 @@ bool isaConvolutionOpOfType<linalg::DepthwiseConv1DNwcWcmOp>(
   AffineExpr w = m.dim(4);
 
   return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
-      .expectMaps({/*inputMap=*/{N, m.strided(W, w, 0), C},
-                   /*filterMap=*/{w, C, CM},
-                   /*outputMap=*/{N, W, C, CM}})
+      .matchMaps({/*inputMap=*/{N, m.strided(W, w, 0), C},
+                  /*filterMap=*/{w, C, CM},
+                  /*outputMap=*/{N, W, C, CM}})
       .matchBody();
 }
 
@@ -753,9 +767,9 @@ bool isaConvolutionOpOfType<linalg::DepthwiseConv2DNchwChwOp>(
 
   return m.matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/0)
       .matchStride(/*iDim=*/3, /*fDim=*/2, /*oDim=*/3, /*idx=*/1)
-      .expectMaps({/*inputMap=*/{N, C, m.strided(H, h, 0), m.strided(W, w, 1)},
-                   /*filterMap=*/{C, h, w},
-                   /*outputMap=*/{N, C, H, W}})
+      .matchMaps({/*inputMap=*/{N, C, m.strided(H, h, 0), m.strided(W, w, 1)},
+                  /*filterMap=*/{C, h, w},
+                  /*outputMap=*/{N, C, H, W}})
       .matchBody();
 }
 
@@ -789,10 +803,10 @@ bool isaConvolutionOpOfType<linalg::DepthwiseConv3DNdhwcDhwcmOp>(
   return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
       .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1)
       .matchStride(/*iDim=*/3, /*fDim=*/2, /*oDim=*/3, /*idx=*/2)
-      .expectMaps({/*inputMap=*/{N, m.strided(D, d, 0), m.strided(H, h, 1),
-                                 m.strided(W, w, 2), C},
-                   /*filterMap=*/{d, h, w, C, CM},
-                   /*outputMap=*/{N, D, H, W, C, CM}})
+      .matchMaps({/*inputMap=*/{N, m.strided(D, d, 0), m.strided(H, h, 1),
+                                m.strided(W, w, 2), C},
+                  /*filterMap=*/{d, h, w, C, CM},
+                  /*outputMap=*/{N, D, H, W, C, CM}})
       .matchBody();
 }
 
@@ -810,7 +824,7 @@ bool isaConvolutionOpOfType<linalg::PoolingNhwcMaxOp>(
          "expected op to implement ConvolutionOpInterface");
 
   ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides,
-                       PoolingType::MAX_SIGNED);
+                       PoolingType::MaxSigned);
   AffineExpr N = m.dim(0);
   AffineExpr H = m.dim(1);
   AffineExpr W = m.dim(2);
@@ -820,9 +834,9 @@ bool isaConvolutionOpOfType<linalg::PoolingNhwcMaxOp>(
 
   return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
       .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1)
-      .expectMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
-                   /*filterMap=*/{h, w},
-                   /*outputMap=*/{N, H, W, C}})
+      .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
+                  /*filterMap=*/{h, w},
+                  /*outputMap=*/{N, H, W, C}})
       .matchBody();
 }
 
@@ -840,7 +854,7 @@ bool isaConvolutionOpOfType<linalg::PoolingNhwcMinOp>(
          "expected op to implement ConvolutionOpInterface");
 
   ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides,
-                       PoolingType::MIN_SIGNED);
+                       PoolingType::MinSigned);
   AffineExpr N = m.dim(0);
   AffineExpr H = m.dim(1);
   AffineExpr W = m.dim(2);
@@ -850,9 +864,9 @@ bool isaConvolutionOpOfType<linalg::PoolingNhwcMinOp>(
 
   return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
       .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1)
-      .expectMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
-                   /*filterMap=*/{h, w},
-                   /*outputMap=*/{N, H, W, C}})
+      .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
+                  /*filterMap=*/{h, w},
+                  /*outputMap=*/{N, H, W, C}})
       .matchBody();
 }
 
@@ -870,7 +884,7 @@ bool isaConvolutionOpOfType<linalg::PoolingNhwcSumOp>(
          "expected op to implement ConvolutionOpInterface");
 
   ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides,
-                       PoolingType::SUM);
+                       PoolingType::Sum);
   AffineExpr N = m.dim(0);
   AffineExpr H = m.dim(1);
   AffineExpr W = m.dim(2);
@@ -880,9 +894,9 @@ bool isaConvolutionOpOfType<linalg::PoolingNhwcSumOp>(
 
   return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
       .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1)
-      .expectMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
-                   /*filterMap=*/{h, w},
-                   /*outputMap=*/{N, H, W, C}})
+      .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
+                  /*filterMap=*/{h, w},
+                  /*outputMap=*/{N, H, W, C}})
       .matchBody();
 }
 
@@ -900,7 +914,7 @@ bool isaConvolutionOpOfType<linalg::PoolingNhwcMaxUnsignedOp>(
          "expected op to implement ConvolutionOpInterface");
 
   ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides,
-                       PoolingType::MAX_UNSIGNED);
+                       PoolingType::MaxUnsigned);
   AffineExpr N = m.dim(0);
   AffineExpr H = m.dim(1);
   AffineExpr W = m.dim(2);
@@ -910,9 +924,9 @@ bool isaConvolutionOpOfType<linalg::PoolingNhwcMaxUnsignedOp>(
 
   return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
       .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1)
-      .expectMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
-                   /*filterMap=*/{h, w},
-                   /*outputMap=*/{N, H, W, C}})
+      .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
+                  /*filterMap=*/{h, w},
+                  /*outputMap=*/{N, H, W, C}})
       .matchBody();
 }
 
@@ -930,7 +944,7 @@ bool isaConvolutionOpOfType<linalg::PoolingNhwcMinUnsignedOp>(
          "expected op to implement ConvolutionOpInterface");
 
   ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides,
-                       PoolingType::MIN_UNSIGNED);
+                       PoolingType::MinUnsigned);
   AffineExpr N = m.dim(0);
   AffineExpr H = m.dim(1);
   AffineExpr W = m.dim(2);
@@ -940,9 +954,9 @@ bool isaConvolutionOpOfType<linalg::PoolingNhwcMinUnsignedOp>(
 
   return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0)
       .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1)
-      .expectMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
-                   /*filterMap=*/{h, w},
-                   /*outputMap=*/{N, H, W, C}})
+      .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C},
+                  /*filterMap=*/{h, w},
+                  /*outputMap=*/{N, H, W, C}})
       .matchBody();
 }
 

From f67b01847031aadd4d9d9b90e82c99d0490c4287 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Mon, 1 Dec 2025 16:15:15 +0800
Subject: [PATCH 10/39] [mlir][SPIRV] Improve ub.unreachable lowering test case
 (#170083)

Addresses a comment on the PR that introduces the ub.reachable ->
spriv.Unreachable lowering
(https://github.com/llvm/llvm-project/pull/169872#discussion_r2573670611).
---
 mlir/test/Conversion/UBToSPIRV/ub-to-spirv.mlir | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/mlir/test/Conversion/UBToSPIRV/ub-to-spirv.mlir b/mlir/test/Conversion/UBToSPIRV/ub-to-spirv.mlir
index edbe8b8001bba..9c277cf99b9a8 100644
--- a/mlir/test/Conversion/UBToSPIRV/ub-to-spirv.mlir
+++ b/mlir/test/Conversion/UBToSPIRV/ub-to-spirv.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -split-input-file -convert-ub-to-spirv -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-opt -split-input-file -convert-ub-to-spirv %s | FileCheck %s
 
 module attributes {
   spirv.target_env = #spirv.target_env<
@@ -22,15 +22,17 @@ func.func @check_poison() {
 
 // -----
 
-// No successful test because the dialect conversion framework does not convert
-// unreachable blocks.
-
 module attributes {
   spirv.target_env = #spirv.target_env<
     #spirv.vce<v1.0, [Int8, Int16, Int64, Float16, Float64, Shader], []>, #spirv.resource_limits<>>
 } {
-func.func @check_unrechable() {
-// expected-error@+1{{cannot be used in reachable block}}
-  spirv.Unreachable
+// CHECK-LABEL: @check_unrechable
+func.func @check_unrechable(%c: i1) {
+  cf.cond_br %c, ^bb1, ^bb2
+^bb1:
+// CHECK: spirv.Unreachable
+  ub.unreachable
+^bb2:
+  return
 }
 }

From 05b19895510af314a78ed42c6a969c4478a8f496 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Mon, 1 Dec 2025 16:28:23 +0800
Subject: [PATCH 11/39] [mlir][arith] Add support for `negf` to
 `ArithToAPFloat` (#169759)

Add support for `arith.negf`.
---
 .../ArithToAPFloat/ArithToAPFloat.cpp         | 47 ++++++++++++++++++-
 mlir/lib/ExecutionEngine/APFloatWrappers.cpp  |  9 ++++
 .../ArithToApfloat/arith-to-apfloat.mlir      | 10 ++++
 .../Arith/CPU/test-apfloat-emulation.mlir     |  4 ++
 4 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
index 566632bd8707f..024a97b03c14e 100644
--- a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
+++ b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
@@ -449,6 +449,49 @@ struct CmpFOpToAPFloatConversion final : OpRewritePattern<arith::CmpFOp> {
   SymbolOpInterface symTable;
 };
 
+struct NegFOpToAPFloatConversion final : OpRewritePattern<arith::NegFOp> {
+  NegFOpToAPFloatConversion(MLIRContext *context, SymbolOpInterface symTable,
+                            PatternBenefit benefit = 1)
+      : OpRewritePattern<arith::NegFOp>(context, benefit), symTable(symTable) {}
+
+  LogicalResult matchAndRewrite(arith::NegFOp op,
+                                PatternRewriter &rewriter) const override {
+    // Get APFloat function from runtime library.
+    auto i32Type = IntegerType::get(symTable->getContext(), 32);
+    auto i64Type = IntegerType::get(symTable->getContext(), 64);
+    FailureOr<FuncOp> fn =
+        lookupOrCreateApFloatFn(rewriter, symTable, "neg", {i32Type, i64Type});
+    if (failed(fn))
+      return fn;
+
+    // Cast operand to 64-bit integer.
+    rewriter.setInsertionPoint(op);
+    Location loc = op.getLoc();
+    auto floatTy = cast<FloatType>(op.getOperand().getType());
+    auto intWType = rewriter.getIntegerType(floatTy.getWidth());
+    Value operandBits = arith::ExtUIOp::create(
+        rewriter, loc, i64Type, arith::BitcastOp::create(rewriter, loc, intWType, op.getOperand()));
+
+    // Call APFloat function.
+    Value semValue = getSemanticsValue(rewriter, loc, floatTy);
+    SmallVector<Value> params = {semValue, operandBits};
+    Value negatedBits =
+        func::CallOp::create(rewriter, loc, TypeRange(i64Type),
+                             SymbolRefAttr::get(*fn), params)
+            ->getResult(0);
+
+    // Truncate result to the original width.
+    Value truncatedBits = arith::TruncIOp::create(rewriter, loc, intWType,
+                                                  negatedBits);
+    Value result =
+        arith::BitcastOp::create(rewriter, loc, floatTy, truncatedBits);
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+
+  SymbolOpInterface symTable;
+};
+
 namespace {
 struct ArithToAPFloatConversionPass final
     : impl::ArithToAPFloatConversionPassBase<ArithToAPFloatConversionPass> {
@@ -471,7 +514,8 @@ void ArithToAPFloatConversionPass::runOnOperation() {
   patterns.add<BinaryArithOpToAPFloatConversion<arith::RemFOp>>(
       context, "remainder", getOperation());
   patterns
-      .add<FpToFpConversion<arith::ExtFOp>, FpToFpConversion<arith::TruncFOp>>(
+      .add<FpToFpConversion<arith::ExtFOp>, FpToFpConversion<arith::TruncFOp>,
+           CmpFOpToAPFloatConversion, NegFOpToAPFloatConversion>(
           context, getOperation());
   patterns.add<FpToIntConversion<arith::FPToSIOp>>(context, getOperation(),
                                                    /*isUnsigned=*/false);
@@ -481,7 +525,6 @@ void ArithToAPFloatConversionPass::runOnOperation() {
                                                    /*isUnsigned=*/false);
   patterns.add<IntToFpConversion<arith::UIToFPOp>>(context, getOperation(),
                                                    /*isUnsigned=*/true);
-  patterns.add<CmpFOpToAPFloatConversion>(context, getOperation());
   LogicalResult result = success();
   ScopedDiagnosticHandler scopedHandler(context, [&result](Diagnostic &diag) {
     if (diag.getSeverity() == DiagnosticSeverity::Error) {
diff --git a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
index 77f7137264888..f2d5254be6b57 100644
--- a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
@@ -142,4 +142,13 @@ MLIR_APFLOAT_WRAPPERS_EXPORT int8_t _mlir_apfloat_compare(int32_t semantics,
   llvm::APFloat y(sem, llvm::APInt(bitWidth, b));
   return static_cast<int8_t>(x.compare(y));
 }
+
+MLIR_APFLOAT_WRAPPERS_EXPORT uint64_t _mlir_apfloat_neg(int32_t semantics, uint64_t a) {
+  const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(
+      static_cast<llvm::APFloatBase::Semantics>(semantics));
+  unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);
+  llvm::APFloat x(sem, llvm::APInt(bitWidth, a));
+  x.changeSign();
+  return x.bitcastToAPInt().getZExtValue();
+}
 }
diff --git a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
index 78ce3640ecc67..775cb5ea60f22 100644
--- a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
+++ b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
@@ -213,3 +213,13 @@ func.func @cmpf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
   %0 = arith.cmpf "ult", %arg0, %arg1 : f4E2M1FN
   return
 }
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_neg(i32, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 2 : i32
+// CHECK: %[[res:.*]] = call @_mlir_apfloat_neg(%[[sem]], %{{.*}}) : (i32, i64) -> i64
+func.func @negf(%arg0: f32) {
+  %0 = arith.negf %arg0 : f32
+  return
+}
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
index 433d058d025cf..555cc9a531966 100644
--- a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
+++ b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
@@ -43,6 +43,10 @@ func.func @entry() {
   %cvt = arith.truncf %b2 : f32 to f8E4M3FN
   vector.print %cvt : f8E4M3FN
 
+  // CHECK-NEXT: -2.25
+  %negated = arith.negf %cvt : f8E4M3FN
+  vector.print %negated : f8E4M3FN
+
   // CHECK-NEXT: 1
   %cmp1 = arith.cmpf "olt", %cvt, %c1 : f8E4M3FN
   vector.print %cmp1 : i1

From 9afb651613a9383923b0f52885fb2221a5ec134f Mon Sep 17 00:00:00 2001
From: ShashwathiNavada <shashwathinavada@gmail.com>
Date: Mon, 1 Dec 2025 14:03:32 +0530
Subject: [PATCH 12/39] Adding support for iterator in motion clauses.
 (#159112)

As described in section 2.14.6 of openmp spec, the patch implements
support for iterator in motion clauses.

---------

Co-authored-by: Shashwathi N <nshashwa@pe31.hpc.amslabs.hpecorp.net>
---
 clang/docs/OpenMPSupport.rst                  |  2 +-
 clang/include/clang/AST/OpenMPClause.h        | 44 ++++++++++++-------
 clang/include/clang/Basic/OpenMPKinds.def     |  1 +
 clang/include/clang/Sema/SemaOpenMP.h         |  4 +-
 clang/lib/AST/OpenMPClause.cpp                | 38 ++++++++++------
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         | 18 ++++++++
 clang/lib/Parse/ParseOpenMP.cpp               | 31 ++++++++-----
 clang/lib/Sema/SemaOpenMP.cpp                 | 42 +++++++++++-------
 clang/lib/Sema/TreeTransform.h                | 38 +++++++++++-----
 clang/lib/Serialization/ASTReader.cpp         |  4 ++
 clang/lib/Serialization/ASTWriter.cpp         |  4 ++
 clang/test/OpenMP/target_update_codegen.cpp   | 32 ++++++++++++++
 .../target_update_iterator_ast_print.cpp      | 16 +++++++
 .../target_update_iterator_serialization.cpp  | 35 +++++++++++++++
 14 files changed, 238 insertions(+), 71 deletions(-)
 create mode 100644 clang/test/OpenMP/target_update_iterator_ast_print.cpp
 create mode 100644 clang/test/OpenMP/target_update_iterator_serialization.cpp

diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index e7ca7b0bd0792..ab3f2c48983ca 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -266,7 +266,7 @@ implementation.
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | device                       | has_device_addr clause on target construct                   | :none:`unclaimed`        |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | iterators in map clause or motion clauses                    | :none:`unclaimed`        |                                                                       |
+| device                       | iterators in map clause or motion clauses                    | :none:`done`             | https://github.com/llvm/llvm-project/pull/159112                      |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | device                       | indirect clause on declare target directive                  | :part:`In Progress`      |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 72c5efde7449b..d9c3cf239451e 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -7582,7 +7582,8 @@ class OMPToClause final : public OMPMappableExprListClause<OMPToClause>,
 
   /// Motion-modifiers for the 'to' clause.
   OpenMPMotionModifierKind MotionModifiers[NumberOfOMPMotionModifiers] = {
-      OMPC_MOTION_MODIFIER_unknown, OMPC_MOTION_MODIFIER_unknown};
+      OMPC_MOTION_MODIFIER_unknown, OMPC_MOTION_MODIFIER_unknown,
+      OMPC_MOTION_MODIFIER_unknown};
 
   /// Location of motion-modifiers for the 'to' clause.
   SourceLocation MotionModifiersLoc[NumberOfOMPMotionModifiers];
@@ -7654,6 +7655,9 @@ class OMPToClause final : public OMPMappableExprListClause<OMPToClause>,
     MotionModifiersLoc[I] = TLoc;
   }
 
+  void setIteratorModifier(Expr *IteratorModifier) {
+    getTrailingObjects<Expr *>()[2 * varlist_size()] = IteratorModifier;
+  }
   /// Set colon location.
   void setColonLoc(SourceLocation Loc) { ColonLoc = Loc; }
 
@@ -7662,7 +7666,7 @@ class OMPToClause final : public OMPMappableExprListClause<OMPToClause>,
   size_t numTrailingObjects(OverloadToken<Expr *>) const {
     // There are varlist_size() of expressions, and varlist_size() of
     // user-defined mappers.
-    return 2 * varlist_size();
+    return 2 * varlist_size() + 1;
   }
   size_t numTrailingObjects(OverloadToken<ValueDecl *>) const {
     return getUniqueDeclarationsNum();
@@ -7688,15 +7692,14 @@ class OMPToClause final : public OMPMappableExprListClause<OMPToClause>,
   /// \param UDMQualifierLoc C++ nested name specifier for the associated
   /// user-defined mapper.
   /// \param MapperId The identifier of associated user-defined mapper.
-  static OMPToClause *Create(const ASTContext &C, const OMPVarListLocTy &Locs,
-                             ArrayRef<Expr *> Vars,
-                             ArrayRef<ValueDecl *> Declarations,
-                             MappableExprComponentListsRef ComponentLists,
-                             ArrayRef<Expr *> UDMapperRefs,
-                             ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
-                             ArrayRef<SourceLocation> MotionModifiersLoc,
-                             NestedNameSpecifierLoc UDMQualifierLoc,
-                             DeclarationNameInfo MapperId);
+  static OMPToClause *
+  Create(const ASTContext &C, const OMPVarListLocTy &Locs,
+         ArrayRef<Expr *> Vars, ArrayRef<ValueDecl *> Declarations,
+         MappableExprComponentListsRef ComponentLists,
+         ArrayRef<Expr *> UDMapperRefs, Expr *IteratorModifier,
+         ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
+         ArrayRef<SourceLocation> MotionModifiersLoc,
+         NestedNameSpecifierLoc UDMQualifierLoc, DeclarationNameInfo MapperId);
 
   /// Creates an empty clause with the place for \a NumVars variables.
   ///
@@ -7717,7 +7720,9 @@ class OMPToClause final : public OMPMappableExprListClause<OMPToClause>,
            "Requested modifier exceeds the total number of modifiers.");
     return MotionModifiers[Cnt];
   }
-
+  Expr *getIteratorModifier() const {
+    return getTrailingObjects<Expr *>()[2 * varlist_size()];
+  }
   /// Fetches the motion-modifier location at 'Cnt' index of array of modifiers'
   /// locations.
   ///
@@ -7782,7 +7787,8 @@ class OMPFromClause final
 
   /// Motion-modifiers for the 'from' clause.
   OpenMPMotionModifierKind MotionModifiers[NumberOfOMPMotionModifiers] = {
-      OMPC_MOTION_MODIFIER_unknown, OMPC_MOTION_MODIFIER_unknown};
+      OMPC_MOTION_MODIFIER_unknown, OMPC_MOTION_MODIFIER_unknown,
+      OMPC_MOTION_MODIFIER_unknown};
 
   /// Location of motion-modifiers for the 'from' clause.
   SourceLocation MotionModifiersLoc[NumberOfOMPMotionModifiers];
@@ -7843,7 +7849,9 @@ class OMPFromClause final
            "Unexpected index to store motion modifier, exceeds array size.");
     MotionModifiers[I] = T;
   }
-
+  void setIteratorModifier(Expr *IteratorModifier) {
+    getTrailingObjects<Expr *>()[2 * varlist_size()] = IteratorModifier;
+  }
   /// Set location for the motion-modifier.
   ///
   /// \param I index for motion-modifier location.
@@ -7862,7 +7870,7 @@ class OMPFromClause final
   size_t numTrailingObjects(OverloadToken<Expr *>) const {
     // There are varlist_size() of expressions, and varlist_size() of
     // user-defined mappers.
-    return 2 * varlist_size();
+    return 2 * varlist_size() + 1;
   }
   size_t numTrailingObjects(OverloadToken<ValueDecl *>) const {
     return getUniqueDeclarationsNum();
@@ -7892,7 +7900,7 @@ class OMPFromClause final
   Create(const ASTContext &C, const OMPVarListLocTy &Locs,
          ArrayRef<Expr *> Vars, ArrayRef<ValueDecl *> Declarations,
          MappableExprComponentListsRef ComponentLists,
-         ArrayRef<Expr *> UDMapperRefs,
+         ArrayRef<Expr *> UDMapperRefs, Expr *IteratorExpr,
          ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
          ArrayRef<SourceLocation> MotionModifiersLoc,
          NestedNameSpecifierLoc UDMQualifierLoc, DeclarationNameInfo MapperId);
@@ -7916,7 +7924,9 @@ class OMPFromClause final
            "Requested modifier exceeds the total number of modifiers.");
     return MotionModifiers[Cnt];
   }
-
+  Expr *getIteratorModifier() const {
+    return getTrailingObjects<Expr *>()[2 * varlist_size()];
+  }
   /// Fetches the motion-modifier location at 'Cnt' index of array of modifiers'
   /// locations.
   ///
diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def
index b98b946cad75a..ceac89d3aba6d 100644
--- a/clang/include/clang/Basic/OpenMPKinds.def
+++ b/clang/include/clang/Basic/OpenMPKinds.def
@@ -207,6 +207,7 @@ OPENMP_MAP_MODIFIER_KIND(ompx_hold)
 
 // Modifiers for 'to' or 'from' clause.
 OPENMP_MOTION_MODIFIER_KIND(mapper)
+OPENMP_MOTION_MODIFIER_KIND(iterator)
 OPENMP_MOTION_MODIFIER_KIND(present)
 
 // Static attributes for 'dist_schedule' clause.
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index 686e51ee92a08..2d05b4423140b 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -1351,7 +1351,7 @@ class SemaOpenMP : public SemaBase {
   OMPClause *
   ActOnOpenMPToClause(ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
                       ArrayRef<SourceLocation> MotionModifiersLoc,
-                      CXXScopeSpec &MapperIdScopeSpec,
+                      Expr *IteratorModifier, CXXScopeSpec &MapperIdScopeSpec,
                       DeclarationNameInfo &MapperId, SourceLocation ColonLoc,
                       ArrayRef<Expr *> VarList, const OMPVarListLocTy &Locs,
                       ArrayRef<Expr *> UnresolvedMappers = {});
@@ -1359,7 +1359,7 @@ class SemaOpenMP : public SemaBase {
   OMPClause *
   ActOnOpenMPFromClause(ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
                         ArrayRef<SourceLocation> MotionModifiersLoc,
-                        CXXScopeSpec &MapperIdScopeSpec,
+                        Expr *IteratorModifier, CXXScopeSpec &MapperIdScopeSpec,
                         DeclarationNameInfo &MapperId, SourceLocation ColonLoc,
                         ArrayRef<Expr *> VarList, const OMPVarListLocTy &Locs,
                         ArrayRef<Expr *> UnresolvedMappers = {});
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index 0640fed823771..2183d77de8fa7 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -1321,7 +1321,7 @@ OMPToClause *OMPToClause::Create(
     const ASTContext &C, const OMPVarListLocTy &Locs, ArrayRef<Expr *> Vars,
     ArrayRef<ValueDecl *> Declarations,
     MappableExprComponentListsRef ComponentLists, ArrayRef<Expr *> UDMapperRefs,
-    ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
+    Expr *IteratorModifier, ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
     ArrayRef<SourceLocation> MotionModifiersLoc,
     NestedNameSpecifierLoc UDMQualifierLoc, DeclarationNameInfo MapperId) {
   OMPMappableExprListSizeTy Sizes;
@@ -1343,7 +1343,7 @@ OMPToClause *OMPToClause::Create(
   void *Mem = C.Allocate(
       totalSizeToAlloc<Expr *, ValueDecl *, unsigned,
                        OMPClauseMappableExprCommon::MappableComponent>(
-          2 * Sizes.NumVars, Sizes.NumUniqueDeclarations,
+          2 * Sizes.NumVars + 1, Sizes.NumUniqueDeclarations,
           Sizes.NumUniqueDeclarations + Sizes.NumComponentLists,
           Sizes.NumComponents));
 
@@ -1353,6 +1353,7 @@ OMPToClause *OMPToClause::Create(
   Clause->setVarRefs(Vars);
   Clause->setUDMapperRefs(UDMapperRefs);
   Clause->setClauseInfo(Declarations, ComponentLists);
+  Clause->setIteratorModifier(IteratorModifier);
   return Clause;
 }
 
@@ -1361,17 +1362,19 @@ OMPToClause *OMPToClause::CreateEmpty(const ASTContext &C,
   void *Mem = C.Allocate(
       totalSizeToAlloc<Expr *, ValueDecl *, unsigned,
                        OMPClauseMappableExprCommon::MappableComponent>(
-          2 * Sizes.NumVars, Sizes.NumUniqueDeclarations,
+          2 * Sizes.NumVars + 1, Sizes.NumUniqueDeclarations,
           Sizes.NumUniqueDeclarations + Sizes.NumComponentLists,
           Sizes.NumComponents));
-  return new (Mem) OMPToClause(Sizes);
+  OMPToClause *Clause = new (Mem) OMPToClause(Sizes);
+  Clause->setIteratorModifier(nullptr);
+  return Clause;
 }
 
 OMPFromClause *OMPFromClause::Create(
     const ASTContext &C, const OMPVarListLocTy &Locs, ArrayRef<Expr *> Vars,
     ArrayRef<ValueDecl *> Declarations,
     MappableExprComponentListsRef ComponentLists, ArrayRef<Expr *> UDMapperRefs,
-    ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
+    Expr *IteratorModifier, ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
     ArrayRef<SourceLocation> MotionModifiersLoc,
     NestedNameSpecifierLoc UDMQualifierLoc, DeclarationNameInfo MapperId) {
   OMPMappableExprListSizeTy Sizes;
@@ -1393,7 +1396,7 @@ OMPFromClause *OMPFromClause::Create(
   void *Mem = C.Allocate(
       totalSizeToAlloc<Expr *, ValueDecl *, unsigned,
                        OMPClauseMappableExprCommon::MappableComponent>(
-          2 * Sizes.NumVars, Sizes.NumUniqueDeclarations,
+          2 * Sizes.NumVars + 1, Sizes.NumUniqueDeclarations,
           Sizes.NumUniqueDeclarations + Sizes.NumComponentLists,
           Sizes.NumComponents));
 
@@ -1404,6 +1407,7 @@ OMPFromClause *OMPFromClause::Create(
   Clause->setVarRefs(Vars);
   Clause->setUDMapperRefs(UDMapperRefs);
   Clause->setClauseInfo(Declarations, ComponentLists);
+  Clause->setIteratorModifier(IteratorModifier);
   return Clause;
 }
 
@@ -1413,10 +1417,12 @@ OMPFromClause::CreateEmpty(const ASTContext &C,
   void *Mem = C.Allocate(
       totalSizeToAlloc<Expr *, ValueDecl *, unsigned,
                        OMPClauseMappableExprCommon::MappableComponent>(
-          2 * Sizes.NumVars, Sizes.NumUniqueDeclarations,
+          2 * Sizes.NumVars + 1, Sizes.NumUniqueDeclarations,
           Sizes.NumUniqueDeclarations + Sizes.NumComponentLists,
           Sizes.NumComponents));
-  return new (Mem) OMPFromClause(Sizes);
+  OMPFromClause *Clause = new (Mem) OMPFromClause(Sizes);
+  Clause->setIteratorModifier(nullptr);
+  return Clause;
 }
 
 void OMPUseDevicePtrClause::setPrivateCopies(ArrayRef<Expr *> VL) {
@@ -2694,12 +2700,16 @@ template <typename T> void OMPClausePrinter::VisitOMPMotionClause(T *Node) {
     OS << '(';
     for (unsigned I = 0; I < NumberOfOMPMotionModifiers; ++I) {
       if (Node->getMotionModifier(I) != OMPC_MOTION_MODIFIER_unknown) {
-        OS << getOpenMPSimpleClauseTypeName(Node->getClauseKind(),
-                                            Node->getMotionModifier(I));
-        if (Node->getMotionModifier(I) == OMPC_MOTION_MODIFIER_mapper)
-          PrintMapper(OS, Node, Policy);
-        if (I < ModifierCount - 1)
-          OS << ", ";
+        if (Node->getMotionModifier(I) == OMPC_MOTION_MODIFIER_iterator) {
+          PrintIterator(OS, Node, Policy);
+        } else {
+          OS << getOpenMPSimpleClauseTypeName(Node->getClauseKind(),
+                                              Node->getMotionModifier(I));
+          if (Node->getMotionModifier(I) == OMPC_MOTION_MODIFIER_mapper)
+            PrintMapper(OS, Node, Policy);
+          if (I < ModifierCount - 1)
+            OS << ", ";
+        }
       }
     }
     OS << ':';
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index a8255ac74cfcf..9bd6da4a38df8 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -8634,6 +8634,15 @@ class MappableExprsHandler {
       if (llvm::is_contained(C->getMotionModifiers(),
                              OMPC_MOTION_MODIFIER_present))
         Kind = Present;
+      if (llvm::is_contained(C->getMotionModifiers(),
+                             OMPC_MOTION_MODIFIER_iterator)) {
+        if (auto *IteratorExpr = dyn_cast<OMPIteratorExpr>(
+                C->getIteratorModifier()->IgnoreParenImpCasts())) {
+          const auto *VD = cast<VarDecl>(IteratorExpr->getIteratorDecl(0));
+          CGF.EmitVarDecl(*VD);
+        }
+      }
+
       const auto *EI = C->getVarRefs().begin();
       for (const auto L : C->component_lists()) {
         InfoGen(std::get<0>(L), Kind, std::get<1>(L), OMPC_MAP_to, {},
@@ -8650,6 +8659,15 @@ class MappableExprsHandler {
       if (llvm::is_contained(C->getMotionModifiers(),
                              OMPC_MOTION_MODIFIER_present))
         Kind = Present;
+      if (llvm::is_contained(C->getMotionModifiers(),
+                             OMPC_MOTION_MODIFIER_iterator)) {
+        if (auto *IteratorExpr = dyn_cast<OMPIteratorExpr>(
+                C->getIteratorModifier()->IgnoreParenImpCasts())) {
+          const auto *VD = cast<VarDecl>(IteratorExpr->getIteratorDecl(0));
+          CGF.EmitVarDecl(*VD);
+        }
+      }
+
       const auto *EI = C->getVarRefs().begin();
       for (const auto L : C->component_lists()) {
         InfoGen(std::get<0>(L), Kind, std::get<1>(L), OMPC_MAP_from, {},
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 3b69c286634bb..15c3f7594bf44 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -4925,19 +4925,28 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
         break;
       Data.MotionModifiers.push_back(Modifier);
       Data.MotionModifiersLoc.push_back(Tok.getLocation());
-      ConsumeToken();
-      if (Modifier == OMPC_MOTION_MODIFIER_mapper) {
-        IsInvalidMapperModifier = parseMapperModifier(Data);
-        if (IsInvalidMapperModifier)
+      if (PP.getSpelling(Tok) == "iterator" && getLangOpts().OpenMP >= 51) {
+        ExprResult Tail;
+        Tail = ParseOpenMPIteratorsExpr();
+        Tail = Actions.ActOnFinishFullExpr(Tail.get(), T.getOpenLocation(),
+                                           /*DiscardedValue=*/false);
+        if (Tail.isUsable())
+          Data.IteratorExpr = Tail.get();
+      } else {
+        ConsumeToken();
+        if (Modifier == OMPC_MOTION_MODIFIER_mapper) {
+          IsInvalidMapperModifier = parseMapperModifier(Data);
+          if (IsInvalidMapperModifier)
+            break;
+        }
+        // OpenMP < 5.1 doesn't permit a ',' or additional modifiers.
+        if (getLangOpts().OpenMP < 51)
           break;
+        // OpenMP 5.1 accepts an optional ',' even if the next character is ':'.
+        // TODO: Is that intentional?
+        if (Tok.is(tok::comma))
+          ConsumeToken();
       }
-      // OpenMP < 5.1 doesn't permit a ',' or additional modifiers.
-      if (getLangOpts().OpenMP < 51)
-        break;
-      // OpenMP 5.1 accepts an optional ',' even if the next character is ':'.
-      // TODO: Is that intentional?
-      if (Tok.is(tok::comma))
-        ConsumeToken();
     }
     if (!Data.MotionModifiers.empty() && Tok.isNot(tok::colon)) {
       if (!IsInvalidMapperModifier) {
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 31c8f0cd30c56..431c545c07e47 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -18712,16 +18712,16 @@ OMPClause *SemaOpenMP::ActOnOpenMPVarListClause(OpenMPClauseKind Kind,
         ExtraModifierLoc, ColonLoc, VarList, Locs);
     break;
   case OMPC_to:
-    Res =
-        ActOnOpenMPToClause(Data.MotionModifiers, Data.MotionModifiersLoc,
-                            Data.ReductionOrMapperIdScopeSpec,
-                            Data.ReductionOrMapperId, ColonLoc, VarList, Locs);
+    Res = ActOnOpenMPToClause(
+        Data.MotionModifiers, Data.MotionModifiersLoc, Data.IteratorExpr,
+        Data.ReductionOrMapperIdScopeSpec, Data.ReductionOrMapperId, ColonLoc,
+        VarList, Locs);
     break;
   case OMPC_from:
-    Res = ActOnOpenMPFromClause(Data.MotionModifiers, Data.MotionModifiersLoc,
-                                Data.ReductionOrMapperIdScopeSpec,
-                                Data.ReductionOrMapperId, ColonLoc, VarList,
-                                Locs);
+    Res = ActOnOpenMPFromClause(
+        Data.MotionModifiers, Data.MotionModifiersLoc, Data.IteratorExpr,
+        Data.ReductionOrMapperIdScopeSpec, Data.ReductionOrMapperId, ColonLoc,
+        VarList, Locs);
     break;
   case OMPC_use_device_ptr:
     Res = ActOnOpenMPUseDevicePtrClause(VarList, Locs);
@@ -24457,11 +24457,12 @@ void SemaOpenMP::ActOnOpenMPDeclareTargetInitializer(Decl *TargetDecl) {
 
 OMPClause *SemaOpenMP::ActOnOpenMPToClause(
     ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
-    ArrayRef<SourceLocation> MotionModifiersLoc,
+    ArrayRef<SourceLocation> MotionModifiersLoc, Expr *IteratorExpr,
     CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId,
     SourceLocation ColonLoc, ArrayRef<Expr *> VarList,
     const OMPVarListLocTy &Locs, ArrayRef<Expr *> UnresolvedMappers) {
   OpenMPMotionModifierKind Modifiers[] = {OMPC_MOTION_MODIFIER_unknown,
+                                          OMPC_MOTION_MODIFIER_unknown,
                                           OMPC_MOTION_MODIFIER_unknown};
   SourceLocation ModifiersLoc[NumberOfOMPMotionModifiers];
 
@@ -24485,20 +24486,25 @@ OMPClause *SemaOpenMP::ActOnOpenMPToClause(
                               MapperIdScopeSpec, MapperId, UnresolvedMappers);
   if (MVLI.ProcessedVarList.empty())
     return nullptr;
-
+  if (IteratorExpr)
+    if (auto *DRE = dyn_cast<DeclRefExpr>(IteratorExpr))
+      if (auto *VD = dyn_cast<VarDecl>(DRE->getDecl()))
+        DSAStack->addIteratorVarDecl(VD);
   return OMPToClause::Create(
       getASTContext(), Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
-      MVLI.VarComponents, MVLI.UDMapperList, Modifiers, ModifiersLoc,
-      MapperIdScopeSpec.getWithLocInContext(getASTContext()), MapperId);
+      MVLI.VarComponents, MVLI.UDMapperList, IteratorExpr, Modifiers,
+      ModifiersLoc, MapperIdScopeSpec.getWithLocInContext(getASTContext()),
+      MapperId);
 }
 
 OMPClause *SemaOpenMP::ActOnOpenMPFromClause(
     ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
-    ArrayRef<SourceLocation> MotionModifiersLoc,
+    ArrayRef<SourceLocation> MotionModifiersLoc, Expr *IteratorExpr,
     CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId,
     SourceLocation ColonLoc, ArrayRef<Expr *> VarList,
     const OMPVarListLocTy &Locs, ArrayRef<Expr *> UnresolvedMappers) {
   OpenMPMotionModifierKind Modifiers[] = {OMPC_MOTION_MODIFIER_unknown,
+                                          OMPC_MOTION_MODIFIER_unknown,
                                           OMPC_MOTION_MODIFIER_unknown};
   SourceLocation ModifiersLoc[NumberOfOMPMotionModifiers];
 
@@ -24522,11 +24528,15 @@ OMPClause *SemaOpenMP::ActOnOpenMPFromClause(
                               MapperIdScopeSpec, MapperId, UnresolvedMappers);
   if (MVLI.ProcessedVarList.empty())
     return nullptr;
-
+  if (IteratorExpr)
+    if (auto *DRE = dyn_cast<DeclRefExpr>(IteratorExpr))
+      if (auto *VD = dyn_cast<VarDecl>(DRE->getDecl()))
+        DSAStack->addIteratorVarDecl(VD);
   return OMPFromClause::Create(
       getASTContext(), Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
-      MVLI.VarComponents, MVLI.UDMapperList, Modifiers, ModifiersLoc,
-      MapperIdScopeSpec.getWithLocInContext(getASTContext()), MapperId);
+      MVLI.VarComponents, MVLI.UDMapperList, IteratorExpr, Modifiers,
+      ModifiersLoc, MapperIdScopeSpec.getWithLocInContext(getASTContext()),
+      MapperId);
 }
 
 OMPClause *
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 0e8b674a006d0..8e5dbeb792348 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -2221,13 +2221,14 @@ class TreeTransform {
   OMPClause *
   RebuildOMPToClause(ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
                      ArrayRef<SourceLocation> MotionModifiersLoc,
-                     CXXScopeSpec &MapperIdScopeSpec,
+                     Expr *IteratorModifier, CXXScopeSpec &MapperIdScopeSpec,
                      DeclarationNameInfo &MapperId, SourceLocation ColonLoc,
                      ArrayRef<Expr *> VarList, const OMPVarListLocTy &Locs,
                      ArrayRef<Expr *> UnresolvedMappers) {
     return getSema().OpenMP().ActOnOpenMPToClause(
-        MotionModifiers, MotionModifiersLoc, MapperIdScopeSpec, MapperId,
-        ColonLoc, VarList, Locs, UnresolvedMappers);
+        MotionModifiers, MotionModifiersLoc, IteratorModifier,
+        MapperIdScopeSpec, MapperId, ColonLoc, VarList, Locs,
+        UnresolvedMappers);
   }
 
   /// Build a new OpenMP 'from' clause.
@@ -2237,13 +2238,14 @@ class TreeTransform {
   OMPClause *
   RebuildOMPFromClause(ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
                        ArrayRef<SourceLocation> MotionModifiersLoc,
-                       CXXScopeSpec &MapperIdScopeSpec,
+                       Expr *IteratorModifier, CXXScopeSpec &MapperIdScopeSpec,
                        DeclarationNameInfo &MapperId, SourceLocation ColonLoc,
                        ArrayRef<Expr *> VarList, const OMPVarListLocTy &Locs,
                        ArrayRef<Expr *> UnresolvedMappers) {
     return getSema().OpenMP().ActOnOpenMPFromClause(
-        MotionModifiers, MotionModifiersLoc, MapperIdScopeSpec, MapperId,
-        ColonLoc, VarList, Locs, UnresolvedMappers);
+        MotionModifiers, MotionModifiersLoc, IteratorModifier,
+        MapperIdScopeSpec, MapperId, ColonLoc, VarList, Locs,
+        UnresolvedMappers);
   }
 
   /// Build a new OpenMP 'use_device_ptr' clause.
@@ -11535,6 +11537,13 @@ template <typename Derived>
 OMPClause *TreeTransform<Derived>::TransformOMPToClause(OMPToClause *C) {
   OMPVarListLocTy Locs(C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc());
   llvm::SmallVector<Expr *, 16> Vars;
+  Expr *IteratorModifier = C->getIteratorModifier();
+  if (IteratorModifier) {
+    ExprResult MapModRes = getDerived().TransformExpr(IteratorModifier);
+    if (MapModRes.isInvalid())
+      return nullptr;
+    IteratorModifier = MapModRes.get();
+  }
   CXXScopeSpec MapperIdScopeSpec;
   DeclarationNameInfo MapperIdInfo;
   llvm::SmallVector<Expr *, 16> UnresolvedMappers;
@@ -11542,14 +11551,22 @@ OMPClause *TreeTransform<Derived>::TransformOMPToClause(OMPToClause *C) {
           *this, C, Vars, MapperIdScopeSpec, MapperIdInfo, UnresolvedMappers))
     return nullptr;
   return getDerived().RebuildOMPToClause(
-      C->getMotionModifiers(), C->getMotionModifiersLoc(), MapperIdScopeSpec,
-      MapperIdInfo, C->getColonLoc(), Vars, Locs, UnresolvedMappers);
+      C->getMotionModifiers(), C->getMotionModifiersLoc(), IteratorModifier,
+      MapperIdScopeSpec, MapperIdInfo, C->getColonLoc(), Vars, Locs,
+      UnresolvedMappers);
 }
 
 template <typename Derived>
 OMPClause *TreeTransform<Derived>::TransformOMPFromClause(OMPFromClause *C) {
   OMPVarListLocTy Locs(C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc());
   llvm::SmallVector<Expr *, 16> Vars;
+  Expr *IteratorModifier = C->getIteratorModifier();
+  if (IteratorModifier) {
+    ExprResult MapModRes = getDerived().TransformExpr(IteratorModifier);
+    if (MapModRes.isInvalid())
+      return nullptr;
+    IteratorModifier = MapModRes.get();
+  }
   CXXScopeSpec MapperIdScopeSpec;
   DeclarationNameInfo MapperIdInfo;
   llvm::SmallVector<Expr *, 16> UnresolvedMappers;
@@ -11557,8 +11574,9 @@ OMPClause *TreeTransform<Derived>::TransformOMPFromClause(OMPFromClause *C) {
           *this, C, Vars, MapperIdScopeSpec, MapperIdInfo, UnresolvedMappers))
     return nullptr;
   return getDerived().RebuildOMPFromClause(
-      C->getMotionModifiers(), C->getMotionModifiersLoc(), MapperIdScopeSpec,
-      MapperIdInfo, C->getColonLoc(), Vars, Locs, UnresolvedMappers);
+      C->getMotionModifiers(), C->getMotionModifiersLoc(), IteratorModifier,
+      MapperIdScopeSpec, MapperIdInfo, C->getColonLoc(), Vars, Locs,
+      UnresolvedMappers);
 }
 
 template <typename Derived>
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 55c52154c4113..67ba1fd70dff7 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -12387,6 +12387,8 @@ void OMPClauseReader::VisitOMPToClause(OMPToClause *C) {
     C->setMotionModifier(
         I, static_cast<OpenMPMotionModifierKind>(Record.readInt()));
     C->setMotionModifierLoc(I, Record.readSourceLocation());
+    if (C->getMotionModifier(I) == OMPC_MOTION_MODIFIER_iterator)
+      C->setIteratorModifier(Record.readExpr());
   }
   C->setMapperQualifierLoc(Record.readNestedNameSpecifierLoc());
   C->setMapperIdInfo(Record.readDeclarationNameInfo());
@@ -12443,6 +12445,8 @@ void OMPClauseReader::VisitOMPFromClause(OMPFromClause *C) {
     C->setMotionModifier(
         I, static_cast<OpenMPMotionModifierKind>(Record.readInt()));
     C->setMotionModifierLoc(I, Record.readSourceLocation());
+    if (C->getMotionModifier(I) == OMPC_MOTION_MODIFIER_iterator)
+      C->setIteratorModifier(Record.readExpr());
   }
   C->setMapperQualifierLoc(Record.readNestedNameSpecifierLoc());
   C->setMapperIdInfo(Record.readDeclarationNameInfo());
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index e8c0d3f2b4ee9..fcee93c0ebbd3 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -8417,6 +8417,8 @@ void OMPClauseWriter::VisitOMPToClause(OMPToClause *C) {
   for (unsigned I = 0; I < NumberOfOMPMotionModifiers; ++I) {
     Record.push_back(C->getMotionModifier(I));
     Record.AddSourceLocation(C->getMotionModifierLoc(I));
+    if (C->getMotionModifier(I) == OMPC_MOTION_MODIFIER_iterator)
+      Record.AddStmt(C->getIteratorModifier());
   }
   Record.AddNestedNameSpecifierLoc(C->getMapperQualifierLoc());
   Record.AddDeclarationNameInfo(C->getMapperIdInfo());
@@ -8447,6 +8449,8 @@ void OMPClauseWriter::VisitOMPFromClause(OMPFromClause *C) {
   for (unsigned I = 0; I < NumberOfOMPMotionModifiers; ++I) {
     Record.push_back(C->getMotionModifier(I));
     Record.AddSourceLocation(C->getMotionModifierLoc(I));
+    if (C->getMotionModifier(I) == OMPC_MOTION_MODIFIER_iterator)
+      Record.AddStmt(C->getIteratorModifier());
   }
   Record.AddNestedNameSpecifierLoc(C->getMapperQualifierLoc());
   Record.AddDeclarationNameInfo(C->getMapperIdInfo());
diff --git a/clang/test/OpenMP/target_update_codegen.cpp b/clang/test/OpenMP/target_update_codegen.cpp
index c8211f475c7fc..6c754c1c953ea 100644
--- a/clang/test/OpenMP/target_update_codegen.cpp
+++ b/clang/test/OpenMP/target_update_codegen.cpp
@@ -1560,5 +1560,37 @@ void foo(int arg) {
   { ++arg; }
 }
 
+#endif
+// RUN: %clang_cc1 -DCK26 -verify -Wno-vla -fopenmp -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK26 --check-prefix CK26-64
+// RUN: %clang_cc1 -DCK26 -fopenmp -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck %s  --check-prefix CK26 --check-prefix CK26-64
+// RUN: %clang_cc1 -DCK26 -fopenmp-version=51 -verify -Wno-vla -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK26 --check-prefix CK26-32
+// RUN: %clang_cc1 -DCK26 -fopenmp -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck %s  --check-prefix CK26 --check-prefix CK26-32
+
+// RUN: %clang_cc1 -DCK26 -verify -Wno-vla -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY19 %s
+// RUN: %clang_cc1 -DCK26 -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY19 %s
+// RUN: %clang_cc1 -DCK26 -verify -Wno-vla -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY19 %s
+// RUN: %clang_cc1 -DCK26 -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY19 %s
+// SIMD-ONLY19-NOT: {{__kmpc|__tgt}}
+#ifdef CK26
+void foo() {
+int a[10];
+#pragma omp target update to(iterator(int it = 0:10) : a[it])
+// CK26-LABEL: define {{.+}}foo
+// CK26: %[[ITER:[a-zA-Z0-9_]+]] = alloca i32, align 4
+// CK26: %[[LOAD2:.*]] = load i32, ptr %[[ITER]], align 4
+}
+
+void foo1() {
+int a[10];
+#pragma omp target update from(iterator(int it = 0:10) : a[it])
+// CK26-LABEL: define {{.+}}foo1
+// CK26: %[[ITER:[a-zA-Z0-9_]+]] = alloca i32, align 4
+// CK26: %[[LOAD2:.*]] = load i32, ptr %[[ITER]], align 4
+}
+
 #endif
 #endif
diff --git a/clang/test/OpenMP/target_update_iterator_ast_print.cpp b/clang/test/OpenMP/target_update_iterator_ast_print.cpp
new file mode 100644
index 0000000000000..322f565c9c732
--- /dev/null
+++ b/clang/test/OpenMP/target_update_iterator_ast_print.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -ast-print %s | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void test() {
+  int a[10];
+  #pragma omp target update to(iterator(int it = 0:10): a[it]) 
+  // CHECK:   int a[10];
+  // CHECK: #pragma omp target update to(iterator(int it = 0:10): a[it])
+  #pragma omp target update from(iterator(int it = 0:10): a[it]) 
+  // CHECK: #pragma omp target update from(iterator(int it = 0:10): a[it])
+}
+
+#endif
diff --git a/clang/test/OpenMP/target_update_iterator_serialization.cpp b/clang/test/OpenMP/target_update_iterator_serialization.cpp
new file mode 100644
index 0000000000000..c1ad380f7c9a5
--- /dev/null
+++ b/clang/test/OpenMP/target_update_iterator_serialization.cpp
@@ -0,0 +1,35 @@
+// Test without serialization:
+// RUN: %clang_cc1 -std=c++20 -fopenmp  %s -ast-dump | FileCheck %s
+
+// Test with serialization:
+// RUN: %clang_cc1 -std=c++20 -fopenmp  -emit-pch -o %t %s
+// RUN: %clang_cc1 -x c++ -std=c++20 -fopenmp -include-pch %t -ast-dump-all /dev/null  \
+// RUN:   | sed -e "s/ <undeserialized declarations>//" -e "s/ imported//" \
+// RUN:   | FileCheck %s
+
+// CHECK: OMPTargetUpdateDirective
+// CHECK-NEXT: OMPFromClause
+// CHECK-NEXT: ArraySubscriptExpr
+// CHECK: DeclRefExpr {{.*}} 'a'
+// CHECK: DeclRefExpr {{.*}} 'it'
+
+
+void foo1() {
+  int a[10];
+
+#pragma omp target update from(iterator(int it = 0:10) : a[it])
+  ;
+}
+
+// CHECK: OMPTargetUpdateDirective
+// CHECK-NEXT: OMPToClause
+// CHECK-NEXT: ArraySubscriptExpr
+// CHECK: DeclRefExpr {{.*}} 'a'
+// CHECK: DeclRefExpr {{.*}} 'it'
+
+void foo2() {
+  int a[10];
+
+#pragma omp target update to(iterator(int it = 0:10) : a[it])
+  ;
+}

From 147c466bcd0efcd3efe7b403db441ec8d4912d6a Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Mon, 1 Dec 2025 16:50:02 +0800
Subject: [PATCH 13/39] [mlir][arith] Add support for min/max to
 `ArithToAPFloat` (#169760)

Add support for `arith.minnumf`, `arith.maxnumf`, `arith.minimumf`,
`arith.maximumf`.
---
 .../ArithToAPFloat/ArithToAPFloat.cpp         |  8 ++++
 mlir/lib/ExecutionEngine/APFloatWrappers.cpp  | 20 ++++++++++
 .../ArithToApfloat/arith-to-apfloat.mlir      | 40 +++++++++++++++++++
 .../Arith/CPU/test-apfloat-emulation.mlir     |  4 ++
 4 files changed, 72 insertions(+)

diff --git a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
index 024a97b03c14e..5c68236526b7d 100644
--- a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
+++ b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
@@ -513,6 +513,14 @@ void ArithToAPFloatConversionPass::runOnOperation() {
       context, "divide", getOperation());
   patterns.add<BinaryArithOpToAPFloatConversion<arith::RemFOp>>(
       context, "remainder", getOperation());
+  patterns.add<BinaryArithOpToAPFloatConversion<arith::MinNumFOp>>(
+      context, "minnum", getOperation());
+  patterns.add<BinaryArithOpToAPFloatConversion<arith::MaxNumFOp>>(
+      context, "maxnum", getOperation());
+  patterns.add<BinaryArithOpToAPFloatConversion<arith::MinimumFOp>>(
+      context, "minimum", getOperation());
+  patterns.add<BinaryArithOpToAPFloatConversion<arith::MaximumFOp>>(
+      context, "maximum", getOperation());
   patterns
       .add<FpToFpConversion<arith::ExtFOp>, FpToFpConversion<arith::TruncFOp>,
            CmpFOpToAPFloatConversion, NegFOpToAPFloatConversion>(
diff --git a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
index f2d5254be6b57..f3e38eb8ffa2d 100644
--- a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
@@ -151,4 +151,24 @@ MLIR_APFLOAT_WRAPPERS_EXPORT uint64_t _mlir_apfloat_neg(int32_t semantics, uint6
   x.changeSign();
   return x.bitcastToAPInt().getZExtValue();
 }
+
+/// Min/max operations.
+#define APFLOAT_MIN_MAX_OP(OP)                                                 \
+  MLIR_APFLOAT_WRAPPERS_EXPORT uint64_t _mlir_apfloat_##OP(                    \
+      int32_t semantics, uint64_t a, uint64_t b) {                             \
+    const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(        \
+        static_cast<llvm::APFloatBase::Semantics>(semantics));                 \
+    unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);           \
+    llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a));                          \
+    llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b));                          \
+    llvm::APFloat result = llvm::OP(lhs, rhs);                                 \
+    return result.bitcastToAPInt().getZExtValue();                             \
+  }
+
+APFLOAT_MIN_MAX_OP(minimum)
+APFLOAT_MIN_MAX_OP(maximum)
+APFLOAT_MIN_MAX_OP(minnum)
+APFLOAT_MIN_MAX_OP(maxnum)
+
+#undef APFLOAT_MIN_MAX_OP
 }
diff --git a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
index 775cb5ea60f22..950d2cecefa95 100644
--- a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
+++ b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
@@ -223,3 +223,43 @@ func.func @negf(%arg0: f32) {
   %0 = arith.negf %arg0 : f32
   return
 }
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_minimum(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 2 : i32
+// CHECK: %[[res:.*]] = call @_mlir_apfloat_minimum(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @minimumf(%arg0: f32, %arg1: f32) {
+  %0 = arith.minimumf %arg0, %arg1 : f32
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_maximum(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 2 : i32
+// CHECK: %[[res:.*]] = call @_mlir_apfloat_maximum(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @maximumf(%arg0: f32, %arg1: f32) {
+  %0 = arith.maximumf %arg0, %arg1 : f32
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_minnum(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 2 : i32
+// CHECK: %[[res:.*]] = call @_mlir_apfloat_minnum(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @minnumf(%arg0: f32, %arg1: f32) {
+  %0 = arith.minnumf %arg0, %arg1 : f32
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_maxnum(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 2 : i32
+// CHECK: %[[res:.*]] = call @_mlir_apfloat_maxnum(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @maxnumf(%arg0: f32, %arg1: f32) {
+  %0 = arith.maxnumf %arg0, %arg1 : f32
+  return
+}
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
index 555cc9a531966..7f72dd5931488 100644
--- a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
+++ b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
@@ -47,6 +47,10 @@ func.func @entry() {
   %negated = arith.negf %cvt : f8E4M3FN
   vector.print %negated : f8E4M3FN
 
+  // CHECK-NEXT: -2.25
+  %min = arith.minimumf %cvt, %negated : f8E4M3FN
+  vector.print %min : f8E4M3FN
+
   // CHECK-NEXT: 1
   %cmp1 = arith.cmpf "olt", %cvt, %c1 : f8E4M3FN
   vector.print %cmp1 : i1

From eb711d8e142683e06ae14b652218b881896f5046 Mon Sep 17 00:00:00 2001
From: Carlos Galvez <carlosgalvezp@gmail.com>
Date: Mon, 1 Dec 2025 09:50:19 +0100
Subject: [PATCH 14/39] =?UTF-8?q?[clang-tidy][doc]=20Fix=20incorrect=20lin?=
 =?UTF-8?q?k=20syntax=20in=20cppcoreguidelines-pro-=E2=80=A6=20(#170088)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…bounds-avoid-unchecked-container-access

Missing a trailing underscore to render it as a link.

Co-authored-by: Carlos Gálvez <carlos.galvez@zenseact.com>
---
 .../pro-bounds-avoid-unchecked-container-access.rst           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst
index fe78ad8056443..38143c94cd3ae 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst
@@ -29,9 +29,9 @@ STL containers for which ``operator[]`` is well-defined for all inputs are
 excluded from this check (e.g.: ``std::map::operator[]``).
 
 This check enforces part of the `SL.con.3
-<https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#slcon3-avoid-bounds-errors>`
+<https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#slcon3-avoid-bounds-errors>`_
 guideline and is part of the `Bounds Safety (Bounds 4)
-<https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#pro-bounds-arrayindex>`
+<https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#pro-bounds-arrayindex>`_
 profile from the C++ Core Guidelines.
 
 Options

From 8079d033c97f3ad8d289fa014b0f1c85cf3bbbad Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu@apple.com>
Date: Mon, 1 Dec 2025 17:10:39 +0800
Subject: [PATCH 15/39] [CAS] Temporarily skip tests on old windows version
 (#170063)

---
 llvm/unittests/CAS/CASTestConfig.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/llvm/unittests/CAS/CASTestConfig.h b/llvm/unittests/CAS/CASTestConfig.h
index b1c0e59ff2b92..20a95dd2f6aa6 100644
--- a/llvm/unittests/CAS/CASTestConfig.h
+++ b/llvm/unittests/CAS/CASTestConfig.h
@@ -15,6 +15,11 @@
 #include "gtest/gtest.h"
 #include <memory>
 
+#ifdef _WIN32
+#include "llvm/Support/VersionTuple.h"
+#include "llvm/Support/Windows/WindowsSupport.h"
+#endif
+
 namespace llvm::unittest::cas {
 class MockEnv {
   void anchor();
@@ -68,6 +73,10 @@ class CASTest
   }
 
   void SetUp() override {
+#ifdef _WIN32
+    if (llvm::GetWindowsOSVersion() < llvm::VersionTuple(10, 0, 0, 17763))
+      GTEST_SKIP() << "CAS tests skipped on older windows version";
+#endif
     NextCASIndex = 0;
     setMaxOnDiskCASMappingSize();
   }

From 8e6fb0ee84dcfba7e712f3ee4cc9d9819bc2a757 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gergely=20B=C3=A1lint?= <gergely.balint@arm.com>
Date: Mon, 1 Dec 2025 10:20:23 +0100
Subject: [PATCH 16/39] Reapply "[BOLT][BTI] Skip inlining BasicBlocks
 containing indirect tailcalls" (#169881) (#169929)

This reapplies commit 5d6d74359d69d3aada6a46c7cf51d84eb0848b70.

Fix: added assertions to the requirements of the test

--------

Original commit message:

In the Inliner pass, tailcalls are converted to calls in the inlined
BasicBlock. If the tailcall is indirect, the `BR` is converted to `BLR`.

These instructions require different BTI landing pads at their targets.

As the targets of indirect tailcalls are unknown, inlining such blocks
is unsound for BTI: they should be skipped instead.
---
 bolt/lib/Passes/Inliner.cpp        | 26 +++++++++++++++++++
 bolt/test/AArch64/inline-bti-dbg.s | 40 ++++++++++++++++++++++++++++++
 bolt/test/AArch64/inline-bti.s     | 38 ++++++++++++++++++++++++++++
 3 files changed, 104 insertions(+)
 create mode 100644 bolt/test/AArch64/inline-bti-dbg.s
 create mode 100644 bolt/test/AArch64/inline-bti.s

diff --git a/bolt/lib/Passes/Inliner.cpp b/bolt/lib/Passes/Inliner.cpp
index 5a7d02a34b4d8..0740fcef9102b 100644
--- a/bolt/lib/Passes/Inliner.cpp
+++ b/bolt/lib/Passes/Inliner.cpp
@@ -491,6 +491,32 @@ bool Inliner::inlineCallsInFunction(BinaryFunction &Function) {
         }
       }
 
+      // AArch64 BTI:
+      // If the callee has an indirect tailcall (BR), we would transform it to
+      // an indirect call (BLR) in InlineCall. Because of this, we would have to
+      // update the BTI at the target of the tailcall. However, these targets
+      // are not known. Instead, we skip inlining blocks with indirect
+      // tailcalls.
+      auto HasIndirectTailCall = [&](const BinaryFunction &BF) -> bool {
+        for (const auto &BB : BF) {
+          for (const auto &II : BB) {
+            if (BC.MIB->isIndirectBranch(II) && BC.MIB->isTailCall(II)) {
+              return true;
+            }
+          }
+        }
+        return false;
+      };
+
+      if (BC.isAArch64() && BC.usesBTI() &&
+          HasIndirectTailCall(*TargetFunction)) {
+        ++InstIt;
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Skipping inlining block with tailcall"
+                          << " in " << Function << " : " << BB->getName()
+                          << " to keep BTIs consistent.\n");
+        continue;
+      }
+
       LLVM_DEBUG(dbgs() << "BOLT-DEBUG: inlining call to " << *TargetFunction
                         << " in " << Function << " : " << BB->getName()
                         << ". Count: " << BB->getKnownExecutionCount()
diff --git a/bolt/test/AArch64/inline-bti-dbg.s b/bolt/test/AArch64/inline-bti-dbg.s
new file mode 100644
index 0000000000000..a0db4589d39ac
--- /dev/null
+++ b/bolt/test/AArch64/inline-bti-dbg.s
@@ -0,0 +1,40 @@
+# This test checks that for AArch64 binaries with BTI, we do not inline blocks with indirect tailcalls.
+# Same as inline-bti.s, but checks the debug output, and therefore requires assertions.
+
+# REQUIRES: system-linux, assertions
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags -O0 %t.o -o %t.exe -Wl,-q -Wl,-z,force-bti
+# RUN: llvm-bolt --inline-all %t.exe -o %t.bolt --debug 2>&1 | FileCheck %s
+
+# For BTI, we should not inline foo.
+# CHECK: BOLT-DEBUG: Skipping inlining block with tailcall in _Z3barP1A : .LBB01 to keep BTIs consistent.
+# CHECK-NOT: BOLT-INFO: inlined {{[0-9]+}} calls at {{[0-9]+}} call sites in {{[0-9]+}} iteration(s). Change in binary size: {{[0-9]+}} bytes.
+
+	.text
+	.globl	_Z3fooP1A
+	.type	_Z3fooP1A,@function
+_Z3fooP1A:
+	ldr	x8, [x0]
+	ldr	w0, [x8]
+	br x30
+	.size	_Z3fooP1A, .-_Z3fooP1A
+
+	.globl	_Z3barP1A
+	.type	_Z3barP1A,@function
+_Z3barP1A:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	bl	_Z3fooP1A
+	mul	w0, w0, w0
+	ldp	x29, x30, [sp], #16
+	ret
+	.size	_Z3barP1A, .-_Z3barP1A
+
+	.globl	main
+	.p2align	2
+	.type	main,@function
+main:
+	mov	w0, wzr
+	ret
+	.size	main, .-main
diff --git a/bolt/test/AArch64/inline-bti.s b/bolt/test/AArch64/inline-bti.s
new file mode 100644
index 0000000000000..62f6ea6f4b63a
--- /dev/null
+++ b/bolt/test/AArch64/inline-bti.s
@@ -0,0 +1,38 @@
+## This test checks that for AArch64 binaries with BTI, we do not inline blocks with indirect tailcalls.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags -O0 %t.o -o %t.exe -Wl,-q -Wl,-z,force-bti
+# RUN: llvm-bolt --inline-all %t.exe -o %t.bolt  | FileCheck %s
+
+# For BTI, we should not inline foo.
+# CHECK-NOT: BOLT-INFO: inlined {{[0-9]+}} calls at {{[0-9]+}} call sites in {{[0-9]+}} iteration(s). Change in binary size: {{[0-9]+}} bytes.
+
+	.text
+	.globl	_Z3fooP1A
+	.type	_Z3fooP1A,@function
+_Z3fooP1A:
+	ldr	x8, [x0]
+	ldr	w0, [x8]
+	br x30
+	.size	_Z3fooP1A, .-_Z3fooP1A
+
+	.globl	_Z3barP1A
+	.type	_Z3barP1A,@function
+_Z3barP1A:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	bl	_Z3fooP1A
+	mul	w0, w0, w0
+	ldp	x29, x30, [sp], #16
+	ret
+	.size	_Z3barP1A, .-_Z3barP1A
+
+	.globl	main
+	.p2align	2
+	.type	main,@function
+main:
+	mov	w0, wzr
+	ret
+	.size	main, .-main

From dda15ad0aadf0bf485498e3d5f22e5caf94925e5 Mon Sep 17 00:00:00 2001
From: Igor Wodiany <igor.wodiany@imgtec.com>
Date: Mon, 1 Dec 2025 09:43:25 +0000
Subject: [PATCH 17/39] [mlir][spirv] Use MapVector for BlockMergeInfoMap
 (#169636)

This should ensure that the structurizer while loop is deterministic
across runs. Use of `MapVector` addresses the source of the
nondeterminism which is use of a `Block*` as a map key.

fixes #128547
---
 mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp | 3 ---
 mlir/lib/Target/SPIRV/Deserialization/Deserializer.h   | 4 +++-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
index 252be796488c5..d08e7ecf326ca 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
@@ -2923,9 +2923,6 @@ LogicalResult spirv::Deserializer::structurizeControlFlow() {
     return failure();
   }
 
-  // TODO: This loop is non-deterministic. Iteration order may vary between runs
-  // for the same shader as the key to the map is a pointer. See:
-  // https://github.com/llvm/llvm-project/issues/128547
   while (!blockMergeInfo.empty()) {
     Block *headerBlock = blockMergeInfo.begin()->first;
     BlockMergeInfo mergeInfo = blockMergeInfo.begin()->second;
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h
index 243e6fd70ae43..6d09d556c4d02 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h
@@ -58,7 +58,9 @@ struct DebugLine {
 };
 
 /// Map from a selection/loop's header block to its merge (and continue) target.
-using BlockMergeInfoMap = DenseMap<Block *, BlockMergeInfo>;
+/// Use `MapVector<>` to ensure a deterministic iteration order with a pointer
+/// key.
+using BlockMergeInfoMap = llvm::MapVector<Block *, BlockMergeInfo>;
 
 /// A "deferred struct type" is a struct type with one or more member types not
 /// known when the Deserializer first encounters the struct. This happens, for

From 1317083530b95fcf052f3017394a7719a67546fa Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Mon, 1 Dec 2025 09:55:49 +0000
Subject: [PATCH 18/39] [AArch64][SME] Support saving/restoring ZT0 in the
 MachineSMEABIPass (#166362)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch extends the MachineSMEABIPass to support ZT0. This is done
with the addition of two new states:

- `ACTIVE_ZT0_SAVED`
  * This is used when calling a function that shares ZA, but does not
    share ZT0 (i.e., no ZT0 attributes)
  * This state indicates ZT0 must be saved to the save slot, but ZA must
    remain on, with no lazy save setup
- `LOCAL_COMMITTED`
  * This is used for saving ZT0 in functions without ZA state
  * This state indicates ZA is off and ZT0 has been saved
  * This state is general enough to support ZA, but the required
    transitions have not been implemented†

To aid with readability, the state transitions have been reworked to a
switch of `transitionFrom(<FromState>).to(<ToState>)`, rather than
nested ifs, which helps manage more transitions.

† This could be implemented to handle some cases of undefined behavior
better.
---
 .../AArch64/AArch64ExpandPseudoInsts.cpp      |   1 +
 .../Target/AArch64/AArch64ISelLowering.cpp    |  11 +-
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |   6 +
 llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 189 ++++++++++--
 .../test/CodeGen/AArch64/sme-peephole-opts.ll |   4 -
 .../test/CodeGen/AArch64/sme-za-exceptions.ll | 273 ++++++++++++++++--
 llvm/test/CodeGen/AArch64/sme-zt0-state.ll    | 104 ++++---
 7 files changed, 480 insertions(+), 108 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 34d74d04c4419..60e6a82d41cc8 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1717,6 +1717,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
    }
    case AArch64::InOutZAUsePseudo:
    case AArch64::RequiresZASavePseudo:
+   case AArch64::RequiresZT0SavePseudo:
    case AArch64::SMEStateAllocPseudo:
    case AArch64::COALESCER_BARRIER_FPR16:
    case AArch64::COALESCER_BARRIER_FPR32:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6072fd9d8f242..5ba8f05b09012 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9642,6 +9642,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     if (CallAttrs.requiresLazySave() ||
         CallAttrs.requiresPreservingAllZAState())
       ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE;
+    else if (CallAttrs.requiresPreservingZT0())
+      ZAMarkerNode = AArch64ISD::REQUIRES_ZT0_SAVE;
     else if (CallAttrs.caller().hasZAState() ||
              CallAttrs.caller().hasZT0State())
       ZAMarkerNode = AArch64ISD::INOUT_ZA_USE;
@@ -9761,7 +9763,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   SDValue ZTFrameIdx;
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  bool ShouldPreserveZT0 = CallAttrs.requiresPreservingZT0();
+  bool ShouldPreserveZT0 =
+      !UseNewSMEABILowering && CallAttrs.requiresPreservingZT0();
 
   // If the caller has ZT0 state which will not be preserved by the callee,
   // spill ZT0 before the call.
@@ -9774,7 +9777,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // If caller shares ZT0 but the callee is not shared ZA, we need to stop
   // PSTATE.ZA before the call if there is no lazy-save active.
-  bool DisableZA = CallAttrs.requiresDisablingZABeforeCall();
+  bool DisableZA =
+      !UseNewSMEABILowering && CallAttrs.requiresDisablingZABeforeCall();
   assert((!DisableZA || !RequiresLazySave) &&
          "Lazy-save should have PSTATE.SM=1 on entry to the function");
 
@@ -10263,7 +10267,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         getSMToggleCondition(CallAttrs));
   }
 
-  if (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall())
+  if (!UseNewSMEABILowering &&
+      (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall()))
     // Unconditionally resume ZA.
     Result = DAG.getNode(
         AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result,
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 737169253ddb3..b099f15ecf7e3 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -102,6 +102,7 @@ def : Pat<(i64 (AArch64AllocateSMESaveBuffer GPR64:$size)),
 let hasSideEffects = 1, isMeta = 1 in {
   def InOutZAUsePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
   def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
+  def RequiresZT0SavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
 }
 
 def SMEStateAllocPseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
@@ -122,6 +123,11 @@ def AArch64_requires_za_save
            [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
 def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>;
 
+def AArch64_requires_zt0_save
+  : SDNode<"AArch64ISD::REQUIRES_ZT0_SAVE", SDTypeProfile<0, 0, []>,
+           [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+def : Pat<(AArch64_requires_zt0_save), (RequiresZT0SavePseudo)>;
+
 def AArch64_sme_state_alloc
   : SDNode<"AArch64ISD::SME_STATE_ALLOC", SDTypeProfile<0, 0,[]>,
            [SDNPHasChain]>;
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index ead1dfceb96a0..b96f6f12a58d6 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -72,20 +72,34 @@ using namespace llvm;
 
 namespace {
 
-enum ZAState {
+// Note: For agnostic ZA, we assume the function is always entered/exited in the
+// "ACTIVE" state -- this _may_ not be the case (since OFF is also a
+// possibility, but for the purpose of placing ZA saves/restores, that does not
+// matter).
+enum ZAState : uint8_t {
   // Any/unknown state (not valid)
   ANY = 0,
 
   // ZA is in use and active (i.e. within the accumulator)
   ACTIVE,
 
+  // ZA is active, but ZT0 has been saved.
+  // This handles the edge case of sharedZA && !sharesZT0.
+  ACTIVE_ZT0_SAVED,
+
   // A ZA save has been set up or committed (i.e. ZA is dormant or off)
+  // If the function uses ZT0 it must also be saved.
   LOCAL_SAVED,
 
+  // ZA has been committed to the lazy save buffer of the current function.
+  // If the function uses ZT0 it must also be saved.
+  // ZA is off.
+  LOCAL_COMMITTED,
+
   // The ZA/ZT0 state on entry to the function.
   ENTRY,
 
-  // ZA is off
+  // ZA is off.
   OFF,
 
   // The number of ZA states (not a valid state)
@@ -164,6 +178,14 @@ class EmitContext {
     return AgnosticZABufferPtr;
   }
 
+  int getZT0SaveSlot(MachineFunction &MF) {
+    if (ZT0SaveFI)
+      return *ZT0SaveFI;
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    ZT0SaveFI = MFI.CreateSpillStackObject(64, Align(16));
+    return *ZT0SaveFI;
+  }
+
   /// Returns true if the function must allocate a ZA save buffer on entry. This
   /// will be the case if, at any point in the function, a ZA save was emitted.
   bool needsSaveBuffer() const {
@@ -173,6 +195,7 @@ class EmitContext {
   }
 
 private:
+  std::optional<int> ZT0SaveFI;
   std::optional<int> TPIDR2BlockFI;
   Register AgnosticZABufferPtr = AArch64::NoRegister;
 };
@@ -184,8 +207,10 @@ class EmitContext {
 /// state would not be legal, as transitioning to it drops the content of ZA.
 static bool isLegalEdgeBundleZAState(ZAState State) {
   switch (State) {
-  case ZAState::ACTIVE:      // ZA state within the accumulator/ZT0.
-  case ZAState::LOCAL_SAVED: // ZA state is saved on the stack.
+  case ZAState::ACTIVE:           // ZA state within the accumulator/ZT0.
+  case ZAState::ACTIVE_ZT0_SAVED: // ZT0 is saved (ZA is active).
+  case ZAState::LOCAL_SAVED:      // ZA state may be saved on the stack.
+  case ZAState::LOCAL_COMMITTED:  // ZA state is saved on the stack.
     return true;
   default:
     return false;
@@ -199,7 +224,9 @@ StringRef getZAStateString(ZAState State) {
   switch (State) {
     MAKE_CASE(ZAState::ANY)
     MAKE_CASE(ZAState::ACTIVE)
+    MAKE_CASE(ZAState::ACTIVE_ZT0_SAVED)
     MAKE_CASE(ZAState::LOCAL_SAVED)
+    MAKE_CASE(ZAState::LOCAL_COMMITTED)
     MAKE_CASE(ZAState::ENTRY)
     MAKE_CASE(ZAState::OFF)
   default:
@@ -221,18 +248,39 @@ static bool isZAorZTRegOp(const TargetRegisterInfo &TRI,
 /// Returns the required ZA state needed before \p MI and an iterator pointing
 /// to where any code required to change the ZA state should be inserted.
 static std::pair<ZAState, MachineBasicBlock::iterator>
-getZAStateBeforeInst(const TargetRegisterInfo &TRI, MachineInstr &MI,
-                     bool ZAOffAtReturn) {
+getInstNeededZAState(const TargetRegisterInfo &TRI, MachineInstr &MI,
+                     SMEAttrs SMEFnAttrs) {
   MachineBasicBlock::iterator InsertPt(MI);
 
+  // Note: InOutZAUsePseudo, RequiresZASavePseudo, and RequiresZT0SavePseudo are
+  // intended to mark the position immediately before a call. Due to
+  // SelectionDAG constraints, these markers occur after the ADJCALLSTACKDOWN,
+  // so we use std::prev(InsertPt) to get the position before the call.
+
   if (MI.getOpcode() == AArch64::InOutZAUsePseudo)
     return {ZAState::ACTIVE, std::prev(InsertPt)};
 
+  // Note: If we need to save both ZA and ZT0 we use RequiresZASavePseudo.
   if (MI.getOpcode() == AArch64::RequiresZASavePseudo)
     return {ZAState::LOCAL_SAVED, std::prev(InsertPt)};
 
-  if (MI.isReturn())
+  // If we only need to save ZT0 there's two cases to consider:
+  //   1. The function has ZA state (that we don't need to save).
+  //      - In this case we switch to the "ACTIVE_ZT0_SAVED" state.
+  //        This only saves ZT0.
+  //   2. The function does not have ZA state
+  //      - In this case we switch to "LOCAL_COMMITTED" state.
+  //        This saves ZT0 and turns ZA off.
+  if (MI.getOpcode() == AArch64::RequiresZT0SavePseudo) {
+    return {SMEFnAttrs.hasZAState() ? ZAState::ACTIVE_ZT0_SAVED
+                                    : ZAState::LOCAL_COMMITTED,
+            std::prev(InsertPt)};
+  }
+
+  if (MI.isReturn()) {
+    bool ZAOffAtReturn = SMEFnAttrs.hasPrivateZAInterface();
     return {ZAOffAtReturn ? ZAState::OFF : ZAState::ACTIVE, InsertPt};
+  }
 
   for (auto &MO : MI.operands()) {
     if (isZAorZTRegOp(TRI, MO))
@@ -280,6 +328,9 @@ struct MachineSMEABI : public MachineFunctionPass {
   /// predecessors).
   void propagateDesiredStates(FunctionInfo &FnInfo, bool Forwards = true);
 
+  void emitZT0SaveRestore(EmitContext &, MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI, bool IsSave);
+
   // Emission routines for private and shared ZA functions (using lazy saves).
   void emitSMEPrologue(MachineBasicBlock &MBB,
                        MachineBasicBlock::iterator MBBI);
@@ -290,8 +341,8 @@ struct MachineSMEABI : public MachineFunctionPass {
                          MachineBasicBlock::iterator MBBI);
   void emitAllocateLazySaveBuffer(EmitContext &, MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator MBBI);
-  void emitZAOff(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                 bool ClearTPIDR2);
+  void emitZAMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                  bool ClearTPIDR2, bool On);
 
   // Emission routines for agnostic ZA functions.
   void emitSetupFullZASave(MachineBasicBlock &MBB,
@@ -409,7 +460,7 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
       Block.FixedEntryState = ZAState::ENTRY;
     } else if (MBB.isEHPad()) {
       // EH entry block:
-      Block.FixedEntryState = ZAState::LOCAL_SAVED;
+      Block.FixedEntryState = ZAState::LOCAL_COMMITTED;
     }
 
     LiveRegUnits LiveUnits(*TRI);
@@ -431,8 +482,7 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
         PhysLiveRegsAfterSMEPrologue = PhysLiveRegs;
       }
       // Note: We treat Agnostic ZA as inout_za with an alternate save/restore.
-      auto [NeededState, InsertPt] = getZAStateBeforeInst(
-          *TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface());
+      auto [NeededState, InsertPt] = getInstNeededZAState(*TRI, MI, SMEFnAttrs);
       assert((InsertPt == MBBI || isCallStartOpcode(InsertPt->getOpcode())) &&
              "Unexpected state change insertion point!");
       // TODO: Do something to avoid state changes where NZCV is live.
@@ -752,9 +802,9 @@ void MachineSMEABI::emitRestoreLazySave(EmitContext &Context,
   restorePhyRegSave(RegSave, MBB, MBBI, DL);
 }
 
-void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MBBI,
-                              bool ClearTPIDR2) {
+void MachineSMEABI::emitZAMode(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI,
+                               bool ClearTPIDR2, bool On) {
   DebugLoc DL = getDebugLoc(MBB, MBBI);
 
   if (ClearTPIDR2)
@@ -765,7 +815,7 @@ void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
   // Disable ZA.
   BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
       .addImm(AArch64SVCR::SVCRZA)
-      .addImm(0);
+      .addImm(On ? 1 : 0);
 }
 
 void MachineSMEABI::emitAllocateLazySaveBuffer(
@@ -891,6 +941,28 @@ void MachineSMEABI::emitFullZASaveRestore(EmitContext &Context,
   restorePhyRegSave(RegSave, MBB, MBBI, DL);
 }
 
+void MachineSMEABI::emitZT0SaveRestore(EmitContext &Context,
+                                       MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI,
+                                       bool IsSave) {
+  DebugLoc DL = getDebugLoc(MBB, MBBI);
+  Register ZT0Save = MRI->createVirtualRegister(&AArch64::GPR64spRegClass);
+
+  BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), ZT0Save)
+      .addFrameIndex(Context.getZT0SaveSlot(*MF))
+      .addImm(0)
+      .addImm(0);
+
+  if (IsSave) {
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::STR_TX))
+        .addReg(AArch64::ZT0)
+        .addReg(ZT0Save);
+  } else {
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDR_TX), AArch64::ZT0)
+        .addReg(ZT0Save);
+  }
+}
+
 void MachineSMEABI::emitAllocateFullZASaveBuffer(
     EmitContext &Context, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator MBBI, LiveRegs PhysLiveRegs) {
@@ -935,6 +1007,17 @@ void MachineSMEABI::emitAllocateFullZASaveBuffer(
   restorePhyRegSave(RegSave, MBB, MBBI, DL);
 }
 
+struct FromState {
+  ZAState From;
+
+  constexpr uint8_t to(ZAState To) const {
+    static_assert(NUM_ZA_STATE < 16, "expected ZAState to fit in 4-bits");
+    return uint8_t(From) << 4 | uint8_t(To);
+  }
+};
+
+constexpr FromState transitionFrom(ZAState From) { return FromState{From}; }
+
 void MachineSMEABI::emitStateChange(EmitContext &Context,
                                     MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator InsertPt,
@@ -949,8 +1032,6 @@ void MachineSMEABI::emitStateChange(EmitContext &Context,
   if (From == ZAState::ENTRY && To == ZAState::OFF)
     return;
 
-  [[maybe_unused]] SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs();
-
   // TODO: Avoid setting up the save buffer if there's no transition to
   // LOCAL_SAVED.
   if (From == ZAState::ENTRY) {
@@ -966,17 +1047,67 @@ void MachineSMEABI::emitStateChange(EmitContext &Context,
     From = ZAState::ACTIVE;
   }
 
-  if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED)
-    emitZASave(Context, MBB, InsertPt, PhysLiveRegs);
-  else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE)
-    emitZARestore(Context, MBB, InsertPt, PhysLiveRegs);
-  else if (To == ZAState::OFF) {
-    assert(From != ZAState::ENTRY &&
-           "ENTRY to OFF should have already been handled");
-    assert(!SMEFnAttrs.hasAgnosticZAInterface() &&
-           "Should not turn ZA off in agnostic ZA function");
-    emitZAOff(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED);
-  } else {
+  SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs();
+  bool IsAgnosticZA = SMEFnAttrs.hasAgnosticZAInterface();
+  bool HasZT0State = SMEFnAttrs.hasZT0State();
+  bool HasZAState = IsAgnosticZA || SMEFnAttrs.hasZAState();
+
+  switch (transitionFrom(From).to(To)) {
+  // This section handles: ACTIVE <-> ACTIVE_ZT0_SAVED
+  case transitionFrom(ZAState::ACTIVE).to(ZAState::ACTIVE_ZT0_SAVED):
+    emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/true);
+    break;
+  case transitionFrom(ZAState::ACTIVE_ZT0_SAVED).to(ZAState::ACTIVE):
+    emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/false);
+    break;
+
+  // This section handles: ACTIVE[_ZT0_SAVED] -> LOCAL_SAVED
+  case transitionFrom(ZAState::ACTIVE).to(ZAState::LOCAL_SAVED):
+  case transitionFrom(ZAState::ACTIVE_ZT0_SAVED).to(ZAState::LOCAL_SAVED):
+    if (HasZT0State && From == ZAState::ACTIVE)
+      emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/true);
+    if (HasZAState)
+      emitZASave(Context, MBB, InsertPt, PhysLiveRegs);
+    break;
+
+  // This section handles: ACTIVE -> LOCAL_COMMITTED
+  case transitionFrom(ZAState::ACTIVE).to(ZAState::LOCAL_COMMITTED):
+    // TODO: We could support ZA state here, but this transition is currently
+    // only possible when we _don't_ have ZA state.
+    assert(HasZT0State && !HasZAState && "Expect to only have ZT0 state.");
+    emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/true);
+    emitZAMode(MBB, InsertPt, /*ClearTPIDR2=*/false, /*On=*/false);
+    break;
+
+  // This section handles: LOCAL_COMMITTED -> (OFF|LOCAL_SAVED)
+  case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::OFF):
+  case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::LOCAL_SAVED):
+    // These transistions are a no-op.
+    break;
+
+  // This section handles: LOCAL_(SAVED|COMMITTED) -> ACTIVE[_ZT0_SAVED]
+  case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::ACTIVE):
+  case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::ACTIVE_ZT0_SAVED):
+  case transitionFrom(ZAState::LOCAL_SAVED).to(ZAState::ACTIVE):
+    if (HasZAState)
+      emitZARestore(Context, MBB, InsertPt, PhysLiveRegs);
+    else
+      emitZAMode(MBB, InsertPt, /*ClearTPIDR2=*/false, /*On=*/true);
+    if (HasZT0State && To == ZAState::ACTIVE)
+      emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/false);
+    break;
+
+  // This section handles transistions to OFF (not previously covered)
+  case transitionFrom(ZAState::ACTIVE).to(ZAState::OFF):
+  case transitionFrom(ZAState::ACTIVE_ZT0_SAVED).to(ZAState::OFF):
+  case transitionFrom(ZAState::LOCAL_SAVED).to(ZAState::OFF):
+    assert(SMEFnAttrs.hasPrivateZAInterface() &&
+           "Did not expect to turn ZA off in shared/agnostic ZA function");
+    emitZAMode(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED,
+               /*On=*/false);
+    break;
+
+  default:
     dbgs() << "Error: Transition from " << getZAStateString(From) << " to "
            << getZAStateString(To) << '\n';
     llvm_unreachable("Unimplemented state transition");
diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
index a3027f01e73cf..ea1341186ddfa 100644
--- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
+++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
@@ -230,10 +230,6 @@ define void @test7() nounwind "aarch64_inout_zt0" {
 ; CHECK-NEXT:    str zt0, [x19]
 ; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    bl callee
-; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    ldr zt0, [x19]
-; CHECK-NEXT:    str zt0, [x19]
-; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    ldr zt0, [x19]
diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
index ef74825e02881..3947127c47844 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
@@ -511,7 +511,6 @@ exit:
 ;
 ; This code may require reloading ZT0 in the cleanup for ~ZT0Resource().
 ;
-; FIXME: Codegen with `-aarch64-new-sme-abi` is broken with ZT0 (as it is not implemented).
 define void @try_catch_shared_zt0_callee() "aarch64_inout_zt0" personality ptr @__gxx_personality_v0 {
 ; CHECK-LABEL: try_catch_shared_zt0_callee:
 ; CHECK:       .Lfunc_begin3:
@@ -519,52 +518,37 @@ define void @try_catch_shared_zt0_callee() "aarch64_inout_zt0" personality ptr @
 ; CHECK-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
 ; CHECK-NEXT:    .cfi_lsda 28, .Lexception3
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    sub sp, sp, #80
-; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -24
-; CHECK-NEXT:    .cfi_offset w29, -32
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x9, x8, x8, x9
-; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    stp x9, x8, [x29, #-80]
+; CHECK-NEXT:    .cfi_offset w30, -32
 ; CHECK-NEXT:  .Ltmp9: // EH_LABEL
-; CHECK-NEXT:    sub x19, x29, #64
+; CHECK-NEXT:    mov x19, sp
 ; CHECK-NEXT:    str zt0, [x19]
 ; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    bl may_throw
+; CHECK-NEXT:  .Ltmp10: // EH_LABEL
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    ldr zt0, [x19]
-; CHECK-NEXT:  .Ltmp10: // EH_LABEL
 ; CHECK-NEXT:  // %bb.1: // %return_normally
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Reload
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB3_2: // %unwind_dtors
 ; CHECK-NEXT:  .Ltmp11: // EH_LABEL
-; CHECK-NEXT:    sub x20, x29, #64
+; CHECK-NEXT:    mov x20, sp
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-NEXT:    sub x0, x29, #80
-; CHECK-NEXT:    cbnz x8, .LBB3_4
-; CHECK-NEXT:  // %bb.3: // %unwind_dtors
-; CHECK-NEXT:    bl __arm_tpidr2_restore
-; CHECK-NEXT:  .LBB3_4: // %unwind_dtors
-; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    ldr zt0, [x20]
 ; CHECK-NEXT:    bl shared_zt0_call
 ; CHECK-NEXT:    str zt0, [x20]
 ; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    mov x0, x19
 ; CHECK-NEXT:    bl _Unwind_Resume
-; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    ldr zt0, [x20]
 ;
 ; CHECK-SDAG-LABEL: try_catch_shared_zt0_callee:
 ; CHECK-SDAG:       .Lfunc_begin3:
@@ -965,6 +949,239 @@ exit:
   ret void
 }
 
+define void @try_catch_inout_zt0() "aarch64_inout_zt0" personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: try_catch_inout_zt0:
+; CHECK:       .Lfunc_begin7:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-NEXT:    .cfi_lsda 28, .Lexception7
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:  .Ltmp21: // EH_LABEL
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    str zt0, [x19]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    bl may_throw
+; CHECK-NEXT:  .Ltmp22: // EH_LABEL
+; CHECK-NEXT:  .LBB7_1: // %exit
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB7_2: // %catch
+; CHECK-NEXT:  .Ltmp23: // EH_LABEL
+; CHECK-NEXT:    bl __cxa_begin_catch
+; CHECK-NEXT:    bl __cxa_end_catch
+; CHECK-NEXT:    b .LBB7_1
+;
+; CHECK-SDAG-LABEL: try_catch_inout_zt0:
+; CHECK-SDAG:       .Lfunc_begin7:
+; CHECK-SDAG-NEXT:    .cfi_startproc
+; CHECK-SDAG-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT:    .cfi_lsda 28, .Lexception7
+; CHECK-SDAG-NEXT:  // %bb.0: // %entry
+; CHECK-SDAG-NEXT:    sub sp, sp, #80
+; CHECK-SDAG-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-SDAG-NEXT:    .cfi_offset w19, -8
+; CHECK-SDAG-NEXT:    .cfi_offset w30, -16
+; CHECK-SDAG-NEXT:  .Ltmp21: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x19, sp
+; CHECK-SDAG-NEXT:    str zt0, [x19]
+; CHECK-SDAG-NEXT:    smstop za
+; CHECK-SDAG-NEXT:    bl may_throw
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x19]
+; CHECK-SDAG-NEXT:  .Ltmp22: // EH_LABEL
+; CHECK-SDAG-NEXT:  .LBB7_1: // %exit
+; CHECK-SDAG-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    add sp, sp, #80
+; CHECK-SDAG-NEXT:    ret
+; CHECK-SDAG-NEXT:  .LBB7_2: // %catch
+; CHECK-SDAG-NEXT:  .Ltmp23: // EH_LABEL
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x19]
+; CHECK-SDAG-NEXT:    str zt0, [x19]
+; CHECK-SDAG-NEXT:    smstop za
+; CHECK-SDAG-NEXT:    bl __cxa_begin_catch
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x19]
+; CHECK-SDAG-NEXT:    str zt0, [x19]
+; CHECK-SDAG-NEXT:    smstop za
+; CHECK-SDAG-NEXT:    bl __cxa_end_catch
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x19]
+; CHECK-SDAG-NEXT:    b .LBB7_1
+entry:
+  invoke void @may_throw()
+          to label %exit unwind label %catch
+
+catch:
+  %eh_info = landingpad { ptr, i32 }
+          catch ptr null
+  %exception_ptr = extractvalue { ptr, i32 } %eh_info, 0
+  tail call ptr @__cxa_begin_catch(ptr %exception_ptr)
+  tail call void @__cxa_end_catch()
+  br label %exit
+
+exit:
+  ret void
+}
+
+define void @try_catch_shared_za_callee_zt0_saved(ptr %callee) "aarch64_inout_za" "aarch64_in_zt0" personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: try_catch_shared_za_callee_zt0_saved:
+; CHECK:       .Lfunc_begin8:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-NEXT:    .cfi_lsda 28, .Lexception8
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x9, x8, x8, x9
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    stp x9, x8, [x29, #-80]
+; CHECK-NEXT:  .Ltmp24: // EH_LABEL
+; CHECK-NEXT:    sub x20, x29, #64
+; CHECK-NEXT:    sub x8, x29, #80
+; CHECK-NEXT:    str zt0, [x20]
+; CHECK-NEXT:    msr TPIDR2_EL0, x8
+; CHECK-NEXT:    bl may_throw
+; CHECK-NEXT:  .Ltmp25: // EH_LABEL
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #80
+; CHECK-NEXT:    cbnz x8, .LBB8_2
+; CHECK-NEXT:  // %bb.1:
+; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:  .LBB8_2:
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    ldr zt0, [x20]
+; CHECK-NEXT:  // %bb.3: // %return_normally
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB8_4: // %unwind_dtors
+; CHECK-NEXT:  .Ltmp26: // EH_LABEL
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #80
+; CHECK-NEXT:    cbnz x8, .LBB8_6
+; CHECK-NEXT:  // %bb.5: // %unwind_dtors
+; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:  .LBB8_6: // %unwind_dtors
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    blr x19
+; CHECK-NEXT:    sub x8, x29, #80
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    msr TPIDR2_EL0, x8
+; CHECK-NEXT:    bl _Unwind_Resume
+;
+; CHECK-SDAG-LABEL: try_catch_shared_za_callee_zt0_saved:
+; CHECK-SDAG:       .Lfunc_begin8:
+; CHECK-SDAG-NEXT:    .cfi_startproc
+; CHECK-SDAG-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT:    .cfi_lsda 28, .Lexception8
+; CHECK-SDAG-NEXT:  // %bb.0:
+; CHECK-SDAG-NEXT:    stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    mov x29, sp
+; CHECK-SDAG-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    sub sp, sp, #80
+; CHECK-SDAG-NEXT:    .cfi_def_cfa w29, 48
+; CHECK-SDAG-NEXT:    .cfi_offset w19, -8
+; CHECK-SDAG-NEXT:    .cfi_offset w20, -16
+; CHECK-SDAG-NEXT:    .cfi_offset w21, -24
+; CHECK-SDAG-NEXT:    .cfi_offset w22, -32
+; CHECK-SDAG-NEXT:    .cfi_offset w30, -40
+; CHECK-SDAG-NEXT:    .cfi_offset w29, -48
+; CHECK-SDAG-NEXT:    rdsvl x8, #1
+; CHECK-SDAG-NEXT:    mov x9, sp
+; CHECK-SDAG-NEXT:    mov x19, x0
+; CHECK-SDAG-NEXT:    msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT:    mov sp, x9
+; CHECK-SDAG-NEXT:    stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT:  .Ltmp24: // EH_LABEL
+; CHECK-SDAG-NEXT:    sub x8, x29, #16
+; CHECK-SDAG-NEXT:    sub x20, x29, #80
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x8
+; CHECK-SDAG-NEXT:    str zt0, [x20]
+; CHECK-SDAG-NEXT:    bl may_throw
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x20]
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB8_2
+; CHECK-SDAG-NEXT:  // %bb.1:
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB8_2:
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:  .Ltmp25: // EH_LABEL
+; CHECK-SDAG-NEXT:  // %bb.3: // %return_normally
+; CHECK-SDAG-NEXT:    mov sp, x29
+; CHECK-SDAG-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    ldp x29, x30, [sp], #48 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    ret
+; CHECK-SDAG-NEXT:  .LBB8_4: // %unwind_dtors
+; CHECK-SDAG-NEXT:  .Ltmp26: // EH_LABEL
+; CHECK-SDAG-NEXT:    sub x21, x29, #80
+; CHECK-SDAG-NEXT:    sub x22, x29, #16
+; CHECK-SDAG-NEXT:    mov x20, x0
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x21]
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB8_6
+; CHECK-SDAG-NEXT:  // %bb.5: // %unwind_dtors
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB8_6: // %unwind_dtors
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    str zt0, [x21]
+; CHECK-SDAG-NEXT:    blr x19
+; CHECK-SDAG-NEXT:    ldr zt0, [x21]
+; CHECK-SDAG-NEXT:    mov x0, x20
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x22
+; CHECK-SDAG-NEXT:    str zt0, [x21]
+; CHECK-SDAG-NEXT:    bl _Unwind_Resume
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x21]
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB8_8
+; CHECK-SDAG-NEXT:  // %bb.7: // %unwind_dtors
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB8_8: // %unwind_dtors
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+  invoke void @may_throw()
+          to label %return_normally unwind label %unwind_dtors
+
+unwind_dtors:
+  %5 = landingpad { ptr, i32 }
+          cleanup
+  call void %callee() "aarch64_inout_za"
+  resume { ptr, i32 } %5
+
+return_normally:
+  ret void
+}
+
 declare ptr @__cxa_allocate_exception(i64)
 declare void @__cxa_throw(ptr, ptr, ptr)
 declare ptr @__cxa_begin_catch(ptr)
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index 69c69f027a33f..0d4a39b2eeb2f 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -193,7 +193,7 @@ define void @zt0_new_caller_zt0_new_callee(ptr %callee) "aarch64_new_zt0" nounwi
 ; CHECK-NEWLOWERING-LABEL: zt0_new_caller_zt0_new_callee:
 ; CHECK-NEWLOWERING:       // %bb.0:
 ; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #80
-; CHECK-NEWLOWERING-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    str x30, [sp, #64] // 8-byte Spill
 ; CHECK-NEWLOWERING-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEWLOWERING-NEXT:    cbz x8, .LBB6_2
 ; CHECK-NEWLOWERING-NEXT:  // %bb.1:
@@ -202,14 +202,11 @@ define void @zt0_new_caller_zt0_new_callee(ptr %callee) "aarch64_new_zt0" nounwi
 ; CHECK-NEWLOWERING-NEXT:    zero { zt0 }
 ; CHECK-NEWLOWERING-NEXT:  .LBB6_2:
 ; CHECK-NEWLOWERING-NEXT:    smstart za
-; CHECK-NEWLOWERING-NEXT:    mov x19, sp
-; CHECK-NEWLOWERING-NEXT:    str zt0, [x19]
+; CHECK-NEWLOWERING-NEXT:    mov x8, sp
+; CHECK-NEWLOWERING-NEXT:    str zt0, [x8]
 ; CHECK-NEWLOWERING-NEXT:    smstop za
 ; CHECK-NEWLOWERING-NEXT:    blr x0
-; CHECK-NEWLOWERING-NEXT:    smstart za
-; CHECK-NEWLOWERING-NEXT:    ldr zt0, [x19]
-; CHECK-NEWLOWERING-NEXT:    smstop za
-; CHECK-NEWLOWERING-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldr x30, [sp, #64] // 8-byte Reload
 ; CHECK-NEWLOWERING-NEXT:    add sp, sp, #80
 ; CHECK-NEWLOWERING-NEXT:    ret
   call void %callee() "aarch64_new_zt0";
@@ -246,7 +243,7 @@ define i64 @zt0_new_caller_abi_routine_callee() "aarch64_new_zt0" nounwind {
 ; CHECK-NEWLOWERING-LABEL: zt0_new_caller_abi_routine_callee:
 ; CHECK-NEWLOWERING:       // %bb.0:
 ; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #80
-; CHECK-NEWLOWERING-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    str x30, [sp, #64] // 8-byte Spill
 ; CHECK-NEWLOWERING-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEWLOWERING-NEXT:    cbz x8, .LBB7_2
 ; CHECK-NEWLOWERING-NEXT:  // %bb.1:
@@ -255,12 +252,11 @@ define i64 @zt0_new_caller_abi_routine_callee() "aarch64_new_zt0" nounwind {
 ; CHECK-NEWLOWERING-NEXT:    zero { zt0 }
 ; CHECK-NEWLOWERING-NEXT:  .LBB7_2:
 ; CHECK-NEWLOWERING-NEXT:    smstart za
-; CHECK-NEWLOWERING-NEXT:    mov x19, sp
-; CHECK-NEWLOWERING-NEXT:    str zt0, [x19]
-; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_state
-; CHECK-NEWLOWERING-NEXT:    ldr zt0, [x19]
+; CHECK-NEWLOWERING-NEXT:    mov x8, sp
+; CHECK-NEWLOWERING-NEXT:    str zt0, [x8]
 ; CHECK-NEWLOWERING-NEXT:    smstop za
-; CHECK-NEWLOWERING-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_state
+; CHECK-NEWLOWERING-NEXT:    ldr x30, [sp, #64] // 8-byte Reload
 ; CHECK-NEWLOWERING-NEXT:    add sp, sp, #80
 ; CHECK-NEWLOWERING-NEXT:    ret
   %res = call {i64, i64} @__arm_sme_state()
@@ -382,37 +378,57 @@ define void @shared_za_new_zt0(ptr %callee) "aarch64_inout_za" "aarch64_new_zt0"
 
 
 define void @zt0_multiple_private_za_calls(ptr %callee) "aarch64_in_zt0" nounwind {
-; CHECK-COMMON-LABEL: zt0_multiple_private_za_calls:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    sub sp, sp, #96
-; CHECK-COMMON-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    mov x20, sp
-; CHECK-COMMON-NEXT:    mov x19, x0
-; CHECK-COMMON-NEXT:    str x30, [sp, #64] // 8-byte Spill
-; CHECK-COMMON-NEXT:    str zt0, [x20]
-; CHECK-COMMON-NEXT:    smstop za
-; CHECK-COMMON-NEXT:    blr x0
-; CHECK-COMMON-NEXT:    smstart za
-; CHECK-COMMON-NEXT:    ldr zt0, [x20]
-; CHECK-COMMON-NEXT:    str zt0, [x20]
-; CHECK-COMMON-NEXT:    smstop za
-; CHECK-COMMON-NEXT:    blr x19
-; CHECK-COMMON-NEXT:    smstart za
-; CHECK-COMMON-NEXT:    ldr zt0, [x20]
-; CHECK-COMMON-NEXT:    str zt0, [x20]
-; CHECK-COMMON-NEXT:    smstop za
-; CHECK-COMMON-NEXT:    blr x19
-; CHECK-COMMON-NEXT:    smstart za
-; CHECK-COMMON-NEXT:    ldr zt0, [x20]
-; CHECK-COMMON-NEXT:    str zt0, [x20]
-; CHECK-COMMON-NEXT:    smstop za
-; CHECK-COMMON-NEXT:    blr x19
-; CHECK-COMMON-NEXT:    smstart za
-; CHECK-COMMON-NEXT:    ldr zt0, [x20]
-; CHECK-COMMON-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT:    ldr x30, [sp, #64] // 8-byte Reload
-; CHECK-COMMON-NEXT:    add sp, sp, #96
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: zt0_multiple_private_za_calls:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x20, sp
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Spill
+; CHECK-NEXT:    str zt0, [x20]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    blr x0
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x20]
+; CHECK-NEXT:    str zt0, [x20]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    blr x19
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x20]
+; CHECK-NEXT:    str zt0, [x20]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    blr x19
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x20]
+; CHECK-NEXT:    str zt0, [x20]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    blr x19
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x20]
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+;
+; CHECK-NEWLOWERING-LABEL: zt0_multiple_private_za_calls:
+; CHECK-NEWLOWERING:       // %bb.0:
+; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #96
+; CHECK-NEWLOWERING-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    mov x20, sp
+; CHECK-NEWLOWERING-NEXT:    mov x19, x0
+; CHECK-NEWLOWERING-NEXT:    str x30, [sp, #64] // 8-byte Spill
+; CHECK-NEWLOWERING-NEXT:    str zt0, [x20]
+; CHECK-NEWLOWERING-NEXT:    smstop za
+; CHECK-NEWLOWERING-NEXT:    blr x0
+; CHECK-NEWLOWERING-NEXT:    blr x19
+; CHECK-NEWLOWERING-NEXT:    blr x19
+; CHECK-NEWLOWERING-NEXT:    blr x19
+; CHECK-NEWLOWERING-NEXT:    smstart za
+; CHECK-NEWLOWERING-NEXT:    ldr zt0, [x20]
+; CHECK-NEWLOWERING-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldr x30, [sp, #64] // 8-byte Reload
+; CHECK-NEWLOWERING-NEXT:    add sp, sp, #96
+; CHECK-NEWLOWERING-NEXT:    ret
   call void %callee()
   call void %callee()
   call void %callee()

From 34c44f21ae9bf5532e467fa2e942fe61715d1394 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Mon, 1 Dec 2025 10:05:56 +0000
Subject: [PATCH 19/39] [flang][TBAA] refine TARGET/POINTER encoding (#169544)

Previously we were less specific for POINTER/TARGET: encoding that they
could alias with (almost) anything.

In the new system, the "target data" tree is now a sibling of the other
trees (e.g. "global data"). POITNTER variables go at the root of the
"target data" tree, whereas TARGET variables get their own nodes under
that tree. For example,

```
integer, pointer :: ip
real, pointer :: rp
integer, target :: it
integer, target :: it2(:)
real, target :: rt
integer :: i
real :: r
```
- `ip` and `rp` may alias with any variable except `i` and `r`.
- `it`, `it2`, and `rt` may alias only with `ip` or `rp`.
- `i` and `r` cannot alias with any other variable.

Fortran 2023 15.5.2.14 gives restrictions on entities associated with
dummy arguments. These do not allow non-target globals to be modified
through dummy arguments and therefore I don't think we need to make all
globals alias with dummy arguments.

I haven't implemented it in this patch, but I wonder whether it is ever
possible for `ip` to alias with `rt` or even `it2`.

While I was updating the tests I fixed up some tests that still assumed
that local alloc tbaa wasn't the default.

I found no functional regressions in the gfortran test suite, fujitsu
test suite, spec2017, or a selection of HPC apps we test internally.
---
 .../flang/Optimizer/Analysis/TBAAForest.h     | 24 +++--
 flang/lib/Optimizer/Analysis/TBAAForest.cpp   |  9 +-
 .../lib/Optimizer/Transforms/AddAliasTags.cpp | 18 +++-
 flang/test/Driver/tco-test-gen.fir            |  8 +-
 flang/test/Fir/tbaa-codegen2.fir              |  1 -
 .../test/Transforms/tbaa-for-common-vars.fir  | 29 +++---
 .../Transforms/tbaa-for-global-equiv-vars.fir |  6 +-
 flang/test/Transforms/tbaa-for-local-vars.fir | 32 ++++---
 .../test/Transforms/tbaa-with-dummy-scope.fir | 22 +++--
 .../Transforms/tbaa-with-dummy-scope2.fir     | 32 ++++---
 flang/test/Transforms/tbaa2.fir               | 20 ++---
 flang/test/Transforms/tbaa3.fir               | 89 +++++++++----------
 flang/test/Transforms/tbaa4.fir               | 32 +++----
 13 files changed, 162 insertions(+), 160 deletions(-)

diff --git a/flang/include/flang/Optimizer/Analysis/TBAAForest.h b/flang/include/flang/Optimizer/Analysis/TBAAForest.h
index b4932594114a1..0b70778eba3af 100644
--- a/flang/include/flang/Optimizer/Analysis/TBAAForest.h
+++ b/flang/include/flang/Optimizer/Analysis/TBAAForest.h
@@ -99,11 +99,25 @@ struct TBAATree {
   //   |- "any data access"
   //      |
   //      |- "dummy arg data"
-  //      |- "target data"
-  //         |
-  //         |- "allocated data"
-  //         |- "direct data"
-  //         |- "global data"
+  //        |
+  //        |- <dummy arg name 1>
+  //        |- <dummy arg name 2>
+  //      |- "target data" <-- Any POINTER variable or TARGET dummy arg
+  //        |
+  //        |- <target name 1> <--- any TARGET variable which isn't a dummy arg
+  //        |- <target name 2>
+  //      |- "allocated data"
+  //        |
+  //        |- <allocated name 1>
+  //        |- <allocated name 2>
+  //      |- "direct data"
+  //        |
+  //        |- <direct name 1>
+  //        |- <direct name 2>
+  //      |- "global data"
+  //        |
+  //        |- <global name 1>
+  //        |- <global name 2>
   static TBAATree buildTree(mlir::StringAttr functionName);
 
 private:
diff --git a/flang/lib/Optimizer/Analysis/TBAAForest.cpp b/flang/lib/Optimizer/Analysis/TBAAForest.cpp
index 44a0348da3a6f..7154785c62c75 100644
--- a/flang/lib/Optimizer/Analysis/TBAAForest.cpp
+++ b/flang/lib/Optimizer/Analysis/TBAAForest.cpp
@@ -66,12 +66,9 @@ fir::TBAATree::TBAATree(mlir::LLVM::TBAATypeDescriptorAttr anyAccess,
                         mlir::LLVM::TBAATypeDescriptorAttr dataRoot,
                         mlir::LLVM::TBAATypeDescriptorAttr boxMemberTypeDesc)
     : targetDataTree(dataRoot.getContext(), "target data", dataRoot),
-      globalDataTree(dataRoot.getContext(), "global data",
-                     targetDataTree.getRoot()),
-      allocatedDataTree(dataRoot.getContext(), "allocated data",
-                        targetDataTree.getRoot()),
+      globalDataTree(dataRoot.getContext(), "global data", dataRoot),
+      allocatedDataTree(dataRoot.getContext(), "allocated data", dataRoot),
       dummyArgDataTree(dataRoot.getContext(), "dummy arg data", dataRoot),
-      directDataTree(dataRoot.getContext(), "direct data",
-                     targetDataTree.getRoot()),
+      directDataTree(dataRoot.getContext(), "direct data", dataRoot),
       anyAccessDesc(anyAccess), boxMemberTypeDesc(boxMemberTypeDesc),
       anyDataTypeDesc(dataRoot) {}
diff --git a/flang/lib/Optimizer/Transforms/AddAliasTags.cpp b/flang/lib/Optimizer/Transforms/AddAliasTags.cpp
index 0221c7a8184d7..b592cee794f33 100644
--- a/flang/lib/Optimizer/Transforms/AddAliasTags.cpp
+++ b/flang/lib/Optimizer/Transforms/AddAliasTags.cpp
@@ -692,8 +692,9 @@ void AddAliasTagsPass::runOnAliasInterface(fir::FirAliasTagOpInterface op,
     LLVM_DEBUG(llvm::dbgs().indent(2)
                << "Found reference to dummy argument at " << *op << "\n");
     std::string name = getFuncArgName(llvm::cast<mlir::Value>(source.origin.u));
-    // If it is a TARGET or POINTER, then we do not care about the name,
-    // because the tag points to the root of the subtree currently.
+    // POINTERS can alias with any POINTER or TARGET. Assume that TARGET dummy
+    // arguments might alias with each other (because of the "TARGET" hole for
+    // dummy arguments). See flang/docs/Aliasing.md.
     if (source.isTargetOrPointer()) {
       tag = state.getFuncTreeWithScope(func, scopeOp).targetDataTree.getTag();
     } else if (!name.empty()) {
@@ -716,7 +717,12 @@ void AddAliasTagsPass::runOnAliasInterface(fir::FirAliasTagOpInterface op,
                << "Found reference to global " << globalName.str() << " at "
                << *op << "\n");
     if (source.isPointer()) {
+      // Pointers can alias with any pointer or target.
       tag = state.getFuncTreeWithScope(func, scopeOp).targetDataTree.getTag();
+    } else if (source.isTarget()) {
+      // Targets could alias with any pointer but not with each other.
+      tag = state.getFuncTreeWithScope(func, scopeOp)
+                .targetDataTree.getTag(globalName);
     } else {
       // In general, place the tags under the "global data" root.
       fir::TBAATree::SubtreeState *subTree =
@@ -776,9 +782,17 @@ void AddAliasTagsPass::runOnAliasInterface(fir::FirAliasTagOpInterface op,
       const char *name = glbl.getRootReference().data();
       LLVM_DEBUG(llvm::dbgs().indent(2) << "Found reference to direct " << name
                                         << " at " << *op << "\n");
+      // Pointer can alias with any pointer or target so that gets the root.
       if (source.isPointer())
         tag = state.getFuncTreeWithScope(func, scopeOp).targetDataTree.getTag();
+      // Targets could alias with any pointer but not with each other so they
+      // get their own node inside of the target data tree.
+      else if (source.isTarget())
+        tag = state.getFuncTreeWithScope(func, scopeOp)
+                  .targetDataTree.getTag(name);
       else
+        // Boxes that are not pointers or targets cannot alias with those that
+        // are. Put them under global data.
         tag = state.getFuncTreeWithScope(func, scopeOp)
                   .directDataTree.getTag(name);
     } else {
diff --git a/flang/test/Driver/tco-test-gen.fir b/flang/test/Driver/tco-test-gen.fir
index b39295d72918f..438804ce42b76 100644
--- a/flang/test/Driver/tco-test-gen.fir
+++ b/flang/test/Driver/tco-test-gen.fir
@@ -77,13 +77,13 @@ func.func @_QPtest(%arg0: !fir.ref<i32> {fir.bindc_name = "num"}, %arg1: !fir.re
 // CHECK:           llvm.cond_br %[[VAL_17]], ^bb2, ^bb3
 // CHECK:         ^bb2:
 
-// AA:              llvm.store %[[VAL_15]], %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : i32, !llvm.ptr
+// AA:              llvm.store %[[VAL_15]], %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : i32, !llvm.ptr
 // NOAA:            llvm.store %[[VAL_15]], %{{.*}} : i32, !llvm.ptr
 
 // AA:              %[[VAL_18:.*]] = llvm.load %[[ARG0]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "dummy arg data/_QFtestEnum", members = {<#llvm.tbaa_type_desc<id = "dummy arg data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "dummy arg data/_QFtestEnum", members = {<#llvm.tbaa_type_desc<id = "dummy arg data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : !llvm.ptr -> i32
 // NOAA:            %[[VAL_18:.*]] = llvm.load %[[ARG0]] : !llvm.ptr -> i32
 
-// AA:              %[[VAL_19:.*]] = llvm.load %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : !llvm.ptr -> i32
+// AA:              %[[VAL_19:.*]] = llvm.load %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : !llvm.ptr -> i32
 // NOAA:            %[[VAL_19:.*]] = llvm.load %{{.*}} : !llvm.ptr -> i32
 
 // CHECK:           %[[VAL_20:.*]] = llvm.add %[[VAL_18]], %[[VAL_19]] : i32
@@ -92,7 +92,7 @@ func.func @_QPtest(%arg0: !fir.ref<i32> {fir.bindc_name = "num"}, %arg1: !fir.re
 
 // CHECK:           %[[VAL_21:.*]] = llvm.trunc %[[VAL_10]] : i64 to i32
 
-// AA:              %[[VAL_22:.*]] = llvm.load %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : !llvm.ptr -> i32
+// AA:              %[[VAL_22:.*]] = llvm.load %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : !llvm.ptr -> i32
 // NOAA:            %[[VAL_22:.*]] = llvm.load %{{.*}} : !llvm.ptr -> i32
 
 // CHECK:           %[[VAL_23:.*]] = llvm.add %[[VAL_22]], %[[VAL_21]] overflow<nsw> : i32
@@ -100,7 +100,7 @@ func.func @_QPtest(%arg0: !fir.ref<i32> {fir.bindc_name = "num"}, %arg1: !fir.re
 // CHECK:           llvm.br ^bb1(%[[VAL_23]], %[[VAL_24]] : i32, i64)
 // CHECK:         ^bb3:
 
-// AA:              llvm.store %[[VAL_15]], %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : i32, !llvm.ptr
+// AA:              llvm.store %[[VAL_15]], %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : i32, !llvm.ptr
 // NOAA:            llvm.store %[[VAL_15]], %{{.*}} : i32, !llvm.ptr
 
 // CHECK:           llvm.return
diff --git a/flang/test/Fir/tbaa-codegen2.fir b/flang/test/Fir/tbaa-codegen2.fir
index 4907aa03ec5a5..071d3ec89394c 100644
--- a/flang/test/Fir/tbaa-codegen2.fir
+++ b/flang/test/Fir/tbaa-codegen2.fir
@@ -114,4 +114,3 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.targ
 // CHECK: ![[TMP_DATA_ACCESS_TAG]] = !{![[TMP_DATA_ACCESS_TYPE:.*]], ![[TMP_DATA_ACCESS_TYPE]], i64 0}
 // CHECK: ![[TMP_DATA_ACCESS_TYPE]] = !{!"allocated data/", ![[TMP_ACCESS_TYPE:.*]], i64 0}
 // CHECK: ![[TMP_ACCESS_TYPE]] = !{!"allocated data", ![[TARGET_ACCESS_TAG:.*]], i64 0}
-// CHECK: ![[TARGET_ACCESS_TAG]] = !{!"target data", ![[DATA_ACCESS_TYPE]], i64 0}
diff --git a/flang/test/Transforms/tbaa-for-common-vars.fir b/flang/test/Transforms/tbaa-for-common-vars.fir
index a8dd86bff72ed..087e6938f8acb 100644
--- a/flang/test/Transforms/tbaa-for-common-vars.fir
+++ b/flang/test/Transforms/tbaa-for-common-vars.fir
@@ -28,8 +28,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest1">
 // CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_0]], 0>}>
 // CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_1]], 0>}>
-// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_2]], 0>}>
-// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_3]], 0>}>
+// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_2]], 0>}>
 // CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc<id = "global data/common1_", members = {<#[[$ATTR_4]], 0>}>
 // CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "global data/common1_/bytes_4_to_7", members = {<#[[$ATTR_5]], 0>}>
 // CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_type_desc<id = "global data/common1_/bytes_0_to_3", members = {<#[[$ATTR_5]], 0>}>
@@ -66,8 +65,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[$ATTR_10:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest2">
 // CHECK: #[[$ATTR_11:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_10]], 0>}>
 // CHECK: #[[$ATTR_12:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_11]], 0>}>
-// CHECK: #[[$ATTR_13:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_12]], 0>}>
-// CHECK: #[[$ATTR_14:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_13]], 0>}>
+// CHECK: #[[$ATTR_14:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_12]], 0>}>
 // CHECK: #[[$ATTR_15:.+]] = #llvm.tbaa_type_desc<id = "global data/common2_", members = {<#[[$ATTR_14]], 0>}>
 // CHECK: #[[$ATTR_16:.+]] = #llvm.tbaa_type_desc<id = "global data/common2_/bytes_0_to_3", members = {<#[[$ATTR_15]], 0>}>
 // CHECK: #[[$ATTR_18:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_16]], access_type = #[[$ATTR_16]], offset = 0>
@@ -118,14 +116,13 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[ANYACC3INNER:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT3INNER]], 0>}>
 // CHECK: #[[ANYDATA3:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANYACC3]], 0>}>
 // CHECK: #[[ANYDATA3INNER:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANYACC3INNER]], 0>}>
-// CHECK: #[[TARGETDATA3:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA3]], 0>}>
+// CHECK: #[[GLOBALDATA3:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA3]], 0>}>
 // CHECK: #[[DUMMYARG3INNER:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[ANYDATA3INNER]], 0>}>
-// CHECK: #[[GLOBALDATA3:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA3]], 0>}>
+// CHECK: #[[GLOBALDATA3COMMON3:.+]] = #llvm.tbaa_type_desc<id = "global data/common3_", members = {<#[[GLOBALDATA3]], 0>}>
 // CHECK: #[[DUMMYD:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest3FinnerEd", members = {<#[[DUMMYARG3INNER]], 0>}>
 // CHECK: #[[DUMMYC:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest3FinnerEc", members = {<#[[DUMMYARG3INNER]], 0>}>
 // CHECK: #[[DUMMYDTAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYD]], access_type = #[[DUMMYD]], offset = 0>
 // CHECK: #[[DUMMYCTAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYC]], access_type = #[[DUMMYC]], offset = 0>
-// CHECK: #[[GLOBALDATA3COMMON3:.+]] = #llvm.tbaa_type_desc<id = "global data/common3_", members = {<#[[GLOBALDATA3]], 0>}>
 // CHECK: #[[GLOBALB:.+]] = #llvm.tbaa_type_desc<id = "global data/common3_/bytes_4_to_7", members = {<#[[GLOBALDATA3COMMON3]], 0>}>
 // CHECK: #[[GLOBALA:.+]] = #llvm.tbaa_type_desc<id = "global data/common3_/bytes_0_to_3", members = {<#[[GLOBALDATA3COMMON3]], 0>}>
 // CHECK: #[[GLOBALBTAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLOBALB]], access_type = #[[GLOBALB]], offset = 0>
@@ -180,10 +177,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[INNER4ANYACC:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[INNER4ROOT]], 0>}>
 // CHECK: #[[TEST4ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[TEST4ANYCC]], 0>}>
 // CHECK: #[[INNER4ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[INNER4ANYACC]], 0>}>
-// CHECK: #[[TEST4TARGET:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[TEST4ANYDATA]], 0>}>
-// CHECK: #[[INNER4TARGET:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[INNER4ANYDATA]], 0>}>
-// CHECK: #[[TEST4GLOBAL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TEST4TARGET]], 0>}>
-// CHECK: #[[INNER4GLOBAL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[INNER4TARGET]], 0>}>
+// CHECK: #[[TEST4GLOBAL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TEST4ANYDATA]], 0>}>
+// CHECK: #[[INNER4GLOBAL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[INNER4ANYDATA]], 0>}>
 // CHECK: #[[TEST4COMMON:.+]] = #llvm.tbaa_type_desc<id = "global data/common4_", members = {<#[[TEST4GLOBAL]], 0>}>
 // CHECK: #[[INNER4COMMON:.+]] = #llvm.tbaa_type_desc<id = "global data/common4_", members = {<#[[INNER4GLOBAL]], 0>}>
 // CHECK: #[[TEST4B:.+]] = #llvm.tbaa_type_desc<id = "global data/common4_/bytes_4_to_7", members = {<#[[TEST4COMMON]], 0>}>
@@ -229,8 +224,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[TEST5ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest5">
 // CHECK: #[[TEST5ANYACC:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[TEST5ROOT]], 0>}>
 // CHECK: #[[TEST5ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[TEST5ANYACC]], 0>}>
-// CHECK: #[[TEST5TARGET:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[TEST5ANYDATA]], 0>}>
-// CHECK: #[[TEST5GLOBAL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TEST5TARGET]], 0>}>
+// CHECK: #[[TEST5GLOBAL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TEST5ANYDATA]], 0>}>
 // CHECK: #[[TEST5COMMON5:.+]] = #llvm.tbaa_type_desc<id = "global data/common5_", members = {<#[[TEST5GLOBAL]], 0>}>
 // CHECK: #[[TEST5COMMON5TAG:.+]] = #llvm.tbaa_tag<base_type = #[[TEST5COMMON5]], access_type = #[[TEST5COMMON5]], offset = 0>
 // CHECK: #[[TEST5A:.+]] = #llvm.tbaa_type_desc<id = "global data/common5_/bytes_0_to_3", members = {<#[[TEST5COMMON5]], 0>}>
@@ -288,8 +282,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest6">
 // CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_0]], 0>}>
 // CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_1]], 0>}>
-// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_2]], 0>}>
-// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_3]], 0>}>
+// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_2]], 0>}>
 // CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc<id = "global data/block_", members = {<#[[$ATTR_4]], 0>}>
 // CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "global data/block_/bytes_0_to_79", members = {<#[[$ATTR_5]], 0>}>
 // CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_6]], access_type = #[[$ATTR_6]], offset = 0>
@@ -354,8 +347,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[$ATTR_74:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_73]], 0>}>
 // CHECK: #[[$ATTR_75:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_74]], 0>}>
 // CHECK: #[[$ATTR_76:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_75]], 0>}>
+// CHECK: #[[$ATTR_78:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_75]], 0>}>
 // CHECK: #[[$ATTR_77:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_76]], access_type = #[[$ATTR_76]], offset = 0>
-// CHECK: #[[$ATTR_78:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_76]], 0>}>
 // CHECK: #[[$ATTR_79:.+]] = #llvm.tbaa_type_desc<id = "global data/block_", members = {<#[[$ATTR_78]], 0>}>
 // CHECK: #[[$ATTR_80:.+]] = #llvm.tbaa_type_desc<id = "global data/block_/bytes_40_to_43", members = {<#[[$ATTR_79]], 0>}>
 // CHECK: #[[$ATTR_81:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_80]], access_type = #[[$ATTR_80]], offset = 0>
@@ -425,10 +418,10 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[$ATTR_82:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest8">
 // CHECK: #[[$ATTR_83:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_82]], 0>}>
 // CHECK: #[[$ATTR_84:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_83]], 0>}>
+// CHECK: #[[$ATTR_87:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$ATTR_84]], 0>}>
 // CHECK: #[[$ATTR_85:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_84]], 0>}>
-// CHECK: #[[$ATTR_86:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_85]], access_type = #[[$ATTR_85]], offset = 0>
-// CHECK: #[[$ATTR_87:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$ATTR_85]], 0>}>
 // CHECK: #[[$ATTR_88:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_87]], access_type = #[[$ATTR_87]], offset = 0>
+// CHECK: #[[$ATTR_86:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_85]], access_type = #[[$ATTR_85]], offset = 0>
 // CHECK-LABEL:   func.func @_QPtest8() {
 // CHECK:           fir.load %{{[0-9]+}} : !fir.ref<!fir.box<!fir.ptr<f32>>>
 // CHECK:           fir.load %{{[0-9]+}} {tbaa = [#[[$ATTR_86]]]} : !fir.ptr<f32>
diff --git a/flang/test/Transforms/tbaa-for-global-equiv-vars.fir b/flang/test/Transforms/tbaa-for-global-equiv-vars.fir
index dbefa3f8e3f5f..0d082c7504024 100644
--- a/flang/test/Transforms/tbaa-for-global-equiv-vars.fir
+++ b/flang/test/Transforms/tbaa-for-global-equiv-vars.fir
@@ -30,8 +30,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[ROOT1:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest1">
 // CHECK: #[[ANYACC1:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT1]], 0>}>
 // CHECK: #[[ANYDATA1:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANYACC1]], 0>}>
-// CHECK: #[[TARGETDATA1:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA1]], 0>}>
-// CHECK: #[[GLOBALDATA1:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA1]], 0>}>
+// CHECK: #[[GLOBALDATA1:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA1]], 0>}>
 // CHECK: #[[GLOB1COMMON:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMdata1Eglob1", members = {<#[[GLOBALDATA1]], 0>}>
 // CHECK: #[[GLOB1:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMdata1Eglob1/bytes_0_to_3", members = {<#[[GLOB1COMMON]], 0>}>
 // CHECK: #[[TAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLOB1]], access_type = #[[GLOB1]], offset = 0>
@@ -74,8 +73,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[ROOT2:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest2">
 // CHECK: #[[ANYACC2:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT2]], 0>}>
 // CHECK: #[[ANYDATA2:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANYACC2]], 0>}>
-// CHECK: #[[TARGETDATA2:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA2]], 0>}>
-// CHECK: #[[GLOBALDATA2:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA2]], 0>}>
+// CHECK: #[[GLOBALDATA2:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA2]], 0>}>
 // CHECK: #[[GLOB1COMMON:.+]] = #llvm.tbaa_type_desc<id = "global data/glob1_", members = {<#[[GLOBALDATA2]], 0>}>
 // CHECK: #[[GLOB1GLOB2:.+]] = #llvm.tbaa_type_desc<id = "global data/glob1_/bytes_0_to_3", members = {<#[[GLOB1COMMON]], 0>}>
 // CHECK: #[[GLOB3:.+]] = #llvm.tbaa_type_desc<id = "global data/glob1_/bytes_4_to_7", members = {<#[[GLOB1COMMON]], 0>}>
diff --git a/flang/test/Transforms/tbaa-for-local-vars.fir b/flang/test/Transforms/tbaa-for-local-vars.fir
index 4eb6b2ecf31c4..fde5c400c75ed 100644
--- a/flang/test/Transforms/tbaa-for-local-vars.fir
+++ b/flang/test/Transforms/tbaa-for-local-vars.fir
@@ -35,18 +35,22 @@
 // scope's TBAA tree.
 // RUN: fir-opt --fir-add-alias-tags %s | FileCheck %s
 
-// CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root<id = "Flang function root _QMmPtest - Scope 2">
-// CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_root<id = "Flang function root _QMmPtest">
-// CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_0]], 0>}>
-// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_1]], 0>}>
-// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_2]], 0>}>
-// CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_3]], 0>}>
-// CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ATTR_4]], 0>}>
-// CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_5]], 0>}>
-// CHECK: #[[$ATTR_9:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmFbarEthis", members = {<#[[$ATTR_6]], 0>}>
-// CHECK: #[[$ATTR_10:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$ATTR_7]], 0>}>
-// CHECK: #[[$ATTR_12:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_9]], access_type = #[[$ATTR_9]], offset = 0>
-// CHECK: #[[$ATTR_13:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_10]], access_type = #[[$ATTR_10]], offset = 0>
+// CHECK: #[[$SCOPE_2:.+]] = #llvm.tbaa_root<id = "Flang function root _QMmPtest - Scope 2">
+// CHECK: #[[$SCOPE_1:.+]] = #llvm.tbaa_root<id = "Flang function root _QMmPtest">
+// CHECK: #[[$ANY_ACCESS2:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$SCOPE_2]], 0>}>
+// CHECK: #[[$ANY_ACCESS1:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$SCOPE_1]], 0>}>
+// CHECK: #[[$ANY_DATA2:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ANY_ACCESS2]], 0>}>
+// CHECK: #[[$ANY_DATA1:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ANY_ACCESS1]], 0>}>
+// CHECK: #[[$DUMMY_ARG2:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ANY_DATA2]], 0>}>
+// CHECK: #[[$ALLOCATED_DATA1:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$ANY_DATA1]], 0>}>
+// CHECK: #[[$DUMMY_ARG1:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ANY_DATA1]], 0>}>
+// CHECK: #[[$ALLOCATED_DATA1_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[$ALLOCATED_DATA1]], access_type = #[[$ALLOCATED_DATA1]], offset = 0>
+// CHECK: #[[$BAR_THIS2:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmFbarEthis", members = {<#[[$DUMMY_ARG2]], 0>}>
+// CHECK: #[[$TEST_VAR1:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QMmFtestEvar", members = {<#[[$ALLOCATED_DATA1]], 0>}>
+// CHECK: #[[$TEST_ARG1:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmFtestEarg", members = {<#[[$DUMMY_ARG1]], 0>}>
+// CHECK: #[[$BAR_THIS2_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[$BAR_THIS2]], access_type = #[[$BAR_THIS2]], offset = 0>
+// CHECK: #[[$TEST_VAR1_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[$TEST_VAR1]], access_type = #[[$TEST_VAR1]], offset = 0>
+// CHECK: #[[$TEST_ARG2_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[$TEST_ARG1]], access_type = #[[$TEST_ARG1]], offset = 0>
 
 // CHECK-LABEL:   func.func @_QMmPtest(
 // CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<f32> {fir.bindc_name = "arg"}) {
@@ -61,10 +65,10 @@
 // CHECK:           %[[VAL_10:.*]] = fir.dummy_scope : !fir.dscope
 // CHECK:           %[[VAL_11:.*]] = fir.declare %[[VAL_9]] dummy_scope %[[VAL_10]] {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QMmFbarEthis"} : (!fir.class<!fir.type<_QMmTt{x:f32}>>, !fir.dscope) -> !fir.class<!fir.type<_QMmTt{x:f32}>>
 // CHECK:           %[[VAL_12:.*]] = fir.coordinate_of %[[VAL_11]], x : (!fir.class<!fir.type<_QMmTt{x:f32}>>) -> !fir.ref<f32>
-// CHECK:           fir.store %[[VAL_0]] to %[[VAL_12]] {tbaa = [#[[$ATTR_12]]]} : !fir.ref<f32>
+// CHECK:           fir.store %[[VAL_0]] to %[[VAL_12]] {tbaa = [#[[$BAR_THIS2_TAG]]]} : !fir.ref<f32>
 // CHECK:           %[[VAL_13:.*]] = fir.declare %[[VAL_1]] {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.type<_QMmTt{x:f32}>>) -> !fir.ref<!fir.type<_QMmTt{x:f32}>>
 // CHECK:           %[[VAL_14:.*]] = fir.coordinate_of %[[VAL_13]], x : (!fir.ref<!fir.type<_QMmTt{x:f32}>>) -> !fir.ref<f32>
-// CHECK:           %[[VAL_16:.*]] = fir.load %[[VAL_14]] {tbaa = [#[[$ATTR_13]]]} : !fir.ref<f32>
+// CHECK:           %[[VAL_16:.*]] = fir.load %[[VAL_14]] {tbaa = [#[[$ALLOCATED_DATA1_TAG]]]} : !fir.ref<f32>
 module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i64 = dense<[32, 64]> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little">, llvm.data_layout = ""} {
 func.func @_QMmPtest(%arg0: !fir.ref<f32> {fir.bindc_name = "arg"}) {
   %cst = arith.constant 1.000000e+00 : f32
diff --git a/flang/test/Transforms/tbaa-with-dummy-scope.fir b/flang/test/Transforms/tbaa-with-dummy-scope.fir
index 4ae2b8efe2581..d7f33776150ae 100644
--- a/flang/test/Transforms/tbaa-with-dummy-scope.fir
+++ b/flang/test/Transforms/tbaa-with-dummy-scope.fir
@@ -24,7 +24,7 @@
 // CHECK: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[TEST1ANYDATA]], 0>}>
 // CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ATTR_4]], 0>}>
 // CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ATTR_5]], 0>}>
-// CHECK: #[[TARGETTAG:.+]] = #llvm.tbaa_tag<base_type = #[[TARGETDATA]], access_type = #[[TARGETDATA]], offset = 0>
+// CHECK: #[[TARGETDATA_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[TARGETDATA]], access_type = #[[TARGETDATA]], offset = 0>
 // CHECK: #[[$ATTR_8:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtestFinnerEy", members = {<#[[$ATTR_6]], 0>}>
 // CHECK: #[[$ATTR_9:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtestFinnerEx", members = {<#[[$ATTR_6]], 0>}>
 // CHECK: #[[$ATTR_10:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtestFinnerEy", members = {<#[[$ATTR_7]], 0>}>
@@ -34,8 +34,8 @@
 // CHECK: #[[$ATTR_14:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_10]], access_type = #[[$ATTR_10]], offset = 0>
 // CHECK: #[[$ATTR_15:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_11]], access_type = #[[$ATTR_11]], offset = 0>
 // CHECK:   func.func @test1(
-// CHECK:           %[[VAL_5:.*]] = fir.load %{{.*}} {tbaa = [#[[TARGETTAG]]]} : !fir.ref<f32>
-// CHECK:           fir.store %{{.*}} {tbaa = [#[[TARGETTAG]]]} : !fir.ref<f32>
+// CHECK:           %[[VAL_5:.*]] = fir.load %{{.*}} {tbaa = [#[[TARGETDATA_TAG]]]} : !fir.ref<f32>
+// CHECK:           fir.store %{{.*}} {tbaa = [#[[TARGETDATA_TAG]]]} : !fir.ref<f32>
 // CHECK:           %[[VAL_6:.*]] = fir.dummy_scope : !fir.dscope
 // CHECK:           %[[VAL_9:.*]] = fir.load %{{.*}} {tbaa = [#[[$ATTR_12]]]} : !fir.ref<f32>
 // CHECK:           fir.store %{{.*}} {tbaa = [#[[$ATTR_13]]]} : !fir.ref<f32>
@@ -83,23 +83,21 @@ func.func @test1(%arg0: !fir.ref<f32> {fir.bindc_name = "x", fir.target}, %arg1:
 // CHECK: #[[$ATTR_33:.+]] = #llvm.tbaa_root<id = "Flang function root _QMtestPcaller - Scope 1">
 // CHECK: #[[$ATTR_34:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_32]], 0>}>
 // CHECK: #[[$ATTR_35:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_33]], 0>}>
-// CHECK: #[[$ATTR_36:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_34]], 0>}>
-// CHECK: #[[$ATTR_37:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_35]], 0>}>
-// CHECK: #[[CALLERTARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_36]], 0>}>
-// CHECK: #[[CALLEETARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_37]], 0>}>
-// CHECK: #[[$ATTR_40:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ATTR_37]], 0>}>
-// CHECK: #[[$ATTR_38:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[CALLERTARGETDATA]], 0>}>
-// CHECK: #[[$ATTR_39:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[CALLEETARGETDATA]], 0>}>
-// CHECK: #[[$ATTR_45:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMtestFcalleeEz", members = {<#[[$ATTR_40]], 0>}>
-// CHECK: #[[$ATTR_50:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_45]], access_type = #[[$ATTR_45]], offset = 0>
+// CHECK: #[[$CALLERANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_34]], 0>}>
+// CHECK: #[[$CALLEEANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_35]], 0>}>
+// CHECK: #[[$ATTR_38:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$CALLERANYDATA]], 0>}>
+// CHECK: #[[$ATTR_39:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$CALLEEANYDATA]], 0>}>
+// CHECK: #[[$ATTR_40:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$CALLEEANYDATA]], 0>}>
 // CHECK: #[[$ATTR_41:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMtestEy", members = {<#[[$ATTR_38]], 0>}>
 // CHECK: #[[$ATTR_42:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMtestEx", members = {<#[[$ATTR_38]], 0>}>
 // CHECK: #[[$ATTR_43:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMtestEy", members = {<#[[$ATTR_39]], 0>}>
 // CHECK: #[[$ATTR_44:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMtestEx", members = {<#[[$ATTR_39]], 0>}>
+// CHECK: #[[$ATTR_45:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMtestFcalleeEz", members = {<#[[$ATTR_40]], 0>}>
 // CHECK: #[[$ATTR_46:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_41]], access_type = #[[$ATTR_41]], offset = 0>
 // CHECK: #[[$ATTR_47:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_42]], access_type = #[[$ATTR_42]], offset = 0>
 // CHECK: #[[$ATTR_48:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_43]], access_type = #[[$ATTR_43]], offset = 0>
 // CHECK: #[[$ATTR_49:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_44]], access_type = #[[$ATTR_44]], offset = 0>
+// CHECK: #[[$ATTR_50:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_45]], access_type = #[[$ATTR_45]], offset = 0>
 // CHECK:   func.func @_QMtestPcaller(
 // CHECK-SAME:                              %[[VAL_0:.*]]: !fir.ref<f32> {fir.bindc_name = "z"}) {
 // CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
diff --git a/flang/test/Transforms/tbaa-with-dummy-scope2.fir b/flang/test/Transforms/tbaa-with-dummy-scope2.fir
index 54902ca7d41e1..6f5ed69fbc9c6 100644
--- a/flang/test/Transforms/tbaa-with-dummy-scope2.fir
+++ b/flang/test/Transforms/tbaa-with-dummy-scope2.fir
@@ -44,16 +44,15 @@ func.func @_QPtest1() attributes {noinline} {
 }
 // CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest1">
 // CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_0]], 0>}>
-// CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_1]], 0>}>
-// CHECK: #[[$TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_2]], 0>}>
-// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ATTR_2]], 0>}>
-// CHECK: #[[$LOCAL_ATTR_0:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$TARGETDATA]], 0>}>
-// CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest1FinnerEx", members = {<#[[$ATTR_3]], 0>}>
-// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$TARGETDATA]], 0>}>
-// CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_5]], access_type = #[[$ATTR_5]], offset = 0>
+// CHECK: #[[$ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_1]], 0>}>
+// CHECK: #[[$LOCAL_ATTR_0:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$ANYDATA]], 0>}>
+// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ANYDATA]], 0>}>
+// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ANYDATA]], 0>}>
 // CHECK: #[[$LOCAL_ATTR_1:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1FinnerEy", members = {<#[[$LOCAL_ATTR_0]], 0>}>
+// CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest1FinnerEx", members = {<#[[$ATTR_3]], 0>}>
 // CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMmEglob", members = {<#[[$ATTR_4]], 0>}>
 // CHECK: #[[$LOCAL_ATTR_2:.+]] = #llvm.tbaa_tag<base_type = #[[$LOCAL_ATTR_1]], access_type = #[[$LOCAL_ATTR_1]], offset = 0>
+// CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_5]], access_type = #[[$ATTR_5]], offset = 0>
 // CHECK: #[[$ATTR_8:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_6]], access_type = #[[$ATTR_6]], offset = 0>
 // CHECK-LABEL:   func.func @_QPtest1() attributes {noinline} {
 // CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFtest1FinnerEy"}
@@ -90,19 +89,18 @@ func.func @_QPtest2() attributes {noinline} {
 }
 // CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest2">
 // CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest2 - Scope 1">
-// CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_0]], 0>}>
-// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_1]], 0>}>
-// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_2]], 0>}>
-// CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_3]], 0>}>
-// CHECK: #[[$TARGETDATA_0:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_4]], 0>}>
-// CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ATTR_5]], 0>}>
-// CHECK: #[[$LOCAL_ATTR_0:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$TARGETDATA_0]], 0>}>
-// CHECK: #[[$ATTR_8:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest2FinnerEx", members = {<#[[$ATTR_6]], 0>}>
-// CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$TARGETDATA_0]], 0>}>
-// CHECK: #[[$ATTR_10:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_8]], access_type = #[[$ATTR_8]], offset = 0>
+// CHECK: #[[$ANY_ACCESS_0:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_0]], 0>}>
+// CHECK: #[[$ANY_ACCESS_1:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_1]], 0>}>
+// CHECK: #[[$ANY_DATA_0:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ANY_ACCESS_0]], 0>}>
+// CHECK: #[[$ANY_DATA_1:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ANY_ACCESS_1]], 0>}>
+// CHECK: #[[$LOCAL_ATTR_0:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$ANY_DATA_0]], 0>}>
+// CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ANY_DATA_1]], 0>}>
+// CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ANY_DATA_0]], 0>}>
 // CHECK: #[[$LOCAL_ATTR_1:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest2FinnerEy", members = {<#[[$LOCAL_ATTR_0]], 0>}>
+// CHECK: #[[$ATTR_8:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest2FinnerEx", members = {<#[[$ATTR_6]], 0>}>
 // CHECK: #[[$ATTR_9:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMmEglob", members = {<#[[$ATTR_7]], 0>}>
 // CHECK: #[[$LOCAL_ATTR_2:.+]] = #llvm.tbaa_tag<base_type = #[[$LOCAL_ATTR_1]], access_type = #[[$LOCAL_ATTR_1]], offset = 0>
+// CHECK: #[[$ATTR_10:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_8]], access_type = #[[$ATTR_8]], offset = 0>
 // CHECK: #[[$ATTR_11:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_9]], access_type = #[[$ATTR_9]], offset = 0>
 // CHECK-LABEL:   func.func @_QPtest2() attributes {noinline} {
 // CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFtest2FinnerEy"}
diff --git a/flang/test/Transforms/tbaa2.fir b/flang/test/Transforms/tbaa2.fir
index a594e6b32fdac..9b5307ba69d17 100644
--- a/flang/test/Transforms/tbaa2.fir
+++ b/flang/test/Transforms/tbaa2.fir
@@ -48,18 +48,10 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QMmodPcallee">
 // CHECK: #[[ANY_ACCESS:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // CHECK: #[[ANY_DATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY_ACCESS]], 0>}>
-// CHECK: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANY_DATA]], 0>}>
+// CHECK: #[[ANY_GLBL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANY_DATA]], 0>}>
+// CHECK: #[[ANY_LOCAL:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[ANY_DATA]], 0>}>
 // CHECK: #[[ANY_ARG:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[ANY_DATA]], 0>}>
-// CHECK: #[[ANY_GLBL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA]], 0>}>
-// CHECK: #[[ANY_LOCAL:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[TARGETDATA]], 0>}>
-// CHECK: #[[ARG_LOW:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeElow", members = {<#[[ANY_ARG]], 0>}>
-// CHECK: #[[ANY_DIRECT:.+]] = #llvm.tbaa_type_desc<id = "direct data", members = {<#[[TARGETDATA]], 0>}>
-// CHECK: #[[ARG_Z:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeEz", members = {<#[[ANY_ARG]], 0>}>
-// CHECK: #[[ARG_Y:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeEy", members = {<#[[ANY_ARG]], 0>}>
-
-// CHECK: #[[ARG_LOW_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_LOW]], access_type = #[[ARG_LOW]], offset = 0>
-// CHECK: #[[ARG_Z_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_Z]], access_type = #[[ARG_Z]], offset = 0>
-// CHECK: #[[ARG_Y_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_Y]], access_type = #[[ARG_Y]], offset = 0>
+// CHECK: #[[ANY_DIRECT:.+]] = #llvm.tbaa_type_desc<id = "direct data", members = {<#[[ANY_DATA]], 0>}>
 
 // CHECK: #[[GLBL_ZSTART:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMmodEzstart", members = {<#[[ANY_GLBL]], 0>}>
 // CHECK: #[[GLBL_ZSTOP:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMmodEzstop", members = {<#[[ANY_GLBL]], 0>}>
@@ -69,10 +61,13 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[LOCAL2_ALLOC:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QMmodFcalleeEj", members = {<#[[ANY_LOCAL]], 0>}>
 // CHECK: #[[GLBL_XSTART:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMmodExstart", members = {<#[[ANY_GLBL]], 0>}>
 // CHECK: #[[LOCAL3_ALLOC:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QMmodFcalleeEi", members = {<#[[ANY_LOCAL]], 0>}>
+// CHECK: #[[ARG_LOW:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeElow", members = {<#[[ANY_ARG]], 0>}>
 // CHECK: #[[LOCAL4_ALLOC:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QMmodFcalleeEdxold", members = {<#[[ANY_LOCAL]], 0>}>
 // CHECK: #[[DIRECT_A:.+]] = #llvm.tbaa_type_desc<id = "direct data/_QMmodEa", members = {<#[[ANY_DIRECT]], 0>}>
 // CHECK: #[[DIRECT_B:.+]] = #llvm.tbaa_type_desc<id = "direct data/_QMmodEb", members = {<#[[ANY_DIRECT]], 0>}>
+// CHECK: #[[ARG_Z:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeEz", members = {<#[[ANY_ARG]], 0>}>
 // CHECK: #[[GLBL_DYINV:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMmodEdyinv", members = {<#[[ANY_GLBL]], 0>}>
+// CHECK: #[[ARG_Y:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeEy", members = {<#[[ANY_ARG]], 0>}>
 // CHECK: #[[LOCAL5_ALLOC:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QMmodFcalleeEdzinv", members = {<#[[ANY_LOCAL]], 0>}>
 
 // CHECK: #[[GLBL_ZSTART_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLBL_ZSTART]], access_type = #[[GLBL_ZSTART]], offset = 0>
@@ -83,10 +78,13 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[LOCAL2_ALLOC_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCAL2_ALLOC]], access_type = #[[LOCAL2_ALLOC]], offset = 0>
 // CHECK: #[[GLBL_XSTART_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLBL_XSTART]], access_type = #[[GLBL_XSTART]], offset = 0>
 // CHECK: #[[LOCAL3_ALLOC_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCAL3_ALLOC]], access_type = #[[LOCAL3_ALLOC]], offset = 0>
+// CHECK: #[[ARG_LOW_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_LOW]], access_type = #[[ARG_LOW]], offset = 0>
 // CHECK: #[[LOCAL4_ALLOC_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCAL4_ALLOC]], access_type = #[[LOCAL4_ALLOC]], offset = 0>
 // CHECK: #[[DIRECT_A_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[DIRECT_A]], access_type = #[[DIRECT_A]], offset = 0>
 // CHECK: #[[DIRECT_B_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[DIRECT_B]], access_type = #[[DIRECT_B]], offset = 0>
+// CHECK: #[[ARG_Z_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_Z]], access_type = #[[ARG_Z]], offset = 0>
 // CHECK: #[[GLBL_DYINV_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLBL_DYINV]], access_type = #[[GLBL_DYINV]], offset = 0>
+// CHECK: #[[ARG_Y_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_Y]], access_type = #[[ARG_Y]], offset = 0>
 // CHECK: #[[LOCAL5_ALLOC_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCAL5_ALLOC]], access_type = #[[LOCAL5_ALLOC]], offset = 0>
 
   func.func @_QMmodPcallee(%arg0: !fir.box<!fir.array<?x?x?xf32>> {fir.bindc_name = "z"}, %arg1: !fir.box<!fir.array<?x?x?xf32>> {fir.bindc_name = "y"}, %arg2: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>> {fir.bindc_name = "low"}) {
diff --git a/flang/test/Transforms/tbaa3.fir b/flang/test/Transforms/tbaa3.fir
index abcb7e000bac1..7a9a819ea102a 100644
--- a/flang/test/Transforms/tbaa3.fir
+++ b/flang/test/Transforms/tbaa3.fir
@@ -1,5 +1,4 @@
-// RUN: fir-opt --fir-add-alias-tags %s | FileCheck --check-prefixes=ALL,DEFAULT %s
-// RUN: fir-opt --fir-add-alias-tags --local-alloc-tbaa %s | FileCheck --check-prefixes=ALL,LOCAL %s
+// RUN: fir-opt --fir-add-alias-tags %s | FileCheck --check-prefixes=ALL %s
 
 // Test AddAliasTagsPass creating sub-tree for TARGET/POINTER variables.
 
@@ -56,56 +55,57 @@
 //    |  |- "dummy arg data/_QFtest1Edummyas"
 //    |  |- "dummy arg data/_QFtest1Edummya"
 //    |
-//    |- "target data" <- all pointers and taget dummys
-//       |
-//       |- "global data"
-//       |  |
-//       |  |- "global data/_QMdataEglob"
-//       |  |- "global data/_QMdataEglobt"
-//       |
-//       |- "direct data"
-//       |  |
-//       |  |- "direct data/_QMdataEgloba"
-//       |  |- "direct data/_QMdataEglobat"
+//    |- "target data" <--- all pointers and target dummy arguments go here
+//    |  |- "target data/_QMdataEglobt"
+//    |  |- "target data/_QMdataEglobat"
+//    |  |- "target data/_QFtest1Elocalt"
+//    |  |- "target data/_QFtest1Elocalat"
+//    |
+//    |- "global data"
+//    |  |
+//    |  |- "global data/_QMdataEglob"
+//    |
+//    |- "direct data"
+//    |  |
+//    |  |- "direct data/_QMdataEgloba"
+//    |
+//    |- "allocated data"
 //       |
-//       |- "allocated data"
-//          |
-//          |- "allocated data/_QFtest1Elocal"
-//          |- "allocated data/_QFtest1Elocalt"
-//          |- "allocated data/_QFtest1Elocala"
-//          |- "allocated data/_QFtest1Elocalat"
+//       |- "allocated data/_QFtest1Elocal"
+//       |- "allocated data/_QFtest1Elocala"
 
 // ALL: #[[FUNCROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest1">
 // ALL: #[[ANYACCESS:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[FUNCROOT]], 0>}>
 // ALL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANYACCESS]], 0>}>
+// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA]], 0>}>
 // ALL: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA]], 0>}>
+// ALL: #[[DIRECTDATA:.+]] = #llvm.tbaa_type_desc<id = "direct data", members = {<#[[ANYDATA]], 0>}>
 // ALL: #[[DUMMYDATA:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[ANYDATA]], 0>}>
+// ALL: #[[LOCALDATA:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[ANYDATA]], 0>}>
 // ALL: #[[TARGETTAG:.+]] = #llvm.tbaa_tag<base_type = #[[TARGETDATA]], access_type = #[[TARGETDATA]], offset = 0>
-// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA]], 0>}>
-// ALL: #[[DIRECTDATA:.+]] = #llvm.tbaa_type_desc<id = "direct data", members = {<#[[TARGETDATA]], 0>}>
+// ALL: #[[GLOBVAR:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMdataEglob", members = {<#[[GLOBALDATA]], 0>}>
+// ALL: #[[GLOBTVAR:.+]] = #llvm.tbaa_type_desc<id = "target data/_QMdataEglobt", members = {<#[[TARGETDATA]], 0>}>
+// ALL: #[[GLOBAVAR:.+]] = #llvm.tbaa_type_desc<id = "direct data/_QMdataEgloba", members = {<#[[DIRECTDATA]], 0>}>
+// ALL: #[[GLOBATVAR:.+]] = #llvm.tbaa_type_desc<id = "target data/_QMdataEglobat", members = {<#[[TARGETDATA]], 0>}>
 // ALL: #[[DUMMYFVAR:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest1Edummyf", members = {<#[[DUMMYDATA]], 0>}>
 // ALL: #[[DUMMYASVAR:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest1Edummyas", members = {<#[[DUMMYDATA]], 0>}>
 // ALL: #[[DUMMYAVAR:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest1Edummya", members = {<#[[DUMMYDATA]], 0>}>
-// LOCAL: #[[LOCALDATA:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[TARGETDATA]], 0>}>
-// ALL: #[[DUMMYFTAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYFVAR]], access_type = #[[DUMMYFVAR]], offset = 0>
-// ALL: #[[DUMMYASTAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYASVAR]], access_type = #[[DUMMYASVAR]], offset = 0>
-// ALL: #[[DUMMYATAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYAVAR]], access_type = #[[DUMMYAVAR]], offset = 0>
-// ALL: #[[GLOBVAR:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMdataEglob", members = {<#[[GLOBALDATA]], 0>}>
-// ALL: #[[GLOBTVAR:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMdataEglobt", members = {<#[[GLOBALDATA]], 0>}>
-// ALL: #[[GLOBAVAR:.+]] = #llvm.tbaa_type_desc<id = "direct data/_QMdataEgloba", members = {<#[[DIRECTDATA]], 0>}>
-// ALL: #[[GLOBATVAR:.+]] = #llvm.tbaa_type_desc<id = "direct data/_QMdataEglobat", members = {<#[[DIRECTDATA]], 0>}>
-// LOCAL: #[[LOCALVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocal", members = {<#[[LOCALDATA]], 0>}>
-// LOCAL: #[[LOCALTVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocalt", members = {<#[[LOCALDATA]], 0>}>
-// LOCAL: #[[LOCALAVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocala", members = {<#[[LOCALDATA]], 0>}>
-// LOCAL: #[[LOCALATVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocalat", members = {<#[[LOCALDATA]], 0>}>
+// ALL: #[[LOCALVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocal", members = {<#[[LOCALDATA]], 0>}>
+// ALL: #[[LOCALTVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocalt", members = {<#[[LOCALDATA]], 0>}>
+// ALL: #[[LOCALAVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocala", members = {<#[[LOCALDATA]], 0>}>
+// ALL: #[[LOCALATVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocalat", members = {<#[[LOCALDATA]], 0>}>
+
 // ALL: #[[GLOBTAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLOBVAR]], access_type = #[[GLOBVAR]], offset = 0>
 // ALL: #[[GLOBTTAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLOBTVAR]], access_type = #[[GLOBTVAR]], offset = 0>
 // ALL: #[[GLOBATAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLOBAVAR]], access_type = #[[GLOBAVAR]], offset = 0>
 // ALL: #[[GLOBATTAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLOBATVAR]], access_type = #[[GLOBATVAR]], offset = 0>
-// LOCAL: #[[LOCALTAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALVAR]], access_type = #[[LOCALVAR]], offset = 0>
-// LOCAL: #[[LOCALTTAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALTVAR]], access_type = #[[LOCALTVAR]], offset = 0>
-// LOCAL: #[[LOCALATAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALAVAR]], access_type = #[[LOCALAVAR]], offset = 0>
-// LOCAL: #[[LOCALATTAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALATVAR]], access_type = #[[LOCALATVAR]], offset = 0>
+// ALL: #[[DUMMYFTAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYFVAR]], access_type = #[[DUMMYFVAR]], offset = 0>
+// ALL: #[[DUMMYASTAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYASVAR]], access_type = #[[DUMMYASVAR]], offset = 0>
+// ALL: #[[DUMMYATAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYAVAR]], access_type = #[[DUMMYAVAR]], offset = 0>
+// ALL: #[[LOCALTAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALVAR]], access_type = #[[LOCALVAR]], offset = 0>
+// ALL: #[[LOCALTTAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALTVAR]], access_type = #[[LOCALTVAR]], offset = 0>
+// ALL: #[[LOCALATAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALAVAR]], access_type = #[[LOCALAVAR]], offset = 0>
+// ALL: #[[LOCALATTAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALATVAR]], access_type = #[[LOCALATVAR]], offset = 0>
 
 module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i64 = dense<[32, 64]> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little">, llvm.data_layout = ""} {
   fir.global @_QMdataEglob : !fir.array<10xf32> {
@@ -263,13 +263,11 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
     fir.store %cst to %67 : !fir.ref<f32>
     %68 = fir.array_coor %20(%5) %c1 : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
 // real :: local(10)
-// DEFAULT: fir.store{{.*}}tbaa
-// LOCAL: fir.store{{.*}}{tbaa = [#[[LOCALTAG]]]} : !fir.ref<f32>
+// ALL: fir.store{{.*}}{tbaa = [#[[LOCALTAG]]]} : !fir.ref<f32>
     fir.store %cst to %68 : !fir.ref<f32>
     %69 = fir.array_coor %33(%5) %c1 : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
 // real, target :: localt(10)
-// DEFAULT: fir.store{{.*}}tbaa
-// LOCAL: fir.store{{.*}}{tbaa = [#[[LOCALTTAG]]]} : !fir.ref<f32>
+// ALL: fir.store{{.*}}{tbaa = [#[[LOCALTTAG]]]} : !fir.ref<f32>
     fir.store %cst to %69 : !fir.ref<f32>
 // ALL-NOT: fir.load{{.*}}tbaa
     %70 = fir.load %25 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
@@ -278,8 +276,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
     %73 = fir.shape_shift %72#0, %72#1 : (index, index) -> !fir.shapeshift<1>
     %74 = fir.array_coor %71(%73) %c1 : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>, index) -> !fir.ref<f32>
 // real, allocatable :: locala(:)
-// DEFAULT: fir.store{{.*}}tbaa
-// LOCAL: fir.store{{.*}}{tbaa = [#[[LOCALATAG]]]} : !fir.ref<f32>
+// ALL: fir.store{{.*}}{tbaa = [#[[LOCALATAG]]]} : !fir.ref<f32>
     fir.store %cst to %74 : !fir.ref<f32>
 // ALL-NOT: fir.load{{.*}}tbaa
     %75 = fir.load %27 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
@@ -288,8 +285,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
     %78 = fir.shape_shift %77#0, %77#1 : (index, index) -> !fir.shapeshift<1>
     %79 = fir.array_coor %76(%78) %c1 : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>, index) -> !fir.ref<f32>
 // real, allocatable, target :: localat(:)
-// DEFAULT: fir.store{{.*}}tbaa
-// LOCAL: fir.store{{.*}}{tbaa = [#[[LOCALATTAG]]]} : !fir.ref<f32>
+// ALL: fir.store{{.*}}{tbaa = [#[[LOCALATTAG]]]} : !fir.ref<f32>
     fir.store %cst to %79 : !fir.ref<f32>
 // ALL-NOT: fir.load{{.*}}tbaa
     %80 = fir.load %31 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
@@ -297,8 +293,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
     %82 = fir.shift %81#0 : (index) -> !fir.shift<1>
     %83 = fir.array_coor %80(%82) %c1 : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, !fir.shift<1>, index) -> !fir.ref<f32>
 // real, pointer :: localp(:)
-// DEFAULT: fir.store{{.*}}tbaa
-// LOCAL: fir.store{{.*}}{tbaa = [#[[TARGETTAG]]]} : !fir.ref<f32>
+// ALL: fir.store{{.*}}{tbaa = [#[[TARGETTAG]]]} : !fir.ref<f32>
     fir.store %cst to %83 : !fir.ref<f32>
 // ALL-NOT: fir.load{{.*}}tbaa
     %84 = fir.load %27 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
diff --git a/flang/test/Transforms/tbaa4.fir b/flang/test/Transforms/tbaa4.fir
index c368a3d06c2ba..5e29014af8935 100644
--- a/flang/test/Transforms/tbaa4.fir
+++ b/flang/test/Transforms/tbaa4.fir
@@ -1,12 +1,10 @@
 // Test TBAA tags for common and equivalence.
-// RUN: fir-opt --fir-add-alias-tags --split-input-file %s | FileCheck --check-prefixes=ALL,DEFAULT %s
-// RUN: fir-opt --fir-add-alias-tags --local-alloc-tbaa --split-input-file %s | FileCheck --check-prefixes=ALL,LOCAL %s
+// RUN: fir-opt --fir-add-alias-tags --split-input-file %s | FileCheck --check-prefixes=ALL %s
 
 // ALL: #[[ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest_common">
 // ALL: #[[ANY:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // ALL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY]], 0>}>
-// ALL: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA]], 0>}>
-// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA]], 0>}>
+// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA]], 0>}>
 // ALL: #[[BLK:.+]] = #llvm.tbaa_type_desc<id = "global data/blk_", members = {<#[[GLOBALDATA]], 0>}>
 // ALL: #[[BLK_A:.+]] = #llvm.tbaa_type_desc<id = "global data/blk_/bytes_0_to_3", members = {<#[[BLK]], 0>}>
 // ALL: #[[BLK_C:.+]] = #llvm.tbaa_type_desc<id = "global data/blk_/bytes_8_to_47", members = {<#[[BLK]], 0>}>
@@ -54,19 +52,17 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 
 // -----
 
-// LOCAL: #[[ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest_local_equiv">
-// LOCAL: #[[ANY:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
-// LOCAL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY]], 0>}>
-// LOCAL: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA]], 0>}>
-// LOCAL: #[[ALLOCATEDDATA:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[TARGETDATA]], 0>}>
-// LOCAL: #[[EQUIV:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest_local_equivEa", members = {<#[[ALLOCATEDDATA]], 0>}>
-// LOCAL: #[[TAG:.+]] = #llvm.tbaa_tag<base_type = #[[EQUIV]], access_type = #[[EQUIV]], offset = 0>
+// ALL: #[[ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest_local_equiv">
+// ALL: #[[ANY:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
+// ALL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY]], 0>}>
+// ALL: #[[ALLOCATEDDATA:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[ANYDATA]], 0>}>
+// ALL: #[[EQUIV:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest_local_equivEa", members = {<#[[ALLOCATEDDATA]], 0>}>
+// ALL: #[[TAG:.+]] = #llvm.tbaa_tag<base_type = #[[EQUIV]], access_type = #[[EQUIV]], offset = 0>
 
 // ALL-LABEL:   func.func @_QPtest_local_equiv() {
-// LOCAL:         fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ptr<f32>
-// LOCAL:         fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ref<i32>
-// LOCAL:         fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ptr<f32>
-// DEFAULT-NOT:   fir.store{{.}}tbaa
+// ALL:         fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ptr<f32>
+// ALL:         fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ref<i32>
+// ALL:         fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ptr<f32>
 module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i64 = dense<[32, 64]> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little">, llvm.data_layout = ""} {
 func.func @_QPtest_local_equiv() {
   %c1 = arith.constant 1 : index
@@ -98,8 +94,7 @@ func.func @_QPtest_local_equiv() {
 // ALL: #[[ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest_save_equiv">
 // ALL: #[[ANY:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // ALL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY]], 0>}>
-// ALL: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA]], 0>}>
-// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA]], 0>}>
+// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA]], 0>}>
 // ALL: #[[EQUIV:.+]] = #llvm.tbaa_type_desc<id = "global data/_QFtest_save_equivEa", members = {<#[[GLOBALDATA]], 0>}>
 // ALL: #[[TAG:.+]] = #llvm.tbaa_tag<base_type = #[[EQUIV]], access_type = #[[EQUIV]], offset = 0>
 module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i64 = dense<[32, 64]> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little">, llvm.data_layout = ""} {
@@ -143,8 +138,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // ALL: #[[ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest_global_equiv">
 // ALL: #[[ANY:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // ALL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY]], 0>}>
-// ALL: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA]], 0>}>
-// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA]], 0>}>
+// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA]], 0>}>
 // ALL: #[[EQUIV:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMdataEa", members = {<#[[GLOBALDATA]], 0>}>
 // ALL: #[[TAG:.+]] = #llvm.tbaa_tag<base_type = #[[EQUIV]], access_type = #[[EQUIV]], offset = 0>
 

From 8ec2112ec8b43a0fdf8f5e000f0c6376b6105987 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Mon, 1 Dec 2025 10:07:19 +0000
Subject: [PATCH 20/39] [OMPIRBuilder] re-land cancel barriers patch #164586
 (#169931)

A barrier will pause execution until all threads reach it. If some go to
a different barrier then we deadlock. This manifests in that the
finalization callback must only be run once. Fix by ensuring we always
go through the same finalization block whether the thread in cancelled
or not and no matter which cancellation point causes the cancellation.

The old callback only affected PARALLEL, so it has been moved into the
code generating PARALLEL. For this reason, we don't need similar changes
for other cancellable constructs. We need to create the barrier on the
shared exit from the outlined function instead of only on the cancelled
branch to make sure that threads exiting normally (without cancellation)
meet the same barriers as those which were cancelled. For example,
previously we might have generated code like

```
...
  %ret = call i32 @__kmpc_cancel(...)
  %cond = icmp eq i32 %ret, 0
  br i1 %cond, label %continue, label %cancel

continue:
  // do the rest of the callback, eventually branching to %fini
  br label %fini

cancel:
  // Populated by the callback:
  // unsafe: if any thread makes it to the end without being cancelled
  // it won't reach this barrier and then the program will deadlock
  %unused = call i32 @__kmpc_cancel_barrier(...)
  br label %fini

fini:
  // run destructors etc
  ret
```

In the new version the barrier is moved into fini. I generate it *after*
the destructors because the standard describes the barrier as occurring
after the end of the parallel region.

```
...
  %ret = call i32 @__kmpc_cancel(...)
  %cond = icmp eq i32 %ret, 0
  br i1 %cond, label %continue, label %cancel

continue:
  // do the rest of the callback, eventually branching to %fini
  br label %fini

cancel:
  br label %fini

fini:
  // run destructors etc
  // safe so long as every exit from the function happens via this block:
  %unused = call i32 @__kmpc_cancel_barrier(...)
  ret
```

To achieve this, the barrier is now generated alongside the finalization
code instead of in the callback. This is the reason for the changes to
the unit test.

I'm unsure if I should keep the incorrect barrier generation callback
only on the cancellation branch in clang with the OMPIRBuilder backend
because that would match clang's ordinary codegen. Right now I have
opted to remove it entirely because it is a deadlock waiting to happen.

---

This re-lands #164586 with a small fix for a failing buildbot running
address sanitizer on clang lit tests.

In the previous version of the patch I added an insertion point guard
"just to be safe" and never removed it. There isn't insertion point
guarding on the other route out of this function and we do not
preserve the insertion point around getFiniBB either so it is not
needed here.

The problem flagged by the sanitizers was because the saved insertion
point pointed to an instruction which was then removed inside the FiniCB
for some clang codegen functions. The instruction was freed when it was
removed. Then accessing it to restore the insertion point was a use
after free bug.
---
 clang/test/OpenMP/cancel_codegen.cpp          |  40 ++---
 clang/test/OpenMP/critical_codegen.cpp        |   2 +
 clang/test/OpenMP/critical_codegen_attr.cpp   |   2 +
 .../OpenMP/irbuilder_nested_parallel_for.c    | 108 +++++++------
 clang/test/OpenMP/masked_codegen.cpp          |   2 +
 clang/test/OpenMP/master_codegen.cpp          |   2 +
 clang/test/OpenMP/nested_loop_codegen.cpp     |   4 +
 clang/test/OpenMP/ordered_codegen.cpp         |  40 +++--
 clang/test/OpenMP/parallel_codegen.cpp        |   4 +
 .../parallel-private-reduction-worstcase.f90  |   5 +-
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  35 +++--
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 146 +++++++++---------
 .../OpenMP/parallel_region_merging.ll         |  24 ++-
 .../Frontend/OpenMPIRBuilderTest.cpp          | 109 ++++++-------
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  14 +-
 .../Target/LLVMIR/openmp-barrier-cancel.mlir  |  14 +-
 mlir/test/Target/LLVMIR/openmp-cancel.mlir    |  22 +--
 .../LLVMIR/openmp-cancellation-point.mlir     |  14 +-
 .../LLVMIR/openmp-outline-infinite-loop.mlir  |   6 +-
 .../openmp-parallel-reduction-multiblock.mlir |   4 +-
 .../openmp-reduction-array-sections.mlir      |  14 +-
 .../LLVMIR/openmp-reduction-init-arg.mlir     |   4 +-
 .../LLVMIR/openmp-reduction-sections.mlir     |  16 +-
 23 files changed, 369 insertions(+), 262 deletions(-)

diff --git a/clang/test/OpenMP/cancel_codegen.cpp b/clang/test/OpenMP/cancel_codegen.cpp
index 16e7542a8e826..6090a91b6a3d9 100644
--- a/clang/test/OpenMP/cancel_codegen.cpp
+++ b/clang/test/OpenMP/cancel_codegen.cpp
@@ -774,8 +774,6 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM12]])
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_AFTER:%.*]]
 // CHECK3:       omp_section_loop.after:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_AFTERSECTIONS_FINI:%.*]]
-// CHECK3:       omp_section_loop.aftersections.fini:
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_PREHEADER13:%.*]]
 // CHECK3:       omp_section_loop.preheader13:
 // CHECK3-NEXT:    store i32 0, ptr [[P_LOWERBOUND29]], align 4
@@ -811,16 +809,16 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE23_SECTION_AFTER:%.*]]
 // CHECK3:       omp_section_loop.body.case23.section.after:
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER]]
-// CHECK3:       omp_section_loop.body.case25:
+// CHECK3:       omp_section_loop.body.case26:
 // CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM27:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_cancel(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM27]], i32 3)
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_SECTION_LOOP_BODY_CASE25_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE25_CNCL:%.*]]
-// CHECK3:       omp_section_loop.body.case25.split:
+// CHECK3:       omp_section_loop.body.case26.split:
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE25_SECTION_AFTER26:%.*]]
-// CHECK3:       omp_section_loop.body.case25.section.after26:
+// CHECK3:       omp_section_loop.body.case26.section.after27:
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE25_SECTION_AFTER:%.*]]
-// CHECK3:       omp_section_loop.body.case25.section.after:
+// CHECK3:       omp_section_loop.body.case26.section.after:
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER]]
 // CHECK3:       omp_section_loop.body16.sections.after:
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_INC17]]
@@ -833,8 +831,6 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM33]])
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_AFTER19:%.*]]
 // CHECK3:       omp_section_loop.after19:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_AFTER19SECTIONS_FINI:%.*]]
-// CHECK3:       omp_section_loop.after19sections.fini:
 // CHECK3-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP20]], ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
@@ -894,8 +890,8 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_EXIT]]
 // CHECK3:       omp_section_loop.body.case23.cncl:
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
-// CHECK3:       omp_section_loop.body.case25.cncl:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
+// CHECK3:       omp_section_loop.body.case26.cncl:
+// CHECK3-NEXT:    br label [[OMP_REGION_FINALIZE:.*]]
 // CHECK3:       .cancel.continue:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
@@ -967,8 +963,10 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_cancel_barrier(ptr @[[GLOB3:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM4]])
 // CHECK3-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
 // CHECK3-NEXT:    br i1 [[TMP9]], label [[DOTCONT:%.*]], label [[DOTCNCL5:%.*]]
-// CHECK3:       .cncl5:
-// CHECK3-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+// CHECK3:       .cncl4:
+// CHECK3-NEXT:    br label [[FINI:%.*]]
+// CHECK3:       .fini
+// CHECK3-NEXT:    br label %[[EXIT_STUB:omp.par.exit.exitStub]]
 // CHECK3:       .cont:
 // CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[LOADGEP_ARGC_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[LOADGEP_ARGV_ADDR]], align 8
@@ -984,16 +982,14 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3:       omp.par.region.parallel.after:
 // CHECK3-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK3:       omp.par.pre_finalize:
-// CHECK3-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB]]
+// CHECK3-NEXT:    br label [[FINI]]
 // CHECK3:       14:
 // CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_cancel(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM1]], i32 1)
 // CHECK3-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 0
 // CHECK3-NEXT:    br i1 [[TMP16]], label [[DOTSPLIT:%.*]], label [[DOTCNCL:%.*]]
 // CHECK3:       .cncl:
-// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK3-NEXT:    [[TMP17:%.*]] = call i32 @__kmpc_cancel_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
-// CHECK3-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB]]
+// CHECK3-NEXT:    br label [[FINI]]
 // CHECK3:       .split:
 // CHECK3-NEXT:    br label [[TMP4]]
 // CHECK3:       omp.par.exit.exitStub:
@@ -1089,7 +1085,7 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3:       .omp.sections.case.split:
 // CHECK3-NEXT:    br label [[DOTOMP_SECTIONS_EXIT]]
 // CHECK3:       .omp.sections.case.cncl:
-// CHECK3-NEXT:    br label [[CANCEL_CONT:%.*]]
+// CHECK3-NEXT:    br label [[FINI:%.*]]
 // CHECK3:       .omp.sections.exit:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
@@ -1100,7 +1096,7 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB19:[0-9]+]])
 // CHECK3-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB15]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
-// CHECK3-NEXT:    br label [[CANCEL_CONT]]
+// CHECK3-NEXT:    br label [[CANCEL_CONT:.*]]
 // CHECK3:       cancel.cont:
 // CHECK3-NEXT:    ret void
 // CHECK3:       cancel.exit:
@@ -1153,6 +1149,8 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3:       .omp.sections.case.split:
 // CHECK3-NEXT:    br label [[DOTOMP_SECTIONS_EXIT]]
 // CHECK3:       .omp.sections.case.cncl:
+// CHECK3-NEXT:    br label [[DOTFINI:.%*]]
+// CHECK3:       .fini:
 // CHECK3-NEXT:    br label [[CANCEL_CONT:%.*]]
 // CHECK3:       .omp.sections.case2:
 // CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
@@ -1162,9 +1160,11 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3:       .omp.sections.case2.split:
 // CHECK3-NEXT:    br label [[DOTOMP_SECTIONS_CASE2_SECTION_AFTER:%.*]]
 // CHECK3:       .omp.sections.case2.section.after:
-// CHECK3-NEXT:    br label [[DOTOMP_SECTIONS_EXIT]]
+// CHECK3-NEXT:    br label [[OMP_REGION_FINALIZE]]
+// CHECK3:       omp_region.finalize:
+// CHECK3-NEXT:    br label [[OMP_SECTIONS_EXIT:.*]]
 // CHECK3:       .omp.sections.case2.cncl:
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_END]]
+// CHECK3-NEXT:    br label [[FINI:.*]]
 // CHECK3:       .omp.sections.exit:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
diff --git a/clang/test/OpenMP/critical_codegen.cpp b/clang/test/OpenMP/critical_codegen.cpp
index 5c752d354804b..9620613dfdb87 100644
--- a/clang/test/OpenMP/critical_codegen.cpp
+++ b/clang/test/OpenMP/critical_codegen.cpp
@@ -35,6 +35,8 @@ int main() {
 // ALL-NEXT:  			store i8 2, ptr [[A_ADDR]]
 // IRBUILDER-NEXT:		br label %[[AFTER:[^ ,]+]]
 // IRBUILDER:			[[AFTER]]
+// IRBUILDER-NEXT:		br label %[[OMP_REGION_FINALIZE:[^ ,]+]]
+// IRBUILDER:			[[OMP_REGION_FINALIZE]]
 // ALL-NEXT:  			call {{.*}}void @__kmpc_end_critical(ptr [[DEFAULT_LOC]], i32 [[GTID]], ptr [[UNNAMED_LOCK]])
 #pragma omp critical
   a = 2;
diff --git a/clang/test/OpenMP/critical_codegen_attr.cpp b/clang/test/OpenMP/critical_codegen_attr.cpp
index 32482a92e76b8..50b0b04fcfd4a 100644
--- a/clang/test/OpenMP/critical_codegen_attr.cpp
+++ b/clang/test/OpenMP/critical_codegen_attr.cpp
@@ -35,6 +35,8 @@ int main() {
 // ALL-NEXT:  			store i8 2, ptr [[A_ADDR]]
 // IRBUILDER-NEXT:		br label %[[AFTER:[^ ,]+]]
 // IRBUILDER:			[[AFTER]]
+// IRBUILDER-NEXT:		br label %[[OMP_REGION_FINALIZE:[^ ,]+]]
+// IRBUILDER:			[[OMP_REGION_FINALIZE]]
 // ALL-NEXT:  			call {{.*}}void @__kmpc_end_critical(ptr [[DEFAULT_LOC]], i32 [[GTID]], ptr [[UNNAMED_LOCK]])
   [[omp::directive(critical)]]
   a = 2;
diff --git a/clang/test/OpenMP/irbuilder_nested_parallel_for.c b/clang/test/OpenMP/irbuilder_nested_parallel_for.c
index 5cc5640a5173b..56cf9644de5ed 100644
--- a/clang/test/OpenMP/irbuilder_nested_parallel_for.c
+++ b/clang/test/OpenMP/irbuilder_nested_parallel_for.c
@@ -449,7 +449,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    call void @__captured_stmt.19(ptr [[DOTCOUNT_ADDR188]], ptr [[AGG_CAPTURED186]])
 // CHECK-NEXT:    [[DOTCOUNT189:%.*]] = load i32, ptr [[DOTCOUNT_ADDR188]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_PREHEADER190:%.*]]
-// CHECK:       omp_loop.preheader187:
+// CHECK:       omp_loop.preheader190:
 // CHECK-NEXT:    store i32 0, ptr [[P_LOWERBOUND204]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[DOTCOUNT189]], 1
 // CHECK-NEXT:    store i32 [[TMP3]], ptr [[P_UPPERBOUND205]], align 4
@@ -461,13 +461,13 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP5]], [[TMP4]]
 // CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 1
 // CHECK-NEXT:    br label [[OMP_LOOP_HEADER191:%.*]]
-// CHECK:       omp_loop.header188:
+// CHECK:       omp_loop.header191:
 // CHECK-NEXT:    [[OMP_LOOP_IV197:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER190]] ], [ [[OMP_LOOP_NEXT199:%.*]], [[OMP_LOOP_INC194:%.*]] ]
 // CHECK-NEXT:    br label [[OMP_LOOP_COND192:%.*]]
-// CHECK:       omp_loop.cond189:
+// CHECK:       omp_loop.cond192:
 // CHECK-NEXT:    [[OMP_LOOP_CMP198:%.*]] = icmp ult i32 [[OMP_LOOP_IV197]], [[TMP7]]
 // CHECK-NEXT:    br i1 [[OMP_LOOP_CMP198]], label [[OMP_LOOP_BODY193:%.*]], label [[OMP_LOOP_EXIT195:%.*]]
-// CHECK:       omp_loop.body190:
+// CHECK:       omp_loop.body193:
 // CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[OMP_LOOP_IV197]], [[TMP4]]
 // CHECK-NEXT:    call void @__captured_stmt.20(ptr [[I185]], i32 [[TMP8]], ptr [[AGG_CAPTURED187]])
 // CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -478,15 +478,15 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[R_ADDR]], align 8
 // CHECK-NEXT:    store float [[CONV202]], ptr [[TMP11]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC194]]
-// CHECK:       omp_loop.inc191:
+// CHECK:       omp_loop.inc194:
 // CHECK-NEXT:    [[OMP_LOOP_NEXT199]] = add nuw i32 [[OMP_LOOP_IV197]], 1
 // CHECK-NEXT:    br label [[OMP_LOOP_HEADER191]]
-// CHECK:       omp_loop.exit192:
+// CHECK:       omp_loop.exit195:
 // CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM207]])
 // CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM208:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM208]])
 // CHECK-NEXT:    br label [[OMP_LOOP_AFTER196:%.*]]
-// CHECK:       omp_loop.after193:
+// CHECK:       omp_loop.after196:
 // CHECK-NEXT:    ret void
 //
 //
@@ -576,7 +576,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    call void @__captured_stmt.17(ptr [[DOTCOUNT_ADDR163]], ptr [[AGG_CAPTURED161]])
 // CHECK-NEXT:    [[DOTCOUNT164:%.*]] = load i32, ptr [[DOTCOUNT_ADDR163]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_PREHEADER165:%.*]]
-// CHECK:       omp_loop.preheader163:
+// CHECK:       omp_loop.preheader165:
 // CHECK-NEXT:    store i32 0, ptr [[P_LOWERBOUND179]], align 4
 // CHECK-NEXT:    [[TMP13:%.*]] = sub i32 [[DOTCOUNT164]], 1
 // CHECK-NEXT:    store i32 [[TMP13]], ptr [[P_UPPERBOUND180]], align 4
@@ -588,24 +588,24 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], [[TMP14]]
 // CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], 1
 // CHECK-NEXT:    br label [[OMP_LOOP_HEADER166:%.*]]
-// CHECK:       omp_loop.header164:
+// CHECK:       omp_loop.header166:
 // CHECK-NEXT:    [[OMP_LOOP_IV172:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER165]] ], [ [[OMP_LOOP_NEXT174:%.*]], [[OMP_LOOP_INC169:%.*]] ]
 // CHECK-NEXT:    br label [[OMP_LOOP_COND167:%.*]]
-// CHECK:       omp_loop.cond165:
+// CHECK:       omp_loop.cond167:
 // CHECK-NEXT:    [[OMP_LOOP_CMP173:%.*]] = icmp ult i32 [[OMP_LOOP_IV172]], [[TMP17]]
 // CHECK-NEXT:    br i1 [[OMP_LOOP_CMP173]], label [[OMP_LOOP_BODY168:%.*]], label [[OMP_LOOP_EXIT170:%.*]]
-// CHECK:       omp_loop.exit168:
+// CHECK:       omp_loop.exit170:
 // CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM182]])
 // CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM183:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM183]])
 // CHECK-NEXT:    br label [[OMP_LOOP_AFTER171:%.*]]
-// CHECK:       omp_loop.after169:
+// CHECK:       omp_loop.after171:
 // CHECK-NEXT:    br label [[OMP_PAR_REGION_PARALLEL_AFTER:%.*]]
 // CHECK:       omp.par.region.parallel.after:
 // CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK:       omp.par.pre_finalize:
 // CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT184_EXITSTUB:%.*]]
-// CHECK:       omp_loop.body166:
+// CHECK:       omp_loop.body168:
 // CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[OMP_LOOP_IV172]], [[TMP14]]
 // CHECK-NEXT:    call void @__captured_stmt.18(ptr [[I160]], i32 [[TMP18]], ptr [[AGG_CAPTURED162]])
 // CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[LOADGEP_A_ADDR]], align 4
@@ -616,7 +616,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[LOADGEP_R_ADDR]], align 8
 // CHECK-NEXT:    store float [[CONV177]], ptr [[TMP21]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC169]]
-// CHECK:       omp_loop.inc167:
+// CHECK:       omp_loop.inc169:
 // CHECK-NEXT:    [[OMP_LOOP_NEXT174]] = add nuw i32 [[OMP_LOOP_IV172]], 1
 // CHECK-NEXT:    br label [[OMP_LOOP_HEADER166]]
 // CHECK:       omp_loop.body:
@@ -758,7 +758,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK:       omp_loop.after86:
 // CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM99:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-NEXT:    br label [[OMP_PARALLEL213:%.*]]
-// CHECK:       omp_parallel210:
+// CHECK:       omp_parallel213:
 // CHECK-NEXT:    [[GEP_A_ADDR210:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG209]], i32 0, i32 0
 // CHECK-NEXT:    store ptr [[LOADGEP_A_ADDR]], ptr [[GEP_A_ADDR210]], align 8
 // CHECK-NEXT:    [[GEP_B_ADDR211:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG209]], i32 0, i32 1
@@ -777,7 +777,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    call void @__captured_stmt.15(ptr [[DOTCOUNT_ADDR138]], ptr [[AGG_CAPTURED136]])
 // CHECK-NEXT:    [[DOTCOUNT139:%.*]] = load i32, ptr [[DOTCOUNT_ADDR138]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_PREHEADER140:%.*]]
-// CHECK:       omp_loop.preheader139:
+// CHECK:       omp_loop.preheader140:
 // CHECK-NEXT:    store i32 0, ptr [[P_LOWERBOUND154]], align 4
 // CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[DOTCOUNT139]], 1
 // CHECK-NEXT:    store i32 [[TMP21]], ptr [[P_UPPERBOUND155]], align 4
@@ -789,24 +789,26 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    [[TMP24:%.*]] = sub i32 [[TMP23]], [[TMP22]]
 // CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP24]], 1
 // CHECK-NEXT:    br label [[OMP_LOOP_HEADER141:%.*]]
-// CHECK:       omp_loop.header140:
+// CHECK:       omp_loop.header141:
 // CHECK-NEXT:    [[OMP_LOOP_IV147:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER140]] ], [ [[OMP_LOOP_NEXT149:%.*]], [[OMP_LOOP_INC144:%.*]] ]
 // CHECK-NEXT:    br label [[OMP_LOOP_COND142:%.*]]
-// CHECK:       omp_loop.cond141:
+// CHECK:       omp_loop.cond142:
 // CHECK-NEXT:    [[OMP_LOOP_CMP148:%.*]] = icmp ult i32 [[OMP_LOOP_IV147]], [[TMP25]]
 // CHECK-NEXT:    br i1 [[OMP_LOOP_CMP148]], label [[OMP_LOOP_BODY143:%.*]], label [[OMP_LOOP_EXIT145:%.*]]
-// CHECK:       omp_loop.exit144:
+// CHECK:       omp_loop.exit145:
 // CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM157]])
 // CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM158:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM158]])
 // CHECK-NEXT:    br label [[OMP_LOOP_AFTER146:%.*]]
-// CHECK:       omp_loop.after145:
+// CHECK:       omp_loop.after146:
 // CHECK-NEXT:    br label [[OMP_PAR_REGION9_PARALLEL_AFTER:%.*]]
 // CHECK:       omp.par.region9.parallel.after:
 // CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE10:%.*]]
 // CHECK:       omp.par.pre_finalize10:
-// CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT159_EXITSTUB:%.*]]
-// CHECK:       omp_loop.body142:
+// CHECK-NEXT:    br label [[FINI159:%.*]]
+// CHECK:       .fini159:
+// CHECK-NEXT:    br label [[OMP_PAR_EXIT11_EXITSTUB:%.*]]
+// CHECK:       omp_loop.body143:
 // CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[OMP_LOOP_IV147]], [[TMP22]]
 // CHECK-NEXT:    call void @__captured_stmt.16(ptr [[I135]], i32 [[TMP26]], ptr [[AGG_CAPTURED137]])
 // CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[LOADGEP_A_ADDR]], align 4
@@ -817,7 +819,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[LOADGEP_R_ADDR]], align 8
 // CHECK-NEXT:    store float [[CONV152]], ptr [[TMP29]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC144]]
-// CHECK:       omp_loop.inc143:
+// CHECK:       omp_loop.inc144:
 // CHECK-NEXT:    [[OMP_LOOP_NEXT149]] = add nuw i32 [[OMP_LOOP_IV147]], 1
 // CHECK-NEXT:    br label [[OMP_LOOP_HEADER141]]
 // CHECK:       omp_loop.body83:
@@ -1557,6 +1559,8 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG:       omp.par.region.parallel.after:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK-DEBUG:       omp.par.pre_finalize:
+// CHECK-DEBUG-NEXT:    br label [[FINI:.*]]
+// CHECK-DEBUG:       .fini:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]], !dbg [[DBG30]]
 // CHECK-DEBUG:       omp_loop.body:
 // CHECK-DEBUG-NEXT:    [[TMP9:%.*]] = add i32 [[OMP_LOOP_IV]], [[TMP5]], !dbg [[DBG29]]
@@ -1700,6 +1704,8 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG:       omp.par.region.parallel.after:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK-DEBUG:       omp.par.pre_finalize:
+// CHECK-DEBUG-NEXT:    br label [[FINI16:%.*]]
+// CHECK-DEBUG:       .fini16:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT16_EXITSTUB:%.*]], !dbg [[DBG92]]
 // CHECK-DEBUG:       omp.par.exit.exitStub:
 // CHECK-DEBUG-NEXT:    ret void
@@ -1769,6 +1775,8 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG:       omp.par.region5.parallel.after:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_PRE_FINALIZE6:%.*]]
 // CHECK-DEBUG:       omp.par.pre_finalize6:
+// CHECK-DEBUG-NEXT:    br label [[FINI:%.*]]
+// CHECK-DEBUG:       .fini:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]], !dbg [[DBG103]]
 // CHECK-DEBUG:       omp_loop.body:
 // CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = add i32 [[OMP_LOOP_IV]], [[TMP6]], !dbg [[DBG102]]
@@ -1899,7 +1907,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    call void @__captured_stmt.19(ptr [[DOTCOUNT_ADDR188]], ptr [[AGG_CAPTURED186]]), !dbg [[DBG148]]
 // CHECK-DEBUG-NEXT:    [[DOTCOUNT189:%.*]] = load i32, ptr [[DOTCOUNT_ADDR188]], align 4, !dbg [[DBG148]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_PREHEADER190:%.*]], !dbg [[DBG148]]
-// CHECK-DEBUG:       omp_loop.preheader187:
+// CHECK-DEBUG:       omp_loop.preheader190:
 // CHECK-DEBUG-NEXT:    store i32 0, ptr [[P_LOWERBOUND204]], align 4, !dbg [[DBG148]]
 // CHECK-DEBUG-NEXT:    [[TMP3:%.*]] = sub i32 [[DOTCOUNT189]], 1, !dbg [[DBG148]]
 // CHECK-DEBUG-NEXT:    store i32 [[TMP3]], ptr [[P_UPPERBOUND205]], align 4, !dbg [[DBG148]]
@@ -1911,13 +1919,13 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP5]], [[TMP4]], !dbg [[DBG148]]
 // CHECK-DEBUG-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 1, !dbg [[DBG148]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER191:%.*]], !dbg [[DBG148]]
-// CHECK-DEBUG:       omp_loop.header188:
+// CHECK-DEBUG:       omp_loop.header191:
 // CHECK-DEBUG-NEXT:    [[OMP_LOOP_IV197:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER190]] ], [ [[OMP_LOOP_NEXT199:%.*]], [[OMP_LOOP_INC194:%.*]] ], !dbg [[DBG148]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_COND192:%.*]], !dbg [[DBG148]]
-// CHECK-DEBUG:       omp_loop.cond189:
+// CHECK-DEBUG:       omp_loop.cond192:
 // CHECK-DEBUG-NEXT:    [[OMP_LOOP_CMP198:%.*]] = icmp ult i32 [[OMP_LOOP_IV197]], [[TMP7]], !dbg [[DBG148]]
 // CHECK-DEBUG-NEXT:    br i1 [[OMP_LOOP_CMP198]], label [[OMP_LOOP_BODY193:%.*]], label [[OMP_LOOP_EXIT195:%.*]], !dbg [[DBG148]]
-// CHECK-DEBUG:       omp_loop.body190:
+// CHECK-DEBUG:       omp_loop.body193:
 // CHECK-DEBUG-NEXT:    [[TMP8:%.*]] = add i32 [[OMP_LOOP_IV197]], [[TMP4]], !dbg [[DBG150:![0-9]+]]
 // CHECK-DEBUG-NEXT:    call void @__captured_stmt.20(ptr [[I185]], i32 [[TMP8]], ptr [[AGG_CAPTURED187]]), !dbg [[DBG148]]
 // CHECK-DEBUG-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG151:![0-9]+]]
@@ -1928,15 +1936,15 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[R_ADDR]], align 8, !dbg [[DBG153:![0-9]+]]
 // CHECK-DEBUG-NEXT:    store float [[CONV202]], ptr [[TMP11]], align 4, !dbg [[DBG154:![0-9]+]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_INC194]], !dbg [[DBG148]]
-// CHECK-DEBUG:       omp_loop.inc191:
+// CHECK-DEBUG:       omp_loop.inc194:
 // CHECK-DEBUG-NEXT:    [[OMP_LOOP_NEXT199]] = add nuw i32 [[OMP_LOOP_IV197]], 1, !dbg [[DBG148]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER191]], !dbg [[DBG148]]
-// CHECK-DEBUG:       omp_loop.exit192:
+// CHECK-DEBUG:       omp_loop.exit195:
 // CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM207]]), !dbg [[DBG148]]
 // CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM208:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB42]]), !dbg [[DBG150]]
 // CHECK-DEBUG-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB43:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM208]]), !dbg [[DBG150]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_AFTER196:%.*]], !dbg [[DBG148]]
-// CHECK-DEBUG:       omp_loop.after193:
+// CHECK-DEBUG:       omp_loop.after196:
 // CHECK-DEBUG-NEXT:    ret void, !dbg [[DBG155:![0-9]+]]
 //
 //
@@ -2031,7 +2039,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    call void @__captured_stmt.17(ptr [[DOTCOUNT_ADDR163]], ptr [[AGG_CAPTURED161]]), !dbg [[DBG174]]
 // CHECK-DEBUG-NEXT:    [[DOTCOUNT164:%.*]] = load i32, ptr [[DOTCOUNT_ADDR163]], align 4, !dbg [[DBG174]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_PREHEADER165:%.*]], !dbg [[DBG174]]
-// CHECK-DEBUG:       omp_loop.preheader163:
+// CHECK-DEBUG:       omp_loop.preheader165:
 // CHECK-DEBUG-NEXT:    store i32 0, ptr [[P_LOWERBOUND179]], align 4, !dbg [[DBG174]]
 // CHECK-DEBUG-NEXT:    [[TMP13:%.*]] = sub i32 [[DOTCOUNT164]], 1, !dbg [[DBG174]]
 // CHECK-DEBUG-NEXT:    store i32 [[TMP13]], ptr [[P_UPPERBOUND180]], align 4, !dbg [[DBG174]]
@@ -2043,24 +2051,26 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], [[TMP14]], !dbg [[DBG174]]
 // CHECK-DEBUG-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], 1, !dbg [[DBG174]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER166:%.*]], !dbg [[DBG174]]
-// CHECK-DEBUG:       omp_loop.header164:
+// CHECK-DEBUG:       omp_loop.header166:
 // CHECK-DEBUG-NEXT:    [[OMP_LOOP_IV172:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER165]] ], [ [[OMP_LOOP_NEXT174:%.*]], [[OMP_LOOP_INC169:%.*]] ], !dbg [[DBG174]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_COND167:%.*]], !dbg [[DBG174]]
-// CHECK-DEBUG:       omp_loop.cond165:
+// CHECK-DEBUG:       omp_loop.cond167:
 // CHECK-DEBUG-NEXT:    [[OMP_LOOP_CMP173:%.*]] = icmp ult i32 [[OMP_LOOP_IV172]], [[TMP17]], !dbg [[DBG174]]
 // CHECK-DEBUG-NEXT:    br i1 [[OMP_LOOP_CMP173]], label [[OMP_LOOP_BODY168:%.*]], label [[OMP_LOOP_EXIT170:%.*]], !dbg [[DBG174]]
-// CHECK-DEBUG:       omp_loop.exit168:
+// CHECK-DEBUG:       omp_loop.exit170:
 // CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB39]], i32 [[OMP_GLOBAL_THREAD_NUM182]]), !dbg [[DBG174]]
 // CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM183:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB39]]), !dbg [[DBG176:![0-9]+]]
 // CHECK-DEBUG-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB40:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM183]]), !dbg [[DBG176]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_AFTER171:%.*]], !dbg [[DBG174]]
-// CHECK-DEBUG:       omp_loop.after169:
+// CHECK-DEBUG:       omp_loop.after171:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_REGION_PARALLEL_AFTER:%.*]], !dbg [[DBG177:![0-9]+]]
 // CHECK-DEBUG:       omp.par.region.parallel.after:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK-DEBUG:       omp.par.pre_finalize:
+// CHECK-DEBUG-NEXT:    br label [[FINI184:%.*]]
+// CHECK-DEBUG:       .fini184:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT184_EXITSTUB:%.*]], !dbg [[DBG177]]
-// CHECK-DEBUG:       omp_loop.body166:
+// CHECK-DEBUG:       omp_loop.body168:
 // CHECK-DEBUG-NEXT:    [[TMP18:%.*]] = add i32 [[OMP_LOOP_IV172]], [[TMP14]], !dbg [[DBG176]]
 // CHECK-DEBUG-NEXT:    call void @__captured_stmt.18(ptr [[I160]], i32 [[TMP18]], ptr [[AGG_CAPTURED162]]), !dbg [[DBG174]]
 // CHECK-DEBUG-NEXT:    [[TMP19:%.*]] = load i32, ptr [[LOADGEP_A_ADDR]], align 4, !dbg [[DBG178:![0-9]+]]
@@ -2071,7 +2081,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[LOADGEP_R_ADDR]], align 8, !dbg [[DBG180:![0-9]+]]
 // CHECK-DEBUG-NEXT:    store float [[CONV177]], ptr [[TMP21]], align 4, !dbg [[DBG181:![0-9]+]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_INC169]], !dbg [[DBG174]]
-// CHECK-DEBUG:       omp_loop.inc167:
+// CHECK-DEBUG:       omp_loop.inc169:
 // CHECK-DEBUG-NEXT:    [[OMP_LOOP_NEXT174]] = add nuw i32 [[OMP_LOOP_IV172]], 1, !dbg [[DBG174]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER166]], !dbg [[DBG174]]
 // CHECK-DEBUG:       omp_loop.body:
@@ -2218,7 +2228,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG:       omp_loop.after86:
 // CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM99:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB31:[0-9]+]]), !dbg [[DBG208:![0-9]+]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_PARALLEL213:%.*]]
-// CHECK-DEBUG:       omp_parallel210:
+// CHECK-DEBUG:       omp_parallel213:
 // CHECK-DEBUG-NEXT:    [[GEP_A_ADDR210:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG209]], i32 0, i32 0
 // CHECK-DEBUG-NEXT:    store ptr [[LOADGEP_A_ADDR]], ptr [[GEP_A_ADDR210]], align 8
 // CHECK-DEBUG-NEXT:    [[GEP_B_ADDR211:%.*]] = getelementptr { ptr, ptr, ptr }, ptr [[STRUCTARG209]], i32 0, i32 1
@@ -2238,7 +2248,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    call void @__captured_stmt.15(ptr [[DOTCOUNT_ADDR138]], ptr [[AGG_CAPTURED136]]), !dbg [[DBG217]]
 // CHECK-DEBUG-NEXT:    [[DOTCOUNT139:%.*]] = load i32, ptr [[DOTCOUNT_ADDR138]], align 4, !dbg [[DBG217]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_PREHEADER140:%.*]], !dbg [[DBG217]]
-// CHECK-DEBUG:       omp_loop.preheader139:
+// CHECK-DEBUG:       omp_loop.preheader140:
 // CHECK-DEBUG-NEXT:    store i32 0, ptr [[P_LOWERBOUND154]], align 4, !dbg [[DBG217]]
 // CHECK-DEBUG-NEXT:    [[TMP21:%.*]] = sub i32 [[DOTCOUNT139]], 1, !dbg [[DBG217]]
 // CHECK-DEBUG-NEXT:    store i32 [[TMP21]], ptr [[P_UPPERBOUND155]], align 4, !dbg [[DBG217]]
@@ -2250,24 +2260,26 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP24:%.*]] = sub i32 [[TMP23]], [[TMP22]], !dbg [[DBG217]]
 // CHECK-DEBUG-NEXT:    [[TMP25:%.*]] = add i32 [[TMP24]], 1, !dbg [[DBG217]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER141:%.*]], !dbg [[DBG217]]
-// CHECK-DEBUG:       omp_loop.header140:
+// CHECK-DEBUG:       omp_loop.header141:
 // CHECK-DEBUG-NEXT:    [[OMP_LOOP_IV147:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER140]] ], [ [[OMP_LOOP_NEXT149:%.*]], [[OMP_LOOP_INC144:%.*]] ], !dbg [[DBG217]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_COND142:%.*]], !dbg [[DBG217]]
-// CHECK-DEBUG:       omp_loop.cond141:
+// CHECK-DEBUG:       omp_loop.cond142:
 // CHECK-DEBUG-NEXT:    [[OMP_LOOP_CMP148:%.*]] = icmp ult i32 [[OMP_LOOP_IV147]], [[TMP25]], !dbg [[DBG217]]
 // CHECK-DEBUG-NEXT:    br i1 [[OMP_LOOP_CMP148]], label [[OMP_LOOP_BODY143:%.*]], label [[OMP_LOOP_EXIT145:%.*]], !dbg [[DBG217]]
-// CHECK-DEBUG:       omp_loop.exit144:
+// CHECK-DEBUG:       omp_loop.exit145:
 // CHECK-DEBUG-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB36]], i32 [[OMP_GLOBAL_THREAD_NUM157]]), !dbg [[DBG217]]
 // CHECK-DEBUG-NEXT:    [[OMP_GLOBAL_THREAD_NUM158:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB36]]), !dbg [[DBG219:![0-9]+]]
 // CHECK-DEBUG-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB37:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM158]]), !dbg [[DBG219]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_AFTER146:%.*]], !dbg [[DBG217]]
-// CHECK-DEBUG:       omp_loop.after145:
+// CHECK-DEBUG:       omp_loop.after146:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_REGION9_PARALLEL_AFTER:%.*]], !dbg [[DBG220:![0-9]+]]
 // CHECK-DEBUG:       omp.par.region9.parallel.after:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_PRE_FINALIZE10:%.*]]
 // CHECK-DEBUG:       omp.par.pre_finalize10:
+// CHECK-DEBUG-NEXT:    br label [[FINI159:%.*]]
+// CHECK-DEBUG:       .fini159:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT159_EXITSTUB:%.*]], !dbg [[DBG220]]
-// CHECK-DEBUG:       omp_loop.body142:
+// CHECK-DEBUG:       omp_loop.body143:
 // CHECK-DEBUG-NEXT:    [[TMP26:%.*]] = add i32 [[OMP_LOOP_IV147]], [[TMP22]], !dbg [[DBG219]]
 // CHECK-DEBUG-NEXT:    call void @__captured_stmt.16(ptr [[I135]], i32 [[TMP26]], ptr [[AGG_CAPTURED137]]), !dbg [[DBG217]]
 // CHECK-DEBUG-NEXT:    [[TMP27:%.*]] = load i32, ptr [[LOADGEP_A_ADDR]], align 4, !dbg [[DBG221:![0-9]+]]
@@ -2278,7 +2290,7 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[LOADGEP_R_ADDR]], align 8, !dbg [[DBG223:![0-9]+]]
 // CHECK-DEBUG-NEXT:    store float [[CONV152]], ptr [[TMP29]], align 4, !dbg [[DBG224:![0-9]+]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_INC144]], !dbg [[DBG217]]
-// CHECK-DEBUG:       omp_loop.inc143:
+// CHECK-DEBUG:       omp_loop.inc144:
 // CHECK-DEBUG-NEXT:    [[OMP_LOOP_NEXT149]] = add nuw i32 [[OMP_LOOP_IV147]], 1, !dbg [[DBG217]]
 // CHECK-DEBUG-NEXT:    br label [[OMP_LOOP_HEADER141]], !dbg [[DBG217]]
 // CHECK-DEBUG:       omp_loop.body83:
@@ -2375,8 +2387,8 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG:       omp_loop.after121:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_REGION103_PARALLEL_AFTER:%.*]], !dbg [[DBG244:![0-9]+]]
 // CHECK-DEBUG:       omp.par.region103.parallel.after:
-// CHECK-DEBUG-NEXT:    br label [[OMP_PAR_PRE_FINALIZE104:%.*]]
-// CHECK-DEBUG:       omp.par.pre_finalize104:
+// CHECK-DEBUG-NEXT:    br label [[FINI134:%.*]]
+// CHECK-DEBUG:       .fini134:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT134_EXITSTUB:%.*]], !dbg [[DBG244]]
 // CHECK-DEBUG:       omp_loop.body118:
 // CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = add i32 [[OMP_LOOP_IV122]], [[TMP6]], !dbg [[DBG243]]
@@ -2460,6 +2472,8 @@ void parallel_for_2(float *r, int a, double b) {
 // CHECK-DEBUG:       omp.par.region44.parallel.after:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_PRE_FINALIZE45:%.*]]
 // CHECK-DEBUG:       omp.par.pre_finalize45:
+// CHECK-DEBUG-NEXT:    br label [[FINI:%.*]]
+// CHECK-DEBUG:       .fini:
 // CHECK-DEBUG-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]], !dbg [[DBG260]]
 // CHECK-DEBUG:       omp_loop.body59:
 // CHECK-DEBUG-NEXT:    [[TMP10:%.*]] = add i32 [[OMP_LOOP_IV63]], [[TMP6]], !dbg [[DBG259]]
diff --git a/clang/test/OpenMP/masked_codegen.cpp b/clang/test/OpenMP/masked_codegen.cpp
index a39de12d69337..bc6f68de9b248 100644
--- a/clang/test/OpenMP/masked_codegen.cpp
+++ b/clang/test/OpenMP/masked_codegen.cpp
@@ -35,6 +35,8 @@ int main() {
 // ALL-NEXT:  			store i8 2, ptr [[A_ADDR]]
 // IRBUILDER-NEXT:		br label %[[AFTER:[^ ,]+]]
 // IRBUILDER:			[[AFTER]]
+// IRBUILDER-NEXT:		br label %[[OMP_REGION_FINALIZE:[^ ,]+]]
+// IRBUILDER:			[[OMP_REGION_FINALIZE]]
 // ALL-NEXT:  			call {{.*}}void @__kmpc_end_masked(ptr [[DEFAULT_LOC]], i32 [[GTID]])
 // ALL-NEXT:  			br label {{%?}}[[EXIT]]
 // ALL:       			[[EXIT]]
diff --git a/clang/test/OpenMP/master_codegen.cpp b/clang/test/OpenMP/master_codegen.cpp
index a7af326caacfe..5a92444d9a927 100644
--- a/clang/test/OpenMP/master_codegen.cpp
+++ b/clang/test/OpenMP/master_codegen.cpp
@@ -35,6 +35,8 @@ int main() {
 // ALL-NEXT:  			store i8 2, ptr [[A_ADDR]]
 // IRBUILDER-NEXT:		br label %[[AFTER:[^ ,]+]]
 // IRBUILDER:			[[AFTER]]
+// IRBUILDER-NEXT:		br label %[[OMP_REGION_FINALIZE:[^ ,]+]]
+// IRBUILDER:			[[OMP_REGION_FINALIZE]]
 // ALL-NEXT:  			call {{.*}}void @__kmpc_end_master(ptr [[DEFAULT_LOC]], i32 [[GTID]])
 // ALL-NEXT:  			br label {{%?}}[[EXIT]]
 // ALL:       			[[EXIT]]
diff --git a/clang/test/OpenMP/nested_loop_codegen.cpp b/clang/test/OpenMP/nested_loop_codegen.cpp
index 9aefc6a739e51..e01fd0da31ee8 100644
--- a/clang/test/OpenMP/nested_loop_codegen.cpp
+++ b/clang/test/OpenMP/nested_loop_codegen.cpp
@@ -904,6 +904,8 @@ int inline_decl() {
 // CHECK4:       omp.par.region.parallel.after:
 // CHECK4-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK4:       omp.par.pre_finalize:
+// CHECK4-NEXT:    br label [[FINI:%.*]]
+// CHECK4:       .fini:
 // CHECK4-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]], !dbg [[DBG27]]
 // CHECK4:       for.body:
 // CHECK4-NEXT:    store i32 0, ptr [[LOADGEP_K]], align 4, !dbg [[DBG28:![0-9]+]]
@@ -1083,6 +1085,8 @@ int inline_decl() {
 // CHECK4:       omp.par.region.parallel.after:
 // CHECK4-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK4:       omp.par.pre_finalize:
+// CHECK4-NEXT:    br label [[FINI:%.*]]
+// CHECK4:       .fini:
 // CHECK4-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]], !dbg [[DBG90]]
 // CHECK4:       for.body:
 // CHECK4-NEXT:      #dbg_declare(ptr [[K]], [[META91:![0-9]+]], !DIExpression(), [[META95:![0-9]+]])
diff --git a/clang/test/OpenMP/ordered_codegen.cpp b/clang/test/OpenMP/ordered_codegen.cpp
index 5cd95f1927e5c..3b29feac7caa2 100644
--- a/clang/test/OpenMP/ordered_codegen.cpp
+++ b/clang/test/OpenMP/ordered_codegen.cpp
@@ -794,6 +794,8 @@ void foo_simd(int low, int up) {
 // CHECK1-IRBUILDER-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK1-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+// CHECK1-IRBUILDER:       omp_region.finalize:
 // CHECK1-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1-IRBUILDER:       omp.body.continue:
@@ -884,6 +886,8 @@ void foo_simd(int low, int up) {
 // CHECK1-IRBUILDER-NEXT:    store float [[MUL7]], ptr [[ARRAYIDX8]], align 4
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK1-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+// CHECK1-IRBUILDER:       omp_region.finalize:
 // CHECK1-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1-IRBUILDER:       omp.body.continue:
@@ -1022,6 +1026,8 @@ void foo_simd(int low, int up) {
 // CHECK1-IRBUILDER-NEXT:    store float [[MUL29]], ptr [[ARRAYIDX31]], align 4
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK1-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+// CHECK1-IRBUILDER:       omp_region.finalize:
 // CHECK1-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM23]])
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1-IRBUILDER:       omp.body.continue:
@@ -1131,6 +1137,8 @@ void foo_simd(int low, int up) {
 // CHECK1-IRBUILDER-NEXT:    store float [[MUL14]], ptr [[ARRAYIDX16]], align 4
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK1-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+// CHECK1-IRBUILDER:       omp_region.finalize:
 // CHECK1-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8]])
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1-IRBUILDER:       omp.body.continue:
@@ -1296,17 +1304,19 @@ void foo_simd(int low, int up) {
 // CHECK1-IRBUILDER-NEXT:    call void @__captured_stmt.1(ptr [[I28]])
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY33_ORDERED_AFTER:%.*]]
 // CHECK1-IRBUILDER:       omp.inner.for.body33.ordered.after:
-// CHECK1-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE38:%.*]]
-// CHECK1-IRBUILDER:       omp.body.continue38:
-// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC39:%.*]]
-// CHECK1-IRBUILDER:       omp.inner.for.inc39:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_REGION_FINALIZE38:%.*]]
+// CHECK1-IRBUILDER:       omp_region.finalize38:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE39:%.*]]
+// CHECK1-IRBUILDER:       omp.body.continue39:
+// CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC40:%.*]]
+// CHECK1-IRBUILDER:       omp.inner.for.inc40:
 // CHECK1-IRBUILDER-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4
 // CHECK1-IRBUILDER-NEXT:    [[ADD40:%.*]] = add i32 [[TMP32]], 1
 // CHECK1-IRBUILDER-NEXT:    store i32 [[ADD40]], ptr [[DOTOMP_IV16]], align 4
 // CHECK1-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM41:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB12]])
 // CHECK1-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM41]])
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP5:![0-9]+]]
-// CHECK1-IRBUILDER:       omp.inner.for.end42:
+// CHECK1-IRBUILDER:       omp.inner.for.end43:
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1-IRBUILDER:       omp.dispatch.inc:
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
@@ -2034,6 +2044,8 @@ void foo_simd(int low, int up) {
 // CHECK3-IRBUILDER-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK3-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+// CHECK3-IRBUILDER:       omp_region.finalize:
 // CHECK3-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3-IRBUILDER:       omp.body.continue:
@@ -2124,6 +2136,8 @@ void foo_simd(int low, int up) {
 // CHECK3-IRBUILDER-NEXT:    store float [[MUL7]], ptr [[ARRAYIDX8]], align 4
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK3-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+// CHECK3-IRBUILDER:       omp_region.finalize:
 // CHECK3-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3-IRBUILDER:       omp.body.continue:
@@ -2262,6 +2276,8 @@ void foo_simd(int low, int up) {
 // CHECK3-IRBUILDER-NEXT:    store float [[MUL29]], ptr [[ARRAYIDX31]], align 4
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK3-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+// CHECK3-IRBUILDER:       omp_region.finalize:
 // CHECK3-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM23]])
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3-IRBUILDER:       omp.body.continue:
@@ -2371,6 +2387,8 @@ void foo_simd(int low, int up) {
 // CHECK3-IRBUILDER-NEXT:    store float [[MUL14]], ptr [[ARRAYIDX16]], align 4
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK3-IRBUILDER:       omp.inner.for.body.ordered.after:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+// CHECK3-IRBUILDER:       omp_region.finalize:
 // CHECK3-IRBUILDER-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8]])
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3-IRBUILDER:       omp.body.continue:
@@ -2536,17 +2554,19 @@ void foo_simd(int low, int up) {
 // CHECK3-IRBUILDER-NEXT:    call void @__captured_stmt.1(ptr [[I28]])
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY33_ORDERED_AFTER:%.*]]
 // CHECK3-IRBUILDER:       omp.inner.for.body33.ordered.after:
-// CHECK3-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE38:%.*]]
-// CHECK3-IRBUILDER:       omp.body.continue38:
-// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC39:%.*]]
-// CHECK3-IRBUILDER:       omp.inner.for.inc39:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_REGION_FINALIZE38:%.*]]
+// CHECK3-IRBUILDER:       omp_region.finalize38:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_BODY_CONTINUE39:%.*]]
+// CHECK3-IRBUILDER:       omp.body.continue39:
+// CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_INC40:%.*]]
+// CHECK3-IRBUILDER:       omp.inner.for.inc40:
 // CHECK3-IRBUILDER-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4
 // CHECK3-IRBUILDER-NEXT:    [[ADD40:%.*]] = add i32 [[TMP32]], 1
 // CHECK3-IRBUILDER-NEXT:    store i32 [[ADD40]], ptr [[DOTOMP_IV16]], align 4
 // CHECK3-IRBUILDER-NEXT:    [[OMP_GLOBAL_THREAD_NUM41:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB12]])
 // CHECK3-IRBUILDER-NEXT:    call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM41]])
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP5:![0-9]+]]
-// CHECK3-IRBUILDER:       omp.inner.for.end42:
+// CHECK3-IRBUILDER:       omp.inner.for.end43:
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK3-IRBUILDER:       omp.dispatch.inc:
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_DISPATCH_COND]]
diff --git a/clang/test/OpenMP/parallel_codegen.cpp b/clang/test/OpenMP/parallel_codegen.cpp
index e8e57aedaa164..9f6004e37db9c 100644
--- a/clang/test/OpenMP/parallel_codegen.cpp
+++ b/clang/test/OpenMP/parallel_codegen.cpp
@@ -906,6 +906,8 @@ int main (int argc, char **argv) {
 // CHECK4:       omp.par.region.parallel.after:
 // CHECK4-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK4:       omp.par.pre_finalize:
+// CHECK4-NEXT:    br label [[FINI:%.*]]
+// CHECK4:       .fini:
 // CHECK4-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]], !dbg [[DBG35]]
 // CHECK4:       omp.par.exit.exitStub:
 // CHECK4-NEXT:    ret void
@@ -975,6 +977,8 @@ int main (int argc, char **argv) {
 // CHECK4:       omp.par.region.parallel.after:
 // CHECK4-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK4:       omp.par.pre_finalize:
+// CHECK4-NEXT:    br label [[FINI:%.*]]
+// CHECK4:       .fini:
 // CHECK4-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]], !dbg [[DBG66]]
 // CHECK4:       omp.par.exit.exitStub:
 // CHECK4-NEXT:    ret void
diff --git a/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90 b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
index cf77c46346b7f..fd59d39b552da 100644
--- a/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
+++ b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
@@ -174,10 +174,13 @@ subroutine worst_case(a, b, c, d)
 ! CHECK-NEXT:    br label %omp.par.pre_finalize
 
 ! CHECK:       omp.par.pre_finalize:                             ; preds = %reduce.finalize
+! CHECK-NEXT:    br label %.fini
+
+! CHECK:       .fini:
 ! CHECK-NEXT:    %{{.*}} = load ptr, ptr
 ! CHECK-NEXT:    br label %omp.reduction.cleanup
 
-! CHECK:       omp.reduction.cleanup:                            ; preds = %omp.par.pre_finalize
+! CHECK:       omp.reduction.cleanup:                            ; preds = %.fini
 !                [null check]
 ! CHECK:         br i1 %{{.*}}, label %omp.reduction.cleanup43, label %omp.reduction.cleanup44
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index b801e212ceced..3efbdc4fe17d6 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -576,16 +576,33 @@ class OpenMPIRBuilder {
   using FinalizeCallbackTy = std::function<Error(InsertPointTy CodeGenIP)>;
 
   struct FinalizationInfo {
-    /// The finalization callback provided by the last in-flight invocation of
-    /// createXXXX for the directive of kind DK.
-    FinalizeCallbackTy FiniCB;
-
+    FinalizationInfo(FinalizeCallbackTy FiniCB, omp::Directive DK,
+                     bool IsCancellable)
+        : DK(DK), IsCancellable(IsCancellable), FiniCB(std::move(FiniCB)) {}
     /// The directive kind of the innermost directive that has an associated
     /// region which might require finalization when it is left.
-    omp::Directive DK;
+    const omp::Directive DK;
 
     /// Flag to indicate if the directive is cancellable.
-    bool IsCancellable;
+    const bool IsCancellable;
+
+    /// The basic block to which control should be transferred to
+    /// implement the FiniCB. Memoized to avoid generating finalization
+    /// multiple times.
+    Expected<BasicBlock *> getFiniBB(IRBuilderBase &Builder);
+
+    /// For cases where there is an unavoidable existing finalization block
+    /// (e.g. loop finialization after omp sections). The existing finalization
+    /// block must not contain any non-finalization code.
+    Error mergeFiniBB(IRBuilderBase &Builder, BasicBlock *ExistingFiniBB);
+
+  private:
+    /// Access via getFiniBB.
+    BasicBlock *FiniBB = nullptr;
+
+    /// The finalization callback provided by the last in-flight invocation of
+    /// createXXXX for the directive of kind DK.
+    FinalizeCallbackTy FiniCB;
   };
 
   /// Push a finalization callback on the finalization stack.
@@ -2246,8 +2263,7 @@ class OpenMPIRBuilder {
   ///
   /// \return an error, if any were triggered during execution.
   LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag,
-                                          omp::Directive CanceledDirective,
-                                          FinalizeCallbackTy ExitCB = {});
+                                          omp::Directive CanceledDirective);
 
   /// Generate a target region entry call.
   ///
@@ -3402,7 +3418,8 @@ class OpenMPIRBuilder {
   /// Common interface to finalize the region
   ///
   /// \param OMPD Directive to generate exiting code for
-  /// \param FinIP Insertion point for emitting Finalization code and exit call
+  /// \param FinIP Insertion point for emitting Finalization code and exit call.
+  ///              This block must not contain any non-finalization code.
   /// \param ExitCall Call to the ending OMP Runtime Function
   /// \param HasFinalize indicate if the directive will require finalization
   ///         and has a finalization callback in the stack that
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index cf88c4309974f..0d196be2ee696 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -682,6 +682,47 @@ OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
   return {FnTy, Fn};
 }
 
+Expected<BasicBlock *>
+OpenMPIRBuilder::FinalizationInfo::getFiniBB(IRBuilderBase &Builder) {
+  if (!FiniBB) {
+    Function *ParentFunc = Builder.GetInsertBlock()->getParent();
+    IRBuilderBase::InsertPointGuard Guard(Builder);
+    FiniBB = BasicBlock::Create(Builder.getContext(), ".fini", ParentFunc);
+    Builder.SetInsertPoint(FiniBB);
+    // FiniCB adds the branch to the exit stub.
+    if (Error Err = FiniCB(Builder.saveIP()))
+      return Err;
+  }
+  return FiniBB;
+}
+
+Error OpenMPIRBuilder::FinalizationInfo::mergeFiniBB(IRBuilderBase &Builder,
+                                                     BasicBlock *OtherFiniBB) {
+  // Simple case: FiniBB does not exist yet: re-use OtherFiniBB.
+  if (!FiniBB) {
+    FiniBB = OtherFiniBB;
+
+    Builder.SetInsertPoint(FiniBB->getFirstNonPHIIt());
+    if (Error Err = FiniCB(Builder.saveIP()))
+      return Err;
+
+    return Error::success();
+  }
+
+  // Move instructions from FiniBB to the start of OtherFiniBB.
+  auto EndIt = FiniBB->end();
+  if (FiniBB->size() >= 1)
+    if (auto Prev = std::prev(EndIt); Prev->isTerminator())
+      EndIt = Prev;
+  OtherFiniBB->splice(OtherFiniBB->getFirstNonPHIIt(), FiniBB, FiniBB->begin(),
+                      EndIt);
+
+  FiniBB->replaceAllUsesWith(OtherFiniBB);
+  FiniBB->eraseFromParent();
+  FiniBB = OtherFiniBB;
+  return Error::success();
+}
+
 Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
   FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID);
   auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
@@ -1129,21 +1170,9 @@ OpenMPIRBuilder::createCancel(const LocationDescription &Loc,
   Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
   Value *Result = createRuntimeFunctionCall(
       getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
-  auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
-    if (CanceledDirective == OMPD_parallel) {
-      IRBuilder<>::InsertPointGuard IPG(Builder);
-      Builder.restoreIP(IP);
-      return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
-                           omp::Directive::OMPD_unknown,
-                           /* ForceSimpleCall */ false,
-                           /* CheckCancelFlag */ false)
-          .takeError();
-    }
-    return Error::success();
-  };
 
   // The actual cancel logic is shared with others, e.g., cancel_barriers.
-  if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
+  if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
     return Err;
 
   // Update the insertion point and remove the terminator we introduced.
@@ -1180,21 +1209,9 @@ OpenMPIRBuilder::createCancellationPoint(const LocationDescription &Loc,
   Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
   Value *Result = createRuntimeFunctionCall(
       getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
-  auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
-    if (CanceledDirective == OMPD_parallel) {
-      IRBuilder<>::InsertPointGuard IPG(Builder);
-      Builder.restoreIP(IP);
-      return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
-                           omp::Directive::OMPD_unknown,
-                           /* ForceSimpleCall */ false,
-                           /* CheckCancelFlag */ false)
-          .takeError();
-    }
-    return Error::success();
-  };
 
   // The actual cancel logic is shared with others, e.g., cancel_barriers.
-  if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
+  if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
     return Err;
 
   // Update the insertion point and remove the terminator we introduced.
@@ -1298,8 +1315,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch(
 }
 
 Error OpenMPIRBuilder::emitCancelationCheckImpl(
-    Value *CancelFlag, omp::Directive CanceledDirective,
-    FinalizeCallbackTy ExitCB) {
+    Value *CancelFlag, omp::Directive CanceledDirective) {
   assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
          "Unexpected cancellation!");
 
@@ -1326,13 +1342,12 @@ Error OpenMPIRBuilder::emitCancelationCheckImpl(
 
   // From the cancellation block we finalize all variables and go to the
   // post finalization block that is known to the FiniCB callback.
-  Builder.SetInsertPoint(CancellationBlock);
-  if (ExitCB)
-    if (Error Err = ExitCB(Builder.saveIP()))
-      return Err;
   auto &FI = FinalizationStack.back();
-  if (Error Err = FI.FiniCB(Builder.saveIP()))
-    return Err;
+  Expected<BasicBlock *> FiniBBOrErr = FI.getFiniBB(Builder);
+  if (!FiniBBOrErr)
+    return FiniBBOrErr.takeError();
+  Builder.SetInsertPoint(CancellationBlock);
+  Builder.CreateBr(*FiniBBOrErr);
 
   // The continuation block is where code generation continues.
   Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
@@ -1821,8 +1836,18 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
   Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
 
   InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
-  if (Error Err = FiniCB(PreFiniIP))
-    return Err;
+  Expected<BasicBlock *> FiniBBOrErr = FiniInfo.getFiniBB(Builder);
+  if (!FiniBBOrErr)
+    return FiniBBOrErr.takeError();
+  {
+    IRBuilderBase::InsertPointGuard Guard(Builder);
+    Builder.restoreIP(PreFiniIP);
+    Builder.CreateBr(*FiniBBOrErr);
+    // There's currently a branch to omp.par.exit. Delete it. We will get there
+    // via the fini block
+    if (Instruction *Term = Builder.GetInsertBlock()->getTerminator())
+      Term->eraseFromParent();
+  }
 
   // Register the outlined info.
   addOutlineInfo(std::move(OI));
@@ -2258,23 +2283,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections(
   if (!updateToLocation(Loc))
     return Loc.IP;
 
-  // FiniCBWrapper needs to create a branch to the loop finalization block, but
-  // this has not been created yet at some times when this callback runs.
-  SmallVector<BranchInst *> CancellationBranches;
-  auto FiniCBWrapper = [&](InsertPointTy IP) {
-    if (IP.getBlock()->end() != IP.getPoint())
-      return FiniCB(IP);
-    // This must be done otherwise any nested constructs using FinalizeOMPRegion
-    // will fail because that function requires the Finalization Basic Block to
-    // have a terminator, which is already removed by EmitOMPRegionBody.
-    // IP is currently at cancelation block.
-    BranchInst *DummyBranch = Builder.CreateBr(IP.getBlock());
-    IP = InsertPointTy(DummyBranch->getParent(), DummyBranch->getIterator());
-    CancellationBranches.push_back(DummyBranch);
-    return FiniCB(IP);
-  };
-
-  FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
+  FinalizationStack.push_back({FiniCB, OMPD_sections, IsCancellable});
 
   // Each section is emitted as a switch case
   // Each finalization callback is handled from clang.EmitOMPSectionDirective()
@@ -2340,20 +2349,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections(
   auto FiniInfo = FinalizationStack.pop_back_val();
   assert(FiniInfo.DK == OMPD_sections &&
          "Unexpected finalization stack state!");
-  if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
-    Builder.restoreIP(AfterIP);
-    BasicBlock *FiniBB =
-        splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
-    if (Error Err = CB(Builder.saveIP()))
-      return Err;
-    AfterIP = {FiniBB, FiniBB->begin()};
-  }
-
-  // Now we can fix the dummy branch to point to the right place
-  for (BranchInst *DummyBranch : CancellationBranches) {
-    assert(DummyBranch->getNumSuccessors() == 1);
-    DummyBranch->setSuccessor(0, LoopFini);
-  }
+  if (Error Err = FiniInfo.mergeFiniBB(Builder, LoopFini))
+    return Err;
 
   return AfterIP;
 }
@@ -6718,9 +6715,6 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
       emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
   if (!AfterIP)
     return AfterIP.takeError();
-  assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
-         "Unexpected Control Flow State!");
-  MergeBlockIntoPredecessor(FiniBB);
 
   // If we are skipping the region of a non conditional, remove the exit
   // block, and clear the builder's insertion point.
@@ -6780,14 +6774,12 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
     FinalizationInfo Fi = FinalizationStack.pop_back_val();
     assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
 
-    if (Error Err = Fi.FiniCB(FinIP))
-      return Err;
-
-    BasicBlock *FiniBB = FinIP.getBlock();
-    Instruction *FiniBBTI = FiniBB->getTerminator();
+    if (Error Err = Fi.mergeFiniBB(Builder, FinIP.getBlock()))
+      return std::move(Err);
 
-    // set Builder IP for call creation
-    Builder.SetInsertPoint(FiniBBTI);
+    // Exit condition: insertion point is before the terminator of the new Fini
+    // block
+    Builder.SetInsertPoint(FinIP.getBlock()->getTerminator());
   }
 
   if (!ExitCall)
diff --git a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
index 83452e72b56b9..1bbac5cc3154b 100644
--- a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
@@ -4880,6 +4880,8 @@ entry:
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
+; CHECK2-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+; CHECK2:       omp_region.finalize:
 ; CHECK2-NEXT:    call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.exit.exitStub:
@@ -4974,6 +4976,8 @@ entry:
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
+; CHECK2-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+; CHECK2:       omp_region.finalize:
 ; CHECK2-NEXT:    call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.exit.exitStub:
@@ -5070,6 +5074,8 @@ entry:
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
+; CHECK2-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+; CHECK2:       omp_region.finalize:
 ; CHECK2-NEXT:    call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.exit.exitStub:
@@ -5157,6 +5163,8 @@ entry:
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
+; CHECK2-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+; CHECK2:       omp_region.finalize:
 ; CHECK2-NEXT:    call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.exit.exitStub:
@@ -5254,6 +5262,8 @@ entry:
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
+; CHECK2-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+; CHECK2:       omp_region.finalize:
 ; CHECK2-NEXT:    call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.exit.exitStub:
@@ -5434,6 +5444,8 @@ entry:
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
+; CHECK2-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+; CHECK2:       omp_region.finalize:
 ; CHECK2-NEXT:    call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.exit.exitStub:
@@ -5624,8 +5636,10 @@ entry:
 ; CHECK2:       omp.par.region.split:
 ; CHECK2-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 ; CHECK2:       omp.par.pre_finalize:
-; CHECK2-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
-; CHECK2:       omp_region.body5:
+; CHECK2-NEXT:    br label [[FINI:%.*]]
+; CHECK2:       .fini:
+; CHECK2-NEXT:    br label [[OMP_PAR_EXIT_EXITSTUB:.*]]
+; CHECK2:       omp_region.body6:
 ; CHECK2-NEXT:    br label [[SEQ_PAR_MERGED2:%.*]]
 ; CHECK2:       seq.par.merged2:
 ; CHECK2-NEXT:    [[ADD_SEQ_OUTPUT_LOAD:%.*]] = load i32, ptr [[LOADGEP_ADD_SEQ_OUTPUT_ALLOC]], align 4
@@ -5634,7 +5648,9 @@ entry:
 ; CHECK2-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT_SPLIT:%.*]]
 ; CHECK2:       omp.par.merged.split.split.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY5_SPLIT:%.*]]
-; CHECK2:       omp_region.body5.split:
+; CHECK2:       omp_region.body6.split:
+; CHECK2-NEXT:    br label [[OMP_REGION_FINALIZE5:%.*]]
+; CHECK2:       omp_region.finalize{{.*}}:
 ; CHECK2-NEXT:    call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END4]]
 ; CHECK2:       omp_region.body:
@@ -5646,6 +5662,8 @@ entry:
 ; CHECK2:       omp.par.merged.split:
 ; CHECK2-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
 ; CHECK2:       omp_region.body.split:
+; CHECK2-NEXT:    br label [[OMP_REGION_FINALIZE:%.*]]
+; CHECK2:       omp_region.finalize:
 ; CHECK2-NEXT:    call void @__kmpc_end_master(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
 ; CHECK2-NEXT:    br label [[OMP_REGION_END]]
 ; CHECK2:       omp.par.exit.exitStub:
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 1f35b7a5cfaa4..dab0a46eeb3bc 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -428,8 +428,8 @@ TEST_F(OpenMPIRBuilderTest, CreateCancel) {
                        OMPBuilder.createCancel(Loc, nullptr, OMPD_parallel));
   Builder.restoreIP(NewIP);
   EXPECT_FALSE(M->global_empty());
-  EXPECT_EQ(M->size(), 4U);
-  EXPECT_EQ(F->size(), 4U);
+  EXPECT_EQ(M->size(), 3U);
+  EXPECT_EQ(F->size(), 5U);
   EXPECT_EQ(BB->size(), 4U);
 
   CallInst *GTID = dyn_cast<CallInst>(&BB->front());
@@ -449,23 +449,16 @@ TEST_F(OpenMPIRBuilderTest, CreateCancel) {
   Instruction *CancelBBTI = Cancel->getParent()->getTerminator();
   EXPECT_EQ(CancelBBTI->getNumSuccessors(), 2U);
   EXPECT_EQ(CancelBBTI->getSuccessor(0), NewIP.getBlock());
-  EXPECT_EQ(CancelBBTI->getSuccessor(1)->size(), 3U);
-  CallInst *GTID1 = dyn_cast<CallInst>(&CancelBBTI->getSuccessor(1)->front());
-  EXPECT_NE(GTID1, nullptr);
-  EXPECT_EQ(GTID1->arg_size(), 1U);
-  EXPECT_EQ(GTID1->getCalledFunction()->getName(), "__kmpc_global_thread_num");
-  EXPECT_FALSE(GTID1->getCalledFunction()->doesNotAccessMemory());
-  EXPECT_FALSE(GTID1->getCalledFunction()->doesNotFreeMemory());
-  CallInst *Barrier = dyn_cast<CallInst>(GTID1->getNextNode());
-  EXPECT_NE(Barrier, nullptr);
-  EXPECT_EQ(Barrier->arg_size(), 2U);
-  EXPECT_EQ(Barrier->getCalledFunction()->getName(), "__kmpc_cancel_barrier");
-  EXPECT_FALSE(Barrier->getCalledFunction()->doesNotAccessMemory());
-  EXPECT_FALSE(Barrier->getCalledFunction()->doesNotFreeMemory());
-  EXPECT_TRUE(Barrier->use_empty());
+  EXPECT_EQ(CancelBBTI->getSuccessor(1)->size(), 1U);
   EXPECT_EQ(CancelBBTI->getSuccessor(1)->getTerminator()->getNumSuccessors(),
             1U);
-  EXPECT_EQ(CancelBBTI->getSuccessor(1)->getTerminator()->getSuccessor(0), CBB);
+  // cancel branch instruction (1) -> .cncl -> .fini -> CBB
+  EXPECT_EQ(CancelBBTI->getSuccessor(1)
+                ->getTerminator()
+                ->getSuccessor(0)
+                ->getTerminator()
+                ->getSuccessor(0),
+            CBB);
 
   EXPECT_EQ(cast<CallInst>(Cancel)->getArgOperand(1), GTID);
 
@@ -497,8 +490,8 @@ TEST_F(OpenMPIRBuilderTest, CreateCancelIfCond) {
       OMPBuilder.createCancel(Loc, Builder.getTrue(), OMPD_parallel));
   Builder.restoreIP(NewIP);
   EXPECT_FALSE(M->global_empty());
-  EXPECT_EQ(M->size(), 4U);
-  EXPECT_EQ(F->size(), 7U);
+  EXPECT_EQ(M->size(), 3U);
+  EXPECT_EQ(F->size(), 8U);
   EXPECT_EQ(BB->size(), 1U);
   ASSERT_TRUE(isa<BranchInst>(BB->getTerminator()));
   ASSERT_EQ(BB->getTerminator()->getNumSuccessors(), 2U);
@@ -524,23 +517,15 @@ TEST_F(OpenMPIRBuilderTest, CreateCancelIfCond) {
   EXPECT_EQ(CancelBBTI->getSuccessor(0)->size(), 1U);
   EXPECT_EQ(CancelBBTI->getSuccessor(0)->getUniqueSuccessor(),
             NewIP.getBlock());
-  EXPECT_EQ(CancelBBTI->getSuccessor(1)->size(), 3U);
-  CallInst *GTID1 = dyn_cast<CallInst>(&CancelBBTI->getSuccessor(1)->front());
-  EXPECT_NE(GTID1, nullptr);
-  EXPECT_EQ(GTID1->arg_size(), 1U);
-  EXPECT_EQ(GTID1->getCalledFunction()->getName(), "__kmpc_global_thread_num");
-  EXPECT_FALSE(GTID1->getCalledFunction()->doesNotAccessMemory());
-  EXPECT_FALSE(GTID1->getCalledFunction()->doesNotFreeMemory());
-  CallInst *Barrier = dyn_cast<CallInst>(GTID1->getNextNode());
-  EXPECT_NE(Barrier, nullptr);
-  EXPECT_EQ(Barrier->arg_size(), 2U);
-  EXPECT_EQ(Barrier->getCalledFunction()->getName(), "__kmpc_cancel_barrier");
-  EXPECT_FALSE(Barrier->getCalledFunction()->doesNotAccessMemory());
-  EXPECT_FALSE(Barrier->getCalledFunction()->doesNotFreeMemory());
-  EXPECT_TRUE(Barrier->use_empty());
+  EXPECT_EQ(CancelBBTI->getSuccessor(1)->size(), 1U);
   EXPECT_EQ(CancelBBTI->getSuccessor(1)->getTerminator()->getNumSuccessors(),
             1U);
-  EXPECT_EQ(CancelBBTI->getSuccessor(1)->getTerminator()->getSuccessor(0), CBB);
+  EXPECT_EQ(CancelBBTI->getSuccessor(1)
+                ->getTerminator()
+                ->getSuccessor(0)
+                ->getTerminator()
+                ->getSuccessor(0),
+            CBB);
 
   EXPECT_EQ(cast<CallInst>(Cancel)->getArgOperand(1), GTID);
 
@@ -572,7 +557,7 @@ TEST_F(OpenMPIRBuilderTest, CreateCancelBarrier) {
   Builder.restoreIP(NewIP);
   EXPECT_FALSE(M->global_empty());
   EXPECT_EQ(M->size(), 3U);
-  EXPECT_EQ(F->size(), 4U);
+  EXPECT_EQ(F->size(), 5U);
   EXPECT_EQ(BB->size(), 4U);
 
   CallInst *GTID = dyn_cast<CallInst>(&BB->front());
@@ -595,7 +580,11 @@ TEST_F(OpenMPIRBuilderTest, CreateCancelBarrier) {
   EXPECT_EQ(BarrierBBTI->getSuccessor(1)->size(), 1U);
   EXPECT_EQ(BarrierBBTI->getSuccessor(1)->getTerminator()->getNumSuccessors(),
             1U);
-  EXPECT_EQ(BarrierBBTI->getSuccessor(1)->getTerminator()->getSuccessor(0),
+  EXPECT_EQ(BarrierBBTI->getSuccessor(1)
+                ->getTerminator()
+                ->getSuccessor(0)
+                ->getTerminator()
+                ->getSuccessor(0),
             CBB);
 
   EXPECT_EQ(cast<CallInst>(Barrier)->getArgOperand(1), GTID);
@@ -1291,8 +1280,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelCancelBarrier) {
 
   EXPECT_EQ(NumBodiesGenerated, 1U);
   EXPECT_EQ(NumPrivatizedVars, 0U);
-  EXPECT_EQ(NumFinalizationPoints, 2U);
-  EXPECT_TRUE(FakeDestructor->hasNUses(2));
+  EXPECT_EQ(NumFinalizationPoints, 1U);
+  EXPECT_TRUE(FakeDestructor->hasNUses(1));
 
   Builder.restoreIP(AfterIP);
   Builder.CreateRetVoid();
@@ -2916,7 +2905,8 @@ TEST_F(OpenMPIRBuilderTest, MasterDirective) {
   BranchInst *EntryBr = cast<BranchInst>(EntryBB->getTerminator());
   EXPECT_TRUE(EntryBr->isConditional());
   EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB);
-  BasicBlock *ExitBB = ThenBB->getUniqueSuccessor();
+  BasicBlock *FinalizeBB = ThenBB->getUniqueSuccessor();
+  BasicBlock *ExitBB = FinalizeBB->getUniqueSuccessor();
   EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB);
 
   CmpInst *CondInst = cast<CmpInst>(EntryBr->getCondition());
@@ -2928,7 +2918,7 @@ TEST_F(OpenMPIRBuilderTest, MasterDirective) {
   EXPECT_TRUE(isa<GlobalVariable>(MasterEntryCI->getArgOperand(0)));
 
   CallInst *MasterEndCI = nullptr;
-  for (auto &FI : *ThenBB) {
+  for (auto &FI : *FinalizeBB) {
     Instruction *cur = &FI;
     if (isa<CallInst>(cur)) {
       MasterEndCI = cast<CallInst>(cur);
@@ -2998,7 +2988,8 @@ TEST_F(OpenMPIRBuilderTest, MaskedDirective) {
   BranchInst *EntryBr = cast<BranchInst>(EntryBB->getTerminator());
   EXPECT_TRUE(EntryBr->isConditional());
   EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB);
-  BasicBlock *ExitBB = ThenBB->getUniqueSuccessor();
+  BasicBlock *FinalizeBB = ThenBB->getUniqueSuccessor();
+  BasicBlock *ExitBB = FinalizeBB->getUniqueSuccessor();
   EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB);
 
   CmpInst *CondInst = cast<CmpInst>(EntryBr->getCondition());
@@ -3010,7 +3001,7 @@ TEST_F(OpenMPIRBuilderTest, MaskedDirective) {
   EXPECT_TRUE(isa<GlobalVariable>(MaskedEntryCI->getArgOperand(0)));
 
   CallInst *MaskedEndCI = nullptr;
-  for (auto &FI : *ThenBB) {
+  for (auto &FI : *FinalizeBB) {
     Instruction *cur = &FI;
     if (isa<CallInst>(cur)) {
       MaskedEndCI = cast<CallInst>(cur);
@@ -3062,6 +3053,9 @@ TEST_F(OpenMPIRBuilderTest, CriticalDirective) {
                                 FINICB_WRAPPER(FiniCB), "testCRT", nullptr));
   Builder.restoreIP(AfterIP);
 
+  BasicBlock *FinalizeBB = EntryBB->getUniqueSuccessor();
+  EXPECT_NE(FinalizeBB, nullptr);
+
   CallInst *CriticalEntryCI = nullptr;
   for (auto &EI : *EntryBB) {
     Instruction *cur = &EI;
@@ -3078,7 +3072,7 @@ TEST_F(OpenMPIRBuilderTest, CriticalDirective) {
   EXPECT_TRUE(isa<GlobalVariable>(CriticalEntryCI->getArgOperand(0)));
 
   CallInst *CriticalEndCI = nullptr;
-  for (auto &FI : *EntryBB) {
+  for (auto &FI : *FinalizeBB) {
     Instruction *cur = &FI;
     if (isa<CallInst>(cur)) {
       CriticalEndCI = cast<CallInst>(cur);
@@ -3312,6 +3306,9 @@ TEST_F(OpenMPIRBuilderTest, OrderedDirectiveThreads) {
                                           FINICB_WRAPPER(FiniCB), true));
   Builder.restoreIP(AfterIP);
 
+  BasicBlock *FinalizeBB = EntryBB->getUniqueSuccessor();
+  EXPECT_NE(FinalizeBB, nullptr);
+
   Builder.CreateRetVoid();
   OMPBuilder.finalize();
   EXPECT_FALSE(verifyModule(*M, &errs()));
@@ -3334,7 +3331,7 @@ TEST_F(OpenMPIRBuilderTest, OrderedDirectiveThreads) {
   EXPECT_TRUE(isa<GlobalVariable>(OrderedEntryCI->getArgOperand(0)));
 
   CallInst *OrderedEndCI = nullptr;
-  for (auto &FI : *EntryBB) {
+  for (auto &FI : *FinalizeBB) {
     Instruction *Cur = &FI;
     if (isa<CallInst>(Cur)) {
       OrderedEndCI = cast<CallInst>(Cur);
@@ -3508,7 +3505,8 @@ TEST_F(OpenMPIRBuilderTest, SingleDirective) {
   BranchInst *EntryBr = cast<BranchInst>(EntryBB->getTerminator());
   EXPECT_TRUE(EntryBr->isConditional());
   EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB);
-  BasicBlock *ExitBB = ThenBB->getUniqueSuccessor();
+  BasicBlock *FinalizeBB = ThenBB->getUniqueSuccessor();
+  BasicBlock *ExitBB = FinalizeBB->getUniqueSuccessor();
   EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB);
 
   CmpInst *CondInst = cast<CmpInst>(EntryBr->getCondition());
@@ -3520,7 +3518,7 @@ TEST_F(OpenMPIRBuilderTest, SingleDirective) {
   EXPECT_TRUE(isa<GlobalVariable>(SingleEntryCI->getArgOperand(0)));
 
   CallInst *SingleEndCI = nullptr;
-  for (auto &FI : *ThenBB) {
+  for (auto &FI : *FinalizeBB) {
     Instruction *cur = &FI;
     if (isa<CallInst>(cur)) {
       SingleEndCI = cast<CallInst>(cur);
@@ -3601,7 +3599,8 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveNowait) {
   BranchInst *EntryBr = cast<BranchInst>(EntryBB->getTerminator());
   EXPECT_TRUE(EntryBr->isConditional());
   EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB);
-  BasicBlock *ExitBB = ThenBB->getUniqueSuccessor();
+  BasicBlock *FinalizeBB = ThenBB->getUniqueSuccessor();
+  BasicBlock *ExitBB = FinalizeBB->getUniqueSuccessor();
   EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB);
 
   CmpInst *CondInst = cast<CmpInst>(EntryBr->getCondition());
@@ -3613,7 +3612,7 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveNowait) {
   EXPECT_TRUE(isa<GlobalVariable>(SingleEntryCI->getArgOperand(0)));
 
   CallInst *SingleEndCI = nullptr;
-  for (auto &FI : *ThenBB) {
+  for (auto &FI : *FinalizeBB) {
     Instruction *cur = &FI;
     if (isa<CallInst>(cur)) {
       SingleEndCI = cast<CallInst>(cur);
@@ -3724,7 +3723,8 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveCopyPrivate) {
   BranchInst *EntryBr = cast<BranchInst>(EntryBB->getTerminator());
   EXPECT_TRUE(EntryBr->isConditional());
   EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB);
-  BasicBlock *ExitBB = ThenBB->getUniqueSuccessor();
+  BasicBlock *FinalizeBB = ThenBB->getUniqueSuccessor();
+  BasicBlock *ExitBB = FinalizeBB->getUniqueSuccessor();
   EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB);
 
   CmpInst *CondInst = cast<CmpInst>(EntryBr->getCondition());
@@ -3743,25 +3743,28 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveCopyPrivate) {
   EXPECT_EQ(PrivLI->getPointerOperand(), PrivAI);
   // icmp
   EXPECT_TRUE(ThenBBI.next<ICmpInst>());
+
+  // check FinalizeBB
+  BBInstIter FinalizeBBI(FinalizeBB);
   // store 1, DidIt
-  auto *DidItSI = ThenBBI.next<StoreInst>();
+  auto *DidItSI = FinalizeBBI.next<StoreInst>();
   EXPECT_NE(DidItSI, nullptr);
   EXPECT_EQ(DidItSI->getValueOperand(),
             ConstantInt::get(Type::getInt32Ty(Ctx), 1));
   Value *DidIt = DidItSI->getPointerOperand();
   // call __kmpc_end_single
-  auto *SingleEndCI = ThenBBI.next<CallInst>();
+  auto *SingleEndCI = FinalizeBBI.next<CallInst>();
   EXPECT_NE(SingleEndCI, nullptr);
   EXPECT_EQ(SingleEndCI->getCalledFunction()->getName(), "__kmpc_end_single");
   EXPECT_EQ(SingleEndCI->arg_size(), 2U);
   EXPECT_TRUE(isa<GlobalVariable>(SingleEndCI->getArgOperand(0)));
   EXPECT_EQ(SingleEndCI->getArgOperand(1), SingleEntryCI->getArgOperand(1));
   // br ExitBB
-  auto *ExitBBBI = ThenBBI.next<BranchInst>();
+  auto *ExitBBBI = FinalizeBBI.next<BranchInst>();
   EXPECT_NE(ExitBBBI, nullptr);
   EXPECT_TRUE(ExitBBBI->isUnconditional());
   EXPECT_EQ(ExitBBBI->getOperand(0), ExitBB);
-  EXPECT_FALSE(ThenBBI.hasNext());
+  EXPECT_FALSE(FinalizeBBI.hasNext());
 
   // check ExitBB
   BBInstIter ExitBBI(ExitBB);
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 0d5b553c8e652..cdab9f87a8758 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2729,6 +2729,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   ArrayRef<bool> isByRef = getIsByRef(opInst.getReductionByref());
   assert(isByRef.size() == opInst.getNumReductionVars());
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  bool isCancellable = constructIsCancellable(opInst);
 
   if (failed(checkImplementationStatus(*opInst)))
     return failure();
@@ -2867,6 +2868,18 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
                                   privateVarsInfo.privatizers)))
       return llvm::make_error<PreviouslyReportedError>();
 
+    // If we could be performing cancellation, add the cancellation barrier on
+    // the way out of the outlined region.
+    if (isCancellable) {
+      auto IPOrErr = ompBuilder->createBarrier(
+          llvm::OpenMPIRBuilder::LocationDescription(builder),
+          llvm::omp::Directive::OMPD_unknown,
+          /* ForceSimpleCall */ false,
+          /* CheckCancelFlag */ false);
+      if (!IPOrErr)
+        return IPOrErr.takeError();
+    }
+
     builder.restoreIP(oldIP);
     return llvm::Error::success();
   };
@@ -2880,7 +2893,6 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   auto pbKind = llvm::omp::OMP_PROC_BIND_default;
   if (auto bind = opInst.getProcBindKind())
     pbKind = getProcBindKind(*bind);
-  bool isCancellable = constructIsCancellable(opInst);
 
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
       findAllocaInsertPoint(builder, moduleTranslation);
diff --git a/mlir/test/Target/LLVMIR/openmp-barrier-cancel.mlir b/mlir/test/Target/LLVMIR/openmp-barrier-cancel.mlir
index c4b245667a1f3..6585549de7f96 100644
--- a/mlir/test/Target/LLVMIR/openmp-barrier-cancel.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-barrier-cancel.mlir
@@ -29,22 +29,24 @@ llvm.func @test() {
 // CHECK:         %[[VAL_14:.*]] = icmp eq i32 %[[VAL_13]], 0
 // CHECK:         br i1 %[[VAL_14]], label %[[VAL_15:.*]], label %[[VAL_16:.*]]
 // CHECK:       omp.par.region1.cncl:                             ; preds = %[[VAL_11]]
-// CHECK:         %[[VAL_17:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
-// CHECK:         %[[VAL_18:.*]] = call i32 @__kmpc_cancel_barrier(ptr @2, i32 %[[VAL_17]])
-// CHECK:         br label %[[VAL_19:.*]]
+// CHECK:         br label %[[FINI:.*]]
+// CHECK:       .fini:
+// CHECK:         %[[TID:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         %[[CNCL_BARRIER:.*]] = call i32 @__kmpc_cancel_barrier(ptr @2, i32 %[[TID]])
+// CHECK:         br label %[[EXIT_STUB:.*]]
 // CHECK:       omp.par.region1.split:                            ; preds = %[[VAL_11]]
 // CHECK:         %[[VAL_20:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         %[[VAL_21:.*]] = call i32 @__kmpc_cancel_barrier(ptr @3, i32 %[[VAL_20]])
 // CHECK:         %[[VAL_22:.*]] = icmp eq i32 %[[VAL_21]], 0
 // CHECK:         br i1 %[[VAL_22]], label %[[VAL_23:.*]], label %[[VAL_24:.*]]
 // CHECK:       omp.par.region1.split.cncl:                       ; preds = %[[VAL_15]]
-// CHECK:         br label %[[VAL_19]]
+// CHECK:         br label %[[FINI]]
 // CHECK:       omp.par.region1.split.cont:                       ; preds = %[[VAL_15]]
 // CHECK:         br label %[[VAL_25:.*]]
 // CHECK:       omp.region.cont:                                  ; preds = %[[VAL_23]]
 // CHECK:         br label %[[VAL_26:.*]]
 // CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_25]]
-// CHECK:         br label %[[VAL_19]]
-// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[VAL_26]], %[[VAL_24]], %[[VAL_16]]
+// CHECK:         br label %[[FINI]]
+// CHECK:       omp.par.exit.exitStub:
 // CHECK:         ret void
 
diff --git a/mlir/test/Target/LLVMIR/openmp-cancel.mlir b/mlir/test/Target/LLVMIR/openmp-cancel.mlir
index 21241702ad569..5e20b8793f499 100644
--- a/mlir/test/Target/LLVMIR/openmp-cancel.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-cancel.mlir
@@ -24,16 +24,18 @@ llvm.func @cancel_parallel() {
 // CHECK:         %[[VAL_15:.*]] = icmp eq i32 %[[VAL_14]], 0
 // CHECK:         br i1 %[[VAL_15]], label %[[VAL_16:.*]], label %[[VAL_17:.*]]
 // CHECK:       omp.par.region1.cncl:                             ; preds = %[[VAL_12]]
+// CHECK:         br label %[[VAL_20:.*]]
+// CHECK:       .fini:
 // CHECK:         %[[VAL_18:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         %[[VAL_19:.*]] = call i32 @__kmpc_cancel_barrier(ptr @2, i32 %[[VAL_18]])
-// CHECK:         br label %[[VAL_20:.*]]
+// CHECK:         br label %[[EXIT_STUB:.*]]
 // CHECK:       omp.par.region1.split:                            ; preds = %[[VAL_12]]
 // CHECK:         br label %[[VAL_21:.*]]
 // CHECK:       omp.region.cont:                                  ; preds = %[[VAL_16]]
 // CHECK:         br label %[[VAL_22:.*]]
 // CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_21]]
 // CHECK:         br label %[[VAL_20]]
-// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[VAL_22]], %[[VAL_17]]
+// CHECK:       omp.par.exit.exitStub:
 // CHECK:         ret void
 
 llvm.func @cancel_parallel_if(%arg0 : i1) {
@@ -67,18 +69,20 @@ llvm.func @cancel_parallel_if(%arg0 : i1) {
 // CHECK:         br label %[[VAL_26:.*]]
 // CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_25]]
 // CHECK:         br label %[[VAL_27:.*]]
-// CHECK:       5:                                                ; preds = %[[VAL_20]]
+// CHECK:       .fini:
+// CHECK:         %[[VAL_32:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         %[[VAL_33:.*]] = call i32 @__kmpc_cancel_barrier(ptr @2, i32 %[[VAL_32]])
+// CHECK:         br label %[[EXIT_STUB:.*]]
+// CHECK:       6:                                                ; preds = %[[VAL_20]]
 // CHECK:         %[[VAL_28:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         %[[VAL_29:.*]] = call i32 @__kmpc_cancel(ptr @1, i32 %[[VAL_28]], i32 1)
 // CHECK:         %[[VAL_30:.*]] = icmp eq i32 %[[VAL_29]], 0
 // CHECK:         br i1 %[[VAL_30]], label %[[VAL_24]], label %[[VAL_31:.*]]
 // CHECK:       .cncl:                                            ; preds = %[[VAL_21]]
-// CHECK:         %[[VAL_32:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
-// CHECK:         %[[VAL_33:.*]] = call i32 @__kmpc_cancel_barrier(ptr @2, i32 %[[VAL_32]])
 // CHECK:         br label %[[VAL_27]]
 // CHECK:       .split:                                           ; preds = %[[VAL_21]]
 // CHECK:         br label %[[VAL_23]]
-// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[VAL_31]], %[[VAL_26]]
+// CHECK:       omp.par.exit.exitStub:
 // CHECK:         ret void
 
 llvm.func @cancel_sections_if(%cond : i1) {
@@ -145,14 +149,12 @@ llvm.func @cancel_sections_if(%cond : i1) {
 // CHECK:       omp_section_loop.inc:                             ; preds = %[[VAL_23]]
 // CHECK:         %[[VAL_15]] = add nuw i32 %[[VAL_14]], 1
 // CHECK:         br label %[[VAL_12]]
-// CHECK:       omp_section_loop.exit:                            ; preds = %[[VAL_33]], %[[VAL_16]]
+// CHECK:       omp_section_loop.exit:
 // CHECK:         call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_7]])
 // CHECK:         %[[VAL_36:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_36]])
 // CHECK:         br label %[[VAL_37:.*]]
 // CHECK:       omp_section_loop.after:                           ; preds = %[[VAL_19]]
-// CHECK:         br label %[[VAL_38:.*]]
-// CHECK:       omp_section_loop.aftersections.fini:              ; preds = %[[VAL_37]]
 // CHECK:         ret void
 // CHECK:       .cncl:                                            ; preds = %[[VAL_27]]
 // CHECK:         br label %[[VAL_19]]
@@ -232,7 +234,7 @@ llvm.func @cancel_wsloop_if(%lb : i32, %ub : i32, %step : i32, %cond : i1) {
 // CHECK:       omp_loop.inc:                                     ; preds = %[[VAL_52]]
 // CHECK:         %[[VAL_34]] = add nuw i32 %[[VAL_33]], 1
 // CHECK:         br label %[[VAL_31]]
-// CHECK:       omp_loop.exit:                                    ; preds = %[[VAL_50]], %[[VAL_35]]
+// CHECK:       omp_loop.exit:
 // CHECK:         call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_26]])
 // CHECK:         %[[VAL_53:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_53]])
diff --git a/mlir/test/Target/LLVMIR/openmp-cancellation-point.mlir b/mlir/test/Target/LLVMIR/openmp-cancellation-point.mlir
index 5e0d3f9f7e293..93fa2064ab99a 100644
--- a/mlir/test/Target/LLVMIR/openmp-cancellation-point.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-cancellation-point.mlir
@@ -24,16 +24,18 @@ llvm.func @cancellation_point_parallel() {
 // CHECK:         %[[VAL_15:.*]] = icmp eq i32 %[[VAL_14]], 0
 // CHECK:         br i1 %[[VAL_15]], label %[[VAL_16:.*]], label %[[VAL_17:.*]]
 // CHECK:       omp.par.region1.cncl:                             ; preds = %[[VAL_12]]
+// CHECK:         br label %[[FINI:.*]]
+// CHECK:       .fini:
 // CHECK:         %[[VAL_18:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         %[[VAL_19:.*]] = call i32 @__kmpc_cancel_barrier(ptr @2, i32 %[[VAL_18]])
-// CHECK:         br label %[[VAL_20:.*]]
+// CHECK:         br label %[[EXIT_STUB:.*]]
 // CHECK:       omp.par.region1.split:                            ; preds = %[[VAL_12]]
 // CHECK:         br label %[[VAL_21:.*]]
 // CHECK:       omp.region.cont:                                  ; preds = %[[VAL_16]]
 // CHECK:         br label %[[VAL_22:.*]]
 // CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_21]]
-// CHECK:         br label %[[VAL_20]]
-// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[VAL_22]], %[[VAL_17]]
+// CHECK:         br label %[[FINI]]
+// CHECK:       omp.par.exit.exitStub:
 // CHECK:         ret void
 
 llvm.func @cancellation_point_sections() {
@@ -94,14 +96,12 @@ llvm.func @cancellation_point_sections() {
 // CHECK:       omp_section_loop.inc:                             ; preds = %[[VAL_46]]
 // CHECK:         %[[VAL_38]] = add nuw i32 %[[VAL_37]], 1
 // CHECK:         br label %[[VAL_35]]
-// CHECK:       omp_section_loop.exit:                            ; preds = %[[VAL_53]], %[[VAL_39]]
+// CHECK:       omp_section_loop.exit:
 // CHECK:         call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_30]])
 // CHECK:         %[[VAL_55:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_55]])
 // CHECK:         br label %[[VAL_56:.*]]
 // CHECK:       omp_section_loop.after:                           ; preds = %[[VAL_42]]
-// CHECK:         br label %[[VAL_57:.*]]
-// CHECK:       omp_section_loop.aftersections.fini:              ; preds = %[[VAL_56]]
 // CHECK:         ret void
 // CHECK:       omp.section.region.cncl:                          ; preds = %[[VAL_48]]
 // CHECK:         br label %[[VAL_42]]
@@ -175,7 +175,7 @@ llvm.func @cancellation_point_wsloop(%lb : i32, %ub : i32, %step : i32) {
 // CHECK:       omp_loop.inc:                                     ; preds = %[[VAL_106]]
 // CHECK:         %[[VAL_92]] = add nuw i32 %[[VAL_91]], 1
 // CHECK:         br label %[[VAL_89]]
-// CHECK:       omp_loop.exit:                                    ; preds = %[[VAL_105]], %[[VAL_93]]
+// CHECK:       omp_loop.exit:
 // CHECK:         call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_84]])
 // CHECK:         %[[VAL_107:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_107]])
diff --git a/mlir/test/Target/LLVMIR/openmp-outline-infinite-loop.mlir b/mlir/test/Target/LLVMIR/openmp-outline-infinite-loop.mlir
index faccfc678adfe..99f37c7e79be8 100644
--- a/mlir/test/Target/LLVMIR/openmp-outline-infinite-loop.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-outline-infinite-loop.mlir
@@ -21,9 +21,11 @@ llvm.func @parallel_infinite_loop() -> () {
 // CHECK:       omp.region.cont:                                  ; No predecessors!
 // CHECK:         br label %[[VAL_4:.*]]
 // CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_5:.*]]
-// CHECK:         br label %[[VAL_6:.*]]
-// CHECK:       omp.par.exit:                                     ; preds = %[[VAL_4]]
+// CHECK:         br label %[[FINI:.*]]
+// CHECK:       [[OMP_PAR_EXIT:omp.par.exit]]:                                     ; preds = %[[FINI]]
 // CHECK:         ret void
+// CHECK:       [[FINI]]:
+// CHECK:         br label %[[OMP_PAR_EXIT]]
 // CHECK:       }
 
 // CHECK-LABEL: define internal void @parallel_infinite_loop..omp_par(
diff --git a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir
index 887d2977e45cc..c79c369b69d7f 100644
--- a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir
@@ -108,6 +108,8 @@ llvm.func @missordered_blocks_(%arg0: !llvm.ptr {fir.bindc_name = "x"}, %arg1: !
 // CHECK:       reduce.finalize:                                  ; preds = %[[VAL_49]], %[[VAL_43]]
 // CHECK:         br label %[[VAL_53:.*]]
 // CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_48]]
+// CHECK:         br label %[[FINI:.*]]
+// CHECK:       .fini:
 // CHECK:         %[[VAL_54:.*]] = load ptr, ptr %[[VAL_20]], align 8
 // CHECK:         %[[VAL_55:.*]] = load ptr, ptr %[[VAL_21]], align 8
 // CHECK:         br label %[[VAL_56:.*]]
@@ -115,5 +117,5 @@ llvm.func @missordered_blocks_(%arg0: !llvm.ptr {fir.bindc_name = "x"}, %arg1: !
 // CHECK:         br label %[[VAL_38]]
 // CHECK:       omp.reduction.neutral1:                           ; preds = %[[VAL_25]]
 // CHECK:         br label %[[VAL_30]]
-// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[VAL_53]]
+// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[FINI]]
 // CHECK:         ret void
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
index b302b4b20edd5..13f52f054869e 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
@@ -127,8 +127,6 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
 // CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_36]])
 // CHECK:         br label %[[VAL_37:.*]]
 // CHECK:       omp_section_loop.after:                           ; preds = %[[VAL_35]]
-// CHECK:         br label %[[VAL_38:.*]]
-// CHECK:       omp_section_loop.aftersections.fini:              ; preds = %[[VAL_37]]
 // CHECK:         %[[VAL_39:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_14]], i64 0, i64 0
 // CHECK:         store ptr %[[VAL_21]], ptr %[[VAL_39]], align 8
 // CHECK:         %[[VAL_40:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
@@ -137,9 +135,9 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
 // CHECK:           i32 1, label %[[VAL_43:.*]]
 // CHECK:           i32 2, label %[[VAL_44:.*]]
 // CHECK:         ]
-// CHECK:       reduce.switch.atomic:                             ; preds = %[[VAL_38]]
+// CHECK:       reduce.switch.atomic:                             ; preds = %[[VAL_37]]
 // CHECK:         unreachable
-// CHECK:       reduce.switch.nonatomic:                          ; preds = %[[VAL_38]]
+// CHECK:       reduce.switch.nonatomic:                          ; preds = %[[VAL_37]]
 // CHECK:         %[[VAL_45:.*]] = load ptr, ptr %[[VAL_21]], align 8
 // CHECK:         br label %[[VAL_46:.*]]
 // CHECK:       omp.reduction.nonatomic.body:                     ; preds = %[[VAL_43]]
@@ -157,7 +155,7 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
 // CHECK:       omp.reduction.nonatomic.body17:                   ; preds = %[[VAL_47]]
 // CHECK:         %[[VAL_50]] = sub i64 %[[VAL_49]], 1
 // CHECK:         br label %[[VAL_47]]
-// CHECK:       reduce.finalize:                                  ; preds = %[[VAL_53]], %[[VAL_38]]
+// CHECK:       reduce.finalize:                                  ; preds = %[[VAL_53]], %[[VAL_37]]
 // CHECK:         %[[VAL_55:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_55]])
 // CHECK:         %[[VAL_56:.*]] = load ptr, ptr %[[VAL_21]], align 8
@@ -173,7 +171,9 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
 // CHECK:       omp.region.cont:                                  ; preds = %[[VAL_62]]
 // CHECK:         br label %[[VAL_64:.*]]
 // CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_63]]
-// CHECK:         br label %[[VAL_65:.*]]
+// CHECK:         br label %[[FINI:.fini.*]]
+// CHECK:       [[FINI]]:
+// CHECK:         br label %[[EXIT:.*]]
 // CHECK:       omp.reduction.cleanup21:                          ; preds = %[[VAL_57]]
 // CHECK:         br label %[[VAL_61]]
 // CHECK:       omp_section_loop.body:                            ; preds = %[[VAL_32]]
@@ -219,5 +219,5 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
 // CHECK:       omp_section_loop.inc:                             ; preds = %[[VAL_69]]
 // CHECK:         %[[VAL_31]] = add nuw i32 %[[VAL_30]], 1
 // CHECK:         br label %[[VAL_28]]
-// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[VAL_64]]
+// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[FINI]]
 // CHECK:         ret void
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
index a714ca68a1e95..cb30d3b2f4473 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
@@ -96,8 +96,10 @@ module {
 // CHECK:       reduce.finalize:                                  ; preds = %[[VAL_34]], %[[VAL_28]]
 // CHECK:         br label %[[VAL_38:.*]]
 // CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_33]]
+// CHECK:         br label %[[FINI:.*]]
+// CHECK:       [[FINI]]:
 // CHECK:         br label %[[VAL_39:.*]]
-// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[VAL_38]]
+// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[FINI]]
 // CHECK:         ret void
 // CHECK:         %[[VAL_40:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_41:.*]], i64 0, i64 0
 // CHECK:         %[[VAL_42:.*]] = load ptr, ptr %[[VAL_40]], align 8
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir
index 19da6f8517fcd..00f6c1b02206e 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir
@@ -86,8 +86,6 @@ llvm.func @sections_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attributes {fir.in
 // CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_40]])
 // CHECK:         br label %[[VAL_41:.*]]
 // CHECK:       omp_section_loop.after:                           ; preds = %[[VAL_39]]
-// CHECK:         br label %[[VAL_42:.*]]
-// CHECK:       omp_section_loop.aftersections.fini:              ; preds = %[[VAL_41]]
 // CHECK:         %[[VAL_43:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_21]], i64 0, i64 0
 // CHECK:         store ptr %[[VAL_20]], ptr %[[VAL_43]], align 8
 // CHECK:         %[[VAL_44:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
@@ -96,23 +94,25 @@ llvm.func @sections_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attributes {fir.in
 // CHECK:           i32 1, label %[[VAL_47:.*]]
 // CHECK:           i32 2, label %[[VAL_48:.*]]
 // CHECK:         ]
-// CHECK:       reduce.switch.atomic:                             ; preds = %[[VAL_42]]
+// CHECK:       reduce.switch.atomic:                             ; preds = %[[VAL_41]]
 // CHECK:         unreachable
-// CHECK:       reduce.switch.nonatomic:                          ; preds = %[[VAL_42]]
+// CHECK:       reduce.switch.nonatomic:                          ; preds = %[[VAL_41]]
 // CHECK:         %[[VAL_49:.*]] = load float, ptr %[[VAL_11]], align 4
 // CHECK:         %[[VAL_50:.*]] = load float, ptr %[[VAL_20]], align 4
 // CHECK:         %[[VAL_51:.*]] = fadd contract float %[[VAL_49]], %[[VAL_50]]
 // CHECK:         store float %[[VAL_51]], ptr %[[VAL_11]], align 4
 // CHECK:         call void @__kmpc_end_reduce(ptr @1, i32 %[[VAL_44]], ptr @.gomp_critical_user_.reduction.var)
 // CHECK:         br label %[[VAL_46]]
-// CHECK:       reduce.finalize:                                  ; preds = %[[VAL_47]], %[[VAL_42]]
+// CHECK:       reduce.finalize:                                  ; preds = %[[VAL_47]], %[[VAL_41]]
 // CHECK:         %[[VAL_52:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_52]])
 // CHECK:         br label %[[VAL_53:.*]]
 // CHECK:       omp.region.cont:                                  ; preds = %[[VAL_46]]
 // CHECK:         br label %[[VAL_54:.*]]
 // CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_53]]
-// CHECK:         br label %[[VAL_55:.*]]
+// CHECK:         br label %[[FINI:.fini.*]]
+// CHECK:       [[FINI]]:
+// CHECK:         br label %[[EXIT:.*]]
 // CHECK:       omp_section_loop.body:                            ; preds = %[[VAL_36]]
 // CHECK:         %[[VAL_56:.*]] = add i32 %[[VAL_34]], %[[VAL_28]]
 // CHECK:         %[[VAL_57:.*]] = mul i32 %[[VAL_56]], 1
@@ -144,8 +144,10 @@ llvm.func @sections_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attributes {fir.in
 // CHECK:       omp_section_loop.inc:                             ; preds = %[[VAL_59]]
 // CHECK:         %[[VAL_35]] = add nuw i32 %[[VAL_34]], 1
 // CHECK:         br label %[[VAL_32]]
-// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[VAL_54]]
+// CHECK:       omp.par.exit.exitStub:                            ; preds = %[[FINI]]
 // CHECK:         ret void
+
+// CHECK-LABEL: define internal void @.omp.reduction.func
 // CHECK:         %[[VAL_70:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_71:.*]], i64 0, i64 0
 // CHECK:         %[[VAL_72:.*]] = load ptr, ptr %[[VAL_70]], align 8
 // CHECK:         %[[VAL_73:.*]] = load float, ptr %[[VAL_72]], align 4

From 2c9e9ffa77e37fa0ff5d15325dab5471636b8a44 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Mon, 1 Dec 2025 18:29:21 +0800
Subject: [PATCH 21/39] [SCCP] Handle llvm.experimental.get.vector.length calls
 (#169527)

As noted in the reproducer provided in
https://github.com/llvm/llvm-project/issues/164762#issuecomment-3554719231,
on RISC-V after LTO we sometimes have trip counts exposed to vectorized
loops. The loop vectorizer will have generated calls to
@llvm.experimental.get.vector.length, but there are [some
properties](https://llvm.org/docs/LangRef.html#id2399) about the
intrinsic we can use to simplify it:

- The result is always less than both Count and MaxLanes
- If Count <= MaxLanes, then the result is Count

This teaches SCCP to handle these cases with the intrinsic, which allows
some single-iteration-after-LTO loops to be unfolded.

#169293 is related and also simplifies the intrinsic in InstCombine via
computeKnownBits, but it can't fully remove the loop since
computeKnownBits only does limited reasoning on recurrences.
---
 llvm/lib/Transforms/Utils/SCCPSolver.cpp      |  32 ++++
 .../SCCP/get_vector_length-intrinsic.ll       | 147 ++++++++++++++++++
 2 files changed, 179 insertions(+)
 create mode 100644 llvm/test/Transforms/SCCP/get_vector_length-intrinsic.ll

diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 4947d03a2dc66..951bf1ca62fc2 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -2098,6 +2098,38 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
       return (void)mergeInValue(ValueState[II], II,
                                 ValueLatticeElement::getRange(Result));
     }
+    if (II->getIntrinsicID() == Intrinsic::experimental_get_vector_length) {
+      Value *CountArg = II->getArgOperand(0);
+      Value *VF = II->getArgOperand(1);
+      bool Scalable = cast<ConstantInt>(II->getArgOperand(2))->isOne();
+
+      // Computation happens in the larger type.
+      unsigned BitWidth = std::max(CountArg->getType()->getScalarSizeInBits(),
+                                   VF->getType()->getScalarSizeInBits());
+
+      ConstantRange Count = getValueState(CountArg)
+                                .asConstantRange(CountArg->getType(), false)
+                                .zextOrTrunc(BitWidth);
+      ConstantRange MaxLanes = getValueState(VF)
+                                   .asConstantRange(VF->getType(), false)
+                                   .zextOrTrunc(BitWidth);
+      if (Scalable)
+        MaxLanes =
+            MaxLanes.multiply(getVScaleRange(II->getFunction(), BitWidth));
+
+      // The result is always less than both Count and MaxLanes.
+      ConstantRange Result(
+          APInt::getZero(BitWidth),
+          APIntOps::umin(Count.getUpper(), MaxLanes.getUpper()));
+
+      // If Count <= MaxLanes, getvectorlength(Count, MaxLanes) = Count
+      if (Count.icmp(CmpInst::ICMP_ULE, MaxLanes))
+        Result = Count;
+
+      Result = Result.zextOrTrunc(II->getType()->getScalarSizeInBits());
+      return (void)mergeInValue(ValueState[II], II,
+                                ValueLatticeElement::getRange(Result));
+    }
 
     if (ConstantRange::isIntrinsicSupported(II->getIntrinsicID())) {
       // Compute result range for intrinsics supported by ConstantRange.
diff --git a/llvm/test/Transforms/SCCP/get_vector_length-intrinsic.ll b/llvm/test/Transforms/SCCP/get_vector_length-intrinsic.ll
new file mode 100644
index 0000000000000..d0741161e729e
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/get_vector_length-intrinsic.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -p sccp -S | FileCheck %s
+
+define i1 @result_le_count() {
+; CHECK-LABEL: define i1 @result_le_count() {
+; CHECK-NEXT:    ret i1 true
+;
+  %x = call i32 @llvm.experimental.get.vector.length(i32 3, i32 4, i1 false)
+  %res = icmp ule i32 %x, 3
+  ret i1 %res
+}
+
+define i1 @result_le_max_lanes(i32 %count) {
+; CHECK-LABEL: define i1 @result_le_max_lanes(
+; CHECK-SAME: i32 [[COUNT:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[COUNT]], i32 3, i1 false)
+; CHECK-NEXT:    ret i1 true
+;
+  %x = call i32 @llvm.experimental.get.vector.length(i32 %count, i32 3, i1 false)
+  %res = icmp ule i32 %x, 3
+  ret i1 %res
+}
+
+define i1 @result_le_max_lanes_scalable(i32 %count) vscale_range(2, 4) {
+; CHECK-LABEL: define i1 @result_le_max_lanes_scalable(
+; CHECK-SAME: i32 [[COUNT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[COUNT]], i32 4, i1 true)
+; CHECK-NEXT:    ret i1 true
+;
+  %x = call i32 @llvm.experimental.get.vector.length(i32 %count, i32 4, i1 true)
+  %res = icmp ule i32 %x, 16
+  ret i1 %res
+}
+
+define i32 @count_le_max_lanes() {
+; CHECK-LABEL: define i32 @count_le_max_lanes() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 4
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [4, %entry], [%iv.next, %loop]
+  %x = call i32 @llvm.experimental.get.vector.length(i32 %iv, i32 4, i1 false)
+  %iv.next = sub i32 %iv, %x
+  %ec = icmp eq i32 %iv.next, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %x
+}
+
+; Can't simplify because %iv isn't <= max lanes.
+define i32 @count_not_le_max_lanes() {
+; CHECK-LABEL: define range(i32 0, 5) i32 @count_not_le_max_lanes() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 6, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[IV]], i32 4, i1 false)
+; CHECK-NEXT:    [[IV_NEXT]] = sub i32 [[IV]], [[X]]
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 [[X]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [6, %entry], [%iv.next, %loop]
+  %x = call i32 @llvm.experimental.get.vector.length(i32 %iv, i32 4, i1 false)
+  %iv.next = sub i32 %iv, %x
+  %ec = icmp eq i32 %iv.next, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %x
+}
+
+define i32 @count_le_max_lanes_scalable_known() vscale_range(4, 8) {
+; CHECK-LABEL: define i32 @count_le_max_lanes_scalable_known(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 16
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [16, %entry], [%iv.next, %loop]
+  %x = call i32 @llvm.experimental.get.vector.length(i32 %iv, i32 4, i1 true)
+  %iv.next = sub i32 %iv, %x
+  %ec = icmp eq i32 %iv.next, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %x
+}
+
+; Can't simplify because %iv isn't guaranteed <= max lanes.
+define i32 @count_le_max_lanes_scalable_unknown() {
+; CHECK-LABEL: define range(i32 0, -1) i32 @count_le_max_lanes_scalable_unknown() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 16, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[IV]], i32 4, i1 true)
+; CHECK-NEXT:    [[IV_NEXT]] = sub i32 [[IV]], [[X]]
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 [[X]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [16, %entry], [%iv.next, %loop]
+  %x = call i32 @llvm.experimental.get.vector.length(i32 %iv, i32 4, i1 true)
+  %iv.next = sub i32 %iv, %x
+  %ec = icmp eq i32 %iv.next, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %x
+}
+
+define i1 @result_le_overflow() {
+; CHECK-LABEL: define i1 @result_le_overflow() {
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 4294967296, i32 4, i1 false)
+; CHECK-NEXT:    [[RES:%.*]] = icmp ule i32 [[X]], 3
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %x = call i32 @llvm.experimental.get.vector.length(i64 u0x100000000, i32 4, i1 false)
+  %res = icmp ule i32 %x, 3
+  ret i1 %res
+}

From b1620996f49611767d1950927835fa20284355d5 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 1 Dec 2025 11:33:33 +0100
Subject: [PATCH 22/39] [clang][bytecode] Fix discarding ImplitiValueInitExprs
 (#170089)

They don't have side-effects, so this should be fine.

Fixes https://github.com/llvm/llvm-project/issues/170064
---
 clang/lib/AST/ByteCode/Compiler.cpp |  3 +++
 clang/test/AST/ByteCode/c.c         | 13 +++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index dd0b8e790d444..58e84ef70abb7 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -1705,6 +1705,9 @@ bool Compiler<Emitter>::VisitFixedPointUnaryOperator(const UnaryOperator *E) {
 template <class Emitter>
 bool Compiler<Emitter>::VisitImplicitValueInitExpr(
     const ImplicitValueInitExpr *E) {
+  if (DiscardResult)
+    return true;
+
   QualType QT = E->getType();
 
   if (OptPrimType T = classify(QT))
diff --git a/clang/test/AST/ByteCode/c.c b/clang/test/AST/ByteCode/c.c
index bffd557ff77a6..0d3d97b5eeab2 100644
--- a/clang/test/AST/ByteCode/c.c
+++ b/clang/test/AST/ByteCode/c.c
@@ -392,3 +392,16 @@ void plainComplex(void) {
   _Complex cd; // all-warning {{_Complex double}}
   cd = *(_Complex *)&(struct { double r, i; }){0.0, 0.0}; // all-warning {{_Complex double}}
 }
+
+/// This test results in an ImplicitValueInitExpr with DiscardResult set.
+struct M{
+  char c;
+};
+typedef struct S64 {
+  struct M m;
+  char a[64];
+} I64;
+
+_Static_assert((((I64){}, 1)), ""); // all-warning {{left operand of comma operator has no effect}} \
+                                    // pedantic-warning {{use of an empty initializer is a C23 extension}} \
+                                    // pedantic-warning {{expression is not an integer constant expression; folding it to a constant is a GNU extension}}

From d1500d12be60f21f9a80fdbfb3cfa24b8f20a0c9 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Mon, 1 Dec 2025 18:33:50 +0800
Subject: [PATCH 23/39] [SelectionDAG] Add SelectionDAG::getTypeSize. NFC
 (#169764)

Similar to how getElementCount avoids the need to reason about fixed and
scalable ElementCounts separately, this patch adds getTypeSize to do the
same for TypeSize.

It also goes through and replaces some of the manual uses of getVScale
with getTypeSize/getElementCount where possible.
---
 llvm/include/llvm/CodeGen/SelectionDAG.h      |  8 +--
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  6 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 59 +++++++++----------
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 14 +----
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 21 ++-----
 .../Target/AArch64/AArch64ISelLowering.cpp    | 33 +++--------
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  8 +--
 7 files changed, 54 insertions(+), 95 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 501cbc947132e..21f622ea471e1 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1185,11 +1185,11 @@ class SelectionDAG {
   SDValue getPOISON(EVT VT) { return getNode(ISD::POISON, SDLoc(), VT); }
 
   /// Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
-  LLVM_ABI SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm,
-                             bool ConstantFold = true);
+  LLVM_ABI SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm);
 
-  LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
-                                   bool ConstantFold = true);
+  LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC);
+
+  LLVM_ABI SDValue getTypeSize(const SDLoc &DL, EVT VT, TypeSize TS);
 
   /// Return a GLOBAL_OFFSET_TABLE node. This does not have a useful SDLoc.
   SDValue getGLOBAL_OFFSET_TABLE(EVT VT) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 4274e951446b8..53b7aede7b4a5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1702,10 +1702,8 @@ void DAGTypeLegalizer::SplitVecRes_LOOP_DEPENDENCE_MASK(SDNode *N, SDValue &Lo,
   Lo = DAG.getNode(N->getOpcode(), DL, LoVT, PtrA, PtrB, N->getOperand(2));
 
   unsigned EltSize = N->getConstantOperandVal(2);
-  unsigned Offset = EltSize * HiVT.getVectorMinNumElements();
-  SDValue Addend = HiVT.isScalableVT()
-                       ? DAG.getVScale(DL, MVT::i64, APInt(64, Offset))
-                       : DAG.getConstant(Offset, DL, MVT::i64);
+  ElementCount Offset = HiVT.getVectorElementCount() * EltSize;
+  SDValue Addend = DAG.getElementCount(DL, MVT::i64, Offset);
 
   PtrA = DAG.getNode(ISD::ADD, DL, MVT::i64, PtrA, Addend);
   Hi = DAG.getNode(N->getOpcode(), DL, HiVT, PtrA, PtrB, N->getOperand(2));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 06735708d5369..c9519ce1610b2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2098,32 +2098,43 @@ SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) {
   return SDValue(CondCodeNodes[Cond], 0);
 }
 
-SDValue SelectionDAG::getVScale(const SDLoc &DL, EVT VT, APInt MulImm,
-                                bool ConstantFold) {
+SDValue SelectionDAG::getVScale(const SDLoc &DL, EVT VT, APInt MulImm) {
   assert(MulImm.getBitWidth() == VT.getSizeInBits() &&
          "APInt size does not match type size!");
 
   if (MulImm == 0)
     return getConstant(0, DL, VT);
 
-  if (ConstantFold) {
-    const MachineFunction &MF = getMachineFunction();
-    const Function &F = MF.getFunction();
-    ConstantRange CR = getVScaleRange(&F, 64);
-    if (const APInt *C = CR.getSingleElement())
-      return getConstant(MulImm * C->getZExtValue(), DL, VT);
-  }
+  const MachineFunction &MF = getMachineFunction();
+  const Function &F = MF.getFunction();
+  ConstantRange CR = getVScaleRange(&F, 64);
+  if (const APInt *C = CR.getSingleElement())
+    return getConstant(MulImm * C->getZExtValue(), DL, VT);
 
   return getNode(ISD::VSCALE, DL, VT, getConstant(MulImm, DL, VT));
 }
 
-SDValue SelectionDAG::getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
-                                      bool ConstantFold) {
-  if (EC.isScalable())
-    return getVScale(DL, VT,
-                     APInt(VT.getSizeInBits(), EC.getKnownMinValue()));
+/// \returns a value of type \p VT that represents the runtime value of \p
+/// Quantity, i.e. scaled by vscale if it's scalable, or a fixed constant
+/// otherwise. Quantity should be a FixedOrScalableQuantity, i.e. ElementCount
+/// or TypeSize.
+template <typename Ty>
+static SDValue getFixedOrScalableQuantity(SelectionDAG &DAG, const SDLoc &DL,
+                                          EVT VT, Ty Quantity) {
+  if (Quantity.isScalable())
+    return DAG.getVScale(
+        DL, VT, APInt(VT.getSizeInBits(), Quantity.getKnownMinValue()));
+
+  return DAG.getConstant(Quantity.getKnownMinValue(), DL, VT);
+}
+
+SDValue SelectionDAG::getElementCount(const SDLoc &DL, EVT VT,
+                                      ElementCount EC) {
+  return getFixedOrScalableQuantity(*this, DL, VT, EC);
+}
 
-  return getConstant(EC.getKnownMinValue(), DL, VT);
+SDValue SelectionDAG::getTypeSize(const SDLoc &DL, EVT VT, TypeSize TS) {
+  return getFixedOrScalableQuantity(*this, DL, VT, TS);
 }
 
 SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT) {
@@ -8500,16 +8511,7 @@ static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG,
 SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, TypeSize Offset,
                                            const SDLoc &DL,
                                            const SDNodeFlags Flags) {
-  EVT VT = Base.getValueType();
-  SDValue Index;
-
-  if (Offset.isScalable())
-    Index = getVScale(DL, Base.getValueType(),
-                      APInt(Base.getValueSizeInBits().getFixedValue(),
-                            Offset.getKnownMinValue()));
-  else
-    Index = getConstant(Offset.getFixedValue(), DL, VT);
-
+  SDValue Index = getTypeSize(DL, Base.getValueType(), Offset);
   return getMemBasePlusOffset(Base, Index, DL, Flags);
 }
 
@@ -13585,11 +13587,8 @@ std::pair<SDValue, SDValue> SelectionDAG::SplitEVL(SDValue N, EVT VecVT,
   EVT VT = N.getValueType();
   assert(VecVT.getVectorElementCount().isKnownEven() &&
          "Expecting the mask to be an evenly-sized vector");
-  unsigned HalfMinNumElts = VecVT.getVectorMinNumElements() / 2;
-  SDValue HalfNumElts =
-      VecVT.isFixedLengthVector()
-          ? getConstant(HalfMinNumElts, DL, VT)
-          : getVScale(DL, VT, APInt(VT.getScalarSizeInBits(), HalfMinNumElts));
+  SDValue HalfNumElts = getElementCount(
+      DL, VT, VecVT.getVectorElementCount().divideCoefficientBy(2));
   SDValue Lo = getNode(ISD::UMIN, DL, VT, N, HalfNumElts);
   SDValue Hi = getNode(ISD::USUBSAT, DL, VT, N, HalfNumElts);
   return std::make_pair(Lo, Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 53d73ad618bd1..2caf847370383 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4584,17 +4584,9 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
   if (AllocSize.getValueType() != IntPtr)
     AllocSize = DAG.getZExtOrTrunc(AllocSize, dl, IntPtr);
 
-  if (TySize.isScalable())
-    AllocSize = DAG.getNode(ISD::MUL, dl, IntPtr, AllocSize,
-                            DAG.getVScale(dl, IntPtr,
-                                          APInt(IntPtr.getScalarSizeInBits(),
-                                                TySize.getKnownMinValue())));
-  else {
-    SDValue TySizeValue =
-        DAG.getConstant(TySize.getFixedValue(), dl, MVT::getIntegerVT(64));
-    AllocSize = DAG.getNode(ISD::MUL, dl, IntPtr, AllocSize,
-                            DAG.getZExtOrTrunc(TySizeValue, dl, IntPtr));
-  }
+  AllocSize = DAG.getNode(
+      ISD::MUL, dl, IntPtr, AllocSize,
+      DAG.getZExtOrTrunc(DAG.getTypeSize(dl, MVT::i64, TySize), dl, IntPtr));
 
   // Handle alignment.  If the requested alignment is less than or equal to
   // the stack alignment, ignore it.  If the size is greater than or equal to
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 521d8f07434e6..783ec4b0bd211 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10628,12 +10628,8 @@ TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
                                     AddrVT);
     Increment = DAG.getZExtOrTrunc(Increment, DL, AddrVT);
     Increment = DAG.getNode(ISD::MUL, DL, AddrVT, Increment, Scale);
-  } else if (DataVT.isScalableVector()) {
-    Increment = DAG.getVScale(DL, AddrVT,
-                              APInt(AddrVT.getFixedSizeInBits(),
-                                    DataVT.getStoreSize().getKnownMinValue()));
   } else
-    Increment = DAG.getConstant(DataVT.getStoreSize(), DL, AddrVT);
+    Increment = DAG.getTypeSize(DL, AddrVT, DataVT.getStoreSize());
 
   return DAG.getNode(ISD::ADD, DL, AddrVT, Addr, Increment);
 }
@@ -11926,10 +11922,8 @@ SDValue TargetLowering::expandVectorSplice(SDNode *Node,
   // Store the lo part of CONCAT_VECTORS(V1, V2)
   SDValue StoreV1 = DAG.getStore(DAG.getEntryNode(), DL, V1, StackPtr, PtrInfo);
   // Store the hi part of CONCAT_VECTORS(V1, V2)
-  SDValue OffsetToV2 = DAG.getVScale(
-      DL, PtrVT,
-      APInt(PtrVT.getFixedSizeInBits(), VT.getStoreSize().getKnownMinValue()));
-  SDValue StackPtr2 = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, OffsetToV2);
+  SDValue VTBytes = DAG.getTypeSize(DL, PtrVT, VT.getStoreSize());
+  SDValue StackPtr2 = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, VTBytes);
   SDValue StoreV2 = DAG.getStore(StoreV1, DL, V2, StackPtr2, PtrInfo);
 
   if (Imm >= 0) {
@@ -11948,13 +11942,8 @@ SDValue TargetLowering::expandVectorSplice(SDNode *Node,
   SDValue TrailingBytes =
       DAG.getConstant(TrailingElts * EltByteSize, DL, PtrVT);
 
-  if (TrailingElts > VT.getVectorMinNumElements()) {
-    SDValue VLBytes =
-        DAG.getVScale(DL, PtrVT,
-                      APInt(PtrVT.getFixedSizeInBits(),
-                            VT.getStoreSize().getKnownMinValue()));
-    TrailingBytes = DAG.getNode(ISD::UMIN, DL, PtrVT, TrailingBytes, VLBytes);
-  }
+  if (TrailingElts > VT.getVectorMinNumElements())
+    TrailingBytes = DAG.getNode(ISD::UMIN, DL, PtrVT, TrailingBytes, VTBytes);
 
   // Calculate the start address of the spliced result.
   StackPtr2 = DAG.getNode(ISD::SUB, DL, PtrVT, StackPtr2, TrailingBytes);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5ba8f05b09012..b4f47d249885d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8647,7 +8647,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
               Subtarget->isWindowsArm64EC()) &&
              "Indirect arguments should be scalable on most subtargets");
 
-      uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
+      TypeSize PartSize = VA.getValVT().getStoreSize();
       unsigned NumParts = 1;
       if (Ins[i].Flags.isInConsecutiveRegs()) {
         while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
@@ -8664,16 +8664,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
         InVals.push_back(ArgValue);
         NumParts--;
         if (NumParts > 0) {
-          SDValue BytesIncrement;
-          if (PartLoad.isScalableVector()) {
-            BytesIncrement = DAG.getVScale(
-                DL, Ptr.getValueType(),
-                APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
-          } else {
-            BytesIncrement = DAG.getConstant(
-                APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
-                Ptr.getValueType());
-          }
+          SDValue BytesIncrement =
+              DAG.getTypeSize(DL, Ptr.getValueType(), PartSize);
           Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
                             BytesIncrement, SDNodeFlags::NoUnsignedWrap);
           ExtraArgLocs++;
@@ -9880,8 +9872,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       assert((isScalable || Subtarget->isWindowsArm64EC()) &&
              "Indirect arguments should be scalable on most subtargets");
 
-      uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
-      uint64_t PartSize = StoreSize;
+      TypeSize StoreSize = VA.getValVT().getStoreSize();
+      TypeSize PartSize = StoreSize;
       unsigned NumParts = 1;
       if (Outs[i].Flags.isInConsecutiveRegs()) {
         while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
@@ -9892,7 +9884,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
       Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
       MachineFrameInfo &MFI = MF.getFrameInfo();
-      int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
+      int FI =
+          MFI.CreateStackObject(StoreSize.getKnownMinValue(), Alignment, false);
       if (isScalable) {
         bool IsPred = VA.getValVT() == MVT::aarch64svcount ||
                       VA.getValVT().getVectorElementType() == MVT::i1;
@@ -9913,16 +9906,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
         NumParts--;
         if (NumParts > 0) {
-          SDValue BytesIncrement;
-          if (isScalable) {
-            BytesIncrement = DAG.getVScale(
-                DL, Ptr.getValueType(),
-                APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
-          } else {
-            BytesIncrement = DAG.getConstant(
-                APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
-                Ptr.getValueType());
-          }
+          SDValue BytesIncrement =
+              DAG.getTypeSize(DL, Ptr.getValueType(), PartSize);
           MPI = MachinePointerInfo(MPI.getAddrSpace());
           Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
                             BytesIncrement, SDNodeFlags::NoUnsignedWrap);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index a6212f5cc84be..afbbc0dbeb7ad 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -12783,10 +12783,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op,
 
     SmallVector<SDValue, 8> Loads(Factor);
 
-    SDValue Increment =
-        DAG.getVScale(DL, PtrVT,
-                      APInt(PtrVT.getFixedSizeInBits(),
-                            VecVT.getStoreSize().getKnownMinValue()));
+    SDValue Increment = DAG.getTypeSize(DL, PtrVT, VecVT.getStoreSize());
     for (unsigned i = 0; i != Factor; ++i) {
       if (i != 0)
         StackPtr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, Increment);
@@ -14184,9 +14181,8 @@ RISCVTargetLowering::lowerVPReverseExperimental(SDValue Op,
 
       // Slide off any elements from past EVL that were reversed into the low
       // elements.
-      unsigned MinElts = GatherVT.getVectorMinNumElements();
       SDValue VLMax =
-          DAG.getVScale(DL, XLenVT, APInt(XLenVT.getSizeInBits(), MinElts));
+          DAG.getElementCount(DL, XLenVT, GatherVT.getVectorElementCount());
       SDValue Diff = DAG.getNode(ISD::SUB, DL, XLenVT, VLMax, EVL);
 
       Result = getVSlidedown(DAG, Subtarget, DL, GatherVT,

From b7721c55fc09616d186bbe1f9e3e4b9df8fb4009 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Mon, 1 Dec 2025 13:32:45 +0800
Subject: [PATCH 24/39] [RISCV] Remove the duplicate for RV32/RV64 in
 zicond-fp-select-zfinx.ll. NFC.

---
 .../CodeGen/RISCV/zicond-fp-select-zfinx.ll   | 289 ++++++------------
 1 file changed, 97 insertions(+), 192 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/zicond-fp-select-zfinx.ll b/llvm/test/CodeGen/RISCV/zicond-fp-select-zfinx.ll
index b505c84166eb1..0e8a0c704207d 100644
--- a/llvm/test/CodeGen/RISCV/zicond-fp-select-zfinx.ll
+++ b/llvm/test/CodeGen/RISCV/zicond-fp-select-zfinx.ll
@@ -1,19 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; Zicond with zfinx(implies by zdinx)
-; RUN: llc -mtriple=riscv64 -mattr=+zdinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64ZDINX_ZICOND
-; RUN: llc -mtriple=riscv64 -mattr=+zdinx         -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64ZDINX_NOZICOND
+; RUN: llc -mtriple=riscv64 -mattr=+zdinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZDINX_ZICOND,RV64ZDINX_ZICOND
+; RUN: llc -mtriple=riscv64 -mattr=+zdinx         -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZDINX_NOZICOND,RV64ZDINX_NOZICOND
 
 ; Zicond with zfinx(implies by zhinx)
-; RUN: llc -mtriple=riscv64 -mattr=+zhinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64ZHINX_ZICOND
+; RUN: llc -mtriple=riscv64 -mattr=+zhinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZHINX_ZICOND,RV64ZHINX_ZICOND
 
 ; Baseline with classic FP registers (no *inx); zicond select should NOT trigger
 ; RUN: llc -mtriple=riscv64 -mattr=+f,+d          -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64FD
 
 ; Check same optimize work on 32bit machine
-; RUN: llc -mtriple=riscv32 -mattr=+zfinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32ZFINX_ZICOND
+; RUN: llc -mtriple=riscv32 -mattr=+zfinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZHINX_ZICOND,RV32ZFINX_ZICOND
 ; RUN: llc -mtriple=riscv32 -mattr=+zfinx         -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32ZFINX_NOZICOND
-; RUN: llc -mtriple=riscv32 -mattr=+zdinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32ZDINX_ZICOND
-; RUN: llc -mtriple=riscv32 -mattr=+zdinx         -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32ZDINX_NOZICOND
+; RUN: llc -mtriple=riscv32 -mattr=+zdinx,+zicond -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZDINX_ZICOND,RV32ZDINX_ZICOND
+; RUN: llc -mtriple=riscv32 -mattr=+zdinx         -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZDINX_NOZICOND,RV32ZDINX_NOZICOND
 
 ; This test checks that floating-point SELECT is lowered through integer
 ; SELECT (and thus to Zicond czero.* sequence) when FP values live in GPRs
@@ -25,37 +25,37 @@
 ; -----------------------------------------------------------------------------
 
 define float @select_f32_i1(i1 %cond, float %t, float %f) nounwind {
-; RV64ZDINX_ZICOND-LABEL: select_f32_i1:
-; RV64ZDINX_ZICOND:       # %bb.0: # %entry
-; RV64ZDINX_ZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
-; RV64ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV64ZDINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV64ZDINX_ZICOND-NEXT:    czero.nez a2, a2, a0
-; RV64ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV64ZDINX_ZICOND-NEXT:    or a0, a0, a2
-; RV64ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV64ZDINX_ZICOND-NEXT:    ret
-;
-; RV64ZDINX_NOZICOND-LABEL: select_f32_i1:
-; RV64ZDINX_NOZICOND:       # %bb.0: # %entry
-; RV64ZDINX_NOZICOND-NEXT:    andi a3, a0, 1
-; RV64ZDINX_NOZICOND-NEXT:    mv a0, a1
-; RV64ZDINX_NOZICOND-NEXT:    bnez a3, .LBB0_2
-; RV64ZDINX_NOZICOND-NEXT:  # %bb.1: # %entry
-; RV64ZDINX_NOZICOND-NEXT:    mv a0, a2
-; RV64ZDINX_NOZICOND-NEXT:  .LBB0_2: # %entry
-; RV64ZDINX_NOZICOND-NEXT:    ret
-;
-; RV64ZHINX_ZICOND-LABEL: select_f32_i1:
-; RV64ZHINX_ZICOND:       # %bb.0: # %entry
-; RV64ZHINX_ZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
-; RV64ZHINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV64ZHINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV64ZHINX_ZICOND-NEXT:    czero.nez a2, a2, a0
-; RV64ZHINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV64ZHINX_ZICOND-NEXT:    or a0, a0, a2
-; RV64ZHINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV64ZHINX_ZICOND-NEXT:    ret
+; ZDINX_ZICOND-LABEL: select_f32_i1:
+; ZDINX_ZICOND:       # %bb.0: # %entry
+; ZDINX_ZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
+; ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
+; ZDINX_ZICOND-NEXT:    andi a0, a0, 1
+; ZDINX_ZICOND-NEXT:    czero.nez a2, a2, a0
+; ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
+; ZDINX_ZICOND-NEXT:    or a0, a0, a2
+; ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
+; ZDINX_ZICOND-NEXT:    ret
+;
+; ZDINX_NOZICOND-LABEL: select_f32_i1:
+; ZDINX_NOZICOND:       # %bb.0: # %entry
+; ZDINX_NOZICOND-NEXT:    andi a3, a0, 1
+; ZDINX_NOZICOND-NEXT:    mv a0, a1
+; ZDINX_NOZICOND-NEXT:    bnez a3, .LBB0_2
+; ZDINX_NOZICOND-NEXT:  # %bb.1: # %entry
+; ZDINX_NOZICOND-NEXT:    mv a0, a2
+; ZDINX_NOZICOND-NEXT:  .LBB0_2: # %entry
+; ZDINX_NOZICOND-NEXT:    ret
+;
+; ZHINX_ZICOND-LABEL: select_f32_i1:
+; ZHINX_ZICOND:       # %bb.0: # %entry
+; ZHINX_ZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
+; ZHINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
+; ZHINX_ZICOND-NEXT:    andi a0, a0, 1
+; ZHINX_ZICOND-NEXT:    czero.nez a2, a2, a0
+; ZHINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
+; ZHINX_ZICOND-NEXT:    or a0, a0, a2
+; ZHINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
+; ZHINX_ZICOND-NEXT:    ret
 ;
 ; RV64FD-LABEL: select_f32_i1:
 ; RV64FD:       # %bb.0: # %entry
@@ -66,17 +66,6 @@ define float @select_f32_i1(i1 %cond, float %t, float %f) nounwind {
 ; RV64FD-NEXT:  .LBB0_2: # %entry
 ; RV64FD-NEXT:    ret
 ;
-; RV32ZFINX_ZICOND-LABEL: select_f32_i1:
-; RV32ZFINX_ZICOND:       # %bb.0: # %entry
-; RV32ZFINX_ZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
-; RV32ZFINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV32ZFINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV32ZFINX_ZICOND-NEXT:    czero.nez a2, a2, a0
-; RV32ZFINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV32ZFINX_ZICOND-NEXT:    or a0, a0, a2
-; RV32ZFINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV32ZFINX_ZICOND-NEXT:    ret
-;
 ; RV32ZFINX_NOZICOND-LABEL: select_f32_i1:
 ; RV32ZFINX_NOZICOND:       # %bb.0: # %entry
 ; RV32ZFINX_NOZICOND-NEXT:    andi a3, a0, 1
@@ -86,27 +75,6 @@ define float @select_f32_i1(i1 %cond, float %t, float %f) nounwind {
 ; RV32ZFINX_NOZICOND-NEXT:    mv a0, a2
 ; RV32ZFINX_NOZICOND-NEXT:  .LBB0_2: # %entry
 ; RV32ZFINX_NOZICOND-NEXT:    ret
-;
-; RV32ZDINX_ZICOND-LABEL: select_f32_i1:
-; RV32ZDINX_ZICOND:       # %bb.0: # %entry
-; RV32ZDINX_ZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
-; RV32ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV32ZDINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV32ZDINX_ZICOND-NEXT:    czero.nez a2, a2, a0
-; RV32ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV32ZDINX_ZICOND-NEXT:    or a0, a0, a2
-; RV32ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV32ZDINX_ZICOND-NEXT:    ret
-;
-; RV32ZDINX_NOZICOND-LABEL: select_f32_i1:
-; RV32ZDINX_NOZICOND:       # %bb.0: # %entry
-; RV32ZDINX_NOZICOND-NEXT:    andi a3, a0, 1
-; RV32ZDINX_NOZICOND-NEXT:    mv a0, a1
-; RV32ZDINX_NOZICOND-NEXT:    bnez a3, .LBB0_2
-; RV32ZDINX_NOZICOND-NEXT:  # %bb.1: # %entry
-; RV32ZDINX_NOZICOND-NEXT:    mv a0, a2
-; RV32ZDINX_NOZICOND-NEXT:  .LBB0_2: # %entry
-; RV32ZDINX_NOZICOND-NEXT:    ret
 entry:
   %sel = select i1 %cond, float %t, float %f
   ret float %sel
@@ -353,32 +321,32 @@ entry:
 ; -----------------------------------------------------------------------------
 
 define dso_local noundef half @select_half_i1(i1 %cond, half %a, half %b) nounwind {
-; RV64ZDINX_ZICOND-LABEL: select_half_i1:
-; RV64ZDINX_ZICOND:       # %bb.0: # %entry
-; RV64ZDINX_ZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
-; RV64ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV64ZDINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV64ZDINX_ZICOND-NEXT:    czero.nez a2, a2, a0
-; RV64ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV64ZDINX_ZICOND-NEXT:    or a0, a0, a2
-; RV64ZDINX_ZICOND-NEXT:    lui a1, 1048560
-; RV64ZDINX_ZICOND-NEXT:    or a0, a0, a1
-; RV64ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV64ZDINX_ZICOND-NEXT:    ret
-;
-; RV64ZDINX_NOZICOND-LABEL: select_half_i1:
-; RV64ZDINX_NOZICOND:       # %bb.0: # %entry
-; RV64ZDINX_NOZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
-; RV64ZDINX_NOZICOND-NEXT:    andi a0, a0, 1
-; RV64ZDINX_NOZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV64ZDINX_NOZICOND-NEXT:    bnez a0, .LBB3_2
-; RV64ZDINX_NOZICOND-NEXT:  # %bb.1: # %entry
-; RV64ZDINX_NOZICOND-NEXT:    mv a1, a2
-; RV64ZDINX_NOZICOND-NEXT:  .LBB3_2: # %entry
-; RV64ZDINX_NOZICOND-NEXT:    lui a0, 1048560
-; RV64ZDINX_NOZICOND-NEXT:    or a0, a1, a0
-; RV64ZDINX_NOZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV64ZDINX_NOZICOND-NEXT:    ret
+; ZDINX_ZICOND-LABEL: select_half_i1:
+; ZDINX_ZICOND:       # %bb.0: # %entry
+; ZDINX_ZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
+; ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
+; ZDINX_ZICOND-NEXT:    andi a0, a0, 1
+; ZDINX_ZICOND-NEXT:    czero.nez a2, a2, a0
+; ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
+; ZDINX_ZICOND-NEXT:    or a0, a0, a2
+; ZDINX_ZICOND-NEXT:    lui a1, 1048560
+; ZDINX_ZICOND-NEXT:    or a0, a0, a1
+; ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
+; ZDINX_ZICOND-NEXT:    ret
+;
+; ZDINX_NOZICOND-LABEL: select_half_i1:
+; ZDINX_NOZICOND:       # %bb.0: # %entry
+; ZDINX_NOZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
+; ZDINX_NOZICOND-NEXT:    andi a0, a0, 1
+; ZDINX_NOZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
+; ZDINX_NOZICOND-NEXT:    bnez a0, .LBB3_2
+; ZDINX_NOZICOND-NEXT:  # %bb.1: # %entry
+; ZDINX_NOZICOND-NEXT:    mv a1, a2
+; ZDINX_NOZICOND-NEXT:  .LBB3_2: # %entry
+; ZDINX_NOZICOND-NEXT:    lui a0, 1048560
+; ZDINX_NOZICOND-NEXT:    or a0, a1, a0
+; ZDINX_NOZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
+; ZDINX_NOZICOND-NEXT:    ret
 ;
 ; RV64ZHINX_ZICOND-LABEL: select_half_i1:
 ; RV64ZHINX_ZICOND:       # %bb.0: # %entry
@@ -432,33 +400,6 @@ define dso_local noundef half @select_half_i1(i1 %cond, half %a, half %b) nounwi
 ; RV32ZFINX_NOZICOND-NEXT:    or a0, a1, a0
 ; RV32ZFINX_NOZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
 ; RV32ZFINX_NOZICOND-NEXT:    ret
-;
-; RV32ZDINX_ZICOND-LABEL: select_half_i1:
-; RV32ZDINX_ZICOND:       # %bb.0: # %entry
-; RV32ZDINX_ZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
-; RV32ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV32ZDINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV32ZDINX_ZICOND-NEXT:    czero.nez a2, a2, a0
-; RV32ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV32ZDINX_ZICOND-NEXT:    or a0, a0, a2
-; RV32ZDINX_ZICOND-NEXT:    lui a1, 1048560
-; RV32ZDINX_ZICOND-NEXT:    or a0, a0, a1
-; RV32ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV32ZDINX_ZICOND-NEXT:    ret
-;
-; RV32ZDINX_NOZICOND-LABEL: select_half_i1:
-; RV32ZDINX_NOZICOND:       # %bb.0: # %entry
-; RV32ZDINX_NOZICOND-NEXT:    # kill: def $x12_w killed $x12_w def $x12
-; RV32ZDINX_NOZICOND-NEXT:    andi a0, a0, 1
-; RV32ZDINX_NOZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV32ZDINX_NOZICOND-NEXT:    bnez a0, .LBB3_2
-; RV32ZDINX_NOZICOND-NEXT:  # %bb.1: # %entry
-; RV32ZDINX_NOZICOND-NEXT:    mv a1, a2
-; RV32ZDINX_NOZICOND-NEXT:  .LBB3_2: # %entry
-; RV32ZDINX_NOZICOND-NEXT:    lui a0, 1048560
-; RV32ZDINX_NOZICOND-NEXT:    or a0, a1, a0
-; RV32ZDINX_NOZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV32ZDINX_NOZICOND-NEXT:    ret
 entry:
   %sel = select i1 %cond, half %a, half %b
   ret half %sel
@@ -468,31 +409,31 @@ entry:
 ; Test select with i1 condition and zero ret val (cond ? a : 0), Zfinx
 ; -----------------------------------------------------------------------------
 define dso_local noundef float @select_i1_f32_0(i1 %cond, float %t) nounwind {
-; RV64ZDINX_ZICOND-LABEL: select_i1_f32_0:
-; RV64ZDINX_ZICOND:       # %bb.0: # %entry
-; RV64ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV64ZDINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV64ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV64ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV64ZDINX_ZICOND-NEXT:    ret
-;
-; RV64ZDINX_NOZICOND-LABEL: select_i1_f32_0:
-; RV64ZDINX_NOZICOND:       # %bb.0: # %entry
-; RV64ZDINX_NOZICOND-NEXT:    andi a2, a0, 1
-; RV64ZDINX_NOZICOND-NEXT:    mv a0, a1
-; RV64ZDINX_NOZICOND-NEXT:    bnez a2, .LBB4_2
-; RV64ZDINX_NOZICOND-NEXT:  # %bb.1: # %entry
-; RV64ZDINX_NOZICOND-NEXT:    li a0, 0
-; RV64ZDINX_NOZICOND-NEXT:  .LBB4_2: # %entry
-; RV64ZDINX_NOZICOND-NEXT:    ret
-;
-; RV64ZHINX_ZICOND-LABEL: select_i1_f32_0:
-; RV64ZHINX_ZICOND:       # %bb.0: # %entry
-; RV64ZHINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV64ZHINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV64ZHINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV64ZHINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV64ZHINX_ZICOND-NEXT:    ret
+; ZDINX_ZICOND-LABEL: select_i1_f32_0:
+; ZDINX_ZICOND:       # %bb.0: # %entry
+; ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
+; ZDINX_ZICOND-NEXT:    andi a0, a0, 1
+; ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
+; ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
+; ZDINX_ZICOND-NEXT:    ret
+;
+; ZDINX_NOZICOND-LABEL: select_i1_f32_0:
+; ZDINX_NOZICOND:       # %bb.0: # %entry
+; ZDINX_NOZICOND-NEXT:    andi a2, a0, 1
+; ZDINX_NOZICOND-NEXT:    mv a0, a1
+; ZDINX_NOZICOND-NEXT:    bnez a2, .LBB4_2
+; ZDINX_NOZICOND-NEXT:  # %bb.1: # %entry
+; ZDINX_NOZICOND-NEXT:    li a0, 0
+; ZDINX_NOZICOND-NEXT:  .LBB4_2: # %entry
+; ZDINX_NOZICOND-NEXT:    ret
+;
+; ZHINX_ZICOND-LABEL: select_i1_f32_0:
+; ZHINX_ZICOND:       # %bb.0: # %entry
+; ZHINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
+; ZHINX_ZICOND-NEXT:    andi a0, a0, 1
+; ZHINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
+; ZHINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
+; ZHINX_ZICOND-NEXT:    ret
 ;
 ; RV64FD-LABEL: select_i1_f32_0:
 ; RV64FD:       # %bb.0: # %entry
@@ -503,14 +444,6 @@ define dso_local noundef float @select_i1_f32_0(i1 %cond, float %t) nounwind {
 ; RV64FD-NEXT:  .LBB4_2: # %entry
 ; RV64FD-NEXT:    ret
 ;
-; RV32ZFINX_ZICOND-LABEL: select_i1_f32_0:
-; RV32ZFINX_ZICOND:       # %bb.0: # %entry
-; RV32ZFINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV32ZFINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV32ZFINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV32ZFINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV32ZFINX_ZICOND-NEXT:    ret
-;
 ; RV32ZFINX_NOZICOND-LABEL: select_i1_f32_0:
 ; RV32ZFINX_NOZICOND:       # %bb.0: # %entry
 ; RV32ZFINX_NOZICOND-NEXT:    andi a2, a0, 1
@@ -520,24 +453,6 @@ define dso_local noundef float @select_i1_f32_0(i1 %cond, float %t) nounwind {
 ; RV32ZFINX_NOZICOND-NEXT:    li a0, 0
 ; RV32ZFINX_NOZICOND-NEXT:  .LBB4_2: # %entry
 ; RV32ZFINX_NOZICOND-NEXT:    ret
-;
-; RV32ZDINX_ZICOND-LABEL: select_i1_f32_0:
-; RV32ZDINX_ZICOND:       # %bb.0: # %entry
-; RV32ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV32ZDINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV32ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV32ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV32ZDINX_ZICOND-NEXT:    ret
-;
-; RV32ZDINX_NOZICOND-LABEL: select_i1_f32_0:
-; RV32ZDINX_NOZICOND:       # %bb.0: # %entry
-; RV32ZDINX_NOZICOND-NEXT:    andi a2, a0, 1
-; RV32ZDINX_NOZICOND-NEXT:    mv a0, a1
-; RV32ZDINX_NOZICOND-NEXT:    bnez a2, .LBB4_2
-; RV32ZDINX_NOZICOND-NEXT:  # %bb.1: # %entry
-; RV32ZDINX_NOZICOND-NEXT:    li a0, 0
-; RV32ZDINX_NOZICOND-NEXT:  .LBB4_2: # %entry
-; RV32ZDINX_NOZICOND-NEXT:    ret
 entry:
   %sel = select i1 %cond, float %t, float 0.000000e+00
   ret float %sel
@@ -547,15 +462,15 @@ entry:
 ; Test select with i1 condition and zero ret val for half fp (cond ? a : 0)
 ; -----------------------------------------------------------------------------
 define dso_local noundef half @select_i1_half_0(i1 %cond, half %val) nounwind {
-; RV64ZDINX_ZICOND-LABEL: select_i1_half_0:
-; RV64ZDINX_ZICOND:       # %bb.0: # %entry
-; RV64ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV64ZDINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV64ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV64ZDINX_ZICOND-NEXT:    lui a1, 1048560
-; RV64ZDINX_ZICOND-NEXT:    or a0, a0, a1
-; RV64ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV64ZDINX_ZICOND-NEXT:    ret
+; ZDINX_ZICOND-LABEL: select_i1_half_0:
+; ZDINX_ZICOND:       # %bb.0: # %entry
+; ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
+; ZDINX_ZICOND-NEXT:    andi a0, a0, 1
+; ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
+; ZDINX_ZICOND-NEXT:    lui a1, 1048560
+; ZDINX_ZICOND-NEXT:    or a0, a0, a1
+; ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
+; ZDINX_ZICOND-NEXT:    ret
 ;
 ; RV64ZDINX_NOZICOND-LABEL: select_i1_half_0:
 ; RV64ZDINX_NOZICOND:       # %bb.0: # %entry
@@ -608,16 +523,6 @@ define dso_local noundef half @select_i1_half_0(i1 %cond, half %val) nounwind {
 ; RV32ZFINX_NOZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
 ; RV32ZFINX_NOZICOND-NEXT:    ret
 ;
-; RV32ZDINX_ZICOND-LABEL: select_i1_half_0:
-; RV32ZDINX_ZICOND:       # %bb.0: # %entry
-; RV32ZDINX_ZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11
-; RV32ZDINX_ZICOND-NEXT:    andi a0, a0, 1
-; RV32ZDINX_ZICOND-NEXT:    czero.eqz a0, a1, a0
-; RV32ZDINX_ZICOND-NEXT:    lui a1, 1048560
-; RV32ZDINX_ZICOND-NEXT:    or a0, a0, a1
-; RV32ZDINX_ZICOND-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
-; RV32ZDINX_ZICOND-NEXT:    ret
-;
 ; RV32ZDINX_NOZICOND-LABEL: select_i1_half_0:
 ; RV32ZDINX_NOZICOND:       # %bb.0: # %entry
 ; RV32ZDINX_NOZICOND-NEXT:    # kill: def $x11_w killed $x11_w def $x11

From 8ceeba83812d551423a9e50f600cc77ea4718ca2 Mon Sep 17 00:00:00 2001
From: Ming Yan <ming.yan@terapines.com>
Date: Mon, 1 Dec 2025 18:54:21 +0800
Subject: [PATCH 25/39] [MLIR][SCF] Canonicalize redundant scf.if from
 scf.while before region into after region (#169892)

When a `scf.if` directly precedes a `scf.condition` in the before region
of a `scf.while` and both share the same condition, move the if into the
after region of the loop. This helps simplify the control flow to enable
uplifting `scf.while` to `scf.for`.
---
 mlir/lib/Dialect/SCF/IR/SCF.cpp         | 131 +++++++++++++++++++++++-
 mlir/test/Dialect/SCF/canonicalize.mlir |  50 +++++++++
 2 files changed, 180 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 881e256a8797b..bb07291036667 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -26,6 +26,7 @@
 #include "mlir/Interfaces/ParallelCombiningOpInterface.h"
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Transforms/InliningUtils.h"
+#include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -3687,6 +3688,133 @@ LogicalResult scf::WhileOp::verify() {
 }
 
 namespace {
+/// Move a scf.if op that is directly before the scf.condition op in the while
+/// before region, and whose condition matches the condition of the
+/// scf.condition op, down into the while after region.
+///
+/// scf.while (..) : (...) -> ... {
+///  %additional_used_values = ...
+///  %cond = ...
+///  ...
+///  %res = scf.if %cond -> (...) {
+///    use(%additional_used_values)
+///    ... // then block
+///    scf.yield %then_value
+///  } else {
+///    scf.yield %else_value
+///  }
+///  scf.condition(%cond) %res, ...
+/// } do {
+/// ^bb0(%res_arg, ...):
+///    use(%res_arg)
+///    ...
+///
+/// becomes
+/// scf.while (..) : (...) -> ... {
+///  %additional_used_values = ...
+///  %cond = ...
+///  ...
+///  scf.condition(%cond) %else_value, ..., %additional_used_values
+/// } do {
+/// ^bb0(%res_arg ..., %additional_args): :
+///    use(%additional_args)
+///    ... // if then block
+///    use(%then_value)
+///    ...
+struct WhileMoveIfDown : public OpRewritePattern<scf::WhileOp> {
+  using OpRewritePattern<scf::WhileOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(scf::WhileOp op,
+                                PatternRewriter &rewriter) const override {
+    auto conditionOp = op.getConditionOp();
+
+    // Only support ifOp right before the condition at the moment. Relaxing this
+    // would require to:
+    // - check that the body does not have side-effects conflicting with
+    //    operations between the if and the condition.
+    // - check that results of the if operation are only used as arguments to
+    //    the condition.
+    auto ifOp = dyn_cast_or_null<scf::IfOp>(conditionOp->getPrevNode());
+
+    // Check that the ifOp is directly before the conditionOp and that it
+    // matches the condition of the conditionOp. Also ensure that the ifOp has
+    // no else block with content, as that would complicate the transformation.
+    // TODO: support else blocks with content.
+    if (!ifOp || ifOp.getCondition() != conditionOp.getCondition() ||
+        (ifOp.elseBlock() && !ifOp.elseBlock()->without_terminator().empty()))
+      return failure();
+
+    assert(ifOp->use_empty() || (llvm::all_equal(ifOp->getUsers()) &&
+                                 *ifOp->user_begin() == conditionOp) &&
+                                    "ifOp has unexpected uses");
+
+    Location loc = op.getLoc();
+
+    // Replace uses of ifOp results in the conditionOp with the yielded values
+    // from the ifOp branches.
+    for (auto [idx, arg] : llvm::enumerate(conditionOp.getArgs())) {
+      auto it = llvm::find(ifOp->getResults(), arg);
+      if (it != ifOp->getResults().end()) {
+        size_t ifOpIdx = it.getIndex();
+        Value thenValue = ifOp.thenYield()->getOperand(ifOpIdx);
+        Value elseValue = ifOp.elseYield()->getOperand(ifOpIdx);
+
+        rewriter.replaceAllUsesWith(ifOp->getResults()[ifOpIdx], elseValue);
+        rewriter.replaceAllUsesWith(op.getAfterArguments()[idx], thenValue);
+      }
+    }
+
+    // Collect additional used values from before region.
+    SetVector<Value> additionalUsedValuesSet;
+    visitUsedValuesDefinedAbove(ifOp.getThenRegion(), [&](OpOperand *operand) {
+      if (&op.getBefore() == operand->get().getParentRegion())
+        additionalUsedValuesSet.insert(operand->get());
+    });
+
+    // Create new whileOp with additional used values as results.
+    auto additionalUsedValues = additionalUsedValuesSet.getArrayRef();
+    auto additionalValueTypes = llvm::map_to_vector(
+        additionalUsedValues, [](Value val) { return val.getType(); });
+    size_t additionalValueSize = additionalUsedValues.size();
+    SmallVector<Type> newResultTypes(op.getResultTypes());
+    newResultTypes.append(additionalValueTypes);
+
+    auto newWhileOp =
+        scf::WhileOp::create(rewriter, loc, newResultTypes, op.getInits());
+
+    rewriter.modifyOpInPlace(newWhileOp, [&] {
+      newWhileOp.getBefore().takeBody(op.getBefore());
+      newWhileOp.getAfter().takeBody(op.getAfter());
+      newWhileOp.getAfter().addArguments(
+          additionalValueTypes,
+          SmallVector<Location>(additionalValueSize, loc));
+    });
+
+    rewriter.modifyOpInPlace(conditionOp, [&] {
+      conditionOp.getArgsMutable().append(additionalUsedValues);
+    });
+
+    // Replace uses of additional used values inside the ifOp then region with
+    // the whileOp after region arguments.
+    rewriter.replaceUsesWithIf(
+        additionalUsedValues,
+        newWhileOp.getAfterArguments().take_back(additionalValueSize),
+        [&](OpOperand &use) {
+          return ifOp.getThenRegion().isAncestor(
+              use.getOwner()->getParentRegion());
+        });
+
+    // Inline ifOp then region into new whileOp after region.
+    rewriter.eraseOp(ifOp.thenYield());
+    rewriter.inlineBlockBefore(ifOp.thenBlock(), newWhileOp.getAfterBody(),
+                               newWhileOp.getAfterBody()->begin());
+    rewriter.eraseOp(ifOp);
+    rewriter.replaceOp(op,
+                       newWhileOp->getResults().drop_back(additionalValueSize));
+    return success();
+  }
+};
+
 /// Replace uses of the condition within the do block with true, since otherwise
 /// the block would not be evaluated.
 ///
@@ -4399,7 +4527,8 @@ void WhileOp::getCanonicalizationPatterns(RewritePatternSet &results,
   results.add<RemoveLoopInvariantArgsFromBeforeBlock,
               RemoveLoopInvariantValueYielded, WhileConditionTruth,
               WhileCmpCond, WhileUnusedResult, WhileRemoveDuplicatedResults,
-              WhileRemoveUnusedArgs, WhileOpAlignBeforeArgs>(context);
+              WhileRemoveUnusedArgs, WhileOpAlignBeforeArgs, WhileMoveIfDown>(
+      context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir
index 084c3fc065de3..ac590fc0c47b9 100644
--- a/mlir/test/Dialect/SCF/canonicalize.mlir
+++ b/mlir/test/Dialect/SCF/canonicalize.mlir
@@ -974,6 +974,56 @@ func.func @replace_if_with_cond3(%arg0 : i1, %arg2: i64) -> (i32, i64) {
 
 // -----
 
+// CHECK-LABEL: @while_move_if_down
+func.func @while_move_if_down() -> i32 {
+  %defined_outside = "test.get_some_value0" () : () -> (i32)
+  %0 = scf.while () : () -> (i32) {
+    %used_value = "test.get_some_value1" () : () -> (i32)
+    %used_by_subregion = "test.get_some_value2" () : () -> (i32)
+    %else_value = "test.get_some_value3" () : () -> (i32)
+    %condition = "test.condition"() : () -> i1
+    %res = scf.if %condition -> (i32) {
+      "test.use0" (%defined_outside) : (i32) -> ()
+      "test.use1" (%used_value) : (i32) -> ()
+      test.alloca_scope_region {
+        "test.use2" (%used_by_subregion) : (i32) -> ()
+      }
+      %then_value = "test.get_some_value4" () : () -> (i32)
+      scf.yield %then_value : i32
+    } else {
+      scf.yield %else_value : i32
+    }
+    scf.condition(%condition) %res : i32
+  } do {
+  ^bb0(%res_arg: i32):
+    "test.use3" (%res_arg) : (i32) -> ()
+    scf.yield
+  }
+  return %0 : i32
+}
+// CHECK:           %[[defined_outside:.*]] = "test.get_some_value0"() : () -> i32
+// CHECK:           %[[WHILE_RES:.*]]:3 = scf.while : () -> (i32, i32, i32) {
+// CHECK:             %[[used_value:.*]] = "test.get_some_value1"() : () -> i32
+// CHECK:             %[[used_by_subregion:.*]] = "test.get_some_value2"() : () -> i32
+// CHECK:             %[[else_value:.*]] = "test.get_some_value3"() : () -> i32
+// CHECK:             %[[condition:.*]] = "test.condition"() : () -> i1
+// CHECK:             scf.condition(%[[condition]]) %[[else_value]], %[[used_value]], %[[used_by_subregion]] : i32, i32, i32
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[res_arg:.*]]: i32, %[[used_value_arg:.*]]: i32, %[[used_by_subregion_arg:.*]]: i32):
+// CHECK:             "test.use0"(%[[defined_outside]]) : (i32) -> ()
+// CHECK:             "test.use1"(%[[used_value_arg]]) : (i32) -> ()
+// CHECK:             test.alloca_scope_region {
+// CHECK:               "test.use2"(%[[used_by_subregion_arg]]) : (i32) -> ()
+// CHECK:             }
+// CHECK:             %[[then_value:.*]] = "test.get_some_value4"() : () -> i32
+// CHECK:             "test.use3"(%[[then_value]]) : (i32) -> ()
+// CHECK:             scf.yield
+// CHECK:           }
+// CHECK:           return %[[WHILE_RES]]#0 : i32
+// CHECK:         }
+
+// -----
+
 // CHECK-LABEL: @while_cond_true
 func.func @while_cond_true() -> i1 {
   %0 = scf.while () : () -> i1 {

From 29fef3a51e6dcc5e6b5683c281ce7c19b19f0bbf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gergely=20B=C3=A1lint?= <gergely.balint@arm.com>
Date: Mon, 1 Dec 2025 12:00:31 +0100
Subject: [PATCH 26/39] [BOLT] Improve DWARF CFI generation for pac-ret
 binaries (#163381)

During InsertNegateRAState pass we check the annotations on
instructions,
to decide where to generate the OpNegateRAState CFIs in the output
binary.

As only instructions in the input binary were annotated, we have to make
a judgement on instructions generated by other BOLT passes.
Incorrect placement may cause issues when an (async) unwind request
is received during the new "unknown" instructions.

This patch adds more logic to make a more informed decision on by taking
into account:
- unknown instructions in a BasicBlock with other instruction have the
same RAState. Previously, if the BasicBlock started with an unknown
instruction,
the RAState was copied from the preceding block. Now, the RAState is
copied from
  the succeeding instructions in the same block.
- Some BasicBlocks may only contain instructions with unknown RAState,
As explained in issue #160989, these blocks already have incorrect
unwind info. Because of this, the last known RAState based on the layout order
is copied.

Updated bolt/docs/PacRetDesign.md to reflect changes.
---
 bolt/docs/PacRetDesign.md                     |  21 +-
 .../bolt/Passes/InsertNegateRAStatePass.h     |  25 +-
 bolt/lib/Passes/InsertNegateRAStatePass.cpp   | 147 ++++++--
 bolt/unittests/CMakeLists.txt                 |   1 +
 bolt/unittests/Passes/CMakeLists.txt          |  30 ++
 bolt/unittests/Passes/InsertNegateRAState.cpp | 333 ++++++++++++++++++
 6 files changed, 525 insertions(+), 32 deletions(-)
 create mode 100644 bolt/unittests/Passes/CMakeLists.txt
 create mode 100644 bolt/unittests/Passes/InsertNegateRAState.cpp

diff --git a/bolt/docs/PacRetDesign.md b/bolt/docs/PacRetDesign.md
index f3fe5fbd522cb..2e3cb7b91e0ce 100644
--- a/bolt/docs/PacRetDesign.md
+++ b/bolt/docs/PacRetDesign.md
@@ -200,15 +200,22 @@ This pass runs after optimizations. It performns the _inverse_ of MarkRAState pa
 Some BOLT passes can add new Instructions. In InsertNegateRAStatePass, we have
 to know what RA state these have.
 
-The current solution has the `inferUnknownStates` function to cover these, using
-a fairly simple strategy: unknown states inherit the last known state.
+> [!important]
+> As issue #160989 explains, unwind info is missing from stubs.
+> For this same reason, we cannot generate correct pac-specific unwind info: the
+> signedness of the _incorrect_ return address is meaningless.
 
-This will be updated to a more robust solution.
+Assignment of RAStates to newly generated instructions is done in `inferUnknownStates`.
+We have two different cases to cover:
 
-> [!important]
-> As issue #160989 describes, unwind info is incorrect in stubs with multiple callers.
-> For this same reason, we cannot generate correct pac-specific unwind info: the signess
-> of the _incorrect_ return address is meaningless.
+1. If a BasicBlock has some instructions with known RA state, and some without, we
+   can copy the RAState of known instructions to the unknown ones. As the control
+   flow only changes between BasicBlocks, instructions in the same BasicBlock have
+   the same return address. (The exception is noreturn calls, but these would only
+   cause problems, if the newly inserted instruction is right after the call.)
+
+2. If a BasicBlock has no instructions with known RAState, we have to copy the
+   RAState of the previous BasicBlock in layout order.
 
 ### Optimizations requiring special attention
 
diff --git a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
index 836948bf5e9c0..3f003af96162d 100644
--- a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
+++ b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/InsertNegateRAStatePass.cpp ----------------------------===//
+//===- bolt/Passes/InsertNegateRAStatePass.h ------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -30,9 +30,30 @@ class InsertNegateRAState : public BinaryFunctionPass {
 private:
   /// Because states are tracked as MCAnnotations on individual instructions,
   /// newly inserted instructions do not have a state associated with them.
-  /// New states are "inherited" from the last known state.
+  /// Uses fillUnknownStateInBB and fillUnknownStubs.
   void inferUnknownStates(BinaryFunction &BF);
 
+  /// Simple case: copy RAStates to unknown insts from previous inst.
+  /// If the first inst has unknown state, copy set it to the first known state.
+  /// Accounts for signing and authenticating insts.
+  void fillUnknownStateInBB(BinaryContext &BC, BinaryBasicBlock &BB);
+
+  /// Fill in RAState in BasicBlocks consisting entirely of new instructions.
+  /// As of #160989, we have to copy the RAState from the previous BB in the
+  /// layout, because CFIs are already incorrect here.
+  void fillUnknownStubs(BinaryFunction &BF);
+
+  /// Returns the first known RAState from \p BB, or std::nullopt if all are
+  /// unknown.
+  std::optional<bool> getFirstKnownRAState(BinaryContext &BC,
+                                           BinaryBasicBlock &BB);
+
+  /// \p Return true if all instructions have unknown RAState.
+  bool isUnknownBlock(BinaryContext &BC, BinaryBasicBlock &BB);
+
+  /// Set all instructions in \p BB to \p State.
+  void markUnknownBlock(BinaryContext &BC, BinaryBasicBlock &BB, bool State);
+
   /// Support for function splitting:
   /// if two consecutive BBs with Signed state are going to end up in different
   /// functions (so are held by different FunctionFragments), we have to add a
diff --git a/bolt/lib/Passes/InsertNegateRAStatePass.cpp b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
index 775b7795e77c5..ed4de8a56f89f 100644
--- a/bolt/lib/Passes/InsertNegateRAStatePass.cpp
+++ b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
@@ -52,8 +52,8 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
         MCInst &Inst = *It;
         if (BC.MIB->isCFI(Inst))
           continue;
-        auto RAState = BC.MIB->getRAState(Inst);
-        if (!RAState) {
+        std::optional<bool> RAState = BC.MIB->getRAState(Inst);
+        if (!RAState.has_value()) {
           BC.errs() << "BOLT-ERROR: unknown RAState after inferUnknownStates "
                     << " in function " << BF.getPrintName() << "\n";
           PassFailed = true;
@@ -74,6 +74,20 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
   }
 }
 
+void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) {
+  BinaryContext &BC = BF.getBinaryContext();
+
+  // Fill in missing RAStates in simple cases (inside BBs).
+  for (BinaryBasicBlock &BB : BF) {
+    fillUnknownStateInBB(BC, BB);
+  }
+  // BasicBlocks which are made entirely of "new instructions" (instructions
+  // without RAState annotation) are stubs, and do not have correct unwind info.
+  // We should iterate in layout order and fill them based on previous known
+  // RAState.
+  fillUnknownStubs(BF);
+}
+
 void InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF,
                                                      FunctionFragment &FF) {
   BinaryContext &BC = BF.getBinaryContext();
@@ -92,8 +106,8 @@ void InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF,
   // If a function is already split in the input, the first FF can also start
   // with Signed state. This covers that scenario as well.
   auto II = (*FirstNonEmpty)->getFirstNonPseudo();
-  auto RAState = BC.MIB->getRAState(*II);
-  if (!RAState) {
+  std::optional<bool> RAState = BC.MIB->getRAState(*II);
+  if (!RAState.has_value()) {
     BC.errs() << "BOLT-ERROR: unknown RAState after inferUnknownStates "
               << " in function " << BF.getPrintName() << "\n";
     PassFailed = true;
@@ -104,32 +118,119 @@ void InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF,
                          MCCFIInstruction::createNegateRAState(nullptr));
 }
 
-void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) {
+std::optional<bool>
+InsertNegateRAState::getFirstKnownRAState(BinaryContext &BC,
+                                          BinaryBasicBlock &BB) {
+  for (const MCInst &Inst : BB) {
+    if (BC.MIB->isCFI(Inst))
+      continue;
+    std::optional<bool> RAState = BC.MIB->getRAState(Inst);
+    if (RAState.has_value())
+      return RAState;
+  }
+  return std::nullopt;
+}
+
+bool InsertNegateRAState::isUnknownBlock(BinaryContext &BC,
+                                         BinaryBasicBlock &BB) {
+  std::optional<bool> FirstRAState = getFirstKnownRAState(BC, BB);
+  return !FirstRAState.has_value();
+}
+
+void InsertNegateRAState::fillUnknownStateInBB(BinaryContext &BC,
+                                               BinaryBasicBlock &BB) {
+
+  auto First = BB.getFirstNonPseudo();
+  if (First == BB.end())
+    return;
+  // If the first instruction has unknown RAState, we should copy the first
+  // known RAState.
+  std::optional<bool> RAState = BC.MIB->getRAState(*First);
+  if (!RAState.has_value()) {
+    std::optional<bool> FirstRAState = getFirstKnownRAState(BC, BB);
+    if (!FirstRAState.has_value())
+      // We fill unknown BBs later.
+      return;
+
+    BC.MIB->setRAState(*First, *FirstRAState);
+  }
+
+  // At this point we know the RAState of the first instruction,
+  // so we can propagate the RAStates to all subsequent unknown instructions.
+  MCInst Prev = *First;
+  for (auto It = First + 1; It != BB.end(); ++It) {
+    MCInst &Inst = *It;
+    if (BC.MIB->isCFI(Inst))
+      continue;
+
+    // No need to check for nullopt: we only entered this loop after the first
+    // instruction had its RAState set, and RAState is always set for the
+    // previous instruction in the previous iteration of the loop.
+    std::optional<bool> PrevRAState = BC.MIB->getRAState(Prev);
+
+    std::optional<bool> RAState = BC.MIB->getRAState(Inst);
+    if (!RAState.has_value()) {
+      if (BC.MIB->isPSignOnLR(Prev))
+        PrevRAState = true;
+      else if (BC.MIB->isPAuthOnLR(Prev))
+        PrevRAState = false;
+      BC.MIB->setRAState(Inst, *PrevRAState);
+    }
+    Prev = Inst;
+  }
+}
+
+void InsertNegateRAState::markUnknownBlock(BinaryContext &BC,
+                                           BinaryBasicBlock &BB, bool State) {
+  // If we call this when an Instruction has either kRASigned or kRAUnsigned
+  // annotation, setRASigned or setRAUnsigned would fail.
+  assert(isUnknownBlock(BC, BB) &&
+         "markUnknownBlock should only be called on unknown blocks");
+  for (MCInst &Inst : BB) {
+    if (BC.MIB->isCFI(Inst))
+      continue;
+    BC.MIB->setRAState(Inst, State);
+  }
+}
+
+void InsertNegateRAState::fillUnknownStubs(BinaryFunction &BF) {
   BinaryContext &BC = BF.getBinaryContext();
   bool FirstIter = true;
   MCInst PrevInst;
-  for (BinaryBasicBlock &BB : BF) {
-    for (MCInst &Inst : BB) {
-      if (BC.MIB->isCFI(Inst))
-        continue;
+  for (FunctionFragment &FF : BF.getLayout().fragments()) {
+    for (BinaryBasicBlock *BB : FF) {
+      if (FirstIter) {
+        FirstIter = false;
+        if (isUnknownBlock(BC, *BB))
+          // If the first BasicBlock is unknown, the function's entry RAState
+          // should be used.
+          markUnknownBlock(BC, *BB, BF.getInitialRAState());
+      } else if (isUnknownBlock(BC, *BB)) {
+        // As explained in issue #160989, the unwind info is incorrect for
+        // stubs. Indicating the correct RAState without the rest of the unwind
+        // info being correct is not useful. Instead, we copy the RAState from
+        // the previous instruction.
+        std::optional<bool> PrevRAState = BC.MIB->getRAState(PrevInst);
+        if (!PrevRAState.has_value()) {
+          // No non-cfi instruction encountered in the function yet.
+          // This means the RAState is the same as at the function entry.
+          markUnknownBlock(BC, *BB, BF.getInitialRAState());
+          continue;
+        }
 
-      auto RAState = BC.MIB->getRAState(Inst);
-      if (!FirstIter && !RAState) {
         if (BC.MIB->isPSignOnLR(PrevInst))
-          RAState = true;
+          PrevRAState = true;
         else if (BC.MIB->isPAuthOnLR(PrevInst))
-          RAState = false;
-        else {
-          auto PrevRAState = BC.MIB->getRAState(PrevInst);
-          RAState = PrevRAState ? *PrevRAState : false;
-        }
-        BC.MIB->setRAState(Inst, *RAState);
-      } else {
-        FirstIter = false;
-        if (!RAState)
-          BC.MIB->setRAState(Inst, BF.getInitialRAState());
+          PrevRAState = false;
+        markUnknownBlock(BC, *BB, *PrevRAState);
       }
-      PrevInst = Inst;
+      // This function iterates on BasicBlocks, so the PrevInst has to be
+      // updated to the last instruction of the current BasicBlock. If the
+      // BasicBlock is empty, or only has PseudoInstructions, PrevInst will not
+      // be updated.
+      auto Last = BB->getLastNonPseudo();
+      if (Last != BB->rend())
+        PrevInst = *Last;
     }
   }
 }
diff --git a/bolt/unittests/CMakeLists.txt b/bolt/unittests/CMakeLists.txt
index 64414b83d39fe..d47ddc46b7388 100644
--- a/bolt/unittests/CMakeLists.txt
+++ b/bolt/unittests/CMakeLists.txt
@@ -7,3 +7,4 @@ endfunction()
 
 add_subdirectory(Core)
 add_subdirectory(Profile)
+add_subdirectory(Passes)
diff --git a/bolt/unittests/Passes/CMakeLists.txt b/bolt/unittests/Passes/CMakeLists.txt
new file mode 100644
index 0000000000000..3dc578adeb357
--- /dev/null
+++ b/bolt/unittests/Passes/CMakeLists.txt
@@ -0,0 +1,30 @@
+set(LLVM_LINK_COMPONENTS
+  DebugInfoDWARF
+  Object
+  MC
+  ${BOLT_TARGETS_TO_BUILD}
+  )
+
+add_bolt_unittest(PassTests
+  InsertNegateRAState.cpp
+
+  DISABLE_LLVM_LINK_LLVM_DYLIB
+  )
+
+target_link_libraries(PassTests
+  PRIVATE
+  LLVMBOLTCore
+  LLVMBOLTRewrite
+  LLVMBOLTPasses
+  LLVMBOLTProfile
+  LLVMBOLTUtils
+  )
+
+foreach (tgt ${BOLT_TARGETS_TO_BUILD})
+  include_directories(
+    ${LLVM_MAIN_SRC_DIR}/lib/Target/${tgt}
+    ${LLVM_BINARY_DIR}/lib/Target/${tgt}
+  )
+  string(TOUPPER "${tgt}" upper)
+  target_compile_definitions(PassTests PRIVATE "${upper}_AVAILABLE")
+endforeach()
diff --git a/bolt/unittests/Passes/InsertNegateRAState.cpp b/bolt/unittests/Passes/InsertNegateRAState.cpp
new file mode 100644
index 0000000000000..2ef78d381e570
--- /dev/null
+++ b/bolt/unittests/Passes/InsertNegateRAState.cpp
@@ -0,0 +1,333 @@
+//===- bolt/unittest/Passes/InsertNegateRAState.cpp -----------------------===//
+//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef AARCH64_AVAILABLE
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#endif // AARCH64_AVAILABLE
+
+#include "bolt/Core/BinaryBasicBlock.h"
+#include "bolt/Core/BinaryFunction.h"
+#include "bolt/Passes/InsertNegateRAStatePass.h"
+#include "bolt/Rewrite/BinaryPassManager.h"
+#include "bolt/Rewrite/RewriteInstance.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/Support/TargetSelect.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::object;
+using namespace llvm::ELF;
+using namespace bolt;
+
+namespace {
+struct PassTester : public testing::TestWithParam<Triple::ArchType> {
+  void SetUp() override {
+    initalizeLLVM();
+    prepareElf();
+    initializeBolt();
+  }
+
+protected:
+  void initalizeLLVM() {
+#define BOLT_TARGET(target)                                                    \
+  LLVMInitialize##target##TargetInfo();                                        \
+  LLVMInitialize##target##TargetMC();                                          \
+  LLVMInitialize##target##AsmParser();                                         \
+  LLVMInitialize##target##Disassembler();                                      \
+  LLVMInitialize##target##Target();                                            \
+  LLVMInitialize##target##AsmPrinter();
+
+#include "bolt/Core/TargetConfig.def"
+  }
+
+#define PREPARE_FUNC(name)                                                     \
+  constexpr uint64_t FunctionAddress = 0x1000;                                 \
+  BinaryFunction *BF = BC->createBinaryFunction(                               \
+      name, *TextSection, FunctionAddress, /*Size=*/0, /*SymbolSize=*/0,       \
+      /*Alignment=*/16);                                                       \
+  /* Make sure the pass runs on the BF.*/                                      \
+  BF->updateState(BinaryFunction::State::CFG);                                 \
+  BF->setContainedNegateRAState();                                             \
+  /* All tests need at least one BB. */                                        \
+  BinaryBasicBlock *BB = BF->addBasicBlock();                                  \
+  BF->addEntryPoint(*BB);                                                      \
+  BB->setCFIState(0);
+
+  void prepareElf() {
+    memcpy(ElfBuf, "\177ELF", 4);
+    ELF64LE::Ehdr *EHdr = reinterpret_cast<typename ELF64LE::Ehdr *>(ElfBuf);
+    EHdr->e_ident[llvm::ELF::EI_CLASS] = llvm::ELF::ELFCLASS64;
+    EHdr->e_ident[llvm::ELF::EI_DATA] = llvm::ELF::ELFDATA2LSB;
+    EHdr->e_machine = GetParam() == Triple::aarch64 ? EM_AARCH64 : EM_X86_64;
+    MemoryBufferRef Source(StringRef(ElfBuf, sizeof(ElfBuf)), "ELF");
+    ObjFile = cantFail(ObjectFile::createObjectFile(Source));
+  }
+  void initializeBolt() {
+    Relocation::Arch = ObjFile->makeTriple().getArch();
+    BC = cantFail(BinaryContext::createBinaryContext(
+        ObjFile->makeTriple(), std::make_shared<orc::SymbolStringPool>(),
+        ObjFile->getFileName(), nullptr, true, DWARFContext::create(*ObjFile),
+        {llvm::outs(), llvm::errs()}));
+    ASSERT_FALSE(!BC);
+    BC->initializeTarget(std::unique_ptr<MCPlusBuilder>(
+        createMCPlusBuilder(GetParam(), BC->MIA.get(), BC->MII.get(),
+                            BC->MRI.get(), BC->STI.get())));
+
+    PassManager = std::make_unique<BinaryFunctionPassManager>(*BC);
+    PassManager->registerPass(std::make_unique<InsertNegateRAState>());
+
+    TextSection = &BC->registerOrUpdateSection(
+        ".text", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_EXECINSTR,
+        /*Data=*/nullptr, /*Size=*/0,
+        /*Alignment=*/16);
+  }
+
+  std::vector<int> findCFIOffsets(BinaryFunction &BF) {
+    std::vector<int> Locations;
+    int Idx = 0;
+    int InstSize = 4; // AArch64
+    for (BinaryBasicBlock &BB : BF) {
+      for (MCInst &Inst : BB) {
+        if (BC->MIB->isCFI(Inst)) {
+          const MCCFIInstruction *CFI = BF.getCFIFor(Inst);
+          if (CFI->getOperation() == MCCFIInstruction::OpNegateRAState)
+            Locations.push_back(Idx * InstSize);
+        }
+        Idx++;
+      }
+    }
+    return Locations;
+  }
+
+  char ElfBuf[sizeof(typename ELF64LE::Ehdr)] = {};
+  std::unique_ptr<ObjectFile> ObjFile;
+  std::unique_ptr<BinaryContext> BC;
+  std::unique_ptr<BinaryFunctionPassManager> PassManager;
+  BinarySection *TextSection;
+};
+} // namespace
+
+TEST_P(PassTester, ExampleTest) {
+  if (GetParam() != Triple::aarch64)
+    GTEST_SKIP();
+
+  ASSERT_NE(TextSection, nullptr);
+
+  PREPARE_FUNC("ExampleFunction");
+
+  MCInst UnsignedInst = MCInstBuilder(AArch64::ADDSXri)
+                            .addReg(AArch64::X0)
+                            .addReg(AArch64::X0)
+                            .addImm(0)
+                            .addImm(0);
+  BC->MIB->setRAState(UnsignedInst, false);
+  BB->addInstruction(UnsignedInst);
+
+  MCInst SignedInst = MCInstBuilder(AArch64::ADDSXri)
+                          .addReg(AArch64::X0)
+                          .addReg(AArch64::X0)
+                          .addImm(1)
+                          .addImm(0);
+  BC->MIB->setRAState(SignedInst, true);
+  BB->addInstruction(SignedInst);
+
+  Error E = PassManager->runPasses();
+  EXPECT_FALSE(E);
+
+  /* Expected layout of BF after the pass:
+
+   .LBB0 (3 instructions, align : 1)
+      Entry Point
+      CFI State : 0
+        00000000:   adds    x0, x0, #0x0
+        00000004:   !CFI    $0      ; OpNegateRAState
+        00000004:   adds    x0, x0, #0x1
+      CFI State: 0
+   */
+  auto CFILoc = findCFIOffsets(*BF);
+  EXPECT_EQ(CFILoc.size(), 1u);
+  EXPECT_EQ(CFILoc[0], 4);
+}
+
+TEST_P(PassTester, fillUnknownStateInBBTest) {
+  /* Check that a if BB starts with unknown RAState, we can fill the unknown
+   states based on following instructions with known RAStates.
+   *
+   * .LBB0 (1 instructions, align : 1)
+        Entry Point
+        CFI State : 0
+          00000000:   adds    x0, x0, #0x0
+        CFI State: 0
+
+     .LBB1 (4 instructions, align : 1)
+        CFI State : 0
+          00000004:   !CFI    $0      ; OpNegateRAState
+          00000004:   adds    x0, x0, #0x1
+          00000008:   adds    x0, x0, #0x2
+          0000000c:   adds    x0, x0, #0x3
+        CFI State: 0
+   */
+  if (GetParam() != Triple::aarch64)
+    GTEST_SKIP();
+
+  ASSERT_NE(TextSection, nullptr);
+
+  PREPARE_FUNC("FuncWithUnknownStateInBB");
+  BinaryBasicBlock *BB2 = BF->addBasicBlock();
+  BB2->setCFIState(0);
+
+  MCInst Unsigned = MCInstBuilder(AArch64::ADDSXri)
+                        .addReg(AArch64::X0)
+                        .addReg(AArch64::X0)
+                        .addImm(0)
+                        .addImm(0);
+  BC->MIB->setRAState(Unsigned, false);
+  BB->addInstruction(Unsigned);
+
+  MCInst Unknown = MCInstBuilder(AArch64::ADDSXri)
+                       .addReg(AArch64::X0)
+                       .addReg(AArch64::X0)
+                       .addImm(1)
+                       .addImm(0);
+  MCInst Unknown1 = MCInstBuilder(AArch64::ADDSXri)
+                        .addReg(AArch64::X0)
+                        .addReg(AArch64::X0)
+                        .addImm(2)
+                        .addImm(0);
+  MCInst Signed = MCInstBuilder(AArch64::ADDSXri)
+                      .addReg(AArch64::X0)
+                      .addReg(AArch64::X0)
+                      .addImm(3)
+                      .addImm(0);
+  BC->MIB->setRAState(Signed, true);
+  BB2->addInstruction(Unknown);
+  BB2->addInstruction(Unknown1);
+  BB2->addInstruction(Signed);
+
+  Error E = PassManager->runPasses();
+  EXPECT_FALSE(E);
+
+  auto CFILoc = findCFIOffsets(*BF);
+  EXPECT_EQ(CFILoc.size(), 1u);
+  EXPECT_EQ(CFILoc[0], 4);
+  // Check that the pass set Unknown and Unknown1 to signed.
+  // begin() is the CFI, begin() + 1 is Unknown, begin() + 2 is Unknown1.
+  std::optional<bool> RAState = BC->MIB->getRAState(*(BB2->begin() + 1));
+  EXPECT_TRUE(RAState.has_value());
+  EXPECT_TRUE(*RAState);
+  std::optional<bool> RAState1 = BC->MIB->getRAState(*(BB2->begin() + 2));
+  EXPECT_TRUE(RAState1.has_value());
+  EXPECT_TRUE(*RAState1);
+}
+
+TEST_P(PassTester, fillUnknownStubs) {
+  /*
+   * Stubs that are not part of the function's CFG should inherit the RAState of
+   the BasicBlock before it.
+   *
+   * LBB1 is not part of the CFG: LBB0 jumps unconditionally to LBB2.
+   * LBB1 would be a stub inserted in LongJmp in real code.
+   * We do not add any NegateRAState CFIs, as other CFIs are not added either.
+   * See issue #160989 for more details.
+   *
+   *  .LBB0 (1 instructions, align : 1)
+       Entry Point
+         00000000:   b       .LBB2
+       Successors: .LBB2
+
+     .LBB1 (1 instructions, align : 1)
+         00000004:   ret
+
+     .LBB2 (1 instructions, align : 1)
+       Predecessors: .LBB0
+          00000008:   ret
+   */
+  if (GetParam() != Triple::aarch64)
+    GTEST_SKIP();
+
+  ASSERT_NE(TextSection, nullptr);
+
+  PREPARE_FUNC("FuncWithStub");
+  BinaryBasicBlock *BB2 = BF->addBasicBlock();
+  BB2->setCFIState(0);
+  BinaryBasicBlock *BB3 = BF->addBasicBlock();
+  BB3->setCFIState(0);
+
+  BB->addSuccessor(BB3);
+
+  // Jumping over BB2, to BB3.
+  MCInst Jump;
+  BC->MIB->createUncondBranch(Jump, BB3->getLabel(), BC->Ctx.get());
+  BB->addInstruction(Jump);
+  BC->MIB->setRAState(Jump, false);
+
+  // BB2, in real code it would be a ShortJmp.
+  // Unknown RAState.
+  MCInst StubInst;
+  BC->MIB->createReturn(StubInst);
+  BB2->addInstruction(StubInst);
+
+  // Can be any instruction.
+  MCInst Ret;
+  BC->MIB->createReturn(Ret);
+  BB3->addInstruction(Ret);
+  BC->MIB->setRAState(Ret, false);
+
+  Error E = PassManager->runPasses();
+  EXPECT_FALSE(E);
+
+  // Check that we did not generate any NegateRAState CFIs.
+  auto CFILoc = findCFIOffsets(*BF);
+  EXPECT_EQ(CFILoc.size(), 0u);
+}
+
+TEST_P(PassTester, fillUnknownStubsEmpty) {
+  /*
+   * This test checks that BOLT can set the RAState of unknown BBs,
+   * even if all previous BBs are empty, hence no PrevInst gets set.
+   *
+   * As this means that the current (empty) BB is the first with non-pseudo
+   * instructions, the function's initialRAState should be used.
+   */
+  if (GetParam() != Triple::aarch64)
+    GTEST_SKIP();
+
+  ASSERT_NE(TextSection, nullptr);
+
+  PREPARE_FUNC("FuncWithStub");
+  BF->setInitialRAState(false);
+  BinaryBasicBlock *BB2 = BF->addBasicBlock();
+  BB2->setCFIState(0);
+
+  // BB is empty.
+  BB->addSuccessor(BB2);
+
+  // BB2, in real code it would be a ShortJmp.
+  // Unknown RAState.
+  MCInst StubInst;
+  BC->MIB->createReturn(StubInst);
+  BB2->addInstruction(StubInst);
+
+  Error E = PassManager->runPasses();
+  EXPECT_FALSE(E);
+
+  // Check that BOLT added an RAState to BB2.
+  std::optional<bool> RAState = BC->MIB->getRAState(*(BB2->begin()));
+  EXPECT_TRUE(RAState.has_value());
+  // BB2 should be set to BF.initialRAState (false).
+  EXPECT_FALSE(*RAState);
+}
+
+#ifdef AARCH64_AVAILABLE
+INSTANTIATE_TEST_SUITE_P(AArch64, PassTester,
+                         ::testing::Values(Triple::aarch64));
+#endif

From 2c217909839b345760de964cf87bf1045c9ff784 Mon Sep 17 00:00:00 2001
From: Ming Yan <ming.yan@terapines.com>
Date: Mon, 1 Dec 2025 19:02:02 +0800
Subject: [PATCH 27/39] Revert "[MLIR][SCF] Sink scf.if from scf.while before
 region into after region in scf-uplift-while-to-for" (#169888)

Reverts llvm/llvm-project#165216
It is implemented in #169892 .
---
 .../SCF/Transforms/UpliftWhileToFor.cpp       | 79 +------------------
 mlir/test/Dialect/SCF/uplift-while.mlir       | 31 --------
 2 files changed, 1 insertion(+), 109 deletions(-)

diff --git a/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp b/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp
index 9f242f9e62b8e..ec1044aaa42ac 100644
--- a/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp
@@ -19,83 +19,6 @@
 using namespace mlir;
 
 namespace {
-/// Move an scf.if op that is directly before the scf.condition op in the while
-/// before region, and whose condition matches the condition of the
-/// scf.condition op, down into the while after region.
-///
-/// scf.while (%init) : (...) -> ... {
-///   %cond = ...
-///   %res = scf.if %cond -> (...) {
-///     use1(%init)
-///     %then_val = ...
-///      ... // then block
-///     scf.yield %then_val
-///   } else {
-///     scf.yield %init
-///   }
-///   scf.condition(%cond) %res
-/// } do {
-/// ^bb0(%arg):
-///   use2(%arg)
-///    ...
-///
-/// becomes
-/// scf.while (%init) : (...) -> ... {
-///   %cond = ...
-///   scf.condition(%cond) %init
-/// } do {
-/// ^bb0(%arg): :
-///   use1(%arg)
-///    ... // if then block
-///   %then_val = ...
-///   use2(%then_val)
-///    ...
-struct WhileMoveIfDown : public OpRewritePattern<scf::WhileOp> {
-  using OpRewritePattern<scf::WhileOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(scf::WhileOp op,
-                                PatternRewriter &rewriter) const override {
-    // Check that the first opeation produces one result and that result must
-    // have exactly two uses (these two uses come from the `scf.if` and
-    // `scf.condition` operations).
-    Operation &condOp = op.getBeforeBody()->front();
-    if (condOp.getNumResults() != 1 || !condOp.getResult(0).hasNUses(2))
-      return failure();
-
-    Value condVal = condOp.getResult(0);
-    auto ifOp = dyn_cast<scf::IfOp>(condOp.getNextNode());
-    if (!ifOp || ifOp.getCondition() != condVal)
-      return failure();
-
-    auto term = dyn_cast<scf::ConditionOp>(ifOp->getNextNode());
-    if (!term || term.getCondition() != condVal)
-      return failure();
-
-    // Check that if results and else yield operands match the scf.condition op
-    // arguments and while before arguments respectively.
-    if (!llvm::equal(ifOp->getResults(), term.getArgs()) ||
-        !llvm::equal(ifOp.elseYield()->getOperands(), op.getBeforeArguments()))
-      return failure();
-
-    // Update uses and move the if op into the after region.
-    rewriter.replaceAllUsesWith(op.getAfterArguments(),
-                                ifOp.thenYield()->getOperands());
-    rewriter.replaceUsesWithIf(op.getBeforeArguments(), op.getAfterArguments(),
-                               [&](OpOperand &use) {
-                                 return ifOp.getThenRegion().isAncestor(
-                                     use.getOwner()->getParentRegion());
-                               });
-    rewriter.modifyOpInPlace(
-        term, [&]() { term.getArgsMutable().assign(op.getBeforeArguments()); });
-
-    rewriter.eraseOp(ifOp.thenYield());
-    rewriter.inlineBlockBefore(ifOp.thenBlock(), op.getAfterBody(),
-                               op.getAfterBody()->begin());
-    rewriter.eraseOp(ifOp);
-    return success();
-  }
-};
-
 struct UpliftWhileOp : public OpRewritePattern<scf::WhileOp> {
   using OpRewritePattern::OpRewritePattern;
 
@@ -344,5 +267,5 @@ FailureOr<scf::ForOp> mlir::scf::upliftWhileToForLoop(RewriterBase &rewriter,
 }
 
 void mlir::scf::populateUpliftWhileToForPatterns(RewritePatternSet &patterns) {
-  patterns.add<UpliftWhileOp, WhileMoveIfDown>(patterns.getContext());
+  patterns.add<UpliftWhileOp>(patterns.getContext());
 }
diff --git a/mlir/test/Dialect/SCF/uplift-while.mlir b/mlir/test/Dialect/SCF/uplift-while.mlir
index 736112824c515..cbe2ce5076ad2 100644
--- a/mlir/test/Dialect/SCF/uplift-while.mlir
+++ b/mlir/test/Dialect/SCF/uplift-while.mlir
@@ -185,34 +185,3 @@ func.func @uplift_while(%arg0: index, %arg1: index, %arg2: index) -> (i32, f32)
 //       CHECK:     %[[T2:.*]] = "test.test2"(%[[ARG2]]) : (f32) -> f32
 //       CHECK:     scf.yield %[[T1]], %[[T2]] : i32, f32
 //       CHECK:     return %[[RES]]#0, %[[RES]]#1 : i32, f32
-
-// -----
-
-func.func @uplift_while(%low: index, %upper: index, %val : i32) -> i32 {
-  %c1 = arith.constant 1 : index
-  %1:2 = scf.while (%iv = %low, %iter = %val) : (index, i32) -> (index, i32) {
-    %2 = arith.cmpi slt, %iv, %upper : index
-    %3:2 = scf.if %2 -> (index, i32) {
-      %4 = "test.test"(%iter) : (i32) -> i32
-      %5 = arith.addi %iv, %c1 : index
-      scf.yield %5, %4 : index, i32
-    } else {
-      scf.yield %iv, %iter : index, i32
-    }
-    scf.condition(%2) %3#0, %3#1 : index, i32
-  } do {
-  ^bb0(%arg0: index, %arg1: i32):
-    scf.yield %arg0, %arg1 : index, i32
-  }
-  return %1#1 : i32
-}
-
-// CHECK-LABEL:   func.func @uplift_while(
-// CHECK-SAME:      %[[ARG0:.*]]: index, %[[ARG1:.*]]: index, %[[ARG2:.*]]: i32) -> i32 {
-// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1 : index
-// CHECK:           %[[FOR_0:.*]] = scf.for %[[VAL_0:.*]] = %[[ARG0]] to %[[ARG1]] step %[[CONSTANT_0]] iter_args(%[[VAL_1:.*]] = %[[ARG2]]) -> (i32) {
-// CHECK:             %[[VAL_2:.*]] = "test.test"(%[[VAL_1]]) : (i32) -> i32
-// CHECK:             scf.yield %[[VAL_2]] : i32
-// CHECK:           }
-// CHECK:           return %[[FOR_0]] : i32
-// CHECK:         }

From b60a84a46fa558dd14497f53fc8ad6f7ff505aaa Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Mon, 1 Dec 2025 11:19:12 +0000
Subject: [PATCH 28/39] Revert "[flang][TBAA] refine TARGET/POINTER encoding"
 (#170105)

Reverts llvm/llvm-project#169544

[Regressed](https://lab.llvm.org/buildbot/#/builders/143/builds/12956)
gfortran test suite
---
 .../flang/Optimizer/Analysis/TBAAForest.h     | 24 ++---
 flang/lib/Optimizer/Analysis/TBAAForest.cpp   |  9 +-
 .../lib/Optimizer/Transforms/AddAliasTags.cpp | 18 +---
 flang/test/Driver/tco-test-gen.fir            |  8 +-
 flang/test/Fir/tbaa-codegen2.fir              |  1 +
 .../test/Transforms/tbaa-for-common-vars.fir  | 29 +++---
 .../Transforms/tbaa-for-global-equiv-vars.fir |  6 +-
 flang/test/Transforms/tbaa-for-local-vars.fir | 32 +++----
 .../test/Transforms/tbaa-with-dummy-scope.fir | 22 ++---
 .../Transforms/tbaa-with-dummy-scope2.fir     | 32 +++----
 flang/test/Transforms/tbaa2.fir               | 20 +++--
 flang/test/Transforms/tbaa3.fir               | 89 ++++++++++---------
 flang/test/Transforms/tbaa4.fir               | 32 ++++---
 13 files changed, 160 insertions(+), 162 deletions(-)

diff --git a/flang/include/flang/Optimizer/Analysis/TBAAForest.h b/flang/include/flang/Optimizer/Analysis/TBAAForest.h
index 0b70778eba3af..b4932594114a1 100644
--- a/flang/include/flang/Optimizer/Analysis/TBAAForest.h
+++ b/flang/include/flang/Optimizer/Analysis/TBAAForest.h
@@ -99,25 +99,11 @@ struct TBAATree {
   //   |- "any data access"
   //      |
   //      |- "dummy arg data"
-  //        |
-  //        |- <dummy arg name 1>
-  //        |- <dummy arg name 2>
-  //      |- "target data" <-- Any POINTER variable or TARGET dummy arg
-  //        |
-  //        |- <target name 1> <--- any TARGET variable which isn't a dummy arg
-  //        |- <target name 2>
-  //      |- "allocated data"
-  //        |
-  //        |- <allocated name 1>
-  //        |- <allocated name 2>
-  //      |- "direct data"
-  //        |
-  //        |- <direct name 1>
-  //        |- <direct name 2>
-  //      |- "global data"
-  //        |
-  //        |- <global name 1>
-  //        |- <global name 2>
+  //      |- "target data"
+  //         |
+  //         |- "allocated data"
+  //         |- "direct data"
+  //         |- "global data"
   static TBAATree buildTree(mlir::StringAttr functionName);
 
 private:
diff --git a/flang/lib/Optimizer/Analysis/TBAAForest.cpp b/flang/lib/Optimizer/Analysis/TBAAForest.cpp
index 7154785c62c75..44a0348da3a6f 100644
--- a/flang/lib/Optimizer/Analysis/TBAAForest.cpp
+++ b/flang/lib/Optimizer/Analysis/TBAAForest.cpp
@@ -66,9 +66,12 @@ fir::TBAATree::TBAATree(mlir::LLVM::TBAATypeDescriptorAttr anyAccess,
                         mlir::LLVM::TBAATypeDescriptorAttr dataRoot,
                         mlir::LLVM::TBAATypeDescriptorAttr boxMemberTypeDesc)
     : targetDataTree(dataRoot.getContext(), "target data", dataRoot),
-      globalDataTree(dataRoot.getContext(), "global data", dataRoot),
-      allocatedDataTree(dataRoot.getContext(), "allocated data", dataRoot),
+      globalDataTree(dataRoot.getContext(), "global data",
+                     targetDataTree.getRoot()),
+      allocatedDataTree(dataRoot.getContext(), "allocated data",
+                        targetDataTree.getRoot()),
       dummyArgDataTree(dataRoot.getContext(), "dummy arg data", dataRoot),
-      directDataTree(dataRoot.getContext(), "direct data", dataRoot),
+      directDataTree(dataRoot.getContext(), "direct data",
+                     targetDataTree.getRoot()),
       anyAccessDesc(anyAccess), boxMemberTypeDesc(boxMemberTypeDesc),
       anyDataTypeDesc(dataRoot) {}
diff --git a/flang/lib/Optimizer/Transforms/AddAliasTags.cpp b/flang/lib/Optimizer/Transforms/AddAliasTags.cpp
index b592cee794f33..0221c7a8184d7 100644
--- a/flang/lib/Optimizer/Transforms/AddAliasTags.cpp
+++ b/flang/lib/Optimizer/Transforms/AddAliasTags.cpp
@@ -692,9 +692,8 @@ void AddAliasTagsPass::runOnAliasInterface(fir::FirAliasTagOpInterface op,
     LLVM_DEBUG(llvm::dbgs().indent(2)
                << "Found reference to dummy argument at " << *op << "\n");
     std::string name = getFuncArgName(llvm::cast<mlir::Value>(source.origin.u));
-    // POINTERS can alias with any POINTER or TARGET. Assume that TARGET dummy
-    // arguments might alias with each other (because of the "TARGET" hole for
-    // dummy arguments). See flang/docs/Aliasing.md.
+    // If it is a TARGET or POINTER, then we do not care about the name,
+    // because the tag points to the root of the subtree currently.
     if (source.isTargetOrPointer()) {
       tag = state.getFuncTreeWithScope(func, scopeOp).targetDataTree.getTag();
     } else if (!name.empty()) {
@@ -717,12 +716,7 @@ void AddAliasTagsPass::runOnAliasInterface(fir::FirAliasTagOpInterface op,
                << "Found reference to global " << globalName.str() << " at "
                << *op << "\n");
     if (source.isPointer()) {
-      // Pointers can alias with any pointer or target.
       tag = state.getFuncTreeWithScope(func, scopeOp).targetDataTree.getTag();
-    } else if (source.isTarget()) {
-      // Targets could alias with any pointer but not with each other.
-      tag = state.getFuncTreeWithScope(func, scopeOp)
-                .targetDataTree.getTag(globalName);
     } else {
       // In general, place the tags under the "global data" root.
       fir::TBAATree::SubtreeState *subTree =
@@ -782,17 +776,9 @@ void AddAliasTagsPass::runOnAliasInterface(fir::FirAliasTagOpInterface op,
       const char *name = glbl.getRootReference().data();
       LLVM_DEBUG(llvm::dbgs().indent(2) << "Found reference to direct " << name
                                         << " at " << *op << "\n");
-      // Pointer can alias with any pointer or target so that gets the root.
       if (source.isPointer())
         tag = state.getFuncTreeWithScope(func, scopeOp).targetDataTree.getTag();
-      // Targets could alias with any pointer but not with each other so they
-      // get their own node inside of the target data tree.
-      else if (source.isTarget())
-        tag = state.getFuncTreeWithScope(func, scopeOp)
-                  .targetDataTree.getTag(name);
       else
-        // Boxes that are not pointers or targets cannot alias with those that
-        // are. Put them under global data.
         tag = state.getFuncTreeWithScope(func, scopeOp)
                   .directDataTree.getTag(name);
     } else {
diff --git a/flang/test/Driver/tco-test-gen.fir b/flang/test/Driver/tco-test-gen.fir
index 438804ce42b76..b39295d72918f 100644
--- a/flang/test/Driver/tco-test-gen.fir
+++ b/flang/test/Driver/tco-test-gen.fir
@@ -77,13 +77,13 @@ func.func @_QPtest(%arg0: !fir.ref<i32> {fir.bindc_name = "num"}, %arg1: !fir.re
 // CHECK:           llvm.cond_br %[[VAL_17]], ^bb2, ^bb3
 // CHECK:         ^bb2:
 
-// AA:              llvm.store %[[VAL_15]], %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : i32, !llvm.ptr
+// AA:              llvm.store %[[VAL_15]], %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : i32, !llvm.ptr
 // NOAA:            llvm.store %[[VAL_15]], %{{.*}} : i32, !llvm.ptr
 
 // AA:              %[[VAL_18:.*]] = llvm.load %[[ARG0]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "dummy arg data/_QFtestEnum", members = {<#llvm.tbaa_type_desc<id = "dummy arg data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "dummy arg data/_QFtestEnum", members = {<#llvm.tbaa_type_desc<id = "dummy arg data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : !llvm.ptr -> i32
 // NOAA:            %[[VAL_18:.*]] = llvm.load %[[ARG0]] : !llvm.ptr -> i32
 
-// AA:              %[[VAL_19:.*]] = llvm.load %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : !llvm.ptr -> i32
+// AA:              %[[VAL_19:.*]] = llvm.load %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : !llvm.ptr -> i32
 // NOAA:            %[[VAL_19:.*]] = llvm.load %{{.*}} : !llvm.ptr -> i32
 
 // CHECK:           %[[VAL_20:.*]] = llvm.add %[[VAL_18]], %[[VAL_19]] : i32
@@ -92,7 +92,7 @@ func.func @_QPtest(%arg0: !fir.ref<i32> {fir.bindc_name = "num"}, %arg1: !fir.re
 
 // CHECK:           %[[VAL_21:.*]] = llvm.trunc %[[VAL_10]] : i64 to i32
 
-// AA:              %[[VAL_22:.*]] = llvm.load %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : !llvm.ptr -> i32
+// AA:              %[[VAL_22:.*]] = llvm.load %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : !llvm.ptr -> i32
 // NOAA:            %[[VAL_22:.*]] = llvm.load %{{.*}} : !llvm.ptr -> i32
 
 // CHECK:           %[[VAL_23:.*]] = llvm.add %[[VAL_22]], %[[VAL_21]] overflow<nsw> : i32
@@ -100,7 +100,7 @@ func.func @_QPtest(%arg0: !fir.ref<i32> {fir.bindc_name = "num"}, %arg1: !fir.re
 // CHECK:           llvm.br ^bb1(%[[VAL_23]], %[[VAL_24]] : i32, i64)
 // CHECK:         ^bb3:
 
-// AA:              llvm.store %[[VAL_15]], %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : i32, !llvm.ptr
+// AA:              llvm.store %[[VAL_15]], %[[VAL_1]] {tbaa = [#llvm.tbaa_tag<base_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, access_type = <id = "allocated data/_QFtestEi", members = {<#llvm.tbaa_type_desc<id = "allocated data", members = {<#llvm.tbaa_type_desc<id = "target data", members = {<#llvm.tbaa_type_desc<id = "any data access", members = {<#llvm.tbaa_type_desc<id = "any access", members = {<#llvm.tbaa_root<id = "Flang function root _QPtest">, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : i32, !llvm.ptr
 // NOAA:            llvm.store %[[VAL_15]], %{{.*}} : i32, !llvm.ptr
 
 // CHECK:           llvm.return
diff --git a/flang/test/Fir/tbaa-codegen2.fir b/flang/test/Fir/tbaa-codegen2.fir
index 071d3ec89394c..4907aa03ec5a5 100644
--- a/flang/test/Fir/tbaa-codegen2.fir
+++ b/flang/test/Fir/tbaa-codegen2.fir
@@ -114,3 +114,4 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.targ
 // CHECK: ![[TMP_DATA_ACCESS_TAG]] = !{![[TMP_DATA_ACCESS_TYPE:.*]], ![[TMP_DATA_ACCESS_TYPE]], i64 0}
 // CHECK: ![[TMP_DATA_ACCESS_TYPE]] = !{!"allocated data/", ![[TMP_ACCESS_TYPE:.*]], i64 0}
 // CHECK: ![[TMP_ACCESS_TYPE]] = !{!"allocated data", ![[TARGET_ACCESS_TAG:.*]], i64 0}
+// CHECK: ![[TARGET_ACCESS_TAG]] = !{!"target data", ![[DATA_ACCESS_TYPE]], i64 0}
diff --git a/flang/test/Transforms/tbaa-for-common-vars.fir b/flang/test/Transforms/tbaa-for-common-vars.fir
index 087e6938f8acb..a8dd86bff72ed 100644
--- a/flang/test/Transforms/tbaa-for-common-vars.fir
+++ b/flang/test/Transforms/tbaa-for-common-vars.fir
@@ -28,7 +28,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest1">
 // CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_0]], 0>}>
 // CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_1]], 0>}>
-// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_2]], 0>}>
+// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_2]], 0>}>
+// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_3]], 0>}>
 // CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc<id = "global data/common1_", members = {<#[[$ATTR_4]], 0>}>
 // CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "global data/common1_/bytes_4_to_7", members = {<#[[$ATTR_5]], 0>}>
 // CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_type_desc<id = "global data/common1_/bytes_0_to_3", members = {<#[[$ATTR_5]], 0>}>
@@ -65,7 +66,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[$ATTR_10:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest2">
 // CHECK: #[[$ATTR_11:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_10]], 0>}>
 // CHECK: #[[$ATTR_12:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_11]], 0>}>
-// CHECK: #[[$ATTR_14:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_12]], 0>}>
+// CHECK: #[[$ATTR_13:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_12]], 0>}>
+// CHECK: #[[$ATTR_14:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_13]], 0>}>
 // CHECK: #[[$ATTR_15:.+]] = #llvm.tbaa_type_desc<id = "global data/common2_", members = {<#[[$ATTR_14]], 0>}>
 // CHECK: #[[$ATTR_16:.+]] = #llvm.tbaa_type_desc<id = "global data/common2_/bytes_0_to_3", members = {<#[[$ATTR_15]], 0>}>
 // CHECK: #[[$ATTR_18:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_16]], access_type = #[[$ATTR_16]], offset = 0>
@@ -116,13 +118,14 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[ANYACC3INNER:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT3INNER]], 0>}>
 // CHECK: #[[ANYDATA3:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANYACC3]], 0>}>
 // CHECK: #[[ANYDATA3INNER:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANYACC3INNER]], 0>}>
-// CHECK: #[[GLOBALDATA3:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA3]], 0>}>
+// CHECK: #[[TARGETDATA3:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA3]], 0>}>
 // CHECK: #[[DUMMYARG3INNER:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[ANYDATA3INNER]], 0>}>
-// CHECK: #[[GLOBALDATA3COMMON3:.+]] = #llvm.tbaa_type_desc<id = "global data/common3_", members = {<#[[GLOBALDATA3]], 0>}>
+// CHECK: #[[GLOBALDATA3:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA3]], 0>}>
 // CHECK: #[[DUMMYD:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest3FinnerEd", members = {<#[[DUMMYARG3INNER]], 0>}>
 // CHECK: #[[DUMMYC:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest3FinnerEc", members = {<#[[DUMMYARG3INNER]], 0>}>
 // CHECK: #[[DUMMYDTAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYD]], access_type = #[[DUMMYD]], offset = 0>
 // CHECK: #[[DUMMYCTAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYC]], access_type = #[[DUMMYC]], offset = 0>
+// CHECK: #[[GLOBALDATA3COMMON3:.+]] = #llvm.tbaa_type_desc<id = "global data/common3_", members = {<#[[GLOBALDATA3]], 0>}>
 // CHECK: #[[GLOBALB:.+]] = #llvm.tbaa_type_desc<id = "global data/common3_/bytes_4_to_7", members = {<#[[GLOBALDATA3COMMON3]], 0>}>
 // CHECK: #[[GLOBALA:.+]] = #llvm.tbaa_type_desc<id = "global data/common3_/bytes_0_to_3", members = {<#[[GLOBALDATA3COMMON3]], 0>}>
 // CHECK: #[[GLOBALBTAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLOBALB]], access_type = #[[GLOBALB]], offset = 0>
@@ -177,8 +180,10 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[INNER4ANYACC:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[INNER4ROOT]], 0>}>
 // CHECK: #[[TEST4ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[TEST4ANYCC]], 0>}>
 // CHECK: #[[INNER4ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[INNER4ANYACC]], 0>}>
-// CHECK: #[[TEST4GLOBAL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TEST4ANYDATA]], 0>}>
-// CHECK: #[[INNER4GLOBAL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[INNER4ANYDATA]], 0>}>
+// CHECK: #[[TEST4TARGET:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[TEST4ANYDATA]], 0>}>
+// CHECK: #[[INNER4TARGET:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[INNER4ANYDATA]], 0>}>
+// CHECK: #[[TEST4GLOBAL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TEST4TARGET]], 0>}>
+// CHECK: #[[INNER4GLOBAL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[INNER4TARGET]], 0>}>
 // CHECK: #[[TEST4COMMON:.+]] = #llvm.tbaa_type_desc<id = "global data/common4_", members = {<#[[TEST4GLOBAL]], 0>}>
 // CHECK: #[[INNER4COMMON:.+]] = #llvm.tbaa_type_desc<id = "global data/common4_", members = {<#[[INNER4GLOBAL]], 0>}>
 // CHECK: #[[TEST4B:.+]] = #llvm.tbaa_type_desc<id = "global data/common4_/bytes_4_to_7", members = {<#[[TEST4COMMON]], 0>}>
@@ -224,7 +229,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[TEST5ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest5">
 // CHECK: #[[TEST5ANYACC:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[TEST5ROOT]], 0>}>
 // CHECK: #[[TEST5ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[TEST5ANYACC]], 0>}>
-// CHECK: #[[TEST5GLOBAL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TEST5ANYDATA]], 0>}>
+// CHECK: #[[TEST5TARGET:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[TEST5ANYDATA]], 0>}>
+// CHECK: #[[TEST5GLOBAL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TEST5TARGET]], 0>}>
 // CHECK: #[[TEST5COMMON5:.+]] = #llvm.tbaa_type_desc<id = "global data/common5_", members = {<#[[TEST5GLOBAL]], 0>}>
 // CHECK: #[[TEST5COMMON5TAG:.+]] = #llvm.tbaa_tag<base_type = #[[TEST5COMMON5]], access_type = #[[TEST5COMMON5]], offset = 0>
 // CHECK: #[[TEST5A:.+]] = #llvm.tbaa_type_desc<id = "global data/common5_/bytes_0_to_3", members = {<#[[TEST5COMMON5]], 0>}>
@@ -282,7 +288,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest6">
 // CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_0]], 0>}>
 // CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_1]], 0>}>
-// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_2]], 0>}>
+// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_2]], 0>}>
+// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_3]], 0>}>
 // CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc<id = "global data/block_", members = {<#[[$ATTR_4]], 0>}>
 // CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "global data/block_/bytes_0_to_79", members = {<#[[$ATTR_5]], 0>}>
 // CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_6]], access_type = #[[$ATTR_6]], offset = 0>
@@ -347,8 +354,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[$ATTR_74:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_73]], 0>}>
 // CHECK: #[[$ATTR_75:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_74]], 0>}>
 // CHECK: #[[$ATTR_76:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_75]], 0>}>
-// CHECK: #[[$ATTR_78:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_75]], 0>}>
 // CHECK: #[[$ATTR_77:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_76]], access_type = #[[$ATTR_76]], offset = 0>
+// CHECK: #[[$ATTR_78:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ATTR_76]], 0>}>
 // CHECK: #[[$ATTR_79:.+]] = #llvm.tbaa_type_desc<id = "global data/block_", members = {<#[[$ATTR_78]], 0>}>
 // CHECK: #[[$ATTR_80:.+]] = #llvm.tbaa_type_desc<id = "global data/block_/bytes_40_to_43", members = {<#[[$ATTR_79]], 0>}>
 // CHECK: #[[$ATTR_81:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_80]], access_type = #[[$ATTR_80]], offset = 0>
@@ -418,10 +425,10 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[$ATTR_82:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest8">
 // CHECK: #[[$ATTR_83:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_82]], 0>}>
 // CHECK: #[[$ATTR_84:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_83]], 0>}>
-// CHECK: #[[$ATTR_87:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$ATTR_84]], 0>}>
 // CHECK: #[[$ATTR_85:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_84]], 0>}>
-// CHECK: #[[$ATTR_88:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_87]], access_type = #[[$ATTR_87]], offset = 0>
 // CHECK: #[[$ATTR_86:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_85]], access_type = #[[$ATTR_85]], offset = 0>
+// CHECK: #[[$ATTR_87:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$ATTR_85]], 0>}>
+// CHECK: #[[$ATTR_88:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_87]], access_type = #[[$ATTR_87]], offset = 0>
 // CHECK-LABEL:   func.func @_QPtest8() {
 // CHECK:           fir.load %{{[0-9]+}} : !fir.ref<!fir.box<!fir.ptr<f32>>>
 // CHECK:           fir.load %{{[0-9]+}} {tbaa = [#[[$ATTR_86]]]} : !fir.ptr<f32>
diff --git a/flang/test/Transforms/tbaa-for-global-equiv-vars.fir b/flang/test/Transforms/tbaa-for-global-equiv-vars.fir
index 0d082c7504024..dbefa3f8e3f5f 100644
--- a/flang/test/Transforms/tbaa-for-global-equiv-vars.fir
+++ b/flang/test/Transforms/tbaa-for-global-equiv-vars.fir
@@ -30,7 +30,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[ROOT1:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest1">
 // CHECK: #[[ANYACC1:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT1]], 0>}>
 // CHECK: #[[ANYDATA1:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANYACC1]], 0>}>
-// CHECK: #[[GLOBALDATA1:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA1]], 0>}>
+// CHECK: #[[TARGETDATA1:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA1]], 0>}>
+// CHECK: #[[GLOBALDATA1:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA1]], 0>}>
 // CHECK: #[[GLOB1COMMON:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMdata1Eglob1", members = {<#[[GLOBALDATA1]], 0>}>
 // CHECK: #[[GLOB1:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMdata1Eglob1/bytes_0_to_3", members = {<#[[GLOB1COMMON]], 0>}>
 // CHECK: #[[TAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLOB1]], access_type = #[[GLOB1]], offset = 0>
@@ -73,7 +74,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[ROOT2:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest2">
 // CHECK: #[[ANYACC2:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT2]], 0>}>
 // CHECK: #[[ANYDATA2:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANYACC2]], 0>}>
-// CHECK: #[[GLOBALDATA2:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA2]], 0>}>
+// CHECK: #[[TARGETDATA2:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA2]], 0>}>
+// CHECK: #[[GLOBALDATA2:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA2]], 0>}>
 // CHECK: #[[GLOB1COMMON:.+]] = #llvm.tbaa_type_desc<id = "global data/glob1_", members = {<#[[GLOBALDATA2]], 0>}>
 // CHECK: #[[GLOB1GLOB2:.+]] = #llvm.tbaa_type_desc<id = "global data/glob1_/bytes_0_to_3", members = {<#[[GLOB1COMMON]], 0>}>
 // CHECK: #[[GLOB3:.+]] = #llvm.tbaa_type_desc<id = "global data/glob1_/bytes_4_to_7", members = {<#[[GLOB1COMMON]], 0>}>
diff --git a/flang/test/Transforms/tbaa-for-local-vars.fir b/flang/test/Transforms/tbaa-for-local-vars.fir
index fde5c400c75ed..4eb6b2ecf31c4 100644
--- a/flang/test/Transforms/tbaa-for-local-vars.fir
+++ b/flang/test/Transforms/tbaa-for-local-vars.fir
@@ -35,22 +35,18 @@
 // scope's TBAA tree.
 // RUN: fir-opt --fir-add-alias-tags %s | FileCheck %s
 
-// CHECK: #[[$SCOPE_2:.+]] = #llvm.tbaa_root<id = "Flang function root _QMmPtest - Scope 2">
-// CHECK: #[[$SCOPE_1:.+]] = #llvm.tbaa_root<id = "Flang function root _QMmPtest">
-// CHECK: #[[$ANY_ACCESS2:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$SCOPE_2]], 0>}>
-// CHECK: #[[$ANY_ACCESS1:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$SCOPE_1]], 0>}>
-// CHECK: #[[$ANY_DATA2:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ANY_ACCESS2]], 0>}>
-// CHECK: #[[$ANY_DATA1:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ANY_ACCESS1]], 0>}>
-// CHECK: #[[$DUMMY_ARG2:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ANY_DATA2]], 0>}>
-// CHECK: #[[$ALLOCATED_DATA1:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$ANY_DATA1]], 0>}>
-// CHECK: #[[$DUMMY_ARG1:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ANY_DATA1]], 0>}>
-// CHECK: #[[$ALLOCATED_DATA1_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[$ALLOCATED_DATA1]], access_type = #[[$ALLOCATED_DATA1]], offset = 0>
-// CHECK: #[[$BAR_THIS2:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmFbarEthis", members = {<#[[$DUMMY_ARG2]], 0>}>
-// CHECK: #[[$TEST_VAR1:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QMmFtestEvar", members = {<#[[$ALLOCATED_DATA1]], 0>}>
-// CHECK: #[[$TEST_ARG1:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmFtestEarg", members = {<#[[$DUMMY_ARG1]], 0>}>
-// CHECK: #[[$BAR_THIS2_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[$BAR_THIS2]], access_type = #[[$BAR_THIS2]], offset = 0>
-// CHECK: #[[$TEST_VAR1_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[$TEST_VAR1]], access_type = #[[$TEST_VAR1]], offset = 0>
-// CHECK: #[[$TEST_ARG2_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[$TEST_ARG1]], access_type = #[[$TEST_ARG1]], offset = 0>
+// CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root<id = "Flang function root _QMmPtest - Scope 2">
+// CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_root<id = "Flang function root _QMmPtest">
+// CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_0]], 0>}>
+// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_1]], 0>}>
+// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_2]], 0>}>
+// CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_3]], 0>}>
+// CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ATTR_4]], 0>}>
+// CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_5]], 0>}>
+// CHECK: #[[$ATTR_9:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmFbarEthis", members = {<#[[$ATTR_6]], 0>}>
+// CHECK: #[[$ATTR_10:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$ATTR_7]], 0>}>
+// CHECK: #[[$ATTR_12:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_9]], access_type = #[[$ATTR_9]], offset = 0>
+// CHECK: #[[$ATTR_13:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_10]], access_type = #[[$ATTR_10]], offset = 0>
 
 // CHECK-LABEL:   func.func @_QMmPtest(
 // CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<f32> {fir.bindc_name = "arg"}) {
@@ -65,10 +61,10 @@
 // CHECK:           %[[VAL_10:.*]] = fir.dummy_scope : !fir.dscope
 // CHECK:           %[[VAL_11:.*]] = fir.declare %[[VAL_9]] dummy_scope %[[VAL_10]] {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QMmFbarEthis"} : (!fir.class<!fir.type<_QMmTt{x:f32}>>, !fir.dscope) -> !fir.class<!fir.type<_QMmTt{x:f32}>>
 // CHECK:           %[[VAL_12:.*]] = fir.coordinate_of %[[VAL_11]], x : (!fir.class<!fir.type<_QMmTt{x:f32}>>) -> !fir.ref<f32>
-// CHECK:           fir.store %[[VAL_0]] to %[[VAL_12]] {tbaa = [#[[$BAR_THIS2_TAG]]]} : !fir.ref<f32>
+// CHECK:           fir.store %[[VAL_0]] to %[[VAL_12]] {tbaa = [#[[$ATTR_12]]]} : !fir.ref<f32>
 // CHECK:           %[[VAL_13:.*]] = fir.declare %[[VAL_1]] {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.type<_QMmTt{x:f32}>>) -> !fir.ref<!fir.type<_QMmTt{x:f32}>>
 // CHECK:           %[[VAL_14:.*]] = fir.coordinate_of %[[VAL_13]], x : (!fir.ref<!fir.type<_QMmTt{x:f32}>>) -> !fir.ref<f32>
-// CHECK:           %[[VAL_16:.*]] = fir.load %[[VAL_14]] {tbaa = [#[[$ALLOCATED_DATA1_TAG]]]} : !fir.ref<f32>
+// CHECK:           %[[VAL_16:.*]] = fir.load %[[VAL_14]] {tbaa = [#[[$ATTR_13]]]} : !fir.ref<f32>
 module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i64 = dense<[32, 64]> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little">, llvm.data_layout = ""} {
 func.func @_QMmPtest(%arg0: !fir.ref<f32> {fir.bindc_name = "arg"}) {
   %cst = arith.constant 1.000000e+00 : f32
diff --git a/flang/test/Transforms/tbaa-with-dummy-scope.fir b/flang/test/Transforms/tbaa-with-dummy-scope.fir
index d7f33776150ae..4ae2b8efe2581 100644
--- a/flang/test/Transforms/tbaa-with-dummy-scope.fir
+++ b/flang/test/Transforms/tbaa-with-dummy-scope.fir
@@ -24,7 +24,7 @@
 // CHECK: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[TEST1ANYDATA]], 0>}>
 // CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ATTR_4]], 0>}>
 // CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ATTR_5]], 0>}>
-// CHECK: #[[TARGETDATA_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[TARGETDATA]], access_type = #[[TARGETDATA]], offset = 0>
+// CHECK: #[[TARGETTAG:.+]] = #llvm.tbaa_tag<base_type = #[[TARGETDATA]], access_type = #[[TARGETDATA]], offset = 0>
 // CHECK: #[[$ATTR_8:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtestFinnerEy", members = {<#[[$ATTR_6]], 0>}>
 // CHECK: #[[$ATTR_9:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtestFinnerEx", members = {<#[[$ATTR_6]], 0>}>
 // CHECK: #[[$ATTR_10:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtestFinnerEy", members = {<#[[$ATTR_7]], 0>}>
@@ -34,8 +34,8 @@
 // CHECK: #[[$ATTR_14:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_10]], access_type = #[[$ATTR_10]], offset = 0>
 // CHECK: #[[$ATTR_15:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_11]], access_type = #[[$ATTR_11]], offset = 0>
 // CHECK:   func.func @test1(
-// CHECK:           %[[VAL_5:.*]] = fir.load %{{.*}} {tbaa = [#[[TARGETDATA_TAG]]]} : !fir.ref<f32>
-// CHECK:           fir.store %{{.*}} {tbaa = [#[[TARGETDATA_TAG]]]} : !fir.ref<f32>
+// CHECK:           %[[VAL_5:.*]] = fir.load %{{.*}} {tbaa = [#[[TARGETTAG]]]} : !fir.ref<f32>
+// CHECK:           fir.store %{{.*}} {tbaa = [#[[TARGETTAG]]]} : !fir.ref<f32>
 // CHECK:           %[[VAL_6:.*]] = fir.dummy_scope : !fir.dscope
 // CHECK:           %[[VAL_9:.*]] = fir.load %{{.*}} {tbaa = [#[[$ATTR_12]]]} : !fir.ref<f32>
 // CHECK:           fir.store %{{.*}} {tbaa = [#[[$ATTR_13]]]} : !fir.ref<f32>
@@ -83,21 +83,23 @@ func.func @test1(%arg0: !fir.ref<f32> {fir.bindc_name = "x", fir.target}, %arg1:
 // CHECK: #[[$ATTR_33:.+]] = #llvm.tbaa_root<id = "Flang function root _QMtestPcaller - Scope 1">
 // CHECK: #[[$ATTR_34:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_32]], 0>}>
 // CHECK: #[[$ATTR_35:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_33]], 0>}>
-// CHECK: #[[$CALLERANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_34]], 0>}>
-// CHECK: #[[$CALLEEANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_35]], 0>}>
-// CHECK: #[[$ATTR_38:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$CALLERANYDATA]], 0>}>
-// CHECK: #[[$ATTR_39:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$CALLEEANYDATA]], 0>}>
-// CHECK: #[[$ATTR_40:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$CALLEEANYDATA]], 0>}>
+// CHECK: #[[$ATTR_36:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_34]], 0>}>
+// CHECK: #[[$ATTR_37:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_35]], 0>}>
+// CHECK: #[[CALLERTARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_36]], 0>}>
+// CHECK: #[[CALLEETARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_37]], 0>}>
+// CHECK: #[[$ATTR_40:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ATTR_37]], 0>}>
+// CHECK: #[[$ATTR_38:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[CALLERTARGETDATA]], 0>}>
+// CHECK: #[[$ATTR_39:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[CALLEETARGETDATA]], 0>}>
+// CHECK: #[[$ATTR_45:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMtestFcalleeEz", members = {<#[[$ATTR_40]], 0>}>
+// CHECK: #[[$ATTR_50:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_45]], access_type = #[[$ATTR_45]], offset = 0>
 // CHECK: #[[$ATTR_41:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMtestEy", members = {<#[[$ATTR_38]], 0>}>
 // CHECK: #[[$ATTR_42:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMtestEx", members = {<#[[$ATTR_38]], 0>}>
 // CHECK: #[[$ATTR_43:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMtestEy", members = {<#[[$ATTR_39]], 0>}>
 // CHECK: #[[$ATTR_44:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMtestEx", members = {<#[[$ATTR_39]], 0>}>
-// CHECK: #[[$ATTR_45:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMtestFcalleeEz", members = {<#[[$ATTR_40]], 0>}>
 // CHECK: #[[$ATTR_46:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_41]], access_type = #[[$ATTR_41]], offset = 0>
 // CHECK: #[[$ATTR_47:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_42]], access_type = #[[$ATTR_42]], offset = 0>
 // CHECK: #[[$ATTR_48:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_43]], access_type = #[[$ATTR_43]], offset = 0>
 // CHECK: #[[$ATTR_49:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_44]], access_type = #[[$ATTR_44]], offset = 0>
-// CHECK: #[[$ATTR_50:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_45]], access_type = #[[$ATTR_45]], offset = 0>
 // CHECK:   func.func @_QMtestPcaller(
 // CHECK-SAME:                              %[[VAL_0:.*]]: !fir.ref<f32> {fir.bindc_name = "z"}) {
 // CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
diff --git a/flang/test/Transforms/tbaa-with-dummy-scope2.fir b/flang/test/Transforms/tbaa-with-dummy-scope2.fir
index 6f5ed69fbc9c6..54902ca7d41e1 100644
--- a/flang/test/Transforms/tbaa-with-dummy-scope2.fir
+++ b/flang/test/Transforms/tbaa-with-dummy-scope2.fir
@@ -44,15 +44,16 @@ func.func @_QPtest1() attributes {noinline} {
 }
 // CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest1">
 // CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_0]], 0>}>
-// CHECK: #[[$ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_1]], 0>}>
-// CHECK: #[[$LOCAL_ATTR_0:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$ANYDATA]], 0>}>
-// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ANYDATA]], 0>}>
-// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ANYDATA]], 0>}>
-// CHECK: #[[$LOCAL_ATTR_1:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1FinnerEy", members = {<#[[$LOCAL_ATTR_0]], 0>}>
+// CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_1]], 0>}>
+// CHECK: #[[$TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_2]], 0>}>
+// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ATTR_2]], 0>}>
+// CHECK: #[[$LOCAL_ATTR_0:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$TARGETDATA]], 0>}>
 // CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest1FinnerEx", members = {<#[[$ATTR_3]], 0>}>
+// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$TARGETDATA]], 0>}>
+// CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_5]], access_type = #[[$ATTR_5]], offset = 0>
+// CHECK: #[[$LOCAL_ATTR_1:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1FinnerEy", members = {<#[[$LOCAL_ATTR_0]], 0>}>
 // CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMmEglob", members = {<#[[$ATTR_4]], 0>}>
 // CHECK: #[[$LOCAL_ATTR_2:.+]] = #llvm.tbaa_tag<base_type = #[[$LOCAL_ATTR_1]], access_type = #[[$LOCAL_ATTR_1]], offset = 0>
-// CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_5]], access_type = #[[$ATTR_5]], offset = 0>
 // CHECK: #[[$ATTR_8:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_6]], access_type = #[[$ATTR_6]], offset = 0>
 // CHECK-LABEL:   func.func @_QPtest1() attributes {noinline} {
 // CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFtest1FinnerEy"}
@@ -89,18 +90,19 @@ func.func @_QPtest2() attributes {noinline} {
 }
 // CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest2">
 // CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest2 - Scope 1">
-// CHECK: #[[$ANY_ACCESS_0:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_0]], 0>}>
-// CHECK: #[[$ANY_ACCESS_1:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_1]], 0>}>
-// CHECK: #[[$ANY_DATA_0:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ANY_ACCESS_0]], 0>}>
-// CHECK: #[[$ANY_DATA_1:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ANY_ACCESS_1]], 0>}>
-// CHECK: #[[$LOCAL_ATTR_0:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$ANY_DATA_0]], 0>}>
-// CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ANY_DATA_1]], 0>}>
-// CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$ANY_DATA_0]], 0>}>
-// CHECK: #[[$LOCAL_ATTR_1:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest2FinnerEy", members = {<#[[$LOCAL_ATTR_0]], 0>}>
+// CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_0]], 0>}>
+// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[$ATTR_1]], 0>}>
+// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_2]], 0>}>
+// CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[$ATTR_3]], 0>}>
+// CHECK: #[[$TARGETDATA_0:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[$ATTR_4]], 0>}>
+// CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[$ATTR_5]], 0>}>
+// CHECK: #[[$LOCAL_ATTR_0:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[$TARGETDATA_0]], 0>}>
 // CHECK: #[[$ATTR_8:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest2FinnerEx", members = {<#[[$ATTR_6]], 0>}>
+// CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[$TARGETDATA_0]], 0>}>
+// CHECK: #[[$ATTR_10:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_8]], access_type = #[[$ATTR_8]], offset = 0>
+// CHECK: #[[$LOCAL_ATTR_1:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest2FinnerEy", members = {<#[[$LOCAL_ATTR_0]], 0>}>
 // CHECK: #[[$ATTR_9:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMmEglob", members = {<#[[$ATTR_7]], 0>}>
 // CHECK: #[[$LOCAL_ATTR_2:.+]] = #llvm.tbaa_tag<base_type = #[[$LOCAL_ATTR_1]], access_type = #[[$LOCAL_ATTR_1]], offset = 0>
-// CHECK: #[[$ATTR_10:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_8]], access_type = #[[$ATTR_8]], offset = 0>
 // CHECK: #[[$ATTR_11:.+]] = #llvm.tbaa_tag<base_type = #[[$ATTR_9]], access_type = #[[$ATTR_9]], offset = 0>
 // CHECK-LABEL:   func.func @_QPtest2() attributes {noinline} {
 // CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFtest2FinnerEy"}
diff --git a/flang/test/Transforms/tbaa2.fir b/flang/test/Transforms/tbaa2.fir
index 9b5307ba69d17..a594e6b32fdac 100644
--- a/flang/test/Transforms/tbaa2.fir
+++ b/flang/test/Transforms/tbaa2.fir
@@ -48,10 +48,18 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QMmodPcallee">
 // CHECK: #[[ANY_ACCESS:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // CHECK: #[[ANY_DATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY_ACCESS]], 0>}>
-// CHECK: #[[ANY_GLBL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANY_DATA]], 0>}>
-// CHECK: #[[ANY_LOCAL:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[ANY_DATA]], 0>}>
+// CHECK: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANY_DATA]], 0>}>
 // CHECK: #[[ANY_ARG:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[ANY_DATA]], 0>}>
-// CHECK: #[[ANY_DIRECT:.+]] = #llvm.tbaa_type_desc<id = "direct data", members = {<#[[ANY_DATA]], 0>}>
+// CHECK: #[[ANY_GLBL:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA]], 0>}>
+// CHECK: #[[ANY_LOCAL:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[TARGETDATA]], 0>}>
+// CHECK: #[[ARG_LOW:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeElow", members = {<#[[ANY_ARG]], 0>}>
+// CHECK: #[[ANY_DIRECT:.+]] = #llvm.tbaa_type_desc<id = "direct data", members = {<#[[TARGETDATA]], 0>}>
+// CHECK: #[[ARG_Z:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeEz", members = {<#[[ANY_ARG]], 0>}>
+// CHECK: #[[ARG_Y:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeEy", members = {<#[[ANY_ARG]], 0>}>
+
+// CHECK: #[[ARG_LOW_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_LOW]], access_type = #[[ARG_LOW]], offset = 0>
+// CHECK: #[[ARG_Z_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_Z]], access_type = #[[ARG_Z]], offset = 0>
+// CHECK: #[[ARG_Y_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_Y]], access_type = #[[ARG_Y]], offset = 0>
 
 // CHECK: #[[GLBL_ZSTART:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMmodEzstart", members = {<#[[ANY_GLBL]], 0>}>
 // CHECK: #[[GLBL_ZSTOP:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMmodEzstop", members = {<#[[ANY_GLBL]], 0>}>
@@ -61,13 +69,10 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[LOCAL2_ALLOC:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QMmodFcalleeEj", members = {<#[[ANY_LOCAL]], 0>}>
 // CHECK: #[[GLBL_XSTART:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMmodExstart", members = {<#[[ANY_GLBL]], 0>}>
 // CHECK: #[[LOCAL3_ALLOC:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QMmodFcalleeEi", members = {<#[[ANY_LOCAL]], 0>}>
-// CHECK: #[[ARG_LOW:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeElow", members = {<#[[ANY_ARG]], 0>}>
 // CHECK: #[[LOCAL4_ALLOC:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QMmodFcalleeEdxold", members = {<#[[ANY_LOCAL]], 0>}>
 // CHECK: #[[DIRECT_A:.+]] = #llvm.tbaa_type_desc<id = "direct data/_QMmodEa", members = {<#[[ANY_DIRECT]], 0>}>
 // CHECK: #[[DIRECT_B:.+]] = #llvm.tbaa_type_desc<id = "direct data/_QMmodEb", members = {<#[[ANY_DIRECT]], 0>}>
-// CHECK: #[[ARG_Z:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeEz", members = {<#[[ANY_ARG]], 0>}>
 // CHECK: #[[GLBL_DYINV:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMmodEdyinv", members = {<#[[ANY_GLBL]], 0>}>
-// CHECK: #[[ARG_Y:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeEy", members = {<#[[ANY_ARG]], 0>}>
 // CHECK: #[[LOCAL5_ALLOC:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QMmodFcalleeEdzinv", members = {<#[[ANY_LOCAL]], 0>}>
 
 // CHECK: #[[GLBL_ZSTART_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLBL_ZSTART]], access_type = #[[GLBL_ZSTART]], offset = 0>
@@ -78,13 +83,10 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // CHECK: #[[LOCAL2_ALLOC_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCAL2_ALLOC]], access_type = #[[LOCAL2_ALLOC]], offset = 0>
 // CHECK: #[[GLBL_XSTART_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLBL_XSTART]], access_type = #[[GLBL_XSTART]], offset = 0>
 // CHECK: #[[LOCAL3_ALLOC_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCAL3_ALLOC]], access_type = #[[LOCAL3_ALLOC]], offset = 0>
-// CHECK: #[[ARG_LOW_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_LOW]], access_type = #[[ARG_LOW]], offset = 0>
 // CHECK: #[[LOCAL4_ALLOC_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCAL4_ALLOC]], access_type = #[[LOCAL4_ALLOC]], offset = 0>
 // CHECK: #[[DIRECT_A_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[DIRECT_A]], access_type = #[[DIRECT_A]], offset = 0>
 // CHECK: #[[DIRECT_B_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[DIRECT_B]], access_type = #[[DIRECT_B]], offset = 0>
-// CHECK: #[[ARG_Z_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_Z]], access_type = #[[ARG_Z]], offset = 0>
 // CHECK: #[[GLBL_DYINV_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLBL_DYINV]], access_type = #[[GLBL_DYINV]], offset = 0>
-// CHECK: #[[ARG_Y_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_Y]], access_type = #[[ARG_Y]], offset = 0>
 // CHECK: #[[LOCAL5_ALLOC_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCAL5_ALLOC]], access_type = #[[LOCAL5_ALLOC]], offset = 0>
 
   func.func @_QMmodPcallee(%arg0: !fir.box<!fir.array<?x?x?xf32>> {fir.bindc_name = "z"}, %arg1: !fir.box<!fir.array<?x?x?xf32>> {fir.bindc_name = "y"}, %arg2: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>> {fir.bindc_name = "low"}) {
diff --git a/flang/test/Transforms/tbaa3.fir b/flang/test/Transforms/tbaa3.fir
index 7a9a819ea102a..abcb7e000bac1 100644
--- a/flang/test/Transforms/tbaa3.fir
+++ b/flang/test/Transforms/tbaa3.fir
@@ -1,4 +1,5 @@
-// RUN: fir-opt --fir-add-alias-tags %s | FileCheck --check-prefixes=ALL %s
+// RUN: fir-opt --fir-add-alias-tags %s | FileCheck --check-prefixes=ALL,DEFAULT %s
+// RUN: fir-opt --fir-add-alias-tags --local-alloc-tbaa %s | FileCheck --check-prefixes=ALL,LOCAL %s
 
 // Test AddAliasTagsPass creating sub-tree for TARGET/POINTER variables.
 
@@ -55,57 +56,56 @@
 //    |  |- "dummy arg data/_QFtest1Edummyas"
 //    |  |- "dummy arg data/_QFtest1Edummya"
 //    |
-//    |- "target data" <--- all pointers and target dummy arguments go here
-//    |  |- "target data/_QMdataEglobt"
-//    |  |- "target data/_QMdataEglobat"
-//    |  |- "target data/_QFtest1Elocalt"
-//    |  |- "target data/_QFtest1Elocalat"
-//    |
-//    |- "global data"
-//    |  |
-//    |  |- "global data/_QMdataEglob"
-//    |
-//    |- "direct data"
-//    |  |
-//    |  |- "direct data/_QMdataEgloba"
-//    |
-//    |- "allocated data"
+//    |- "target data" <- all pointers and taget dummys
 //       |
-//       |- "allocated data/_QFtest1Elocal"
-//       |- "allocated data/_QFtest1Elocala"
+//       |- "global data"
+//       |  |
+//       |  |- "global data/_QMdataEglob"
+//       |  |- "global data/_QMdataEglobt"
+//       |
+//       |- "direct data"
+//       |  |
+//       |  |- "direct data/_QMdataEgloba"
+//       |  |- "direct data/_QMdataEglobat"
+//       |
+//       |- "allocated data"
+//          |
+//          |- "allocated data/_QFtest1Elocal"
+//          |- "allocated data/_QFtest1Elocalt"
+//          |- "allocated data/_QFtest1Elocala"
+//          |- "allocated data/_QFtest1Elocalat"
 
 // ALL: #[[FUNCROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest1">
 // ALL: #[[ANYACCESS:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[FUNCROOT]], 0>}>
 // ALL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANYACCESS]], 0>}>
-// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA]], 0>}>
 // ALL: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA]], 0>}>
-// ALL: #[[DIRECTDATA:.+]] = #llvm.tbaa_type_desc<id = "direct data", members = {<#[[ANYDATA]], 0>}>
 // ALL: #[[DUMMYDATA:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[ANYDATA]], 0>}>
-// ALL: #[[LOCALDATA:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[ANYDATA]], 0>}>
 // ALL: #[[TARGETTAG:.+]] = #llvm.tbaa_tag<base_type = #[[TARGETDATA]], access_type = #[[TARGETDATA]], offset = 0>
-// ALL: #[[GLOBVAR:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMdataEglob", members = {<#[[GLOBALDATA]], 0>}>
-// ALL: #[[GLOBTVAR:.+]] = #llvm.tbaa_type_desc<id = "target data/_QMdataEglobt", members = {<#[[TARGETDATA]], 0>}>
-// ALL: #[[GLOBAVAR:.+]] = #llvm.tbaa_type_desc<id = "direct data/_QMdataEgloba", members = {<#[[DIRECTDATA]], 0>}>
-// ALL: #[[GLOBATVAR:.+]] = #llvm.tbaa_type_desc<id = "target data/_QMdataEglobat", members = {<#[[TARGETDATA]], 0>}>
+// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA]], 0>}>
+// ALL: #[[DIRECTDATA:.+]] = #llvm.tbaa_type_desc<id = "direct data", members = {<#[[TARGETDATA]], 0>}>
 // ALL: #[[DUMMYFVAR:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest1Edummyf", members = {<#[[DUMMYDATA]], 0>}>
 // ALL: #[[DUMMYASVAR:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest1Edummyas", members = {<#[[DUMMYDATA]], 0>}>
 // ALL: #[[DUMMYAVAR:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtest1Edummya", members = {<#[[DUMMYDATA]], 0>}>
-// ALL: #[[LOCALVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocal", members = {<#[[LOCALDATA]], 0>}>
-// ALL: #[[LOCALTVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocalt", members = {<#[[LOCALDATA]], 0>}>
-// ALL: #[[LOCALAVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocala", members = {<#[[LOCALDATA]], 0>}>
-// ALL: #[[LOCALATVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocalat", members = {<#[[LOCALDATA]], 0>}>
-
+// LOCAL: #[[LOCALDATA:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[TARGETDATA]], 0>}>
+// ALL: #[[DUMMYFTAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYFVAR]], access_type = #[[DUMMYFVAR]], offset = 0>
+// ALL: #[[DUMMYASTAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYASVAR]], access_type = #[[DUMMYASVAR]], offset = 0>
+// ALL: #[[DUMMYATAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYAVAR]], access_type = #[[DUMMYAVAR]], offset = 0>
+// ALL: #[[GLOBVAR:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMdataEglob", members = {<#[[GLOBALDATA]], 0>}>
+// ALL: #[[GLOBTVAR:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMdataEglobt", members = {<#[[GLOBALDATA]], 0>}>
+// ALL: #[[GLOBAVAR:.+]] = #llvm.tbaa_type_desc<id = "direct data/_QMdataEgloba", members = {<#[[DIRECTDATA]], 0>}>
+// ALL: #[[GLOBATVAR:.+]] = #llvm.tbaa_type_desc<id = "direct data/_QMdataEglobat", members = {<#[[DIRECTDATA]], 0>}>
+// LOCAL: #[[LOCALVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocal", members = {<#[[LOCALDATA]], 0>}>
+// LOCAL: #[[LOCALTVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocalt", members = {<#[[LOCALDATA]], 0>}>
+// LOCAL: #[[LOCALAVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocala", members = {<#[[LOCALDATA]], 0>}>
+// LOCAL: #[[LOCALATVAR:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest1Elocalat", members = {<#[[LOCALDATA]], 0>}>
 // ALL: #[[GLOBTAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLOBVAR]], access_type = #[[GLOBVAR]], offset = 0>
 // ALL: #[[GLOBTTAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLOBTVAR]], access_type = #[[GLOBTVAR]], offset = 0>
 // ALL: #[[GLOBATAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLOBAVAR]], access_type = #[[GLOBAVAR]], offset = 0>
 // ALL: #[[GLOBATTAG:.+]] = #llvm.tbaa_tag<base_type = #[[GLOBATVAR]], access_type = #[[GLOBATVAR]], offset = 0>
-// ALL: #[[DUMMYFTAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYFVAR]], access_type = #[[DUMMYFVAR]], offset = 0>
-// ALL: #[[DUMMYASTAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYASVAR]], access_type = #[[DUMMYASVAR]], offset = 0>
-// ALL: #[[DUMMYATAG:.+]] = #llvm.tbaa_tag<base_type = #[[DUMMYAVAR]], access_type = #[[DUMMYAVAR]], offset = 0>
-// ALL: #[[LOCALTAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALVAR]], access_type = #[[LOCALVAR]], offset = 0>
-// ALL: #[[LOCALTTAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALTVAR]], access_type = #[[LOCALTVAR]], offset = 0>
-// ALL: #[[LOCALATAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALAVAR]], access_type = #[[LOCALAVAR]], offset = 0>
-// ALL: #[[LOCALATTAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALATVAR]], access_type = #[[LOCALATVAR]], offset = 0>
+// LOCAL: #[[LOCALTAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALVAR]], access_type = #[[LOCALVAR]], offset = 0>
+// LOCAL: #[[LOCALTTAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALTVAR]], access_type = #[[LOCALTVAR]], offset = 0>
+// LOCAL: #[[LOCALATAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALAVAR]], access_type = #[[LOCALAVAR]], offset = 0>
+// LOCAL: #[[LOCALATTAG:.+]] = #llvm.tbaa_tag<base_type = #[[LOCALATVAR]], access_type = #[[LOCALATVAR]], offset = 0>
 
 module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i64 = dense<[32, 64]> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little">, llvm.data_layout = ""} {
   fir.global @_QMdataEglob : !fir.array<10xf32> {
@@ -263,11 +263,13 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
     fir.store %cst to %67 : !fir.ref<f32>
     %68 = fir.array_coor %20(%5) %c1 : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
 // real :: local(10)
-// ALL: fir.store{{.*}}{tbaa = [#[[LOCALTAG]]]} : !fir.ref<f32>
+// DEFAULT: fir.store{{.*}}tbaa
+// LOCAL: fir.store{{.*}}{tbaa = [#[[LOCALTAG]]]} : !fir.ref<f32>
     fir.store %cst to %68 : !fir.ref<f32>
     %69 = fir.array_coor %33(%5) %c1 : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
 // real, target :: localt(10)
-// ALL: fir.store{{.*}}{tbaa = [#[[LOCALTTAG]]]} : !fir.ref<f32>
+// DEFAULT: fir.store{{.*}}tbaa
+// LOCAL: fir.store{{.*}}{tbaa = [#[[LOCALTTAG]]]} : !fir.ref<f32>
     fir.store %cst to %69 : !fir.ref<f32>
 // ALL-NOT: fir.load{{.*}}tbaa
     %70 = fir.load %25 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
@@ -276,7 +278,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
     %73 = fir.shape_shift %72#0, %72#1 : (index, index) -> !fir.shapeshift<1>
     %74 = fir.array_coor %71(%73) %c1 : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>, index) -> !fir.ref<f32>
 // real, allocatable :: locala(:)
-// ALL: fir.store{{.*}}{tbaa = [#[[LOCALATAG]]]} : !fir.ref<f32>
+// DEFAULT: fir.store{{.*}}tbaa
+// LOCAL: fir.store{{.*}}{tbaa = [#[[LOCALATAG]]]} : !fir.ref<f32>
     fir.store %cst to %74 : !fir.ref<f32>
 // ALL-NOT: fir.load{{.*}}tbaa
     %75 = fir.load %27 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
@@ -285,7 +288,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
     %78 = fir.shape_shift %77#0, %77#1 : (index, index) -> !fir.shapeshift<1>
     %79 = fir.array_coor %76(%78) %c1 : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>, index) -> !fir.ref<f32>
 // real, allocatable, target :: localat(:)
-// ALL: fir.store{{.*}}{tbaa = [#[[LOCALATTAG]]]} : !fir.ref<f32>
+// DEFAULT: fir.store{{.*}}tbaa
+// LOCAL: fir.store{{.*}}{tbaa = [#[[LOCALATTAG]]]} : !fir.ref<f32>
     fir.store %cst to %79 : !fir.ref<f32>
 // ALL-NOT: fir.load{{.*}}tbaa
     %80 = fir.load %31 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
@@ -293,7 +297,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
     %82 = fir.shift %81#0 : (index) -> !fir.shift<1>
     %83 = fir.array_coor %80(%82) %c1 : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, !fir.shift<1>, index) -> !fir.ref<f32>
 // real, pointer :: localp(:)
-// ALL: fir.store{{.*}}{tbaa = [#[[TARGETTAG]]]} : !fir.ref<f32>
+// DEFAULT: fir.store{{.*}}tbaa
+// LOCAL: fir.store{{.*}}{tbaa = [#[[TARGETTAG]]]} : !fir.ref<f32>
     fir.store %cst to %83 : !fir.ref<f32>
 // ALL-NOT: fir.load{{.*}}tbaa
     %84 = fir.load %27 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
diff --git a/flang/test/Transforms/tbaa4.fir b/flang/test/Transforms/tbaa4.fir
index 5e29014af8935..c368a3d06c2ba 100644
--- a/flang/test/Transforms/tbaa4.fir
+++ b/flang/test/Transforms/tbaa4.fir
@@ -1,10 +1,12 @@
 // Test TBAA tags for common and equivalence.
-// RUN: fir-opt --fir-add-alias-tags --split-input-file %s | FileCheck --check-prefixes=ALL %s
+// RUN: fir-opt --fir-add-alias-tags --split-input-file %s | FileCheck --check-prefixes=ALL,DEFAULT %s
+// RUN: fir-opt --fir-add-alias-tags --local-alloc-tbaa --split-input-file %s | FileCheck --check-prefixes=ALL,LOCAL %s
 
 // ALL: #[[ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest_common">
 // ALL: #[[ANY:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // ALL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY]], 0>}>
-// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA]], 0>}>
+// ALL: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA]], 0>}>
+// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA]], 0>}>
 // ALL: #[[BLK:.+]] = #llvm.tbaa_type_desc<id = "global data/blk_", members = {<#[[GLOBALDATA]], 0>}>
 // ALL: #[[BLK_A:.+]] = #llvm.tbaa_type_desc<id = "global data/blk_/bytes_0_to_3", members = {<#[[BLK]], 0>}>
 // ALL: #[[BLK_C:.+]] = #llvm.tbaa_type_desc<id = "global data/blk_/bytes_8_to_47", members = {<#[[BLK]], 0>}>
@@ -52,17 +54,19 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 
 // -----
 
-// ALL: #[[ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest_local_equiv">
-// ALL: #[[ANY:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
-// ALL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY]], 0>}>
-// ALL: #[[ALLOCATEDDATA:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[ANYDATA]], 0>}>
-// ALL: #[[EQUIV:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest_local_equivEa", members = {<#[[ALLOCATEDDATA]], 0>}>
-// ALL: #[[TAG:.+]] = #llvm.tbaa_tag<base_type = #[[EQUIV]], access_type = #[[EQUIV]], offset = 0>
+// LOCAL: #[[ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest_local_equiv">
+// LOCAL: #[[ANY:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
+// LOCAL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY]], 0>}>
+// LOCAL: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA]], 0>}>
+// LOCAL: #[[ALLOCATEDDATA:.+]] = #llvm.tbaa_type_desc<id = "allocated data", members = {<#[[TARGETDATA]], 0>}>
+// LOCAL: #[[EQUIV:.+]] = #llvm.tbaa_type_desc<id = "allocated data/_QFtest_local_equivEa", members = {<#[[ALLOCATEDDATA]], 0>}>
+// LOCAL: #[[TAG:.+]] = #llvm.tbaa_tag<base_type = #[[EQUIV]], access_type = #[[EQUIV]], offset = 0>
 
 // ALL-LABEL:   func.func @_QPtest_local_equiv() {
-// ALL:         fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ptr<f32>
-// ALL:         fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ref<i32>
-// ALL:         fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ptr<f32>
+// LOCAL:         fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ptr<f32>
+// LOCAL:         fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ref<i32>
+// LOCAL:         fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ptr<f32>
+// DEFAULT-NOT:   fir.store{{.}}tbaa
 module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i64 = dense<[32, 64]> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little">, llvm.data_layout = ""} {
 func.func @_QPtest_local_equiv() {
   %c1 = arith.constant 1 : index
@@ -94,7 +98,8 @@ func.func @_QPtest_local_equiv() {
 // ALL: #[[ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest_save_equiv">
 // ALL: #[[ANY:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // ALL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY]], 0>}>
-// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA]], 0>}>
+// ALL: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA]], 0>}>
+// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA]], 0>}>
 // ALL: #[[EQUIV:.+]] = #llvm.tbaa_type_desc<id = "global data/_QFtest_save_equivEa", members = {<#[[GLOBALDATA]], 0>}>
 // ALL: #[[TAG:.+]] = #llvm.tbaa_tag<base_type = #[[EQUIV]], access_type = #[[EQUIV]], offset = 0>
 module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i64 = dense<[32, 64]> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little">, llvm.data_layout = ""} {
@@ -138,7 +143,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4
 // ALL: #[[ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtest_global_equiv">
 // ALL: #[[ANY:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // ALL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY]], 0>}>
-// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[ANYDATA]], 0>}>
+// ALL: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc<id = "target data", members = {<#[[ANYDATA]], 0>}>
+// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc<id = "global data", members = {<#[[TARGETDATA]], 0>}>
 // ALL: #[[EQUIV:.+]] = #llvm.tbaa_type_desc<id = "global data/_QMdataEa", members = {<#[[GLOBALDATA]], 0>}>
 // ALL: #[[TAG:.+]] = #llvm.tbaa_tag<base_type = #[[EQUIV]], access_type = #[[EQUIV]], offset = 0>
 

From bf22687c4842fe4f78cee34ec4e5e2d3e6e1fb59 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Mon, 1 Dec 2025 11:23:14 +0000
Subject: [PATCH 29/39] [OMPIRBuilder] CANCEL IF(FALSE) is still a cancellation
 point (#170095)

From OpenMP 4.0:

> When an if clause is present on a cancel construct and the if
expression
> evaluates to false, the cancel construct does not activate
cancellation.
> The cancellation point associated with the cancel construct is always
> encountered regardless of the value of the if expression.

This wording is retained unmodified in OpenMP 6.0.

This re-opens the already approved PR #164587, which was closed by
accident. The only changes are a rebase.
---
 clang/test/OpenMP/cancel_codegen.cpp          | 103 ++++++++++--------
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  14 ++-
 .../Frontend/OpenMPIRBuilderTest.cpp          |   4 +-
 mlir/test/Target/LLVMIR/openmp-cancel.mlir    |  69 ++++++++----
 4 files changed, 116 insertions(+), 74 deletions(-)

diff --git a/clang/test/OpenMP/cancel_codegen.cpp b/clang/test/OpenMP/cancel_codegen.cpp
index 6090a91b6a3d9..600aae211087a 100644
--- a/clang/test/OpenMP/cancel_codegen.cpp
+++ b/clang/test/OpenMP/cancel_codegen.cpp
@@ -774,8 +774,8 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM12]])
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_AFTER:%.*]]
 // CHECK3:       omp_section_loop.after:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_PREHEADER13:%.*]]
-// CHECK3:       omp_section_loop.preheader13:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_PREHEADER16:%.*]]
+// CHECK3:       omp_section_loop.preheader16:
 // CHECK3-NEXT:    store i32 0, ptr [[P_LOWERBOUND29]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[P_UPPERBOUND30]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[P_STRIDE31]], align 4
@@ -785,52 +785,52 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[P_UPPERBOUND30]], align 4
 // CHECK3-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], [[TMP9]]
 // CHECK3-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 1
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_HEADER14:%.*]]
-// CHECK3:       omp_section_loop.header14:
-// CHECK3-NEXT:    [[OMP_SECTION_LOOP_IV20:%.*]] = phi i32 [ 0, [[OMP_SECTION_LOOP_PREHEADER13]] ], [ [[OMP_SECTION_LOOP_NEXT22:%.*]], [[OMP_SECTION_LOOP_INC17:%.*]] ]
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_COND15:%.*]]
-// CHECK3:       omp_section_loop.cond15:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_HEADER17:%.*]]
+// CHECK3:       omp_section_loop.header17:
+// CHECK3-NEXT:    [[OMP_SECTION_LOOP_IV20:%.*]] = phi i32 [ 0, [[OMP_SECTION_LOOP_PREHEADER16]] ], [ [[OMP_SECTION_LOOP_NEXT22:%.*]], [[OMP_SECTION_LOOP_INC17:%.*]] ]
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_COND18:%.*]]
+// CHECK3:       omp_section_loop.cond18:
 // CHECK3-NEXT:    [[OMP_SECTION_LOOP_CMP21:%.*]] = icmp ult i32 [[OMP_SECTION_LOOP_IV20]], [[TMP12]]
-// CHECK3-NEXT:    br i1 [[OMP_SECTION_LOOP_CMP21]], label [[OMP_SECTION_LOOP_BODY16:%.*]], label [[OMP_SECTION_LOOP_EXIT18:%.*]]
-// CHECK3:       omp_section_loop.body16:
+// CHECK3-NEXT:    br i1 [[OMP_SECTION_LOOP_CMP21]], label [[OMP_SECTION_LOOP_BODY19:%.*]], label [[OMP_SECTION_LOOP_EXIT21:%.*]]
+// CHECK3:       omp_section_loop.body19:
 // CHECK3-NEXT:    [[TMP13:%.*]] = add i32 [[OMP_SECTION_LOOP_IV20]], [[TMP9]]
 // CHECK3-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], 1
 // CHECK3-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 0
 // CHECK3-NEXT:    switch i32 [[TMP15]], label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER:%.*]] [
-// CHECK3-NEXT:      i32 0, label [[OMP_SECTION_LOOP_BODY_CASE23:%.*]]
-// CHECK3-NEXT:      i32 1, label [[OMP_SECTION_LOOP_BODY_CASE25:%.*]]
+// CHECK3-NEXT:      i32 0, label [[OMP_SECTION_LOOP_BODY_CASE26:%.*]]
+// CHECK3-NEXT:      i32 1, label [[OMP_SECTION_LOOP_BODY_CASE29:%.*]]
 // CHECK3-NEXT:    ]
-// CHECK3:       omp_section_loop.body.case23:
+// CHECK3:       omp_section_loop.body.case26:
 // CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM24:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_cancel(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM24]], i32 3)
 // CHECK3-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 0
-// CHECK3-NEXT:    br i1 [[TMP17]], label [[OMP_SECTION_LOOP_BODY_CASE23_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE23_CNCL:%.*]]
-// CHECK3:       omp_section_loop.body.case23.split:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE23_SECTION_AFTER:%.*]]
-// CHECK3:       omp_section_loop.body.case23.section.after:
+// CHECK3-NEXT:    br i1 [[TMP17]], label [[OMP_SECTION_LOOP_BODY_CASE26_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE26_CNCL:%.*]]
+// CHECK3:       omp_section_loop.body.case26.split:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE26_SECTION_AFTER:%.*]]
+// CHECK3:       omp_section_loop.body.case26.section.after:
 // CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER]]
-// CHECK3:       omp_section_loop.body.case26:
+// CHECK3:       omp_section_loop.body.case29:
 // CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM27:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_cancel(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM27]], i32 3)
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP18]], 0
-// CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_SECTION_LOOP_BODY_CASE25_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE25_CNCL:%.*]]
-// CHECK3:       omp_section_loop.body.case26.split:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE25_SECTION_AFTER26:%.*]]
-// CHECK3:       omp_section_loop.body.case26.section.after27:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE25_SECTION_AFTER:%.*]]
-// CHECK3:       omp_section_loop.body.case26.section.after:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY16_SECTIONS_AFTER]]
-// CHECK3:       omp_section_loop.body16.sections.after:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_INC17]]
-// CHECK3:       omp_section_loop.inc17:
+// CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_SECTION_LOOP_BODY_CASE29_SPLIT:%.*]], label [[OMP_SECTION_LOOP_BODY_CASE29_CNCL:%.*]]
+// CHECK3:       omp_section_loop.body.case29.split:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE25_SECTION_AFTER29:%.*]]
+// CHECK3:       omp_section_loop.body.case29.section.after30:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY_CASE29_SECTION_AFTER:%.*]]
+// CHECK3:       omp_section_loop.body.case29.section.after:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_BODY19_SECTIONS_AFTER:.*]]
+// CHECK3:       omp_section_loop.body19.sections.after:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_INC20:.*]]
+// CHECK3:       omp_section_loop.inc20:
 // CHECK3-NEXT:    [[OMP_SECTION_LOOP_NEXT22]] = add nuw i32 [[OMP_SECTION_LOOP_IV20]], 1
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_HEADER14]]
-// CHECK3:       omp_section_loop.exit18:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_HEADER17]]
+// CHECK3:       omp_section_loop.exit21:
 // CHECK3-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM32]])
 // CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM33:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM33]])
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_AFTER19:%.*]]
-// CHECK3:       omp_section_loop.after19:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_AFTER22:%.*]]
+// CHECK3:       omp_section_loop.after22:
 // CHECK3-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP20]], ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
@@ -887,11 +887,11 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3:       .cancel.exit:
 // CHECK3-NEXT:    br label [[CANCEL_EXIT:%.*]]
 // CHECK3:       omp_section_loop.body.case.cncl:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_EXIT]]
-// CHECK3:       omp_section_loop.body.case23.cncl:
-// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18]]
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_EXIT:.*]]
 // CHECK3:       omp_section_loop.body.case26.cncl:
-// CHECK3-NEXT:    br label [[OMP_REGION_FINALIZE:.*]]
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_EXIT18:.*]]
+// CHECK3:       omp_section_loop.body.case29.cncl:
+// CHECK3-NEXT:    br label [[OMP_SECTION_LOOP_EXIT21:.*]]
 // CHECK3:       .cancel.continue:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
@@ -950,8 +950,17 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP2]], 0.000000e+00
 // CHECK3-NEXT:    br i1 [[TOBOOL]], label [[TMP14:%.*]], label [[TMP3:%.*]]
 // CHECK3:       3:
-// CHECK3-NEXT:    br label [[TMP4:%.*]]
-// CHECK3:       4:
+// CHECK3-NEXT:    %[[GTN:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK3-NEXT:    %[[CANCEL_POINT:.*]] = call i32 @__kmpc_cancellationpoint(ptr @1, i32 %[[GTN]], i32 1)
+// CHECK3-NEXT:    %[[COND:.*]] = icmp eq i32 %[[CANCEL_POINT]], 0
+// CHECK3-NEXT:    br i1 %[[COND]], label %[[SPLIT:.*]], label %[[CNCL:.*]]
+// CHECK3:       .cncl:
+// CHECK3-NEXT:    br label %[[FINI:.*]]
+// CHECK3:       .fini:
+// CHECK3-NEXT:    br label %[[EXIT_STUB:omp.par.exit.exitStub]]
+// CHECK3:       .split:
+// CHECK3-NEXT:    br label [[TMP6:%.*]]
+// CHECK3:       6:
 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[LOADGEP_ARGC_ADDR]], align 4
 // CHECK3-NEXT:    [[CONV:%.*]] = trunc i32 [[TMP5]] to i8
 // CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[LOADGEP_ARGV_ADDR]], align 8
@@ -963,10 +972,8 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_cancel_barrier(ptr @[[GLOB3:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM4]])
 // CHECK3-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
 // CHECK3-NEXT:    br i1 [[TMP9]], label [[DOTCONT:%.*]], label [[DOTCNCL5:%.*]]
-// CHECK3:       .cncl4:
-// CHECK3-NEXT:    br label [[FINI:%.*]]
-// CHECK3:       .fini
-// CHECK3-NEXT:    br label %[[EXIT_STUB:omp.par.exit.exitStub]]
+// CHECK3:       .cncl7:
+// CHECK3-NEXT:    br label %[[FINI]]
 // CHECK3:       .cont:
 // CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[LOADGEP_ARGC_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[LOADGEP_ARGV_ADDR]], align 8
@@ -982,16 +989,16 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3:       omp.par.region.parallel.after:
 // CHECK3-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 // CHECK3:       omp.par.pre_finalize:
-// CHECK3-NEXT:    br label [[FINI]]
-// CHECK3:       14:
+// CHECK3-NEXT:    br label %[[FINI]]
+// CHECK3:       16:
 // CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_cancel(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM1]], i32 1)
 // CHECK3-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 0
 // CHECK3-NEXT:    br i1 [[TMP16]], label [[DOTSPLIT:%.*]], label [[DOTCNCL:%.*]]
-// CHECK3:       .cncl:
-// CHECK3-NEXT:    br label [[FINI]]
-// CHECK3:       .split:
-// CHECK3-NEXT:    br label [[TMP4]]
+// CHECK3:       .cncl4:
+// CHECK3-NEXT:    br label %[[FINI]]
+// CHECK3:       .split3:
+// CHECK3-NEXT:    br label {{.+}}
 // CHECK3:       omp.par.exit.exitStub:
 // CHECK3-NEXT:    ret void
 //
@@ -1160,7 +1167,7 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3:       .omp.sections.case2.split:
 // CHECK3-NEXT:    br label [[DOTOMP_SECTIONS_CASE2_SECTION_AFTER:%.*]]
 // CHECK3:       .omp.sections.case2.section.after:
-// CHECK3-NEXT:    br label [[OMP_REGION_FINALIZE]]
+// CHECK3-NEXT:    br label [[OMP_REGION_FINALIZE:.*]]
 // CHECK3:       omp_region.finalize:
 // CHECK3-NEXT:    br label [[OMP_SECTIONS_EXIT:.*]]
 // CHECK3:       .omp.sections.case2.cncl:
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 0d196be2ee696..0c0caf80d2573 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -1149,8 +1149,20 @@ OpenMPIRBuilder::createCancel(const LocationDescription &Loc,
   auto *UI = Builder.CreateUnreachable();
 
   Instruction *ThenTI = UI, *ElseTI = nullptr;
-  if (IfCondition)
+  if (IfCondition) {
     SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
+
+    // Even if the if condition evaluates to false, this should count as a
+    // cancellation point
+    Builder.SetInsertPoint(ElseTI);
+    auto ElseIP = Builder.saveIP();
+
+    InsertPointOrErrorTy IPOrErr = createCancellationPoint(
+        LocationDescription{ElseIP, Loc.DL}, CanceledDirective);
+    if (!IPOrErr)
+      return IPOrErr;
+  }
+
   Builder.SetInsertPoint(ThenTI);
 
   Value *CancelKind = nullptr;
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index dab0a46eeb3bc..4595590a083d3 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -490,8 +490,8 @@ TEST_F(OpenMPIRBuilderTest, CreateCancelIfCond) {
       OMPBuilder.createCancel(Loc, Builder.getTrue(), OMPD_parallel));
   Builder.restoreIP(NewIP);
   EXPECT_FALSE(M->global_empty());
-  EXPECT_EQ(M->size(), 3U);
-  EXPECT_EQ(F->size(), 8U);
+  EXPECT_EQ(M->size(), 4U);
+  EXPECT_EQ(F->size(), 10U);
   EXPECT_EQ(BB->size(), 1U);
   ASSERT_TRUE(isa<BranchInst>(BB->getTerminator()));
   ASSERT_EQ(BB->getTerminator()->getNumSuccessors(), 2U);
diff --git a/mlir/test/Target/LLVMIR/openmp-cancel.mlir b/mlir/test/Target/LLVMIR/openmp-cancel.mlir
index 5e20b8793f499..a6911f80d43b7 100644
--- a/mlir/test/Target/LLVMIR/openmp-cancel.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-cancel.mlir
@@ -60,28 +60,35 @@ llvm.func @cancel_parallel_if(%arg0 : i1) {
 // CHECK:       omp.par.region:                                   ; preds = %[[VAL_17]]
 // CHECK:         br label %[[VAL_20:.*]]
 // CHECK:       omp.par.region1:                                  ; preds = %[[VAL_19]]
-// CHECK:         br i1 %[[VAL_16]], label %[[VAL_21:.*]], label %[[VAL_22:.*]]
+// CHECK:         br i1 %[[VAL_16]], label %[[SPLIT:.*]], label %[[VAL_22:.*]]
 // CHECK:       3:                                                ; preds = %[[VAL_20]]
-// CHECK:         br label %[[VAL_23:.*]]
-// CHECK:       4:                                                ; preds = %[[VAL_22]], %[[VAL_24:.*]]
-// CHECK:         br label %[[VAL_25:.*]]
-// CHECK:       omp.region.cont:                                  ; preds = %[[VAL_23]]
-// CHECK:         br label %[[VAL_26:.*]]
-// CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_25]]
-// CHECK:         br label %[[VAL_27:.*]]
+// CHECK:         %[[GTN:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         %[[NOT_CANCELLED:.*]] = call i32 @__kmpc_cancellationpoint(ptr @1, i32 %[[GTN]], i32 1)
+// CHECK:         %[[COND:.*]] = icmp eq i32 %[[NOT_CANCELLED]], 0
+// CHECK:         br i1 %[[COND]], label %[[VAL_23:.*]], label %[[CNCL:.*]]
+// CHECK:       .cncl:
+// CHECK:         br label %[[FINI:.*]]
 // CHECK:       .fini:
 // CHECK:         %[[VAL_32:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         %[[VAL_33:.*]] = call i32 @__kmpc_cancel_barrier(ptr @2, i32 %[[VAL_32]])
 // CHECK:         br label %[[EXIT_STUB:.*]]
-// CHECK:       6:                                                ; preds = %[[VAL_20]]
+// CHECK:       .split:
+// CHECK:         br label %[[SEVEN:.*]]
+// CHECK:       7:
+// CHECK:         br label %[[VAL_25:.*]]
+// CHECK:       omp.region.cont:
+// CHECK:         br label %[[VAL_26:.*]]
+// CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_25]]
+// CHECK:         br label %[[VAL_27:.*]]
+// CHECK:       8:                                                ; preds = %[[VAL_20]]
 // CHECK:         %[[VAL_28:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
 // CHECK:         %[[VAL_29:.*]] = call i32 @__kmpc_cancel(ptr @1, i32 %[[VAL_28]], i32 1)
 // CHECK:         %[[VAL_30:.*]] = icmp eq i32 %[[VAL_29]], 0
-// CHECK:         br i1 %[[VAL_30]], label %[[VAL_24]], label %[[VAL_31:.*]]
-// CHECK:       .cncl:                                            ; preds = %[[VAL_21]]
-// CHECK:         br label %[[VAL_27]]
-// CHECK:       .split:                                           ; preds = %[[VAL_21]]
-// CHECK:         br label %[[VAL_23]]
+// CHECK:         br i1 %[[VAL_30]], label %[[SPLIT5:.*]], label %[[VAL_31:.*]]
+// CHECK:       .cncl{{.*}}:
+// CHECK:         br label %[[FINI]]
+// CHECK:       .split{{.*}}:
+// CHECK:         br label %[[SEVEN]]
 // CHECK:       omp.par.exit.exitStub:
 // CHECK:         ret void
 
@@ -136,11 +143,16 @@ llvm.func @cancel_sections_if(%cond : i1) {
 // CHECK:         %[[VAL_30:.*]] = call i32 @__kmpc_cancel(ptr @1, i32 %[[VAL_29]], i32 3)
 // CHECK:         %[[VAL_31:.*]] = icmp eq i32 %[[VAL_30]], 0
 // CHECK:         br i1 %[[VAL_31]], label %[[VAL_32:.*]], label %[[VAL_33:.*]]
-// CHECK:       .split:                                           ; preds = %[[VAL_27]]
+// CHECK:       .split{{.*}}:                                     ; preds = %[[VAL_27]]
 // CHECK:         br label %[[VAL_34:.*]]
 // CHECK:       12:                                               ; preds = %[[VAL_25]]
+// CHECK:         %[[GTN:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         %[[CANCEL_POINT:.*]] = call i32 @__kmpc_cancellationpoint(ptr @1, i32 %[[GTN]], i32 3)
+// CHECK:         %[[COND:.*]] = icmp eq i32 %13, 0
+// CHECK:         br i1 %[[COND]], label %[[SPLIT:.*]], label %[[CNCL:.*]]
+// CHECK:       .split{{.*}}:
 // CHECK:         br label %[[VAL_34]]
-// CHECK:       13:                                               ; preds = %[[VAL_28]], %[[VAL_32]]
+// CHECK:       15:
 // CHECK:         br label %[[VAL_35:.*]]
 // CHECK:       omp.region.cont:                                  ; preds = %[[VAL_34]]
 // CHECK:         br label %[[VAL_23]]
@@ -156,8 +168,10 @@ llvm.func @cancel_sections_if(%cond : i1) {
 // CHECK:         br label %[[VAL_37:.*]]
 // CHECK:       omp_section_loop.after:                           ; preds = %[[VAL_19]]
 // CHECK:         ret void
-// CHECK:       .cncl:                                            ; preds = %[[VAL_27]]
-// CHECK:         br label %[[VAL_19]]
+// CHECK:       .cncl:
+// CHECK:         br label %[[OMP_SECTION_LOOP_EXIT:.*]]
+// CHECK:       .cncl{{.*}}:
+// CHECK:         br label %[[OMP_SECTION_LOOP_EXIT:.*]]
 
 llvm.func @cancel_wsloop_if(%lb : i32, %ub : i32, %step : i32, %cond : i1) {
   omp.wsloop {
@@ -223,11 +237,16 @@ llvm.func @cancel_wsloop_if(%lb : i32, %ub : i32, %step : i32, %cond : i1) {
 // CHECK:         %[[VAL_47:.*]] = call i32 @__kmpc_cancel(ptr @1, i32 %[[VAL_46]], i32 2)
 // CHECK:         %[[VAL_48:.*]] = icmp eq i32 %[[VAL_47]], 0
 // CHECK:         br i1 %[[VAL_48]], label %[[VAL_49:.*]], label %[[VAL_50:.*]]
-// CHECK:       .split:                                           ; preds = %[[VAL_44]]
+// CHECK:       .split{{.*}}:
 // CHECK:         br label %[[VAL_51:.*]]
-// CHECK:       28:                                               ; preds = %[[VAL_42]]
+// CHECK:       28:
+// CHECK:         %[[GTN:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         %[[CANCEL_POINT:.*]] = call i32 @__kmpc_cancellationpoint(ptr @1, i32 %[[GTN]], i32 2)
+// CHECK:         %[[COND:.*]] = icmp eq i32 %[[CANCEL_POINT]], 0
+// CHECK:         br i1 %[[COND]], label %[[SPLIT3:.*]], label %[[CNCL4:.*]]
+// CHECK:       .split{{.*}}:
 // CHECK:         br label %[[VAL_51]]
-// CHECK:       29:                                               ; preds = %[[VAL_45]], %[[VAL_49]]
+// CHECK:       31:
 // CHECK:         br label %[[VAL_52:.*]]
 // CHECK:       omp.region.cont1:                                 ; preds = %[[VAL_51]]
 // CHECK:         br label %[[VAL_32]]
@@ -243,8 +262,12 @@ llvm.func @cancel_wsloop_if(%lb : i32, %ub : i32, %step : i32, %cond : i1) {
 // CHECK:         br label %[[VAL_55:.*]]
 // CHECK:       omp.region.cont:                                  ; preds = %[[VAL_54]]
 // CHECK:         ret void
-// CHECK:       .cncl:                                            ; preds = %[[VAL_44]]
-// CHECK:         br label %[[VAL_38]]
+// CHECK:       .cncl{{.*}}:
+// CHECK:         br label %[[FINI:.*]]
+// CHECK:       .fini:
+// CHECK:         br label %[[OMP_LOOP_EXIT:.*]]
+// CHECK:       .cncl{{.*}}:
+// CHECK:         br label %[[FINI:.*]]
 
 omp.private {type = firstprivate} @i32_priv : i32 copy {
 ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):

From 6c0a02f2adb4dd92c965bd5a70f19d59d4c597a5 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 1 Dec 2025 11:23:43 +0000
Subject: [PATCH 30/39] [X86] Add tests showing failure to concat sqrt
 intrinsics together. (#170096)

Similar to fdiv, we should be trying to concat these high latency instructions together
---
 llvm/test/CodeGen/X86/combine-fsqrt.ll | 91 ++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/combine-fsqrt.ll

diff --git a/llvm/test/CodeGen/X86/combine-fsqrt.ll b/llvm/test/CodeGen/X86/combine-fsqrt.ll
new file mode 100644
index 0000000000000..ddd7d3ac24315
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-fsqrt.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64    | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <8 x float> @concat_sqrt_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_sqrt_v8f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    sqrtps %xmm0, %xmm0
+; SSE-NEXT:    sqrtps %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_sqrt_v8f32_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vsqrtps %xmm0, %xmm0
+; AVX-NEXT:    vsqrtps %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a1)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+define <16 x float> @concat_sqrt_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_sqrt_v16f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    sqrtps %xmm0, %xmm0
+; SSE-NEXT:    sqrtps %xmm1, %xmm1
+; SSE-NEXT:    sqrtps %xmm2, %xmm2
+; SSE-NEXT:    sqrtps %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_sqrt_v16f32_v4f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vsqrtps %xmm0, %xmm0
+; AVX1OR2-NEXT:    vsqrtps %xmm1, %xmm1
+; AVX1OR2-NEXT:    vsqrtps %xmm2, %xmm2
+; AVX1OR2-NEXT:    vsqrtps %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_sqrt_v16f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vsqrtps %xmm0, %xmm0
+; AVX512-NEXT:    vsqrtps %xmm1, %xmm1
+; AVX512-NEXT:    vsqrtps %xmm2, %xmm2
+; AVX512-NEXT:    vsqrtps %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a1)
+  %v2 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a2)
+  %v3 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a3)
+  %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res  = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
+
+define <16 x float> @concat_sqrt_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_sqrt_v16f32_v8f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    sqrtps %xmm0, %xmm0
+; SSE-NEXT:    sqrtps %xmm1, %xmm1
+; SSE-NEXT:    sqrtps %xmm2, %xmm2
+; SSE-NEXT:    sqrtps %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_sqrt_v16f32_v8f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vsqrtps %ymm0, %ymm0
+; AVX1OR2-NEXT:    vsqrtps %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_sqrt_v16f32_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vsqrtps %ymm0, %ymm0
+; AVX512-NEXT:    vsqrtps %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0)
+  %v1 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a1)
+  %res  = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}

From 0e721b75aaa39181c71e798d5a95102eb349bf1c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 1 Dec 2025 11:28:34 +0000
Subject: [PATCH 31/39] [X86] Add tests showing failure to concat RCPPS +
 RSQRTPS intrinsics together. (#170098)

Can only do this for 128->256 cases as we can't safely convert to the RCP14/RSQRT14 variants
---
 llvm/test/CodeGen/X86/combine-rcp.ll   | 65 ++++++++++++++++++++++++++
 llvm/test/CodeGen/X86/combine-rsqrt.ll | 65 ++++++++++++++++++++++++++
 2 files changed, 130 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/combine-rcp.ll
 create mode 100644 llvm/test/CodeGen/X86/combine-rsqrt.ll

diff --git a/llvm/test/CodeGen/X86/combine-rcp.ll b/llvm/test/CodeGen/X86/combine-rcp.ll
new file mode 100644
index 0000000000000..7de3e96d592db
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-rcp.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64    | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <8 x float> @concat_rcp_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_rcp_v8f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rcpps %xmm0, %xmm0
+; SSE-NEXT:    rcpps %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_rcp_v8f32_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrcpps %xmm0, %xmm0
+; AVX-NEXT:    vrcpps %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a1)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+; Ensure we don't convert rcpps to rcp14ps
+define <16 x float> @concat_rcp_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_rcp_v16f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rcpps %xmm0, %xmm0
+; SSE-NEXT:    rcpps %xmm1, %xmm1
+; SSE-NEXT:    rcpps %xmm2, %xmm2
+; SSE-NEXT:    rcpps %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_rcp_v16f32_v4f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vrcpps %xmm0, %xmm0
+; AVX1OR2-NEXT:    vrcpps %xmm1, %xmm1
+; AVX1OR2-NEXT:    vrcpps %xmm2, %xmm2
+; AVX1OR2-NEXT:    vrcpps %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_rcp_v16f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrcpps %xmm0, %xmm0
+; AVX512-NEXT:    vrcpps %xmm1, %xmm1
+; AVX512-NEXT:    vrcpps %xmm2, %xmm2
+; AVX512-NEXT:    vrcpps %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a1)
+  %v2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a2)
+  %v3 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a3)
+  %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res  = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-rsqrt.ll b/llvm/test/CodeGen/X86/combine-rsqrt.ll
new file mode 100644
index 0000000000000..78688701f8cd3
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-rsqrt.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64    | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <8 x float> @concat_rsqrt_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_rsqrt_v8f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rsqrtps %xmm0, %xmm0
+; SSE-NEXT:    rsqrtps %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_rsqrt_v8f32_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrsqrtps %xmm0, %xmm0
+; AVX-NEXT:    vrsqrtps %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a1)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+; Ensure we don't convert rsqrtps to rsqrt14ps
+define <16 x float> @concat_rsqrt_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_rsqrt_v16f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rsqrtps %xmm0, %xmm0
+; SSE-NEXT:    rsqrtps %xmm1, %xmm1
+; SSE-NEXT:    rsqrtps %xmm2, %xmm2
+; SSE-NEXT:    rsqrtps %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_rsqrt_v16f32_v4f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vrsqrtps %xmm0, %xmm0
+; AVX1OR2-NEXT:    vrsqrtps %xmm1, %xmm1
+; AVX1OR2-NEXT:    vrsqrtps %xmm2, %xmm2
+; AVX1OR2-NEXT:    vrsqrtps %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_rsqrt_v16f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vrsqrtps %xmm0, %xmm0
+; AVX512-NEXT:    vrsqrtps %xmm1, %xmm1
+; AVX512-NEXT:    vrsqrtps %xmm2, %xmm2
+; AVX512-NEXT:    vrsqrtps %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a1)
+  %v2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a2)
+  %v3 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a3)
+  %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res  = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}

From edd1856686a44db896d64a3083619dfcc473a65f Mon Sep 17 00:00:00 2001
From: Jasmine Tang <jjasmine@igalia.com>
Date: Mon, 1 Dec 2025 11:32:46 +0000
Subject: [PATCH 32/39] [WebAssembly] Optimize away mask of 63 for shl ( zext
 (and i32 63))) (#152397)

Fixes https://github.com/llvm/llvm-project/issues/71844
---
 .../Target/WebAssembly/WebAssemblyInstrInfo.td    |  2 +-
 .../Target/WebAssembly/WebAssemblyInstrInteger.td |  3 +++
 llvm/test/CodeGen/WebAssembly/masked-shifts.ll    | 15 +++++++++++++++
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 13d048a98d6ea..ce4db2e112fa0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -460,8 +460,8 @@ def : Pat<(i64 (WebAssemblyWrapperREL texternalsym:$addr)),
 include "WebAssemblyInstrMemory.td"
 include "WebAssemblyInstrCall.td"
 include "WebAssemblyInstrControl.td"
-include "WebAssemblyInstrInteger.td"
 include "WebAssemblyInstrConv.td"
+include "WebAssemblyInstrInteger.td"
 include "WebAssemblyInstrFloat.td"
 include "WebAssemblyInstrAtomics.td"
 include "WebAssemblyInstrSIMD.td"
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
index d4c8f92c883e7..eb692679f5971 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -107,6 +107,9 @@ def : Pat<(rotr I32:$lhs, (and I32:$rhs, 31)), (ROTR_I32 I32:$lhs, I32:$rhs)>;
 def : Pat<(rotl I64:$lhs, (and I64:$rhs, 63)), (ROTL_I64 I64:$lhs, I64:$rhs)>;
 def : Pat<(rotr I64:$lhs, (and I64:$rhs, 63)), (ROTR_I64 I64:$lhs, I64:$rhs)>;
 
+def : Pat<(shl I64:$lhs, (zext (and I32:$rhs, 63))),
+                               (SHL_I64 I64:$lhs, (I64_EXTEND_U_I32 I32:$rhs))>;
+
 defm SELECT_I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs, I32:$cond),
                     (outs), (ins),
                     [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))],
diff --git a/llvm/test/CodeGen/WebAssembly/masked-shifts.ll b/llvm/test/CodeGen/WebAssembly/masked-shifts.ll
index 5bcb023e546b5..368f30fd5d7ed 100644
--- a/llvm/test/CodeGen/WebAssembly/masked-shifts.ll
+++ b/llvm/test/CodeGen/WebAssembly/masked-shifts.ll
@@ -18,6 +18,21 @@ define i32 @shl_i32(i32 %v, i32 %x) {
   ret i32 %a
 }
 
+define i64 @shl_i64_zext(i64 %v, i32 %x) {
+; CHECK-LABEL: shl_i64_zext:
+; CHECK:         .functype shl_i64_zext (i64, i32) -> (i64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.extend_i32_u
+; CHECK-NEXT:    i64.shl
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i32 %x, 63
+  %z = zext i32 %m to i64
+  %a = shl i64 %v, %z
+  ret i64 %a
+}
+
 define i32 @sra_i32(i32 %v, i32 %x) {
 ; CHECK-LABEL: sra_i32:
 ; CHECK:         .functype sra_i32 (i32, i32) -> (i32)

From 130746addfed03e9a53b62dfc0da47e2c18ee959 Mon Sep 17 00:00:00 2001
From: Jan Patrick Lehr <JanPatrick.Lehr@amd.com>
Date: Mon, 1 Dec 2025 12:37:09 +0100
Subject: [PATCH 33/39] [MLIR] Fix build after #169982 (#170107)

---
 mlir/lib/Dialect/SCF/IR/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/SCF/IR/CMakeLists.txt b/mlir/lib/Dialect/SCF/IR/CMakeLists.txt
index 423e1c3e1e042..b111117410ba3 100644
--- a/mlir/lib/Dialect/SCF/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/SCF/IR/CMakeLists.txt
@@ -19,5 +19,5 @@ add_mlir_dialect_library(MLIRSCFDialect
   MLIRSideEffectInterfaces
   MLIRTensorDialect
   MLIRValueBoundsOpInterface
+  MLIRTransformUtils
   )
-

From 577cd6fb02959270dcdc48864ea0fba1d540cef4 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Mon, 1 Dec 2025 12:39:25 +0100
Subject: [PATCH 34/39] [LIT] Workaround the 60 processed limit on Windows
 (#157759)

Python multiprocessing is limited to 60 workers at most:

https://github.com/python/cpython/blob/6bc65c30ff1fd0b581a2c93416496fc720bc442c/Lib/concurrent/futures/process.py#L669-L672

The limit being per thread pool, we can work around it by using multiple
pools on windows when we want to actually use more workers.
---
 llvm/utils/lit/lit/run.py             | 68 +++++++++++++++++++++++----
 llvm/utils/lit/lit/util.py            |  5 --
 llvm/utils/lit/tests/windows-pools.py | 27 +++++++++++
 3 files changed, 85 insertions(+), 15 deletions(-)
 create mode 100644 llvm/utils/lit/tests/windows-pools.py

diff --git a/llvm/utils/lit/lit/run.py b/llvm/utils/lit/lit/run.py
index 3fc4a1b9b40bd..9c54511bfd625 100644
--- a/llvm/utils/lit/lit/run.py
+++ b/llvm/utils/lit/lit/run.py
@@ -7,6 +7,14 @@
 import lit.util
 import lit.worker
 
+# Windows has a limit of 60 workers per pool.
+# This is defined in the multiprocessing module implementation.
+# See: https://github.com/python/cpython/blob/6bc65c30ff1fd0b581a2c93416496fc720bc442c/Lib/concurrent/futures/process.py#L669-L672
+WINDOWS_MAX_WORKERS_PER_POOL = 60
+
+
+def _ceilDiv(a, b):
+    return (a + b - 1) // b
 
 class MaxFailuresError(Exception):
     pass
@@ -72,25 +80,65 @@ def _execute(self, deadline):
             if v is not None
         }
 
-        pool = multiprocessing.Pool(
-            self.workers, lit.worker.initialize, (self.lit_config, semaphores)
+        # Windows has a limit of 60 workers per pool, so we need to use multiple pools
+        # if we have more workers requested than the limit.
+        # Also, allow to override the limit with the LIT_WINDOWS_MAX_WORKERS_PER_POOL environment variable.
+        max_workers_per_pool = (
+            WINDOWS_MAX_WORKERS_PER_POOL if os.name == "nt" else self.workers
+        )
+        max_workers_per_pool = int(
+            os.getenv("LIT_WINDOWS_MAX_WORKERS_PER_POOL", max_workers_per_pool)
         )
 
-        async_results = [
-            pool.apply_async(
-                lit.worker.execute, args=[test], callback=self.progress_callback
+        num_pools = max(1, _ceilDiv(self.workers, max_workers_per_pool))
+
+        # Distribute self.workers across num_pools as evenly as possible
+        workers_per_pool_list = [self.workers // num_pools] * num_pools
+        for pool_idx in range(self.workers % num_pools):
+            workers_per_pool_list[pool_idx] += 1
+
+        if num_pools > 1:
+            self.lit_config.note(
+                "Using %d pools balancing %d workers total distributed as %s (Windows worker limit workaround)"
+                % (num_pools, self.workers, workers_per_pool_list)
             )
-            for test in self.tests
-        ]
-        pool.close()
+
+        # Create multiple pools
+        pools = []
+        for pool_size in workers_per_pool_list:
+            pool = multiprocessing.Pool(
+                pool_size, lit.worker.initialize, (self.lit_config, semaphores)
+            )
+            pools.append(pool)
+
+        # Distribute tests across pools
+        tests_per_pool = _ceilDiv(len(self.tests), num_pools)
+        async_results = []
+
+        for pool_idx, pool in enumerate(pools):
+            start_idx = pool_idx * tests_per_pool
+            end_idx = min(start_idx + tests_per_pool, len(self.tests))
+            for test in self.tests[start_idx:end_idx]:
+                ar = pool.apply_async(
+                    lit.worker.execute, args=[test], callback=self.progress_callback
+                )
+                async_results.append(ar)
+
+        # Close all pools
+        for pool in pools:
+            pool.close()
 
         try:
             self._wait_for(async_results, deadline)
         except:
-            pool.terminate()
+            # Terminate all pools on exception
+            for pool in pools:
+                pool.terminate()
             raise
         finally:
-            pool.join()
+            # Join all pools
+            for pool in pools:
+                pool.join()
 
     def _wait_for(self, async_results, deadline):
         timeout = deadline - time.time()
diff --git a/llvm/utils/lit/lit/util.py b/llvm/utils/lit/lit/util.py
index e4e031b3e0898..6f25fbc94b757 100644
--- a/llvm/utils/lit/lit/util.py
+++ b/llvm/utils/lit/lit/util.py
@@ -114,11 +114,6 @@ def usable_core_count():
     except AttributeError:
         n = os.cpu_count() or 1
 
-    # On Windows with more than 60 processes, multiprocessing's call to
-    # _winapi.WaitForMultipleObjects() prints an error and lit hangs.
-    if platform.system() == "Windows":
-        return min(n, 60)
-
     return n
 
 def abs_path_preserve_drive(path):
diff --git a/llvm/utils/lit/tests/windows-pools.py b/llvm/utils/lit/tests/windows-pools.py
new file mode 100644
index 0000000000000..85110b37c2601
--- /dev/null
+++ b/llvm/utils/lit/tests/windows-pools.py
@@ -0,0 +1,27 @@
+# Create a directory with 20 files and check the number of pools and workers per pool that lit will use.
+
+# RUN: rm -Rf %t.dir && mkdir -p %t.dir
+# RUN: python -c "for i in range(20): open(rf'%t.dir/file{i}.txt', 'w').write('RUN:')"
+
+# RUN:  echo "import lit.formats" > %t.dir/lit.cfg
+# RUN:  echo "config.name = \"top-level-suite\"" >> %t.dir/lit.cfg
+# RUN:  echo "config.suffixes = [\".txt\"]" >> %t.dir/lit.cfg
+# RUN:  echo "config.test_format = lit.formats.ShTest()" >> %t.dir/lit.cfg
+
+
+# 15 workers per pool max, 100 workers total max: we expect lit to cap the workers to the number of files
+# RUN: env "LIT_WINDOWS_MAX_WORKERS_PER_POOL=15" %{lit} -s %t.dir/ -j100 > %t.out 2>&1
+# CHECK: Using 2 pools balancing 20 workers total distributed as [10, 10]
+# CHECK: Passed: 20
+
+# 5 workers per pool max, 17 workers total max
+# RUN: env "LIT_WINDOWS_MAX_WORKERS_PER_POOL=5" %{lit} -s %t.dir/ -j17 >> %t.out 2>&1
+# CHECK: Using 4 pools balancing 17 workers total distributed as [5, 4, 4, 4]
+# CHECK: Passed: 20
+
+# 19 workers per pool max, 19 workers total max
+# RUN: env "LIT_WINDOWS_MAX_WORKERS_PER_POOL=19" %{lit} -s %t.dir/ -j19 >> %t.out 2>&1
+# CHECK-NOT: workers total distributed as
+# CHECK: Passed: 20
+
+# RUN: cat %t.out | FileCheck %s

From 48931e5e5942304afd1c0a493be91b662ffd221b Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 1 Dec 2025 12:43:35 +0100
Subject: [PATCH 35/39] [clang][bytecode] Check memcmp builtin for
 one-past-the-end pointers (#170097)

We can't read from those and will run into an assertion sooner or later.

Fixes https://github.com/llvm/llvm-project/issues/170031
---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      | 4 ++++
 clang/test/AST/ByteCode/builtin-functions.cpp | 7 +++++++
 2 files changed, 11 insertions(+)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 8496b58105c7a..971fce541bb88 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -1921,6 +1921,10 @@ static bool interp__builtin_memcmp(InterpState &S, CodePtr OpPC,
   if (PtrA.isDummy() || PtrB.isDummy())
     return false;
 
+  if (!CheckRange(S, OpPC, PtrA, AK_Read) ||
+      !CheckRange(S, OpPC, PtrB, AK_Read))
+    return false;
+
   // Now, read both pointers to a buffer and compare those.
   BitcastBuffer BufferA(
       Bits(ASTCtx.getTypeSize(ElemTypeA) * PtrA.getNumElems()));
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index 4a53cb66b2fdd..3076b5239ebbe 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -1545,6 +1545,13 @@ namespace Memcmp {
 
   int unknown;
   void foo(void) { unknown *= __builtin_memcmp(0, 0, 2); }
+
+  constexpr int onepasttheend(char a) {
+    __builtin_memcmp(&a, &a + 1, 1); // both-note {{read of dereferenced one-past-the-end pointer}}
+    return 1;
+  }
+  static_assert(onepasttheend(10)); // both-error {{not an integral constant expression}} \
+                                    // both-note {{in call to}}
 }
 
 namespace Memchr {

From d0df51bc93fb5a254dd8a05752b782a13dc1f64d Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Mon, 1 Dec 2025 19:51:56 +0800
Subject: [PATCH 36/39] [ConstantRange] Allow casting to the same bitwidth. NFC
 (#170102)

From the review in
https://github.com/llvm/llvm-project/pull/169527#discussion_r2567122387,
there are some users where we want to extend or truncate a ConstantRange
only if it's not already the destination bitwidth. Previously this
asserted, so this PR relaxes it to just be a no-op, similar to
IRBuilder::createZExt and friends.
---
 llvm/lib/IR/ConstantRange.cpp            | 6 ++++++
 llvm/lib/Transforms/Utils/SCCPSolver.cpp | 6 +++---
 llvm/unittests/IR/ConstantRangeTest.cpp  | 9 +++++++++
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index b454c9a4cd3ae..9beaee60d0bc1 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -841,6 +841,8 @@ ConstantRange ConstantRange::zeroExtend(uint32_t DstTySize) const {
   if (isEmptySet()) return getEmpty(DstTySize);
 
   unsigned SrcTySize = getBitWidth();
+  if (DstTySize == SrcTySize)
+    return *this;
   assert(SrcTySize < DstTySize && "Not a value extension");
   if (isFullSet() || isUpperWrapped()) {
     // Change into [0, 1 << src bit width)
@@ -858,6 +860,8 @@ ConstantRange ConstantRange::signExtend(uint32_t DstTySize) const {
   if (isEmptySet()) return getEmpty(DstTySize);
 
   unsigned SrcTySize = getBitWidth();
+  if (DstTySize == SrcTySize)
+    return *this;
   assert(SrcTySize < DstTySize && "Not a value extension");
 
   // special case: [X, INT_MIN) -- not really wrapping around
@@ -874,6 +878,8 @@ ConstantRange ConstantRange::signExtend(uint32_t DstTySize) const {
 
 ConstantRange ConstantRange::truncate(uint32_t DstTySize,
                                       unsigned NoWrapKind) const {
+  if (DstTySize == getBitWidth())
+    return *this;
   assert(getBitWidth() > DstTySize && "Not a value truncation");
   if (isEmptySet())
     return getEmpty(DstTySize);
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 951bf1ca62fc2..021bf0618754a 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -2109,10 +2109,10 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
 
       ConstantRange Count = getValueState(CountArg)
                                 .asConstantRange(CountArg->getType(), false)
-                                .zextOrTrunc(BitWidth);
+                                .zeroExtend(BitWidth);
       ConstantRange MaxLanes = getValueState(VF)
                                    .asConstantRange(VF->getType(), false)
-                                   .zextOrTrunc(BitWidth);
+                                   .zeroExtend(BitWidth);
       if (Scalable)
         MaxLanes =
             MaxLanes.multiply(getVScaleRange(II->getFunction(), BitWidth));
@@ -2126,7 +2126,7 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
       if (Count.icmp(CmpInst::ICMP_ULE, MaxLanes))
         Result = Count;
 
-      Result = Result.zextOrTrunc(II->getType()->getScalarSizeInBits());
+      Result = Result.truncate(II->getType()->getScalarSizeInBits());
       return (void)mergeInValue(ValueState[II], II,
                                 ValueLatticeElement::getRange(Result));
     }
diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp
index 53d581c8db7c9..13712a76d3edf 100644
--- a/llvm/unittests/IR/ConstantRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantRangeTest.cpp
@@ -449,6 +449,9 @@ TEST_F(ConstantRangeTest, Trunc) {
   // trunc([7, 1), 3->2) = [3, 1)
   ConstantRange SevenOne(APInt(3, 7), APInt(3, 1));
   EXPECT_EQ(SevenOne.truncate(2), ConstantRange(APInt(2, 3), APInt(2, 1)));
+
+  ConstantRange Nop = Full.truncate(Full.getBitWidth());
+  EXPECT_EQ(Full, Nop);
 }
 
 TEST_F(ConstantRangeTest, TruncNuw) {
@@ -527,6 +530,9 @@ TEST_F(ConstantRangeTest, ZExt) {
   // zext([5, 0), 3->7) = [5, 8)
   ConstantRange FiveZero(APInt(3, 5), APInt(3, 0));
   EXPECT_EQ(FiveZero.zeroExtend(7), ConstantRange(APInt(7, 5), APInt(7, 8)));
+
+  ConstantRange Nop = Full.zeroExtend(Full.getBitWidth());
+  EXPECT_EQ(Full, Nop);
 }
 
 TEST_F(ConstantRangeTest, SExt) {
@@ -550,6 +556,9 @@ TEST_F(ConstantRangeTest, SExt) {
 
   EXPECT_EQ(ConstantRange(APInt(16, 0x0200), APInt(16, 0x8000)).signExtend(19),
             ConstantRange(APInt(19, 0x0200), APInt(19, 0x8000)));
+
+  ConstantRange Nop = Full.signExtend(Full.getBitWidth());
+  EXPECT_EQ(Full, Nop);
 }
 
 TEST_F(ConstantRangeTest, IntersectWith) {

From 58770200a7045dd46dfb8c85299eee504d95026c Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro@fujitsu.com>
Date: Mon, 1 Dec 2025 20:57:09 +0900
Subject: [PATCH 37/39] [DA] Clean up unnecessary member function declarations
 (#170106)

Follow-up for #169047. The previous PR moved some functions from DA to
Delinearization, but the member function declarations were not updated
accordingly. This patch removes them.
---
 llvm/include/llvm/Analysis/DependenceAnalysis.h | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/llvm/include/llvm/Analysis/DependenceAnalysis.h b/llvm/include/llvm/Analysis/DependenceAnalysis.h
index 8286d8e8e45cc..ad46d2f1466cf 100644
--- a/llvm/include/llvm/Analysis/DependenceAnalysis.h
+++ b/llvm/include/llvm/Analysis/DependenceAnalysis.h
@@ -506,17 +506,6 @@ class DependenceInfo {
   bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X,
                         const SCEV *Y) const;
 
-  /// isKnownLessThan - Compare to see if S is less than Size
-  /// Another wrapper for isKnownNegative(S - max(Size, 1)) with some extra
-  /// checking if S is an AddRec and we can prove lessthan using the loop
-  /// bounds.
-  bool isKnownLessThan(const SCEV *S, const SCEV *Size) const;
-
-  /// isKnownNonNegative - Compare to see if S is known not to be negative
-  /// Uses the fact that S comes from Ptr, which may be an inbound GEP,
-  /// Proving there is no wrapping going on.
-  bool isKnownNonNegative(const SCEV *S, const Value *Ptr) const;
-
   /// collectUpperBound - All subscripts are the same type (on my machine,
   /// an i64). The loop bound may be a smaller type. collectUpperBound
   /// find the bound, if available, and zero extends it to the Type T.

From 6157d4625941870392a0f5377b8ab08c4c204ce4 Mon Sep 17 00:00:00 2001
From: Sohaib Iftikhar <sohaibiftikhar@google.com>
Date: Mon, 1 Dec 2025 13:00:58 +0100
Subject: [PATCH 38/39] [MLIR|BUILD]: Fix for 8ceeba838 (#170110)

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 13a7705091b24..c574ba5877b3d 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -4444,6 +4444,7 @@ cc_library(
         ":SCFIncGen",
         ":SideEffectInterfaces",
         ":TensorDialect",
+        ":TransformUtils",
         ":ValueBoundsOpInterface",
         ":ViewLikeInterface",
         "//llvm:Support",

From 989ac4c9db3aaa660dcfd0d1d5683b4c07dffaec Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 1 Dec 2025 12:07:01 +0000
Subject: [PATCH 39/39] [X86] Add tests showing failure to concat fp rounding
 intrinsics together. (#170108)

---
 llvm/test/CodeGen/X86/combine-fceil.ll      | 175 ++++++++++++++++++++
 llvm/test/CodeGen/X86/combine-fnearbyint.ll | 175 ++++++++++++++++++++
 llvm/test/CodeGen/X86/combine-frint.ll      | 175 ++++++++++++++++++++
 llvm/test/CodeGen/X86/combine-froundeven.ll | 175 ++++++++++++++++++++
 llvm/test/CodeGen/X86/combine-ftrunc.ll     | 175 ++++++++++++++++++++
 llvm/test/CodeGen/X86/combine-rndscale.ll   | 144 ++++++++++++++++
 6 files changed, 1019 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/combine-fceil.ll
 create mode 100644 llvm/test/CodeGen/X86/combine-fnearbyint.ll
 create mode 100644 llvm/test/CodeGen/X86/combine-frint.ll
 create mode 100644 llvm/test/CodeGen/X86/combine-froundeven.ll
 create mode 100644 llvm/test/CodeGen/X86/combine-ftrunc.ll
 create mode 100644 llvm/test/CodeGen/X86/combine-rndscale.ll

diff --git a/llvm/test/CodeGen/X86/combine-fceil.ll b/llvm/test/CodeGen/X86/combine-fceil.ll
new file mode 100644
index 0000000000000..78f1476a49152
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-fceil.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_ceil_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: concat_ceil_v4f64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $10, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $10, %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_ceil_v4f64_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundpd $10, %xmm0, %xmm0
+; AVX-NEXT:    vroundpd $10, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a0)
+  %v1 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a1)
+  %res  = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %res
+}
+
+define <8 x float> @concat_ceil_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_ceil_v8f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $10, %xmm0, %xmm0
+; SSE-NEXT:    roundps $10, %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_ceil_v8f32_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundps $10, %xmm0, %xmm0
+; AVX-NEXT:    vroundps $10, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a1)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+define <8 x double> @concat_ceil_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; SSE-LABEL: concat_ceil_v8f64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $10, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $10, %xmm1, %xmm1
+; SSE-NEXT:    roundpd $10, %xmm2, %xmm2
+; SSE-NEXT:    roundpd $10, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_ceil_v8f64_v2f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $10, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundpd $10, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundpd $10, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundpd $10, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_ceil_v8f64_v2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $10, %xmm0, %xmm0
+; AVX512-NEXT:    vroundpd $10, %xmm1, %xmm1
+; AVX512-NEXT:    vroundpd $10, %xmm2, %xmm2
+; AVX512-NEXT:    vroundpd $10, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a0)
+  %v1 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a1)
+  %v2 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a2)
+  %v3 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a3)
+  %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res  = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_ceil_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_ceil_v16f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $10, %xmm0, %xmm0
+; SSE-NEXT:    roundps $10, %xmm1, %xmm1
+; SSE-NEXT:    roundps $10, %xmm2, %xmm2
+; SSE-NEXT:    roundps $10, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_ceil_v16f32_v4f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $10, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundps $10, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundps $10, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundps $10, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_ceil_v16f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $10, %xmm0, %xmm0
+; AVX512-NEXT:    vroundps $10, %xmm1, %xmm1
+; AVX512-NEXT:    vroundps $10, %xmm2, %xmm2
+; AVX512-NEXT:    vroundps $10, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a1)
+  %v2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a2)
+  %v3 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a3)
+  %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res  = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
+
+define <8 x double> @concat_ceil_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: concat_ceil_v8f64_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $10, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $10, %xmm1, %xmm1
+; SSE-NEXT:    roundpd $10, %xmm2, %xmm2
+; SSE-NEXT:    roundpd $10, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_ceil_v8f64_v4f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $10, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundpd $10, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_ceil_v8f64_v4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $10, %ymm0, %ymm0
+; AVX512-NEXT:    vroundpd $10, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x double> @llvm.ceil.v4f64(<4 x double> %a0)
+  %v1 = call <4 x double> @llvm.ceil.v4f64(<4 x double> %a1)
+  %res  = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_ceil_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_ceil_v16f32_v8f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $10, %xmm0, %xmm0
+; SSE-NEXT:    roundps $10, %xmm1, %xmm1
+; SSE-NEXT:    roundps $10, %xmm2, %xmm2
+; SSE-NEXT:    roundps $10, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_ceil_v16f32_v8f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $10, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundps $10, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_ceil_v16f32_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $10, %ymm0, %ymm0
+; AVX512-NEXT:    vroundps $10, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <8 x float> @llvm.ceil.v8f32(<8 x float> %a0)
+  %v1 = call <8 x float> @llvm.ceil.v8f32(<8 x float> %a1)
+  %res  = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-fnearbyint.ll b/llvm/test/CodeGen/X86/combine-fnearbyint.ll
new file mode 100644
index 0000000000000..14d1017aec630
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-fnearbyint.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_nearbyint_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: concat_nearbyint_v4f64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $12, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $12, %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_nearbyint_v4f64_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundpd $12, %xmm0, %xmm0
+; AVX-NEXT:    vroundpd $12, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a0)
+  %v1 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a1)
+  %res  = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %res
+}
+
+define <8 x float> @concat_nearbyint_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_nearbyint_v8f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $12, %xmm0, %xmm0
+; SSE-NEXT:    roundps $12, %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_nearbyint_v8f32_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundps $12, %xmm0, %xmm0
+; AVX-NEXT:    vroundps $12, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a1)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+define <8 x double> @concat_nearbyint_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; SSE-LABEL: concat_nearbyint_v8f64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $12, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $12, %xmm1, %xmm1
+; SSE-NEXT:    roundpd $12, %xmm2, %xmm2
+; SSE-NEXT:    roundpd $12, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_nearbyint_v8f64_v2f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $12, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundpd $12, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundpd $12, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundpd $12, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_nearbyint_v8f64_v2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $12, %xmm0, %xmm0
+; AVX512-NEXT:    vroundpd $12, %xmm1, %xmm1
+; AVX512-NEXT:    vroundpd $12, %xmm2, %xmm2
+; AVX512-NEXT:    vroundpd $12, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a0)
+  %v1 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a1)
+  %v2 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a2)
+  %v3 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a3)
+  %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res  = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_nearbyint_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_nearbyint_v16f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $12, %xmm0, %xmm0
+; SSE-NEXT:    roundps $12, %xmm1, %xmm1
+; SSE-NEXT:    roundps $12, %xmm2, %xmm2
+; SSE-NEXT:    roundps $12, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_nearbyint_v16f32_v4f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $12, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundps $12, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundps $12, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundps $12, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_nearbyint_v16f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $12, %xmm0, %xmm0
+; AVX512-NEXT:    vroundps $12, %xmm1, %xmm1
+; AVX512-NEXT:    vroundps $12, %xmm2, %xmm2
+; AVX512-NEXT:    vroundps $12, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a1)
+  %v2 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a2)
+  %v3 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a3)
+  %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res  = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
+
+define <8 x double> @concat_nearbyint_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: concat_nearbyint_v8f64_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $12, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $12, %xmm1, %xmm1
+; SSE-NEXT:    roundpd $12, %xmm2, %xmm2
+; SSE-NEXT:    roundpd $12, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_nearbyint_v8f64_v4f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $12, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundpd $12, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_nearbyint_v8f64_v4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $12, %ymm0, %ymm0
+; AVX512-NEXT:    vroundpd $12, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %a0)
+  %v1 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %a1)
+  %res  = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_nearbyint_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_nearbyint_v16f32_v8f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $12, %xmm0, %xmm0
+; SSE-NEXT:    roundps $12, %xmm1, %xmm1
+; SSE-NEXT:    roundps $12, %xmm2, %xmm2
+; SSE-NEXT:    roundps $12, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_nearbyint_v16f32_v8f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $12, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundps $12, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_nearbyint_v16f32_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $12, %ymm0, %ymm0
+; AVX512-NEXT:    vroundps $12, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %a0)
+  %v1 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %a1)
+  %res  = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-frint.ll b/llvm/test/CodeGen/X86/combine-frint.ll
new file mode 100644
index 0000000000000..901ce2c1f0d82
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-frint.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_rint_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: concat_rint_v4f64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $4, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $4, %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_rint_v4f64_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundpd $4, %xmm0, %xmm0
+; AVX-NEXT:    vroundpd $4, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a0)
+  %v1 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a1)
+  %res  = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %res
+}
+
+define <8 x float> @concat_rint_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_rint_v8f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $4, %xmm0, %xmm0
+; SSE-NEXT:    roundps $4, %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_rint_v8f32_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundps $4, %xmm0, %xmm0
+; AVX-NEXT:    vroundps $4, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a1)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+define <8 x double> @concat_rint_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; SSE-LABEL: concat_rint_v8f64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $4, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $4, %xmm1, %xmm1
+; SSE-NEXT:    roundpd $4, %xmm2, %xmm2
+; SSE-NEXT:    roundpd $4, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_rint_v8f64_v2f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $4, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundpd $4, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundpd $4, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundpd $4, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_rint_v8f64_v2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $4, %xmm0, %xmm0
+; AVX512-NEXT:    vroundpd $4, %xmm1, %xmm1
+; AVX512-NEXT:    vroundpd $4, %xmm2, %xmm2
+; AVX512-NEXT:    vroundpd $4, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a0)
+  %v1 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a1)
+  %v2 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a2)
+  %v3 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a3)
+  %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res  = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_rint_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_rint_v16f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $4, %xmm0, %xmm0
+; SSE-NEXT:    roundps $4, %xmm1, %xmm1
+; SSE-NEXT:    roundps $4, %xmm2, %xmm2
+; SSE-NEXT:    roundps $4, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_rint_v16f32_v4f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $4, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundps $4, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundps $4, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundps $4, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_rint_v16f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $4, %xmm0, %xmm0
+; AVX512-NEXT:    vroundps $4, %xmm1, %xmm1
+; AVX512-NEXT:    vroundps $4, %xmm2, %xmm2
+; AVX512-NEXT:    vroundps $4, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a1)
+  %v2 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a2)
+  %v3 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a3)
+  %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res  = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
+
+define <8 x double> @concat_rint_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: concat_rint_v8f64_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $4, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $4, %xmm1, %xmm1
+; SSE-NEXT:    roundpd $4, %xmm2, %xmm2
+; SSE-NEXT:    roundpd $4, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_rint_v8f64_v4f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $4, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundpd $4, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_rint_v8f64_v4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $4, %ymm0, %ymm0
+; AVX512-NEXT:    vroundpd $4, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %a0)
+  %v1 = call <4 x double> @llvm.rint.v4f64(<4 x double> %a1)
+  %res  = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_rint_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_rint_v16f32_v8f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $4, %xmm0, %xmm0
+; SSE-NEXT:    roundps $4, %xmm1, %xmm1
+; SSE-NEXT:    roundps $4, %xmm2, %xmm2
+; SSE-NEXT:    roundps $4, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_rint_v16f32_v8f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $4, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundps $4, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_rint_v16f32_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $4, %ymm0, %ymm0
+; AVX512-NEXT:    vroundps $4, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <8 x float> @llvm.rint.v8f32(<8 x float> %a0)
+  %v1 = call <8 x float> @llvm.rint.v8f32(<8 x float> %a1)
+  %res  = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-froundeven.ll b/llvm/test/CodeGen/X86/combine-froundeven.ll
new file mode 100644
index 0000000000000..484e3a9680450
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-froundeven.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_roundeven_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: concat_roundeven_v4f64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $8, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $8, %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_roundeven_v4f64_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundpd $8, %xmm0, %xmm0
+; AVX-NEXT:    vroundpd $8, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a0)
+  %v1 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a1)
+  %res  = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %res
+}
+
+define <8 x float> @concat_roundeven_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_roundeven_v8f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $8, %xmm0, %xmm0
+; SSE-NEXT:    roundps $8, %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_roundeven_v8f32_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundps $8, %xmm0, %xmm0
+; AVX-NEXT:    vroundps $8, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a1)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+define <8 x double> @concat_roundeven_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; SSE-LABEL: concat_roundeven_v8f64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $8, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $8, %xmm1, %xmm1
+; SSE-NEXT:    roundpd $8, %xmm2, %xmm2
+; SSE-NEXT:    roundpd $8, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_roundeven_v8f64_v2f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $8, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundpd $8, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundpd $8, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundpd $8, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_roundeven_v8f64_v2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $8, %xmm0, %xmm0
+; AVX512-NEXT:    vroundpd $8, %xmm1, %xmm1
+; AVX512-NEXT:    vroundpd $8, %xmm2, %xmm2
+; AVX512-NEXT:    vroundpd $8, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a0)
+  %v1 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a1)
+  %v2 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a2)
+  %v3 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a3)
+  %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res  = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_roundeven_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_roundeven_v16f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $8, %xmm0, %xmm0
+; SSE-NEXT:    roundps $8, %xmm1, %xmm1
+; SSE-NEXT:    roundps $8, %xmm2, %xmm2
+; SSE-NEXT:    roundps $8, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_roundeven_v16f32_v4f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $8, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundps $8, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundps $8, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundps $8, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_roundeven_v16f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $8, %xmm0, %xmm0
+; AVX512-NEXT:    vroundps $8, %xmm1, %xmm1
+; AVX512-NEXT:    vroundps $8, %xmm2, %xmm2
+; AVX512-NEXT:    vroundps $8, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a1)
+  %v2 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a2)
+  %v3 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a3)
+  %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res  = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
+
+define <8 x double> @concat_roundeven_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: concat_roundeven_v8f64_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $8, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $8, %xmm1, %xmm1
+; SSE-NEXT:    roundpd $8, %xmm2, %xmm2
+; SSE-NEXT:    roundpd $8, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_roundeven_v8f64_v4f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $8, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundpd $8, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_roundeven_v8f64_v4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $8, %ymm0, %ymm0
+; AVX512-NEXT:    vroundpd $8, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %a0)
+  %v1 = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %a1)
+  %res  = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_roundeven_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_roundeven_v16f32_v8f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $8, %xmm0, %xmm0
+; SSE-NEXT:    roundps $8, %xmm1, %xmm1
+; SSE-NEXT:    roundps $8, %xmm2, %xmm2
+; SSE-NEXT:    roundps $8, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_roundeven_v16f32_v8f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $8, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundps $8, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_roundeven_v16f32_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $8, %ymm0, %ymm0
+; AVX512-NEXT:    vroundps $8, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %a0)
+  %v1 = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %a1)
+  %res  = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-ftrunc.ll b/llvm/test/CodeGen/X86/combine-ftrunc.ll
new file mode 100644
index 0000000000000..a6c703a1cbeae
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-ftrunc.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_trunc_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: concat_trunc_v4f64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $11, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $11, %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_trunc_v4f64_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundpd $11, %xmm0, %xmm0
+; AVX-NEXT:    vroundpd $11, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a0)
+  %v1 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a1)
+  %res  = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %res
+}
+
+define <8 x float> @concat_trunc_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_trunc_v8f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $11, %xmm0, %xmm0
+; SSE-NEXT:    roundps $11, %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: concat_trunc_v8f32_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundps $11, %xmm0, %xmm0
+; AVX-NEXT:    vroundps $11, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a1)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+define <8 x double> @concat_trunc_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; SSE-LABEL: concat_trunc_v8f64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $11, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $11, %xmm1, %xmm1
+; SSE-NEXT:    roundpd $11, %xmm2, %xmm2
+; SSE-NEXT:    roundpd $11, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_trunc_v8f64_v2f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $11, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundpd $11, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundpd $11, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundpd $11, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_trunc_v8f64_v2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $11, %xmm0, %xmm0
+; AVX512-NEXT:    vroundpd $11, %xmm1, %xmm1
+; AVX512-NEXT:    vroundpd $11, %xmm2, %xmm2
+; AVX512-NEXT:    vroundpd $11, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a0)
+  %v1 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a1)
+  %v2 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a2)
+  %v3 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a3)
+  %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res  = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_trunc_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_trunc_v16f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $11, %xmm0, %xmm0
+; SSE-NEXT:    roundps $11, %xmm1, %xmm1
+; SSE-NEXT:    roundps $11, %xmm2, %xmm2
+; SSE-NEXT:    roundps $11, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_trunc_v16f32_v4f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $11, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundps $11, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundps $11, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundps $11, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_trunc_v16f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $11, %xmm0, %xmm0
+; AVX512-NEXT:    vroundps $11, %xmm1, %xmm1
+; AVX512-NEXT:    vroundps $11, %xmm2, %xmm2
+; AVX512-NEXT:    vroundps $11, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a0)
+  %v1 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a1)
+  %v2 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a2)
+  %v3 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a3)
+  %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res  = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
+
+define <8 x double> @concat_trunc_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: concat_trunc_v8f64_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundpd $11, %xmm0, %xmm0
+; SSE-NEXT:    roundpd $11, %xmm1, %xmm1
+; SSE-NEXT:    roundpd $11, %xmm2, %xmm2
+; SSE-NEXT:    roundpd $11, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_trunc_v8f64_v4f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $11, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundpd $11, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_trunc_v8f64_v4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $11, %ymm0, %ymm0
+; AVX512-NEXT:    vroundpd $11, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x double> @llvm.trunc.v4f64(<4 x double> %a0)
+  %v1 = call <4 x double> @llvm.trunc.v4f64(<4 x double> %a1)
+  %res  = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_trunc_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_trunc_v16f32_v8f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    roundps $11, %xmm0, %xmm0
+; SSE-NEXT:    roundps $11, %xmm1, %xmm1
+; SSE-NEXT:    roundps $11, %xmm2, %xmm2
+; SSE-NEXT:    roundps $11, %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: concat_trunc_v16f32_v8f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $11, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundps $11, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_trunc_v16f32_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $11, %ymm0, %ymm0
+; AVX512-NEXT:    vroundps $11, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <8 x float> @llvm.trunc.v8f32(<8 x float> %a0)
+  %v1 = call <8 x float> @llvm.trunc.v8f32(<8 x float> %a1)
+  %res  = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-rndscale.ll b/llvm/test/CodeGen/X86/combine-rndscale.ll
new file mode 100644
index 0000000000000..25117e864b512
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-rndscale.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_roundpd_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; AVX-LABEL: concat_roundpd_v4f64_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundpd $4, %xmm0, %xmm0
+; AVX-NEXT:    vroundpd $4, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 4)
+  %v1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a1, i32 4)
+  %res  = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %res
+}
+
+define <8 x float> @concat_roundps_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; AVX-LABEL: concat_roundps_v8f32_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundps $4, %xmm0, %xmm0
+; AVX-NEXT:    vroundps $4, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 4)
+  %v1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a1, i32 4)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+define <8 x double> @concat_roundpd_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; AVX1OR2-LABEL: concat_roundpd_v8f64_v2f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $4, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundpd $4, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundpd $4, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundpd $4, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_roundpd_v8f64_v2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $4, %xmm0, %xmm0
+; AVX512-NEXT:    vroundpd $4, %xmm1, %xmm1
+; AVX512-NEXT:    vroundpd $4, %xmm2, %xmm2
+; AVX512-NEXT:    vroundpd $4, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 4)
+  %v1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a1, i32 4)
+  %v2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a2, i32 4)
+  %v3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a3, i32 4)
+  %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res  = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_roundps_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; AVX1OR2-LABEL: concat_roundps_v16f32_v4f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $4, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vroundps $4, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vroundps $4, %xmm2, %xmm2
+; AVX1OR2-NEXT:    vroundps $4, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_roundps_v16f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $4, %xmm0, %xmm0
+; AVX512-NEXT:    vroundps $4, %xmm1, %xmm1
+; AVX512-NEXT:    vroundps $4, %xmm2, %xmm2
+; AVX512-NEXT:    vroundps $4, %xmm3, %xmm3
+; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 4)
+  %v1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a1, i32 4)
+  %v2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a2, i32 4)
+  %v3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a3, i32 4)
+  %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res  = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
+
+define <8 x double> @concat_roundpd_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; AVX1OR2-LABEL: concat_roundpd_v8f64_v4f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundpd $4, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundpd $4, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_roundpd_v8f64_v4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundpd $4, %ymm0, %ymm0
+; AVX512-NEXT:    vroundpd $4, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 4)
+  %v1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a1, i32 4)
+  %res  = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+define <16 x float> @concat_roundps_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; AVX1OR2-LABEL: concat_roundps_v16f32_v8f32:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vroundps $4, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vroundps $4, %ymm1, %ymm1
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: concat_roundps_v16f32_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vroundps $4, %ymm0, %ymm0
+; AVX512-NEXT:    vroundps $4, %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %v0 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 4)
+  %v1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a1, i32 4)
+  %res  = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
+
+; negative test - rounding mode mismatch
+define <8 x float> @concat_roundps_v8f32_v4f32_mismatch(<4 x float> %a0, <4 x float> %a1) {
+; AVX-LABEL: concat_roundps_v8f32_v4f32_mismatch:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vroundps $0, %xmm0, %xmm0
+; AVX-NEXT:    vroundps $4, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %v0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 0)
+  %v1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a1, i32 4)
+  %res  = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}