From 59ed6dfe97b35a4dc88f69e3d830edf8caa99d10 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Tue, 18 Nov 2025 12:15:38 +0000
Subject: [PATCH 01/33] [LLVM][CodeGen][SVE] Use DUPM for constantfp splats.
 (#168391)

This helps cases where the immediate range of FDUP is not sufficient.
---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |  71 +++--
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   2 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  25 ++
 .../test/CodeGen/AArch64/sve-bf16-combines.ll |   8 +-
 llvm/test/CodeGen/AArch64/sve-fp-combine.ll   |  15 +-
 .../CodeGen/AArch64/sve-fp-reduce-fadda.ll    |  15 +-
 llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll   | 121 ++++----
 llvm/test/CodeGen/AArch64/sve-llrint.ll       | 202 ++++++------
 llvm/test/CodeGen/AArch64/sve-lrint.ll        | 202 ++++++------
 llvm/test/CodeGen/AArch64/sve-vector-splat.ll | 292 +++++++++++++++++-
 llvm/test/CodeGen/AArch64/sve-vselect-imm.ll  |  18 +-
 11 files changed, 626 insertions(+), 345 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index f1db05dda4e40..08466667c0fa5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -4403,43 +4403,46 @@ bool AArch64DAGToDAGISel::SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm) {
 
 bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm,
                                               bool Invert) {
-  if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
-    uint64_t ImmVal = CNode->getZExtValue();
-    SDLoc DL(N);
-
-    if (Invert)
-      ImmVal = ~ImmVal;
+  uint64_t ImmVal;
+  if (auto CI = dyn_cast<ConstantSDNode>(N))
+    ImmVal = CI->getZExtValue();
+  else if (auto CFP = dyn_cast<ConstantFPSDNode>(N))
+    ImmVal = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
+  else
+    return false;
 
-    // Shift mask depending on type size.
-    switch (VT.SimpleTy) {
-    case MVT::i8:
-      ImmVal &= 0xFF;
-      ImmVal |= ImmVal << 8;
-      ImmVal |= ImmVal << 16;
-      ImmVal |= ImmVal << 32;
-      break;
-    case MVT::i16:
-      ImmVal &= 0xFFFF;
-      ImmVal |= ImmVal << 16;
-      ImmVal |= ImmVal << 32;
-      break;
-    case MVT::i32:
-      ImmVal &= 0xFFFFFFFF;
-      ImmVal |= ImmVal << 32;
-      break;
-    case MVT::i64:
-      break;
-    default:
-      llvm_unreachable("Unexpected type");
-    }
+  if (Invert)
+    ImmVal = ~ImmVal;
 
-    uint64_t encoding;
-    if (AArch64_AM::processLogicalImmediate(ImmVal, 64, encoding)) {
-      Imm = CurDAG->getTargetConstant(encoding, DL, MVT::i64);
-      return true;
-    }
+  // Shift mask depending on type size.
+  switch (VT.SimpleTy) {
+  case MVT::i8:
+    ImmVal &= 0xFF;
+    ImmVal |= ImmVal << 8;
+    ImmVal |= ImmVal << 16;
+    ImmVal |= ImmVal << 32;
+    break;
+  case MVT::i16:
+    ImmVal &= 0xFFFF;
+    ImmVal |= ImmVal << 16;
+    ImmVal |= ImmVal << 32;
+    break;
+  case MVT::i32:
+    ImmVal &= 0xFFFFFFFF;
+    ImmVal |= ImmVal << 32;
+    break;
+  case MVT::i64:
+    break;
+  default:
+    llvm_unreachable("Unexpected type");
   }
-  return false;
+
+  uint64_t encoding;
+  if (!AArch64_AM::processLogicalImmediate(ImmVal, 64, encoding))
+    return false;
+
+  Imm = CurDAG->getTargetConstant(encoding, SDLoc(N), MVT::i64);
+  return true;
 }
 
 // SVE shift intrinsics allow shift amounts larger than the element's bitwidth.
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index c8c21c4822ffe..e99b3f8ff07e0 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -989,7 +989,7 @@ let Predicates = [HasSVE_or_SME] in {
             (DUP_ZR_D (MOVi64imm (bitcast_fpimm_to_i64 f64:$val)))>;
 
   // Duplicate FP immediate into all vector elements
-  let AddedComplexity = 2 in {
+  let AddedComplexity = 3 in {
     def : Pat<(nxv8f16 (splat_vector fpimm16:$imm8)),
               (FDUP_ZI_H fpimm16:$imm8)>;
     def : Pat<(nxv4f16 (splat_vector fpimm16:$imm8)),
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 1664f4ad0c8fa..1e771e1fb9403 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -347,6 +347,11 @@ def SVELogicalImm16Pat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i16>",
 def SVELogicalImm32Pat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i32>", []>;
 def SVELogicalImm64Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i64>", []>;
 
+def SVELogicalFPImm16Pat : ComplexPattern<f16, 1, "SelectSVELogicalImm<MVT::i16>", []>;
+def SVELogicalFPImm32Pat : ComplexPattern<f32, 1, "SelectSVELogicalImm<MVT::i32>", []>;
+def SVELogicalFPImm64Pat : ComplexPattern<f64, 1, "SelectSVELogicalImm<MVT::i64>", []>;
+def SVELogicalBFPImmPat : ComplexPattern<bf16, 1, "SelectSVELogicalImm<MVT::i16>", []>;
+
 def SVELogicalImm8NotPat  : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i8, true>", []>;
 def SVELogicalImm16NotPat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i16, true>", []>;
 def SVELogicalImm32NotPat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i32, true>", []>;
@@ -2160,6 +2165,26 @@ multiclass sve_int_dup_mask_imm<string asm> {
             (!cast<Instruction>(NAME) i64:$imm)>;
   def : Pat<(nxv2i64 (splat_vector (i64 (SVELogicalImm64Pat i64:$imm)))),
             (!cast<Instruction>(NAME) i64:$imm)>;
+
+  def : Pat<(nxv8f16 (splat_vector (f16 (SVELogicalFPImm16Pat i64:$imm)))),
+            (!cast<Instruction>(NAME) i64:$imm)>;
+  def : Pat<(nxv4f16 (splat_vector (f16 (SVELogicalFPImm16Pat i64:$imm)))),
+            (!cast<Instruction>(NAME) i64:$imm)>;
+  def : Pat<(nxv2f16 (splat_vector (f16 (SVELogicalFPImm16Pat i64:$imm)))),
+            (!cast<Instruction>(NAME) i64:$imm)>;
+  def : Pat<(nxv4f32 (splat_vector (f32 (SVELogicalFPImm32Pat i64:$imm)))),
+            (!cast<Instruction>(NAME) i64:$imm)>;
+  def : Pat<(nxv2f32 (splat_vector (f32 (SVELogicalFPImm32Pat i64:$imm)))),
+            (!cast<Instruction>(NAME) i64:$imm)>;
+  def : Pat<(nxv2f64 (splat_vector (f64 (SVELogicalFPImm64Pat i64:$imm)))),
+            (!cast<Instruction>(NAME) i64:$imm)>;
+
+  def : Pat<(nxv8bf16 (splat_vector (bf16 (SVELogicalBFPImmPat i64:$imm)))),
+            (!cast<Instruction>(NAME) i64:$imm)>;
+  def : Pat<(nxv4bf16 (splat_vector (bf16 (SVELogicalBFPImmPat i64:$imm)))),
+            (!cast<Instruction>(NAME) i64:$imm)>;
+  def : Pat<(nxv2bf16 (splat_vector (bf16 (SVELogicalBFPImmPat i64:$imm)))),
+            (!cast<Instruction>(NAME) i64:$imm)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
index 16e8feb0dc5bb..fc3e018f2ec7a 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
@@ -632,7 +632,6 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloa
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    uunpkhi z3.s, z2.h
 ; SVE-NEXT:    uunpkhi z4.s, z1.h
-; SVE-NEXT:    mov w8, #32768 // =0x8000
 ; SVE-NEXT:    uunpklo z2.s, z2.h
 ; SVE-NEXT:    uunpklo z1.s, z1.h
 ; SVE-NEXT:    ptrue p1.s
@@ -643,9 +642,8 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloa
 ; SVE-NEXT:    fmul z3.s, z4.s, z3.s
 ; SVE-NEXT:    fmul z1.s, z1.s, z2.s
 ; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
-; SVE-NEXT:    fmov h3, w8
+; SVE-NEXT:    dupm z3.h, #0x8000
 ; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
-; SVE-NEXT:    mov z3.h, h3
 ; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
 ; SVE-NEXT:    sel z1.h, p0, z1.h, z3.h
 ; SVE-NEXT:    uunpkhi z3.s, z0.h
@@ -665,10 +663,8 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloa
 ;
 ; SVE-B16B16-LABEL: fsub_sel_fmul_negzero_nxv8bf16:
 ; SVE-B16B16:       // %bb.0:
-; SVE-B16B16-NEXT:    mov w8, #32768 // =0x8000
+; SVE-B16B16-NEXT:    dupm z3.h, #0x8000
 ; SVE-B16B16-NEXT:    bfmul z1.h, z1.h, z2.h
-; SVE-B16B16-NEXT:    fmov h3, w8
-; SVE-B16B16-NEXT:    mov z3.h, h3
 ; SVE-B16B16-NEXT:    sel z1.h, p0, z1.h, z3.h
 ; SVE-B16B16-NEXT:    bfsub z0.h, z0.h, z1.h
 ; SVE-B16B16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-combine.ll b/llvm/test/CodeGen/AArch64/sve-fp-combine.ll
index 53aba04028d62..57389ad2fe9b2 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-combine.ll
@@ -1134,10 +1134,9 @@ define <vscale x 2 x double> @fadd_sel_fmul_d_negzero(<vscale x 2 x double> %a,
 define <vscale x 8 x half> @fsub_sel_fmul_h_negzero(<vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: fsub_sel_fmul_h_negzero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32768 // =0x8000
+; CHECK-NEXT:    dupm z3.h, #0x8000
 ; CHECK-NEXT:    fmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    sel z1.h, p0, z1.h, z2.h
+; CHECK-NEXT:    sel z1.h, p0, z1.h, z3.h
 ; CHECK-NEXT:    fsub z0.h, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %fmul = fmul <vscale x 8 x half> %b, %c
@@ -1150,10 +1149,9 @@ define <vscale x 8 x half> @fsub_sel_fmul_h_negzero(<vscale x 8 x half> %a, <vsc
 define <vscale x 4 x float> @fsub_sel_fmul_s_negzero(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: fsub_sel_fmul_s_negzero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-2147483648 // =0x80000000
+; CHECK-NEXT:    mov z3.s, #0x80000000
 ; CHECK-NEXT:    fmul z1.s, z1.s, z2.s
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    sel z1.s, p0, z1.s, z2.s
+; CHECK-NEXT:    sel z1.s, p0, z1.s, z3.s
 ; CHECK-NEXT:    fsub z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %fmul = fmul <vscale x 4 x float> %b, %c
@@ -1166,10 +1164,9 @@ define <vscale x 4 x float> @fsub_sel_fmul_s_negzero(<vscale x 4 x float> %a, <v
 define <vscale x 2 x double> @fsub_sel_fmul_d_negzero(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: fsub_sel_fmul_d_negzero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-9223372036854775808 // =0x8000000000000000
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    fmul z1.d, z1.d, z2.d
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    sel z1.d, p0, z1.d, z2.d
+; CHECK-NEXT:    sel z1.d, p0, z1.d, z3.d
 ; CHECK-NEXT:    fsub z0.d, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %fmul = fmul <vscale x 2 x double> %b, %c
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll
index 8750867c56731..1223ae1c0cbdd 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll
@@ -51,10 +51,9 @@ define half @fadda_nxv6f16(<vscale x 6 x half> %v, half %s) {
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    mov w8, #32768 // =0x8000
+; CHECK-NEXT:    dupm z2.h, #0x8000
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    str z0, [sp]
-; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fmov s0, s1
 ; CHECK-NEXT:    st1h { z2.d }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    ptrue p0.h
@@ -77,12 +76,11 @@ define half @fadda_nxv10f16(<vscale x 10 x half> %v, half %s) {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $z2
-; CHECK-NEXT:    mov w8, #32768 // =0x8000
 ; CHECK-NEXT:    str z1, [sp]
+; CHECK-NEXT:    addvl x8, sp, #1
 ; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    fadda h2, p0, h2, z0.h
-; CHECK-NEXT:    mov z0.h, w8
-; CHECK-NEXT:    addvl x8, sp, #1
+; CHECK-NEXT:    dupm z0.h, #0x8000
 ; CHECK-NEXT:    st1h { z0.d }, p1, [sp, #1, mul vl]
 ; CHECK-NEXT:    ldr z1, [sp]
 ; CHECK-NEXT:    str z1, [sp, #1, mul vl]
@@ -105,11 +103,10 @@ define half @fadda_nxv12f16(<vscale x 12 x half> %v, half %s) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $z2
-; CHECK-NEXT:    mov w8, #32768 // =0x8000
+; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    fadda h2, p0, h2, z0.h
-; CHECK-NEXT:    uunpklo z0.s, z1.h
-; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    dupm z0.h, #0x8000
+; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
 ; CHECK-NEXT:    fadda h2, p0, h2, z0.h
 ; CHECK-NEXT:    fmov s0, s2
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
index 4ae7ac7b292e9..897ade00320db 100644
--- a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
@@ -454,18 +454,17 @@ declare <vscale x 4 x i64> @llvm.fptosi.sat.nxv4f16.nxv4i64(<vscale x 4 x half>)
 define <vscale x 2 x i32> @test_signed_v2f16_v2i32(<vscale x 2 x half> %f) {
 ; CHECK-LABEL: test_signed_v2f16_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov z1.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    mov z2.d, #0xffffffff80000000
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    mov z1.d, #0xffffffff80000000
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.d, #0x7fffffff
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    fcvtzs z2.d, p1/m, z0.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z1.d, #0x7fffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.h
-; CHECK-NEXT:    sel z0.d, p2, z2.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i32> @llvm.fptosi.sat.nxv2f16.nxv2i32(<vscale x 2 x half> %f)
@@ -475,18 +474,17 @@ define <vscale x 2 x i32> @test_signed_v2f16_v2i32(<vscale x 2 x half> %f) {
 define <vscale x 4 x i32> @test_signed_v4f16_v4i32(<vscale x 4 x half> %f) {
 ; CHECK-LABEL: test_signed_v4f16_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov z1.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    mov z2.s, #0x80000000
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    mov z1.s, #0x80000000
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.s, #0x7fffffff
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    fcvtzs z2.s, p1/m, z0.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z1.s, #0x7fffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    fcvtzs z1.s, p1/m, z0.h
-; CHECK-NEXT:    sel z0.s, p2, z2.s, z1.s
+; CHECK-NEXT:    sel z0.s, p1, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i32> @llvm.fptosi.sat.nxv4f16.nxv4i32(<vscale x 4 x half> %f)
@@ -496,26 +494,25 @@ define <vscale x 4 x i32> @test_signed_v4f16_v4i32(<vscale x 4 x half> %f) {
 define <vscale x 8 x i32> @test_signed_v8f16_v8i32(<vscale x 8 x half> %f) {
 ; CHECK-LABEL: test_signed_v8f16_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    uunpklo z1.s, z0.h
+; CHECK-NEXT:    mov z1.h, #-1025 // =0xfffffffffffffbff
+; CHECK-NEXT:    uunpklo z2.s, z0.h
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
-; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    mov z3.s, #0x80000000
 ; CHECK-NEXT:    mov z4.s, #0x80000000
 ; CHECK-NEXT:    mov z5.h, w8
-; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z2.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.s, #0x7fffffff
+; CHECK-NEXT:    fcmge p1.h, p0/z, z2.h, z1.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z1.s, #0x7fffffff
 ; CHECK-NEXT:    fcmgt p3.h, p0/z, z0.h, z5.h
-; CHECK-NEXT:    fcvtzs z3.s, p1/m, z1.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z5.h
+; CHECK-NEXT:    fcvtzs z3.s, p1/m, z2.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z2.h, z5.h
 ; CHECK-NEXT:    fcvtzs z4.s, p2/m, z0.h
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z2.h, z2.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    sel z0.s, p1, z2.s, z3.s
-; CHECK-NEXT:    sel z1.s, p3, z2.s, z4.s
+; CHECK-NEXT:    sel z0.s, p1, z1.s, z3.s
+; CHECK-NEXT:    sel z1.s, p3, z1.s, z4.s
 ; CHECK-NEXT:    mov z0.s, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
@@ -526,18 +523,17 @@ define <vscale x 8 x i32> @test_signed_v8f16_v8i32(<vscale x 8 x half> %f) {
 define <vscale x 4 x i16> @test_signed_v4f16_v4i16(<vscale x 4 x half> %f) {
 ; CHECK-LABEL: test_signed_v4f16_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #63488 // =0xf800
+; CHECK-NEXT:    dupm z1.h, #0xf800
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z2.s, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #30719 // =0x77ff
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    mov z1.s, #32767 // =0x7fff
-; CHECK-NEXT:    fcvtzs z2.s, p1/m, z0.h
+; CHECK-NEXT:    mov z1.s, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    fcvtzs z1.s, p1/m, z0.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z2.s, #32767 // =0x7fff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    sel z0.s, p2, z1.s, z2.s
+; CHECK-NEXT:    sel z0.s, p1, z2.s, z1.s
 ; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4f16.nxv4i16(<vscale x 4 x half> %f)
@@ -547,18 +543,17 @@ define <vscale x 4 x i16> @test_signed_v4f16_v4i16(<vscale x 4 x half> %f) {
 define <vscale x 8 x i16> @test_signed_v8f16_v8i16(<vscale x 8 x half> %f) {
 ; CHECK-LABEL: test_signed_v8f16_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #63488 // =0xf800
+; CHECK-NEXT:    dupm z1.h, #0xf800
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z2.h, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #30719 // =0x77ff
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    mov z1.h, #32767 // =0x7fff
-; CHECK-NEXT:    fcvtzs z2.h, p1/m, z0.h
+; CHECK-NEXT:    mov z1.h, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    fcvtzs z1.h, p1/m, z0.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z2.h, #32767 // =0x7fff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    sel z0.h, p2, z1.h, z2.h
+; CHECK-NEXT:    sel z0.h, p1, z2.h, z1.h
 ; CHECK-NEXT:    mov z0.h, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i16> @llvm.fptosi.sat.nxv8f16.nxv8i16(<vscale x 8 x half> %f)
@@ -568,18 +563,17 @@ define <vscale x 8 x i16> @test_signed_v8f16_v8i16(<vscale x 8 x half> %f) {
 define <vscale x 2 x i64> @test_signed_v2f16_v2i64(<vscale x 2 x half> %f) {
 ; CHECK-LABEL: test_signed_v2f16_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov z1.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    mov z1.d, #0x8000000000000000
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    fcvtzs z2.d, p1/m, z0.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z1.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.h
-; CHECK-NEXT:    sel z0.d, p2, z2.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i64> @llvm.fptosi.sat.nxv2f16.nxv2i64(<vscale x 2 x half> %f)
@@ -589,26 +583,25 @@ define <vscale x 2 x i64> @test_signed_v2f16_v2i64(<vscale x 2 x half> %f) {
 define <vscale x 4 x i64> @test_signed_v4f16_v4i64(<vscale x 4 x half> %f) {
 ; CHECK-LABEL: test_signed_v4f16_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    mov z1.h, #-1025 // =0xfffffffffffffbff
+; CHECK-NEXT:    uunpklo z2.d, z0.s
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
-; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z4.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z5.h, w8
-; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z2.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmge p1.h, p0/z, z2.h, z1.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z1.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmgt p3.h, p0/z, z0.h, z5.h
-; CHECK-NEXT:    fcvtzs z3.d, p1/m, z1.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z5.h
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z2.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z2.h, z5.h
 ; CHECK-NEXT:    fcvtzs z4.d, p2/m, z0.h
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z2.h, z2.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
-; CHECK-NEXT:    sel z1.d, p3, z2.d, z4.d
+; CHECK-NEXT:    sel z0.d, p1, z1.d, z3.d
+; CHECK-NEXT:    sel z1.d, p3, z1.d, z4.d
 ; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-llrint.ll b/llvm/test/CodeGen/AArch64/sve-llrint.ll
index f964d70e0a05c..c2bb0c81ab405 100644
--- a/llvm/test/CodeGen/AArch64/sve-llrint.ll
+++ b/llvm/test/CodeGen/AArch64/sve-llrint.ll
@@ -5,9 +5,8 @@ define <vscale x 1 x i64> @llrint_v1i64_v1f16(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: llrint_v1i64_v1f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov z1.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
@@ -28,9 +27,8 @@ define <vscale x 2 x i64> @llrint_v1i64_v2f16(<vscale x 2 x half> %x) {
 ; CHECK-LABEL: llrint_v1i64_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov z1.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
@@ -52,10 +50,9 @@ define <vscale x 4 x i64> @llrint_v4i64_v4f16(<vscale x 4 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z2.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z4.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z5.d, #0x7fffffffffffffff
@@ -92,10 +89,9 @@ define <vscale x 8 x i64> @llrint_v8i64_v8f16(<vscale x 8 x half> %x) {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    uunpklo z1.s, z0.h
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z4.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z4.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    mov z5.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z6.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z7.d, #0x8000000000000000
@@ -162,12 +158,13 @@ define <vscale x 16 x i64> @llrint_v16i64_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    uunpkhi z3.s, z0.h
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    uunpklo z7.s, z1.h
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z0.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    mov z0.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z5.d, #0x8000000000000000
+; CHECK-NEXT:    mov z29.h, w8
 ; CHECK-NEXT:    mov z31.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpklo z4.d, z2.s
 ; CHECK-NEXT:    uunpklo z24.d, z3.s
@@ -175,10 +172,8 @@ define <vscale x 16 x i64> @llrint_v16i64_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    uunpkhi z6.d, z2.s
 ; CHECK-NEXT:    uunpklo z26.d, z7.s
 ; CHECK-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpklo z30.d, z1.s
-; CHECK-NEXT:    mov z29.h, w8
 ; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    movprfx z27, z4
@@ -191,17 +186,17 @@ define <vscale x 16 x i64> @llrint_v16i64_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    frintx z26.h, p0/m, z26.h
 ; CHECK-NEXT:    frintx z7.h, p0/m, z7.h
 ; CHECK-NEXT:    mov z6.d, #0x8000000000000000
-; CHECK-NEXT:    fcmge p1.h, p0/z, z27.h, z2.h
-; CHECK-NEXT:    fcmge p3.h, p0/z, z24.h, z2.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z25.h, z2.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z28.h, z2.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z26.h, z2.h
-; CHECK-NEXT:    fcvtzs z0.d, p1/m, z27.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z27.h, z0.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z24.h, z0.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z25.h, z0.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z28.h, z0.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z26.h, z0.h
+; CHECK-NEXT:    fcvtzs z2.d, p1/m, z27.h
 ; CHECK-NEXT:    fcvtzs z4.d, p3/m, z24.h
 ; CHECK-NEXT:    fcvtzs z5.d, p4/m, z25.h
 ; CHECK-NEXT:    fcmgt p3.h, p0/z, z27.h, z29.h
 ; CHECK-NEXT:    fcvtzs z3.d, p2/m, z28.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z7.h, z2.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z7.h, z0.h
 ; CHECK-NEXT:    fcvtzs z6.d, p5/m, z26.h
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z27.h, z27.h
 ; CHECK-NEXT:    movprfx z27, z30
@@ -212,7 +207,7 @@ define <vscale x 16 x i64> @llrint_v16i64_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    fcmuo p2.h, p0/z, z28.h, z28.h
 ; CHECK-NEXT:    mov z28.d, #0x8000000000000000
 ; CHECK-NEXT:    fcvtzs z31.d, p4/m, z7.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z27.h, z2.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z27.h, z0.h
 ; CHECK-NEXT:    fcmgt p6.h, p0/z, z24.h, z29.h
 ; CHECK-NEXT:    fcmuo p7.h, p0/z, z24.h, z24.h
 ; CHECK-NEXT:    mov z24.d, #0x7fffffffffffffff
@@ -221,31 +216,31 @@ define <vscale x 16 x i64> @llrint_v16i64_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    fcmuo p10.h, p0/z, z25.h, z25.h
 ; CHECK-NEXT:    mov z25.d, #0x8000000000000000
 ; CHECK-NEXT:    sel z1.d, p5, z24.d, z3.d
-; CHECK-NEXT:    mov z0.d, p3/m, z24.d
 ; CHECK-NEXT:    sel z3.d, p8, z24.d, z5.d
-; CHECK-NEXT:    fcmge p4.h, p0/z, z30.h, z2.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z30.h, z0.h
+; CHECK-NEXT:    sel z0.d, p3, z24.d, z2.d
 ; CHECK-NEXT:    sel z2.d, p6, z24.d, z4.d
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.d, p10/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Reload
+; CHECK-NEXT:    fcmgt p9.h, p0/z, z26.h, z29.h
 ; CHECK-NEXT:    mov z2.d, p7/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Reload
-; CHECK-NEXT:    fcmgt p9.h, p0/z, z26.h, z29.h
 ; CHECK-NEXT:    fcvtzs z25.d, p4/m, z30.h
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p5.h, p0/z, z7.h, z29.h
 ; CHECK-NEXT:    fcmgt p6.h, p0/z, z27.h, z29.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z30.h, z29.h
 ; CHECK-NEXT:    sel z4.d, p9, z24.d, z6.d
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z30.h, z29.h
 ; CHECK-NEXT:    fcmuo p8.h, p0/z, z7.h, z7.h
 ; CHECK-NEXT:    sel z5.d, p5, z24.d, z31.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Reload
 ; CHECK-NEXT:    sel z6.d, p6, z24.d, z28.d
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Reload
 ; CHECK-NEXT:    fcmuo p9.h, p0/z, z27.h, z27.h
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z26.h, z26.h
 ; CHECK-NEXT:    sel z7.d, p4, z24.d, z25.d
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Reload
-; CHECK-NEXT:    fcmuo p3.h, p0/z, z26.h, z26.h
 ; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Reload
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z30.h, z30.h
@@ -302,48 +297,47 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x40, 0x1c // $d14 @ cfa - 56 * VG - 16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x40, 0x1c // $d15 @ cfa - 64 * VG - 16
 ; CHECK-NEXT:    uunpklo z4.s, z0.h
-; CHECK-NEXT:    uunpkhi z5.s, z0.h
-; CHECK-NEXT:    mov w9, #64511 // =0xfbff
-; CHECK-NEXT:    uunpklo z6.s, z1.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpkhi z28.s, z1.h
-; CHECK-NEXT:    mov z30.h, w9
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    mov w9, #31743 // =0x7bff
+; CHECK-NEXT:    uunpklo z5.s, z1.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z28.h, #-1025 // =0xfffffffffffffbff
+; CHECK-NEXT:    uunpkhi z29.s, z1.h
+; CHECK-NEXT:    mov z7.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpklo z13.s, z2.h
 ; CHECK-NEXT:    mov z9.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpkhi z14.s, z2.h
 ; CHECK-NEXT:    uunpkhi z17.s, z3.h
-; CHECK-NEXT:    uunpklo z7.d, z4.s
+; CHECK-NEXT:    uunpklo z6.d, z4.s
 ; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    uunpklo z27.d, z5.s
-; CHECK-NEXT:    uunpklo z31.d, z6.s
-; CHECK-NEXT:    uunpkhi z8.d, z6.s
-; CHECK-NEXT:    uunpkhi z29.d, z5.s
-; CHECK-NEXT:    uunpkhi z11.d, z28.s
-; CHECK-NEXT:    uunpklo z10.d, z28.s
+; CHECK-NEXT:    uunpklo z27.d, z0.s
+; CHECK-NEXT:    uunpklo z31.d, z5.s
+; CHECK-NEXT:    uunpkhi z8.d, z5.s
+; CHECK-NEXT:    uunpkhi z30.d, z0.s
+; CHECK-NEXT:    uunpkhi z11.d, z29.s
+; CHECK-NEXT:    uunpklo z10.d, z29.s
 ; CHECK-NEXT:    uunpklo z15.s, z3.h
 ; CHECK-NEXT:    uunpklo z16.d, z14.s
 ; CHECK-NEXT:    uunpkhi z14.d, z14.s
 ; CHECK-NEXT:    mov z24.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z1, z7
-; CHECK-NEXT:    frintx z1.h, p0/m, z7.h
 ; CHECK-NEXT:    movprfx z5, z27
 ; CHECK-NEXT:    frintx z5.h, p0/m, z27.h
+; CHECK-NEXT:    movprfx z1, z6
+; CHECK-NEXT:    frintx z1.h, p0/m, z6.h
 ; CHECK-NEXT:    frintx z4.h, p0/m, z4.h
 ; CHECK-NEXT:    movprfx z12, z31
 ; CHECK-NEXT:    frintx z12.h, p0/m, z31.h
 ; CHECK-NEXT:    movprfx z27, z8
 ; CHECK-NEXT:    frintx z27.h, p0/m, z8.h
-; CHECK-NEXT:    movprfx z6, z29
-; CHECK-NEXT:    frintx z6.h, p0/m, z29.h
+; CHECK-NEXT:    movprfx z6, z30
+; CHECK-NEXT:    frintx z6.h, p0/m, z30.h
 ; CHECK-NEXT:    movprfx z31, z10
 ; CHECK-NEXT:    frintx z31.h, p0/m, z10.h
-; CHECK-NEXT:    mov z7.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z11.h, p0/m, z11.h
 ; CHECK-NEXT:    movprfx z3, z16
 ; CHECK-NEXT:    frintx z3.h, p0/m, z16.h
-; CHECK-NEXT:    frintx z11.h, p0/m, z11.h
-; CHECK-NEXT:    mov z29.h, w9
+; CHECK-NEXT:    mov z30.h, w9
 ; CHECK-NEXT:    uunpklo z10.d, z13.s
 ; CHECK-NEXT:    uunpkhi z13.d, z13.s
 ; CHECK-NEXT:    uunpkhi z20.d, z15.s
@@ -354,124 +348,124 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    uunpklo z15.d, z15.s
 ; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z21.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z10.h, p0/m, z10.h
 ; CHECK-NEXT:    mov z26.d, #0x8000000000000000
-; CHECK-NEXT:    mov z28.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z29.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    movprfx z19, z13
 ; CHECK-NEXT:    frintx z19.h, p0/m, z13.h
 ; CHECK-NEXT:    movprfx z13, z14
 ; CHECK-NEXT:    frintx z13.h, p0/m, z14.h
-; CHECK-NEXT:    frintx z10.h, p0/m, z10.h
 ; CHECK-NEXT:    frintx z16.h, p0/m, z16.h
 ; CHECK-NEXT:    mov z22.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z23.d, #0x8000000000000000
-; CHECK-NEXT:    frintx z15.h, p0/m, z15.h
 ; CHECK-NEXT:    mov z14.d, #0x8000000000000000
-; CHECK-NEXT:    fcmge p4.h, p0/z, z4.h, z30.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z12.h, z30.h
-; CHECK-NEXT:    fcmgt p9.h, p0/z, z12.h, z29.h
+; CHECK-NEXT:    frintx z15.h, p0/m, z15.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z4.h, z28.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z12.h, z28.h
+; CHECK-NEXT:    fcmgt p9.h, p0/z, z12.h, z30.h
 ; CHECK-NEXT:    fcmuo p8.h, p0/z, z12.h, z12.h
 ; CHECK-NEXT:    fcvtzs z7.d, p4/m, z4.h
 ; CHECK-NEXT:    fcvtzs z8.d, p2/m, z12.h
 ; CHECK-NEXT:    mov z12.d, #0x8000000000000000
-; CHECK-NEXT:    fcmge p4.h, p0/z, z27.h, z30.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z27.h, z28.h
 ; CHECK-NEXT:    fcmuo p10.h, p0/z, z11.h, z11.h
-; CHECK-NEXT:    fcmge p3.h, p0/z, z5.h, z30.h
-; CHECK-NEXT:    mov z8.d, p9/m, z28.d
+; CHECK-NEXT:    fcmge p3.h, p0/z, z5.h, z28.h
+; CHECK-NEXT:    mov z8.d, p9/m, z29.d
 ; CHECK-NEXT:    fcvtzs z9.d, p4/m, z27.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z11.h, z30.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z11.h, z28.h
 ; CHECK-NEXT:    fcvtzs z24.d, p3/m, z5.h
 ; CHECK-NEXT:    mov z8.d, p8/m, #0 // =0x0
-; CHECK-NEXT:    fcmge p1.h, p0/z, z6.h, z30.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z1.h, z30.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z6.h, z28.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z1.h, z28.h
 ; CHECK-NEXT:    str z8, [x8, #4, mul vl]
 ; CHECK-NEXT:    fcvtzs z12.d, p4/m, z11.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z29.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z30.h
 ; CHECK-NEXT:    uunpkhi z11.d, z17.s
 ; CHECK-NEXT:    movprfx z17, z20
 ; CHECK-NEXT:    frintx z17.h, p0/m, z20.h
 ; CHECK-NEXT:    fcvtzs z25.d, p1/m, z6.h
 ; CHECK-NEXT:    mov z20.d, #0x8000000000000000
 ; CHECK-NEXT:    fcvtzs z0.d, p5/m, z1.h
-; CHECK-NEXT:    fcmge p6.h, p0/z, z10.h, z30.h
+; CHECK-NEXT:    fcmge p6.h, p0/z, z10.h, z28.h
 ; CHECK-NEXT:    frintx z11.h, p0/m, z11.h
-; CHECK-NEXT:    fcmge p3.h, p0/z, z31.h, z30.h
-; CHECK-NEXT:    fcmge p1.h, p0/z, z13.h, z30.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z31.h, z28.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z13.h, z28.h
 ; CHECK-NEXT:    fcvtzs z18.d, p6/m, z10.h
-; CHECK-NEXT:    fcmgt p11.h, p0/z, z10.h, z29.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z11.h, z30.h
+; CHECK-NEXT:    fcmgt p11.h, p0/z, z10.h, z30.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z11.h, z28.h
 ; CHECK-NEXT:    fcvtzs z2.d, p3/m, z31.h
 ; CHECK-NEXT:    fcvtzs z21.d, p1/m, z13.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z17.h, z30.h
-; CHECK-NEXT:    fcmge p3.h, p0/z, z16.h, z30.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z17.h, z28.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z16.h, z28.h
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z10.h, z10.h
-; CHECK-NEXT:    sel z10.d, p4, z28.d, z12.d
-; CHECK-NEXT:    sel z12.d, p11, z28.d, z18.d
+; CHECK-NEXT:    sel z10.d, p4, z29.d, z12.d
+; CHECK-NEXT:    sel z12.d, p11, z29.d, z18.d
 ; CHECK-NEXT:    fcvtzs z26.d, p5/m, z11.h
 ; CHECK-NEXT:    fcvtzs z22.d, p2/m, z17.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z29.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z30.h
 ; CHECK-NEXT:    fcvtzs z23.d, p3/m, z16.h
 ; CHECK-NEXT:    mov z10.d, p10/m, #0 // =0x0
 ; CHECK-NEXT:    mov z12.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmge p6.h, p0/z, z19.h, z30.h
+; CHECK-NEXT:    fcmge p6.h, p0/z, z19.h, z28.h
 ; CHECK-NEXT:    str z10, [x8, #7, mul vl]
-; CHECK-NEXT:    fcmge p7.h, p0/z, z3.h, z30.h
+; CHECK-NEXT:    fcmge p7.h, p0/z, z3.h, z28.h
 ; CHECK-NEXT:    str z12, [x8, #8, mul vl]
-; CHECK-NEXT:    mov z26.d, p4/m, z28.d
-; CHECK-NEXT:    fcmge p2.h, p0/z, z15.h, z30.h
-; CHECK-NEXT:    mov z30.d, #0x8000000000000000
+; CHECK-NEXT:    mov z26.d, p4/m, z29.d
+; CHECK-NEXT:    fcmge p2.h, p0/z, z15.h, z28.h
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
 ; CHECK-NEXT:    fcvtzs z14.d, p6/m, z19.h
-; CHECK-NEXT:    fcmgt p5.h, p0/z, z16.h, z29.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z17.h, z29.h
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z16.h, z30.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z17.h, z30.h
 ; CHECK-NEXT:    fcvtzs z20.d, p7/m, z3.h
-; CHECK-NEXT:    fcvtzs z30.d, p2/m, z15.h
+; CHECK-NEXT:    fcvtzs z28.d, p2/m, z15.h
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z11.h, z11.h
 ; CHECK-NEXT:    fcmuo p2.h, p0/z, z16.h, z16.h
-; CHECK-NEXT:    sel z11.d, p5, z28.d, z23.d
-; CHECK-NEXT:    sel z16.d, p3, z28.d, z22.d
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z19.h, z29.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z15.h, z29.h
+; CHECK-NEXT:    sel z11.d, p5, z29.d, z23.d
+; CHECK-NEXT:    sel z16.d, p3, z29.d, z22.d
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z19.h, z30.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z15.h, z30.h
 ; CHECK-NEXT:    mov z26.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z11.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z13.h, z29.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z13.h, z30.h
 ; CHECK-NEXT:    fcmuo p6.h, p0/z, z17.h, z17.h
 ; CHECK-NEXT:    str z26, [x8, #15, mul vl]
-; CHECK-NEXT:    sel z26.d, p4, z28.d, z14.d
+; CHECK-NEXT:    sel z26.d, p4, z29.d, z14.d
 ; CHECK-NEXT:    str z11, [x8, #14, mul vl]
-; CHECK-NEXT:    mov z30.d, p3/m, z28.d
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z3.h, z29.h
+; CHECK-NEXT:    mov z28.d, p3/m, z29.d
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z3.h, z30.h
 ; CHECK-NEXT:    fcmuo p4.h, p0/z, z13.h, z13.h
 ; CHECK-NEXT:    fcmuo p3.h, p0/z, z3.h, z3.h
-; CHECK-NEXT:    sel z3.d, p1, z28.d, z21.d
+; CHECK-NEXT:    sel z3.d, p1, z29.d, z21.d
 ; CHECK-NEXT:    mov z16.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p12.h, p0/z, z27.h, z29.h
-; CHECK-NEXT:    sel z11.d, p2, z28.d, z20.d
+; CHECK-NEXT:    fcmgt p12.h, p0/z, z27.h, z30.h
+; CHECK-NEXT:    sel z11.d, p2, z29.d, z20.d
 ; CHECK-NEXT:    str z16, [x8, #13, mul vl]
 ; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p6.h, p0/z, z15.h, z15.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z4.h, z29.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z4.h, z30.h
 ; CHECK-NEXT:    mov z11.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z9.d, p12/m, z28.d
+; CHECK-NEXT:    mov z9.d, p12/m, z29.d
 ; CHECK-NEXT:    str z3, [x8, #11, mul vl]
 ; CHECK-NEXT:    fcmuo p5.h, p0/z, z19.h, z19.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z5.h, z29.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z5.h, z30.h
 ; CHECK-NEXT:    str z11, [x8, #10, mul vl]
-; CHECK-NEXT:    mov z30.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    sel z3.d, p1, z28.d, z7.d
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z6.h, z29.h
+; CHECK-NEXT:    mov z28.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    sel z3.d, p1, z29.d, z7.d
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z6.h, z30.h
 ; CHECK-NEXT:    fcmuo p3.h, p0/z, z27.h, z27.h
-; CHECK-NEXT:    str z30, [x8, #12, mul vl]
+; CHECK-NEXT:    str z28, [x8, #12, mul vl]
 ; CHECK-NEXT:    mov z26.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    sel z7.d, p2, z28.d, z24.d
-; CHECK-NEXT:    fcmgt p6.h, p0/z, z31.h, z29.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z29.h
+; CHECK-NEXT:    sel z7.d, p2, z29.d, z24.d
+; CHECK-NEXT:    fcmgt p6.h, p0/z, z31.h, z30.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z30.h
 ; CHECK-NEXT:    str z26, [x8, #9, mul vl]
-; CHECK-NEXT:    sel z24.d, p4, z28.d, z25.d
+; CHECK-NEXT:    sel z24.d, p4, z29.d, z25.d
 ; CHECK-NEXT:    mov z9.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p5.h, p0/z, z31.h, z31.h
 ; CHECK-NEXT:    fcmuo p2.h, p0/z, z6.h, z6.h
-; CHECK-NEXT:    mov z2.d, p6/m, z28.d
+; CHECK-NEXT:    mov z2.d, p6/m, z29.d
 ; CHECK-NEXT:    str z9, [x8, #5, mul vl]
-; CHECK-NEXT:    mov z0.d, p1/m, z28.d
+; CHECK-NEXT:    mov z0.d, p1/m, z29.d
 ; CHECK-NEXT:    fcmuo p3.h, p0/z, z5.h, z5.h
 ; CHECK-NEXT:    fcmuo p4.h, p0/z, z4.h, z4.h
 ; CHECK-NEXT:    mov z2.d, p5/m, #0 // =0x0
diff --git a/llvm/test/CodeGen/AArch64/sve-lrint.ll b/llvm/test/CodeGen/AArch64/sve-lrint.ll
index f517e7fe8dc16..f1224d30d53cc 100644
--- a/llvm/test/CodeGen/AArch64/sve-lrint.ll
+++ b/llvm/test/CodeGen/AArch64/sve-lrint.ll
@@ -6,9 +6,8 @@ define <vscale x 1 x iXLen> @lrint_v1f16(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: lrint_v1f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov z1.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
@@ -29,9 +28,8 @@ define <vscale x 2 x iXLen> @lrint_v2f16(<vscale x 2 x half> %x) {
 ; CHECK-LABEL: lrint_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov z1.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
@@ -53,10 +51,9 @@ define <vscale x 4 x iXLen> @lrint_v4f16(<vscale x 4 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z2.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z4.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z5.d, #0x7fffffffffffffff
@@ -93,10 +90,9 @@ define <vscale x 8 x iXLen> @lrint_v8f16(<vscale x 8 x half> %x) {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    uunpklo z1.s, z0.h
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z4.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z4.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    mov z5.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z6.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z7.d, #0x8000000000000000
@@ -163,12 +159,13 @@ define <vscale x 16 x iXLen> @lrint_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    uunpkhi z3.s, z0.h
-; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    uunpklo z7.s, z1.h
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z0.h, #-1025 // =0xfffffffffffffbff
 ; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    mov z0.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z5.d, #0x8000000000000000
+; CHECK-NEXT:    mov z29.h, w8
 ; CHECK-NEXT:    mov z31.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpklo z4.d, z2.s
 ; CHECK-NEXT:    uunpklo z24.d, z3.s
@@ -176,10 +173,8 @@ define <vscale x 16 x iXLen> @lrint_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    uunpkhi z6.d, z2.s
 ; CHECK-NEXT:    uunpklo z26.d, z7.s
 ; CHECK-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpklo z30.d, z1.s
-; CHECK-NEXT:    mov z29.h, w8
 ; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    movprfx z27, z4
@@ -192,17 +187,17 @@ define <vscale x 16 x iXLen> @lrint_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    frintx z26.h, p0/m, z26.h
 ; CHECK-NEXT:    frintx z7.h, p0/m, z7.h
 ; CHECK-NEXT:    mov z6.d, #0x8000000000000000
-; CHECK-NEXT:    fcmge p1.h, p0/z, z27.h, z2.h
-; CHECK-NEXT:    fcmge p3.h, p0/z, z24.h, z2.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z25.h, z2.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z28.h, z2.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z26.h, z2.h
-; CHECK-NEXT:    fcvtzs z0.d, p1/m, z27.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z27.h, z0.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z24.h, z0.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z25.h, z0.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z28.h, z0.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z26.h, z0.h
+; CHECK-NEXT:    fcvtzs z2.d, p1/m, z27.h
 ; CHECK-NEXT:    fcvtzs z4.d, p3/m, z24.h
 ; CHECK-NEXT:    fcvtzs z5.d, p4/m, z25.h
 ; CHECK-NEXT:    fcmgt p3.h, p0/z, z27.h, z29.h
 ; CHECK-NEXT:    fcvtzs z3.d, p2/m, z28.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z7.h, z2.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z7.h, z0.h
 ; CHECK-NEXT:    fcvtzs z6.d, p5/m, z26.h
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z27.h, z27.h
 ; CHECK-NEXT:    movprfx z27, z30
@@ -213,7 +208,7 @@ define <vscale x 16 x iXLen> @lrint_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    fcmuo p2.h, p0/z, z28.h, z28.h
 ; CHECK-NEXT:    mov z28.d, #0x8000000000000000
 ; CHECK-NEXT:    fcvtzs z31.d, p4/m, z7.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z27.h, z2.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z27.h, z0.h
 ; CHECK-NEXT:    fcmgt p6.h, p0/z, z24.h, z29.h
 ; CHECK-NEXT:    fcmuo p7.h, p0/z, z24.h, z24.h
 ; CHECK-NEXT:    mov z24.d, #0x7fffffffffffffff
@@ -222,31 +217,31 @@ define <vscale x 16 x iXLen> @lrint_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    fcmuo p10.h, p0/z, z25.h, z25.h
 ; CHECK-NEXT:    mov z25.d, #0x8000000000000000
 ; CHECK-NEXT:    sel z1.d, p5, z24.d, z3.d
-; CHECK-NEXT:    mov z0.d, p3/m, z24.d
 ; CHECK-NEXT:    sel z3.d, p8, z24.d, z5.d
-; CHECK-NEXT:    fcmge p4.h, p0/z, z30.h, z2.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z30.h, z0.h
+; CHECK-NEXT:    sel z0.d, p3, z24.d, z2.d
 ; CHECK-NEXT:    sel z2.d, p6, z24.d, z4.d
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.d, p10/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Reload
+; CHECK-NEXT:    fcmgt p9.h, p0/z, z26.h, z29.h
 ; CHECK-NEXT:    mov z2.d, p7/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Reload
-; CHECK-NEXT:    fcmgt p9.h, p0/z, z26.h, z29.h
 ; CHECK-NEXT:    fcvtzs z25.d, p4/m, z30.h
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p5.h, p0/z, z7.h, z29.h
 ; CHECK-NEXT:    fcmgt p6.h, p0/z, z27.h, z29.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z30.h, z29.h
 ; CHECK-NEXT:    sel z4.d, p9, z24.d, z6.d
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z30.h, z29.h
 ; CHECK-NEXT:    fcmuo p8.h, p0/z, z7.h, z7.h
 ; CHECK-NEXT:    sel z5.d, p5, z24.d, z31.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Reload
 ; CHECK-NEXT:    sel z6.d, p6, z24.d, z28.d
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Reload
 ; CHECK-NEXT:    fcmuo p9.h, p0/z, z27.h, z27.h
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z26.h, z26.h
 ; CHECK-NEXT:    sel z7.d, p4, z24.d, z25.d
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Reload
-; CHECK-NEXT:    fcmuo p3.h, p0/z, z26.h, z26.h
 ; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Reload
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z30.h, z30.h
@@ -303,48 +298,47 @@ define <vscale x 32 x iXLen> @lrint_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x40, 0x1c // $d14 @ cfa - 56 * VG - 16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x40, 0x1c // $d15 @ cfa - 64 * VG - 16
 ; CHECK-NEXT:    uunpklo z4.s, z0.h
-; CHECK-NEXT:    uunpkhi z5.s, z0.h
-; CHECK-NEXT:    mov w9, #64511 // =0xfbff
-; CHECK-NEXT:    uunpklo z6.s, z1.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpkhi z28.s, z1.h
-; CHECK-NEXT:    mov z30.h, w9
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    mov w9, #31743 // =0x7bff
+; CHECK-NEXT:    uunpklo z5.s, z1.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z28.h, #-1025 // =0xfffffffffffffbff
+; CHECK-NEXT:    uunpkhi z29.s, z1.h
+; CHECK-NEXT:    mov z7.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpklo z13.s, z2.h
 ; CHECK-NEXT:    mov z9.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpkhi z14.s, z2.h
 ; CHECK-NEXT:    uunpkhi z17.s, z3.h
-; CHECK-NEXT:    uunpklo z7.d, z4.s
+; CHECK-NEXT:    uunpklo z6.d, z4.s
 ; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    uunpklo z27.d, z5.s
-; CHECK-NEXT:    uunpklo z31.d, z6.s
-; CHECK-NEXT:    uunpkhi z8.d, z6.s
-; CHECK-NEXT:    uunpkhi z29.d, z5.s
-; CHECK-NEXT:    uunpkhi z11.d, z28.s
-; CHECK-NEXT:    uunpklo z10.d, z28.s
+; CHECK-NEXT:    uunpklo z27.d, z0.s
+; CHECK-NEXT:    uunpklo z31.d, z5.s
+; CHECK-NEXT:    uunpkhi z8.d, z5.s
+; CHECK-NEXT:    uunpkhi z30.d, z0.s
+; CHECK-NEXT:    uunpkhi z11.d, z29.s
+; CHECK-NEXT:    uunpklo z10.d, z29.s
 ; CHECK-NEXT:    uunpklo z15.s, z3.h
 ; CHECK-NEXT:    uunpklo z16.d, z14.s
 ; CHECK-NEXT:    uunpkhi z14.d, z14.s
 ; CHECK-NEXT:    mov z24.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z1, z7
-; CHECK-NEXT:    frintx z1.h, p0/m, z7.h
 ; CHECK-NEXT:    movprfx z5, z27
 ; CHECK-NEXT:    frintx z5.h, p0/m, z27.h
+; CHECK-NEXT:    movprfx z1, z6
+; CHECK-NEXT:    frintx z1.h, p0/m, z6.h
 ; CHECK-NEXT:    frintx z4.h, p0/m, z4.h
 ; CHECK-NEXT:    movprfx z12, z31
 ; CHECK-NEXT:    frintx z12.h, p0/m, z31.h
 ; CHECK-NEXT:    movprfx z27, z8
 ; CHECK-NEXT:    frintx z27.h, p0/m, z8.h
-; CHECK-NEXT:    movprfx z6, z29
-; CHECK-NEXT:    frintx z6.h, p0/m, z29.h
+; CHECK-NEXT:    movprfx z6, z30
+; CHECK-NEXT:    frintx z6.h, p0/m, z30.h
 ; CHECK-NEXT:    movprfx z31, z10
 ; CHECK-NEXT:    frintx z31.h, p0/m, z10.h
-; CHECK-NEXT:    mov z7.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z11.h, p0/m, z11.h
 ; CHECK-NEXT:    movprfx z3, z16
 ; CHECK-NEXT:    frintx z3.h, p0/m, z16.h
-; CHECK-NEXT:    frintx z11.h, p0/m, z11.h
-; CHECK-NEXT:    mov z29.h, w9
+; CHECK-NEXT:    mov z30.h, w9
 ; CHECK-NEXT:    uunpklo z10.d, z13.s
 ; CHECK-NEXT:    uunpkhi z13.d, z13.s
 ; CHECK-NEXT:    uunpkhi z20.d, z15.s
@@ -355,124 +349,124 @@ define <vscale x 32 x iXLen> @lrint_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    uunpklo z15.d, z15.s
 ; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z21.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z10.h, p0/m, z10.h
 ; CHECK-NEXT:    mov z26.d, #0x8000000000000000
-; CHECK-NEXT:    mov z28.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z29.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    movprfx z19, z13
 ; CHECK-NEXT:    frintx z19.h, p0/m, z13.h
 ; CHECK-NEXT:    movprfx z13, z14
 ; CHECK-NEXT:    frintx z13.h, p0/m, z14.h
-; CHECK-NEXT:    frintx z10.h, p0/m, z10.h
 ; CHECK-NEXT:    frintx z16.h, p0/m, z16.h
 ; CHECK-NEXT:    mov z22.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z23.d, #0x8000000000000000
-; CHECK-NEXT:    frintx z15.h, p0/m, z15.h
 ; CHECK-NEXT:    mov z14.d, #0x8000000000000000
-; CHECK-NEXT:    fcmge p4.h, p0/z, z4.h, z30.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z12.h, z30.h
-; CHECK-NEXT:    fcmgt p9.h, p0/z, z12.h, z29.h
+; CHECK-NEXT:    frintx z15.h, p0/m, z15.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z4.h, z28.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z12.h, z28.h
+; CHECK-NEXT:    fcmgt p9.h, p0/z, z12.h, z30.h
 ; CHECK-NEXT:    fcmuo p8.h, p0/z, z12.h, z12.h
 ; CHECK-NEXT:    fcvtzs z7.d, p4/m, z4.h
 ; CHECK-NEXT:    fcvtzs z8.d, p2/m, z12.h
 ; CHECK-NEXT:    mov z12.d, #0x8000000000000000
-; CHECK-NEXT:    fcmge p4.h, p0/z, z27.h, z30.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z27.h, z28.h
 ; CHECK-NEXT:    fcmuo p10.h, p0/z, z11.h, z11.h
-; CHECK-NEXT:    fcmge p3.h, p0/z, z5.h, z30.h
-; CHECK-NEXT:    mov z8.d, p9/m, z28.d
+; CHECK-NEXT:    fcmge p3.h, p0/z, z5.h, z28.h
+; CHECK-NEXT:    mov z8.d, p9/m, z29.d
 ; CHECK-NEXT:    fcvtzs z9.d, p4/m, z27.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z11.h, z30.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z11.h, z28.h
 ; CHECK-NEXT:    fcvtzs z24.d, p3/m, z5.h
 ; CHECK-NEXT:    mov z8.d, p8/m, #0 // =0x0
-; CHECK-NEXT:    fcmge p1.h, p0/z, z6.h, z30.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z1.h, z30.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z6.h, z28.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z1.h, z28.h
 ; CHECK-NEXT:    str z8, [x8, #4, mul vl]
 ; CHECK-NEXT:    fcvtzs z12.d, p4/m, z11.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z29.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z30.h
 ; CHECK-NEXT:    uunpkhi z11.d, z17.s
 ; CHECK-NEXT:    movprfx z17, z20
 ; CHECK-NEXT:    frintx z17.h, p0/m, z20.h
 ; CHECK-NEXT:    fcvtzs z25.d, p1/m, z6.h
 ; CHECK-NEXT:    mov z20.d, #0x8000000000000000
 ; CHECK-NEXT:    fcvtzs z0.d, p5/m, z1.h
-; CHECK-NEXT:    fcmge p6.h, p0/z, z10.h, z30.h
+; CHECK-NEXT:    fcmge p6.h, p0/z, z10.h, z28.h
 ; CHECK-NEXT:    frintx z11.h, p0/m, z11.h
-; CHECK-NEXT:    fcmge p3.h, p0/z, z31.h, z30.h
-; CHECK-NEXT:    fcmge p1.h, p0/z, z13.h, z30.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z31.h, z28.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z13.h, z28.h
 ; CHECK-NEXT:    fcvtzs z18.d, p6/m, z10.h
-; CHECK-NEXT:    fcmgt p11.h, p0/z, z10.h, z29.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z11.h, z30.h
+; CHECK-NEXT:    fcmgt p11.h, p0/z, z10.h, z30.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z11.h, z28.h
 ; CHECK-NEXT:    fcvtzs z2.d, p3/m, z31.h
 ; CHECK-NEXT:    fcvtzs z21.d, p1/m, z13.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z17.h, z30.h
-; CHECK-NEXT:    fcmge p3.h, p0/z, z16.h, z30.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z17.h, z28.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z16.h, z28.h
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z10.h, z10.h
-; CHECK-NEXT:    sel z10.d, p4, z28.d, z12.d
-; CHECK-NEXT:    sel z12.d, p11, z28.d, z18.d
+; CHECK-NEXT:    sel z10.d, p4, z29.d, z12.d
+; CHECK-NEXT:    sel z12.d, p11, z29.d, z18.d
 ; CHECK-NEXT:    fcvtzs z26.d, p5/m, z11.h
 ; CHECK-NEXT:    fcvtzs z22.d, p2/m, z17.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z29.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z11.h, z30.h
 ; CHECK-NEXT:    fcvtzs z23.d, p3/m, z16.h
 ; CHECK-NEXT:    mov z10.d, p10/m, #0 // =0x0
 ; CHECK-NEXT:    mov z12.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmge p6.h, p0/z, z19.h, z30.h
+; CHECK-NEXT:    fcmge p6.h, p0/z, z19.h, z28.h
 ; CHECK-NEXT:    str z10, [x8, #7, mul vl]
-; CHECK-NEXT:    fcmge p7.h, p0/z, z3.h, z30.h
+; CHECK-NEXT:    fcmge p7.h, p0/z, z3.h, z28.h
 ; CHECK-NEXT:    str z12, [x8, #8, mul vl]
-; CHECK-NEXT:    mov z26.d, p4/m, z28.d
-; CHECK-NEXT:    fcmge p2.h, p0/z, z15.h, z30.h
-; CHECK-NEXT:    mov z30.d, #0x8000000000000000
+; CHECK-NEXT:    mov z26.d, p4/m, z29.d
+; CHECK-NEXT:    fcmge p2.h, p0/z, z15.h, z28.h
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
 ; CHECK-NEXT:    fcvtzs z14.d, p6/m, z19.h
-; CHECK-NEXT:    fcmgt p5.h, p0/z, z16.h, z29.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z17.h, z29.h
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z16.h, z30.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z17.h, z30.h
 ; CHECK-NEXT:    fcvtzs z20.d, p7/m, z3.h
-; CHECK-NEXT:    fcvtzs z30.d, p2/m, z15.h
+; CHECK-NEXT:    fcvtzs z28.d, p2/m, z15.h
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z11.h, z11.h
 ; CHECK-NEXT:    fcmuo p2.h, p0/z, z16.h, z16.h
-; CHECK-NEXT:    sel z11.d, p5, z28.d, z23.d
-; CHECK-NEXT:    sel z16.d, p3, z28.d, z22.d
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z19.h, z29.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z15.h, z29.h
+; CHECK-NEXT:    sel z11.d, p5, z29.d, z23.d
+; CHECK-NEXT:    sel z16.d, p3, z29.d, z22.d
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z19.h, z30.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z15.h, z30.h
 ; CHECK-NEXT:    mov z26.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z11.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z13.h, z29.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z13.h, z30.h
 ; CHECK-NEXT:    fcmuo p6.h, p0/z, z17.h, z17.h
 ; CHECK-NEXT:    str z26, [x8, #15, mul vl]
-; CHECK-NEXT:    sel z26.d, p4, z28.d, z14.d
+; CHECK-NEXT:    sel z26.d, p4, z29.d, z14.d
 ; CHECK-NEXT:    str z11, [x8, #14, mul vl]
-; CHECK-NEXT:    mov z30.d, p3/m, z28.d
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z3.h, z29.h
+; CHECK-NEXT:    mov z28.d, p3/m, z29.d
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z3.h, z30.h
 ; CHECK-NEXT:    fcmuo p4.h, p0/z, z13.h, z13.h
 ; CHECK-NEXT:    fcmuo p3.h, p0/z, z3.h, z3.h
-; CHECK-NEXT:    sel z3.d, p1, z28.d, z21.d
+; CHECK-NEXT:    sel z3.d, p1, z29.d, z21.d
 ; CHECK-NEXT:    mov z16.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p12.h, p0/z, z27.h, z29.h
-; CHECK-NEXT:    sel z11.d, p2, z28.d, z20.d
+; CHECK-NEXT:    fcmgt p12.h, p0/z, z27.h, z30.h
+; CHECK-NEXT:    sel z11.d, p2, z29.d, z20.d
 ; CHECK-NEXT:    str z16, [x8, #13, mul vl]
 ; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p6.h, p0/z, z15.h, z15.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z4.h, z29.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z4.h, z30.h
 ; CHECK-NEXT:    mov z11.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z9.d, p12/m, z28.d
+; CHECK-NEXT:    mov z9.d, p12/m, z29.d
 ; CHECK-NEXT:    str z3, [x8, #11, mul vl]
 ; CHECK-NEXT:    fcmuo p5.h, p0/z, z19.h, z19.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z5.h, z29.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z5.h, z30.h
 ; CHECK-NEXT:    str z11, [x8, #10, mul vl]
-; CHECK-NEXT:    mov z30.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    sel z3.d, p1, z28.d, z7.d
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z6.h, z29.h
+; CHECK-NEXT:    mov z28.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    sel z3.d, p1, z29.d, z7.d
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z6.h, z30.h
 ; CHECK-NEXT:    fcmuo p3.h, p0/z, z27.h, z27.h
-; CHECK-NEXT:    str z30, [x8, #12, mul vl]
+; CHECK-NEXT:    str z28, [x8, #12, mul vl]
 ; CHECK-NEXT:    mov z26.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    sel z7.d, p2, z28.d, z24.d
-; CHECK-NEXT:    fcmgt p6.h, p0/z, z31.h, z29.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z29.h
+; CHECK-NEXT:    sel z7.d, p2, z29.d, z24.d
+; CHECK-NEXT:    fcmgt p6.h, p0/z, z31.h, z30.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z30.h
 ; CHECK-NEXT:    str z26, [x8, #9, mul vl]
-; CHECK-NEXT:    sel z24.d, p4, z28.d, z25.d
+; CHECK-NEXT:    sel z24.d, p4, z29.d, z25.d
 ; CHECK-NEXT:    mov z9.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p5.h, p0/z, z31.h, z31.h
 ; CHECK-NEXT:    fcmuo p2.h, p0/z, z6.h, z6.h
-; CHECK-NEXT:    mov z2.d, p6/m, z28.d
+; CHECK-NEXT:    mov z2.d, p6/m, z29.d
 ; CHECK-NEXT:    str z9, [x8, #5, mul vl]
-; CHECK-NEXT:    mov z0.d, p1/m, z28.d
+; CHECK-NEXT:    mov z0.d, p1/m, z29.d
 ; CHECK-NEXT:    fcmuo p3.h, p0/z, z5.h, z5.h
 ; CHECK-NEXT:    fcmuo p4.h, p0/z, z4.h, z4.h
 ; CHECK-NEXT:    mov z2.d, p5/m, #0 // =0x0
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-splat.ll b/llvm/test/CodeGen/AArch64/sve-vector-splat.ll
index 5cca5539048b5..1ceaa5ad27734 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-splat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-splat.ll
@@ -509,6 +509,294 @@ define <vscale x 2 x bfloat> @splat_nxv2bf16_imm() {
   ret <vscale x 2 x bfloat> splat(bfloat 1.0)
 }
 
+define <vscale x 2 x half> @splat_nzero_nxv2f16() {
+; CHECK-LABEL: splat_nzero_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x8000
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x half> splat (half -0.0)
+}
+
+define <vscale x 4 x half> @splat_nzero_nxv4f16() {
+; CHECK-LABEL: splat_nzero_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x8000
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x half> splat (half -0.0)
+}
+
+define <vscale x 8 x half> @splat_nzero_nxv8f16() {
+; CHECK-LABEL: splat_nzero_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x8000
+; CHECK-NEXT:    ret
+  ret <vscale x 8 x half> splat (half -0.0)
+}
+
+define <vscale x 2 x float> @splat_nzero_nxv2f32() {
+; CHECK-LABEL: splat_nzero_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #0x80000000
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x float> splat (float -0.0)
+}
+
+define <vscale x 4 x float> @splat_nzero_nxv4f32() {
+; CHECK-LABEL: splat_nzero_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #0x80000000
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x float> splat (float -0.0)
+}
+
+define <vscale x 2 x double> @splat_nzero_nxv2f64() {
+; CHECK-LABEL: splat_nzero_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x double> splat (double -0.0)
+}
+
+define <vscale x 2 x bfloat> @splat_nzero_nxv2bf16() {
+; CHECK-LABEL: splat_nzero_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x8000
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x bfloat> splat (bfloat -0.0)
+}
+
+define <vscale x 4 x bfloat> @splat_nzero_nxv4bf16() {
+; CHECK-LABEL: splat_nzero_nxv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x8000
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x bfloat> splat (bfloat -0.0)
+}
+
+define <vscale x 8 x bfloat> @splat_nzero_nxv8bf16() {
+; CHECK-LABEL: splat_nzero_nxv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x8000
+; CHECK-NEXT:    ret
+  ret <vscale x 8 x bfloat> splat (bfloat -0.0)
+}
+
+define <vscale x 2 x half> @splat_pinf_nxv2f16() {
+; CHECK-LABEL: splat_pinf_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x7c00
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x half> splat (half 0x7FF0000000000000)
+}
+
+define <vscale x 4 x half> @splat_pinf_nxv4f16() {
+; CHECK-LABEL: splat_pinf_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x7c00
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x half> splat (half 0x7FF0000000000000)
+}
+
+define <vscale x 8 x half> @splat_pinf_nxv8f16() {
+; CHECK-LABEL: splat_pinf_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x7c00
+; CHECK-NEXT:    ret
+  ret <vscale x 8 x half> splat (half 0x7FF0000000000000)
+}
+
+define <vscale x 2 x float> @splat_pinf_nxv2f32() {
+; CHECK-LABEL: splat_pinf_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #0x7f800000
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x float> splat (float 0x7FF0000000000000)
+}
+
+define <vscale x 4 x float> @splat_pinf_nxv4f32() {
+; CHECK-LABEL: splat_pinf_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #0x7f800000
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x float> splat (float 0x7FF0000000000000)
+}
+
+define <vscale x 2 x double> @splat_pinf_nxv2f64() {
+; CHECK-LABEL: splat_pinf_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #0x7ff0000000000000
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x double> splat (double 0x7FF0000000000000)
+}
+
+define <vscale x 2 x bfloat> @splat_pinf_nxv2bf16() {
+; CHECK-LABEL: splat_pinf_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, #32640 // =0x7f80
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x bfloat> splat (bfloat 0x7FF0000000000000)
+}
+
+define <vscale x 4 x bfloat> @splat_pinf_nxv4bf16() {
+; CHECK-LABEL: splat_pinf_nxv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, #32640 // =0x7f80
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x bfloat> splat (bfloat 0x7FF0000000000000)
+}
+
+define <vscale x 8 x bfloat> @splat_pinf_nxv8bf16() {
+; CHECK-LABEL: splat_pinf_nxv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, #32640 // =0x7f80
+; CHECK-NEXT:    ret
+  ret <vscale x 8 x bfloat> splat (bfloat 0x7FF0000000000000)
+}
+
+define <vscale x 2 x half> @splat_ninf_nxv2f16() {
+; CHECK-LABEL: splat_ninf_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0xfc00
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x half> splat (half 0xFFF0000000000000)
+}
+
+define <vscale x 4 x half> @splat_ninf_nxv4f16() {
+; CHECK-LABEL: splat_ninf_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0xfc00
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x half> splat (half 0xFFF0000000000000)
+}
+
+define <vscale x 8 x half> @splat_ninf_nxv8f16() {
+; CHECK-LABEL: splat_ninf_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0xfc00
+; CHECK-NEXT:    ret
+  ret <vscale x 8 x half> splat (half 0xFFF0000000000000)
+}
+
+define <vscale x 2 x float> @splat_ninf_nxv2f32() {
+; CHECK-LABEL: splat_ninf_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #0xff800000
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x float> splat (float 0xFFF0000000000000)
+}
+
+define <vscale x 4 x float> @splat_ninf_nxv4f32() {
+; CHECK-LABEL: splat_ninf_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #0xff800000
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x float> splat (float 0xFFF0000000000000)
+}
+
+define <vscale x 2 x double> @splat_ninf_nxv2f64() {
+; CHECK-LABEL: splat_ninf_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #0xfff0000000000000
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x double> splat (double 0xFFF0000000000000)
+}
+
+define <vscale x 2 x bfloat> @splat_ninf_nxv2bf16() {
+; CHECK-LABEL: splat_ninf_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0xff80
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x bfloat> splat (bfloat 0xFFF0000000000000)
+}
+
+define <vscale x 4 x bfloat> @splat_ninf_nxv4bf16() {
+; CHECK-LABEL: splat_ninf_nxv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0xff80
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x bfloat> splat (bfloat 0xFFF0000000000000)
+}
+
+define <vscale x 8 x bfloat> @splat_ninf_nxv8bf16() {
+; CHECK-LABEL: splat_ninf_nxv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0xff80
+; CHECK-NEXT:    ret
+  ret <vscale x 8 x bfloat> splat (bfloat 0xFFF0000000000000)
+}
+
+define <vscale x 2 x half> @splat_nan_nxv2f16() {
+; CHECK-LABEL: splat_nan_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x7e00
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x half> splat (half 0x7FF8000000000000)
+}
+
+define <vscale x 4 x half> @splat_nan_nxv4f16() {
+; CHECK-LABEL: splat_nan_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x7e00
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x half> splat (half 0x7FF8000000000000)
+}
+
+define <vscale x 8 x half> @splat_nan_nxv8f16() {
+; CHECK-LABEL: splat_nan_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dupm z0.h, #0x7e00
+; CHECK-NEXT:    ret
+  ret <vscale x 8 x half> splat (half 0x7FF8000000000000)
+}
+
+define <vscale x 2 x float> @splat_nan_nxv2f32() {
+; CHECK-LABEL: splat_nan_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #0x7fc00000
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x float> splat (float 0x7FF8000000000000)
+}
+
+define <vscale x 4 x float> @splat_nan_nxv4f32() {
+; CHECK-LABEL: splat_nan_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #0x7fc00000
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x float> splat (float 0x7FF8000000000000)
+}
+
+define <vscale x 2 x double> @splat_nan_nxv2f64() {
+; CHECK-LABEL: splat_nan_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #0x7ff8000000000000
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x double> splat (double 0x7FF8000000000000)
+}
+
+define <vscale x 2 x bfloat> @splat_nan_nxv2bf16() {
+; CHECK-LABEL: splat_nan_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, #32704 // =0x7fc0
+; CHECK-NEXT:    ret
+  ret <vscale x 2 x bfloat> splat (bfloat 0x7FF8000000000000)
+}
+
+define <vscale x 4 x bfloat> @splat_nan_nxv4bf16() {
+; CHECK-LABEL: splat_nan_nxv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, #32704 // =0x7fc0
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x bfloat> splat (bfloat 0x7FF8000000000000)
+}
+
+define <vscale x 8 x bfloat> @splat_nan_nxv8bf16() {
+; CHECK-LABEL: splat_nan_nxv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, #32704 // =0x7fc0
+; CHECK-NEXT:    ret
+  ret <vscale x 8 x bfloat> splat (bfloat 0x7FF8000000000000)
+}
+
 define <vscale x 4 x i32> @splat_nxv4i32_fold(<vscale x 4 x i32> %x) {
 ; CHECK-LABEL: splat_nxv4i32_fold:
 ; CHECK:       // %bb.0:
@@ -581,8 +869,8 @@ define <vscale x 2 x double> @splat_nxv2f64_imm_out_of_range() {
 ; CHECK-LABEL: splat_nxv2f64_imm_out_of_range:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    adrp x8, .LCPI60_0
-; CHECK-NEXT:    add x8, x8, :lo12:.LCPI60_0
+; CHECK-NEXT:    adrp x8, .LCPI96_0
+; CHECK-NEXT:    add x8, x8, :lo12:.LCPI96_0
 ; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x8]
 ; CHECK-NEXT:    ret
   ret <vscale x 2 x double> splat(double 3.33)
diff --git a/llvm/test/CodeGen/AArch64/sve-vselect-imm.ll b/llvm/test/CodeGen/AArch64/sve-vselect-imm.ll
index 6b5b3d6d436cb..b04029c273ae2 100644
--- a/llvm/test/CodeGen/AArch64/sve-vselect-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vselect-imm.ll
@@ -338,8 +338,7 @@ ret <vscale x 2 x double> %sel
 define <vscale x 8 x half> @sel_merge_nxv8f16_negative_zero(<vscale x 8 x i1> %p, <vscale x 8 x half> %in) {
 ; CHECK-LABEL: sel_merge_nxv8f16_negative_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32768 // =0x8000
-; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    dupm z1.h, #0x8000
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 %sel = select <vscale x 8 x i1> %p, <vscale x 8 x half> splat (half -0.0), <vscale x 8 x half> %in
@@ -349,8 +348,7 @@ ret <vscale x 8 x half> %sel
 define <vscale x 4 x half> @sel_merge_nx4f16_negative_zero(<vscale x 4 x i1> %p, <vscale x 4 x half> %in) {
 ; CHECK-LABEL: sel_merge_nx4f16_negative_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32768 // =0x8000
-; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    dupm z1.h, #0x8000
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 %sel = select <vscale x 4 x i1> %p, <vscale x 4 x half> splat (half -0.0), <vscale x 4 x half> %in
@@ -360,8 +358,7 @@ ret <vscale x 4 x half> %sel
 define <vscale x 2 x half> @sel_merge_nx2f16_negative_zero(<vscale x 2 x i1> %p, <vscale x 2 x half> %in) {
 ; CHECK-LABEL: sel_merge_nx2f16_negative_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32768 // =0x8000
-; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    dupm z1.h, #0x8000
 ; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 %sel = select <vscale x 2 x i1> %p, <vscale x 2 x half> splat (half -0.0), <vscale x 2 x half> %in
@@ -371,8 +368,7 @@ ret <vscale x 2 x half> %sel
 define <vscale x 4 x float> @sel_merge_nx4f32_negative_zero(<vscale x 4 x i1> %p, <vscale x 4 x float> %in) {
 ; CHECK-LABEL: sel_merge_nx4f32_negative_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-2147483648 // =0x80000000
-; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    mov z1.s, #0x80000000
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 %sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> splat (float -0.0), <vscale x 4 x float> %in
@@ -382,8 +378,7 @@ ret <vscale x 4 x float> %sel
 define <vscale x 2 x float> @sel_merge_nx2f32_negative_zero(<vscale x 2 x i1> %p, <vscale x 2 x float> %in) {
 ; CHECK-LABEL: sel_merge_nx2f32_negative_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-2147483648 // =0x80000000
-; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    mov z1.s, #0x80000000
 ; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 %sel = select <vscale x 2 x i1> %p, <vscale x 2 x float> splat (float -0.0), <vscale x 2 x float> %in
@@ -393,8 +388,7 @@ ret <vscale x 2 x float> %sel
 define <vscale x 2 x double> @sel_merge_nx2f64_negative_zero(<vscale x 2 x i1> %p, <vscale x 2 x double> %in) {
 ; CHECK-LABEL: sel_merge_nx2f64_negative_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-9223372036854775808 // =0x8000000000000000
-; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 %sel = select <vscale x 2 x i1> %p, <vscale x 2 x double> splat (double -0.0), <vscale x 2 x double> %in

From 22a2cae5d6735a510b17859848b14f60d2e5cdfa Mon Sep 17 00:00:00 2001
From: Guillot Tony <tony.guillot@protonmail.com>
Date: Tue, 18 Nov 2025 13:36:51 +0100
Subject: [PATCH 02/33] [Clang] Fix cleanup attribute by delaying type checks
 after the type is deduced (#164440)

Previously, the handling of the `cleanup` attribute had some checks
based on the type, but we were deducing the type after handling the
attribute.
This PR fixes the way the are dealing with type checks for the `cleanup`
attribute by delaying these checks after we are deducing the type.

It is also fixed in a way that the solution can be adapted for other
attributes that does some type based checks.
This is the list of C/C++ attributes that are doing type based checks
and will need to be fixed in additional PRs:
- CUDAShared
- MutualExclusions
- PassObjectSize
- InitPriority
- Sentinel
- AcquireCapability
- RequiresCapability
- LocksExcluded
- AcquireHandle

NB: Some attributes could have been missed in my shallow search.

Fixes #129631
---
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/include/clang/Basic/Attr.td             | 12 +++++++
 clang/include/clang/Sema/CMakeLists.txt       |  5 +++
 clang/include/clang/Sema/Sema.h               |  8 +++++
 clang/lib/Sema/SemaDecl.cpp                   |  9 +++++
 clang/lib/Sema/SemaDeclAttr.cpp               | 35 +++++++++++++------
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  |  9 +++++
 clang/test/Sema/type-dependent-attrs.c        | 10 ++++++
 clang/test/SemaCXX/attr-cleanup.cpp           | 25 +++++++++++++
 clang/utils/TableGen/ClangAttrEmitter.cpp     | 20 +++++++++++
 clang/utils/TableGen/TableGen.cpp             |  7 ++++
 clang/utils/TableGen/TableGenBackends.h       |  2 ++
 llvm/docs/TableGen/BackEnds.rst               |  7 ++++
 13 files changed, 140 insertions(+), 10 deletions(-)
 create mode 100644 clang/test/Sema/type-dependent-attrs.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7459127670cc3..c2da61e4d066a 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -500,6 +500,7 @@ Bug Fixes to Attribute Support
 - Fixes crashes or missing diagnostics with the `device_kernel` attribute. (#GH161905)
 - Fix handling of parameter indexes when an attribute is applied to a C++23 explicit object member function.
 - Fixed several false positives and false negatives in function effect (`nonblocking`) analysis. (#GH166078) (#GH166101) (#GH166110)
+- Fix ``cleanup`` attribute by delaying type checks until after the type is deduced. (#GH129631)
 
 Bug Fixes to C++ Support
 ^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 8dfe4bc08c48e..0097476bc0d8d 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -741,6 +741,17 @@ class Attr {
   // our existing general parsing we need to have a separate flag that
   // opts an attribute into strict parsing of attribute parameters
   bit StrictEnumParameters = 0;
+  // Set to true for attributes which have Sema checks which requires the type
+  // to be deduced.
+  // When `IsTypeDependent` is set to true, you should add an `ActOn*Attr`
+  // function to `Sema.h`. The signature of the function must be:
+  // `void ActOn*Attr(Decl *, const Attr *);` where the `Decl *` is the
+  // declaration the attribute will be attached to; its type will have already
+  // been deduced, and the `Attr *` is the attribute being applied to that
+  // declaration. This function should handle all type-sensitive semantics for
+  // the attribute. This function will be automatically called by
+  // `Sema::CheckAttributesOnDeducedType()`.
+  bit IsTypeDependent = 0;
   // Lists language options, one of which is required to be true for the
   // attribute to be applicable. If empty, no language options are required.
   list<LangOpt> LangOpts = [];
@@ -1400,6 +1411,7 @@ def Cleanup : InheritableAttr {
   let Args = [DeclArgument<Function, "FunctionDecl">];
   let Subjects = SubjectList<[LocalVar]>;
   let Documentation = [CleanupDocs];
+  let IsTypeDependent = 1;
   // FIXME: DeclArgument should be reworked to also store the
   // Expr instead of adding attr specific hacks like the following.
   // See the discussion in https://github.com/llvm/llvm-project/pull/14023.
diff --git a/clang/include/clang/Sema/CMakeLists.txt b/clang/include/clang/Sema/CMakeLists.txt
index 9077e22c2307c..3f540ea596871 100644
--- a/clang/include/clang/Sema/CMakeLists.txt
+++ b/clang/include/clang/Sema/CMakeLists.txt
@@ -8,6 +8,11 @@ clang_tablegen(AttrParsedAttrKinds.inc -gen-clang-attr-parsed-attr-kinds
   SOURCE ../Basic/Attr.td
   TARGET ClangAttrParsedAttrKinds)
 
+clang_tablegen(AttrIsTypeDependent.inc -gen-clang-attr-is-type-dependent
+  -I ${CMAKE_CURRENT_SOURCE_DIR}/../../
+  SOURCE ../Basic/Attr.td
+  TARGET ClangAttrIsTypeDependent)
+
 clang_tablegen(AttrSpellingListIndex.inc -gen-clang-attr-spelling-index
   -I ${CMAKE_CURRENT_SOURCE_DIR}/../../
   SOURCE ../Basic/Attr.td
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 6ca182338d6af..fd2a2469142e4 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -4456,6 +4456,10 @@ class Sema final : public SemaBase {
       NamedDecl *New, Decl *Old,
       AvailabilityMergeKind AMK = AvailabilityMergeKind::Redeclaration);
 
+  /// CheckAttributesOnDeducedType - Calls Sema functions for attributes that
+  /// requires the type to be deduced.
+  void CheckAttributesOnDeducedType(Decl *D);
+
   /// MergeTypedefNameDecl - We just parsed a typedef 'New' which has the
   /// same name and scope as a previous declaration 'Old'.  Figure out
   /// how to resolve this situation, merging decls or emitting
@@ -4760,6 +4764,8 @@ class Sema final : public SemaBase {
   // linkage or not.
   static bool mightHaveNonExternalLinkage(const DeclaratorDecl *FD);
 
+#include "clang/Sema/AttrIsTypeDependent.inc"
+
   ///@}
 
   //
@@ -15469,6 +15475,8 @@ class Sema final : public SemaBase {
   std::optional<FunctionEffectMode>
   ActOnEffectExpression(Expr *CondExpr, StringRef AttributeName);
 
+  void ActOnCleanupAttr(Decl *D, const Attr *A);
+
 private:
   /// The implementation of RequireCompleteType
   bool RequireCompleteTypeImpl(SourceLocation Loc, QualType T,
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 25b89d65847ad..b7aecadc86871 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -3355,6 +3355,11 @@ void Sema::mergeDeclAttributes(NamedDecl *New, Decl *Old,
   if (!foundAny) New->dropAttrs();
 }
 
+void Sema::CheckAttributesOnDeducedType(Decl *D) {
+  for (const Attr *A : D->attrs())
+    checkAttrIsTypeDependent(D, A);
+}
+
 // Returns the number of added attributes.
 template <class T>
 static unsigned propagateAttribute(ParmVarDecl *To, const ParmVarDecl *From,
@@ -13809,6 +13814,8 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
       return;
   }
 
+  this->CheckAttributesOnDeducedType(RealDecl);
+
   // dllimport cannot be used on variable definitions.
   if (VDecl->hasAttr<DLLImportAttr>() && !VDecl->isStaticDataMember()) {
     Diag(VDecl->getLocation(), diag::err_attribute_dllimport_data_definition);
@@ -14300,6 +14307,8 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) {
         DeduceVariableDeclarationType(Var, false, nullptr))
       return;
 
+    this->CheckAttributesOnDeducedType(RealDecl);
+
     // C++11 [class.static.data]p3: A static data member can be declared with
     // the constexpr specifier; if so, its declaration shall specify
     // a brace-or-equal-initializer.
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index a9e7b44ac9d73..bda7aa32a9348 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3511,16 +3511,6 @@ static void handleCleanupAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
     return;
   }
 
-  // We're currently more strict than GCC about what function types we accept.
-  // If this ever proves to be a problem it should be easy to fix.
-  QualType Ty = S.Context.getPointerType(cast<VarDecl>(D)->getType());
-  QualType ParamTy = FD->getParamDecl(0)->getType();
-  if (!S.IsAssignConvertCompatible(S.CheckAssignmentConstraints(
-          FD->getParamDecl(0)->getLocation(), ParamTy, Ty))) {
-    S.Diag(Loc, diag::err_attribute_cleanup_func_arg_incompatible_type)
-      << NI.getName() << ParamTy << Ty;
-    return;
-  }
   VarDecl *VD = cast<VarDecl>(D);
   // Create a reference to the variable declaration. This is a fake/dummy
   // reference.
@@ -8311,3 +8301,28 @@ void Sema::redelayDiagnostics(DelayedDiagnosticPool &pool) {
   assert(curPool && "re-emitting in undelayed context not supported");
   curPool->steal(pool);
 }
+
+void Sema::ActOnCleanupAttr(Decl *D, const Attr *A) {
+  VarDecl *VD = cast<VarDecl>(D);
+  if (VD->getType()->isDependentType())
+    return;
+
+  // Obtains the FunctionDecl that was found when handling the attribute
+  // earlier.
+  CleanupAttr *Attr = D->getAttr<CleanupAttr>();
+  FunctionDecl *FD = Attr->getFunctionDecl();
+  DeclarationNameInfo NI = FD->getNameInfo();
+
+  // We're currently more strict than GCC about what function types we accept.
+  // If this ever proves to be a problem it should be easy to fix.
+  QualType Ty = this->Context.getPointerType(VD->getType());
+  QualType ParamTy = FD->getParamDecl(0)->getType();
+  if (!this->IsAssignConvertCompatible(this->CheckAssignmentConstraints(
+          FD->getParamDecl(0)->getLocation(), ParamTy, Ty))) {
+    this->Diag(Attr->getArgLoc(),
+               diag::err_attribute_cleanup_func_arg_incompatible_type)
+        << NI.getName() << ParamTy << Ty;
+    D->dropAttr<CleanupAttr>();
+    return;
+  }
+}
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 1b6b559c1227b..3a4b2ccc74350 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -1007,6 +1007,15 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
       continue;
     }
 
+    if (auto *A = dyn_cast<CleanupAttr>(TmplAttr)) {
+      if (!New->hasAttr<CleanupAttr>()) {
+        auto *NewAttr = A->clone(Context);
+        NewAttr->setArgLoc(A->getArgLoc());
+        New->addAttr(NewAttr);
+      }
+      continue;
+    }
+
     assert(!TmplAttr->isPackExpansion());
     if (TmplAttr->isLateParsed() && LateAttrs) {
       // Late parsed attributes must be instantiated and attached after the
diff --git a/clang/test/Sema/type-dependent-attrs.c b/clang/test/Sema/type-dependent-attrs.c
new file mode 100644
index 0000000000000..13068b3f94ad4
--- /dev/null
+++ b/clang/test/Sema/type-dependent-attrs.c
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -std=c23 -fsyntax-only -verify %s
+
+int open() { return 0; }
+void close(typeof(open()) *) {}
+
+void cleanup_attr() {
+  int fd_int [[gnu::cleanup(close)]] = open();
+  auto fd_auto [[gnu::cleanup(close)]] = open();
+  float fd_invalid [[gnu::cleanup(close)]] = open(); // expected-error {{'cleanup' function 'close' parameter has type 'typeof (open()) *' (aka 'int *') which is incompatible with type 'float *'}}
+}
diff --git a/clang/test/SemaCXX/attr-cleanup.cpp b/clang/test/SemaCXX/attr-cleanup.cpp
index 32d10683edebb..6048b4e92ec3f 100644
--- a/clang/test/SemaCXX/attr-cleanup.cpp
+++ b/clang/test/SemaCXX/attr-cleanup.cpp
@@ -27,3 +27,28 @@ namespace E {
     int v1 __attribute__((cleanup(c3))); // expected-error {{'c3' is not a single function}}
   }
 }
+
+namespace F {
+  int open() { return 0; }
+  void close(decltype(open()) *) {}
+
+  void test1() {
+    auto fd [[gnu::cleanup(close)]] = open();
+  }
+
+  template <typename Ty>
+  void test2() {
+    Ty fd [[gnu::cleanup(close)]] = open();
+  }
+
+  template <typename Ty>
+  void test3() {
+    Ty fd [[gnu::cleanup(close)]] = open(); // #TEST3_CLEANUP
+  }
+
+  int main() {
+    test2<int>();
+    test3<float>(); // expected-error@#TEST3_CLEANUP {{'cleanup' function 'close' parameter has type 'decltype(open()) *' (aka 'int *') which is incompatible with type 'float *'}} \
+                       expected-note {{in instantiation of function template specialization 'F::test3<float>' requested here}}
+  }
+}
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index e49dcb9b70b0f..bee9a01a3b01a 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -5045,6 +5045,26 @@ void EmitClangAttrParsedAttrKinds(const RecordKeeper &Records,
      << "}\n";
 }
 
+// Emits Sema calls for type dependent attributes
+void EmitClangAttrIsTypeDependent(const RecordKeeper &Records,
+                                  raw_ostream &OS) {
+  emitSourceFileHeader("Attribute is type dependent", OS, Records);
+
+  OS << "void checkAttrIsTypeDependent(Decl *D, const Attr *A) {\n";
+  OS << "  switch (A->getKind()) {\n";
+  OS << "  default:\n";
+  OS << "    break;\n";
+  for (const auto *A : Records.getAllDerivedDefinitions("Attr")) {
+    if (A->getValueAsBit("IsTypeDependent")) {
+      OS << "  case attr::" << A->getName() << ":\n";
+      OS << "    ActOn" << A->getName() << "Attr(D, A);\n";
+      OS << "    break;\n";
+    }
+  }
+  OS << "  }\n";
+  OS << "}\n";
+}
+
 // Emits the code to dump an attribute.
 void EmitClangAttrTextNodeDump(const RecordKeeper &Records, raw_ostream &OS) {
   emitSourceFileHeader("Attribute text node dumper", OS, Records);
diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp
index 866040d503646..707ce617cb2d0 100644
--- a/clang/utils/TableGen/TableGen.cpp
+++ b/clang/utils/TableGen/TableGen.cpp
@@ -43,6 +43,7 @@ enum ActionType {
   GenClangAttrParsedAttrList,
   GenClangAttrParsedAttrImpl,
   GenClangAttrParsedAttrKinds,
+  GenClangAttrIsTypeDependent,
   GenClangAttrTextNodeDump,
   GenClangAttrNodeTraverse,
   GenClangBasicReader,
@@ -179,6 +180,9 @@ cl::opt<ActionType> Action(
         clEnumValN(GenClangAttrParsedAttrKinds,
                    "gen-clang-attr-parsed-attr-kinds",
                    "Generate a clang parsed attribute kinds"),
+        clEnumValN(GenClangAttrIsTypeDependent,
+                   "gen-clang-attr-is-type-dependent",
+                   "Generate clang is type dependent attribute code"),
         clEnumValN(GenClangAttrTextNodeDump, "gen-clang-attr-text-node-dump",
                    "Generate clang attribute text node dumper"),
         clEnumValN(GenClangAttrNodeTraverse, "gen-clang-attr-node-traverse",
@@ -423,6 +427,9 @@ bool ClangTableGenMain(raw_ostream &OS, const RecordKeeper &Records) {
   case GenClangAttrParsedAttrKinds:
     EmitClangAttrParsedAttrKinds(Records, OS);
     break;
+  case GenClangAttrIsTypeDependent:
+    EmitClangAttrIsTypeDependent(Records, OS);
+    break;
   case GenClangAttrTextNodeDump:
     EmitClangAttrTextNodeDump(Records, OS);
     break;
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index fa49dcd289bc2..058bda3ebd246 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -82,6 +82,8 @@ void EmitClangAttrParsedAttrImpl(const llvm::RecordKeeper &Records,
                                  llvm::raw_ostream &OS);
 void EmitClangAttrParsedAttrKinds(const llvm::RecordKeeper &Records,
                                   llvm::raw_ostream &OS);
+void EmitClangAttrIsTypeDependent(const llvm::RecordKeeper &Records,
+                                  llvm::raw_ostream &OS);
 void EmitClangAttrTextNodeDump(const llvm::RecordKeeper &Records,
                                llvm::raw_ostream &OS);
 void EmitClangAttrNodeTraverse(const llvm::RecordKeeper &Records,
diff --git a/llvm/docs/TableGen/BackEnds.rst b/llvm/docs/TableGen/BackEnds.rst
index 7f571378860b2..1e3cb8783df16 100644
--- a/llvm/docs/TableGen/BackEnds.rst
+++ b/llvm/docs/TableGen/BackEnds.rst
@@ -355,6 +355,13 @@ ClangAttrParsedAttrKinds
 ``AttributeList::getKind`` function, mapping a string (and syntax) to a parsed
 attribute ``AttributeList::Kind`` enumeration.
 
+ClangAttrIsTypeDependent
+------------------------
+
+**Purpose**: Creates ``AttrIsTypeDependent.inc``, which is used to implement the
+``Sema::CheckAttributesOnDeducedType`` function, mapping an attribute kind to a
+Sema function if it exists.
+
 ClangAttrDump
 -------------
 

From 0be4218d7b7080fec73fe13bc759439d49159c05 Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Tue, 18 Nov 2025 12:41:53 +0000
Subject: [PATCH 03/33] [CMake] Declare all parts of *GenRegisterInfo.inc as
 outputs. (#168405)

This tells the build system to check and regenerate the
*GenRegisterInfo*.inc files, should any of them be missing for
whatever reason.

A follow-up from
<https://github.com/llvm/llvm-project/pull/167700>.
---
 llvm/cmake/modules/TableGen.cmake           | 12 +++++++++++-
 llvm/utils/TableGen/RegisterInfoEmitter.cpp |  2 ++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/llvm/cmake/modules/TableGen.cmake b/llvm/cmake/modules/TableGen.cmake
index 9a2e73a1e3718..84c03cd6432ed 100644
--- a/llvm/cmake/modules/TableGen.cmake
+++ b/llvm/cmake/modules/TableGen.cmake
@@ -66,6 +66,16 @@ function(tablegen project ofn)
     list(APPEND LLVM_TABLEGEN_FLAGS "-omit-comments")
   endif()
 
+  set(EXTRA_OUTPUTS)
+  if("-gen-register-info" IN_LIST ARGN)
+    cmake_path(GET ofn STEM OUTPUT_BASENAME)
+    list(APPEND EXTRA_OUTPUTS
+         ${CMAKE_CURRENT_BINARY_DIR}/${OUTPUT_BASENAME}Enums.inc
+         ${CMAKE_CURRENT_BINARY_DIR}/${OUTPUT_BASENAME}Header.inc
+         ${CMAKE_CURRENT_BINARY_DIR}/${OUTPUT_BASENAME}MCDesc.inc
+         ${CMAKE_CURRENT_BINARY_DIR}/${OUTPUT_BASENAME}TargetDesc.inc)
+  endif()
+
   # MSVC can't support long string literals ("long" > 65534 bytes)[1], so if there's
   # a possibility of generated tables being consumed by MSVC, generate arrays of
   # char literals, instead. If we're cross-compiling, then conservatively assume
@@ -126,7 +136,7 @@ function(tablegen project ofn)
     set(LLVM_TABLEGEN_JOB_POOL "")
   endif()
 
-  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn}
+  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn} ${EXTRA_OUTPUTS}
     COMMAND ${tablegen_exe} ${ARG_UNPARSED_ARGUMENTS}
     ${tblgen_includes}
     ${LLVM_TABLEGEN_FLAGS}
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index ef7b13e8940f8..3486a7a7fb08c 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -1878,6 +1878,8 @@ TableGenOutputFiles RegisterInfoEmitter::run(StringRef FilenamePrefix) {
   if (RegisterInfoDebug)
     debugDump(errs());
 
+  // The suffixes should be in sync with the tablegen function in
+  // llvm/cmake/modules/TableGen.cmake.
   return {Main,
           {{"Enums.inc", Enums},
            {"MCDesc.inc", MCDesc},

From 3c87119a910e95396b26c519fa90d63a59442267 Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Tue, 18 Nov 2025 12:43:10 +0000
Subject: [PATCH 04/33] [TableGen][NFCI] Change TableGenMain() to take
 function_ref. (#167888)

It was switched from a function pointer to std::function in

TableGen: Make 2nd arg MainFn of TableGenMain(argv0, MainFn) optional.
f675ec6165ab6add5e57cd43a2e9fa1a9bc21d81

but there's no mention of any particular reason for that.
---
 llvm/include/llvm/TableGen/Main.h      | 14 ++++++--------
 llvm/lib/TableGen/Main.cpp             |  6 ++----
 llvm/utils/TableGen/Basic/TableGen.cpp |  2 +-
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/TableGen/Main.h b/llvm/include/llvm/TableGen/Main.h
index bafce3a463acc..daede9f5a46f0 100644
--- a/llvm/include/llvm/TableGen/Main.h
+++ b/llvm/include/llvm/TableGen/Main.h
@@ -14,7 +14,6 @@
 #define LLVM_TABLEGEN_MAIN_H
 
 #include "llvm/Support/CommandLine.h"
-#include <functional>
 #include <map>
 
 namespace llvm {
@@ -30,18 +29,17 @@ struct TableGenOutputFiles {
 };
 
 /// Returns true on error, false otherwise.
-using TableGenMainFn = bool(raw_ostream &OS, const RecordKeeper &Records);
+using TableGenMainFn =
+    function_ref<bool(raw_ostream &OS, const RecordKeeper &Records)>;
 
 /// Perform the action using Records, and store output in OutFiles.
 /// Returns true on error, false otherwise.
-using MultiFileTableGenMainFn = bool(TableGenOutputFiles &OutFiles,
-                                     const RecordKeeper &Records);
+using MultiFileTableGenMainFn = function_ref<bool(TableGenOutputFiles &OutFiles,
+                                                  const RecordKeeper &Records)>;
 
-int TableGenMain(const char *argv0,
-                 std::function<TableGenMainFn> MainFn = nullptr);
+int TableGenMain(const char *argv0, TableGenMainFn MainFn = nullptr);
 
-int TableGenMain(const char *argv0,
-                 std::function<MultiFileTableGenMainFn> MainFn = nullptr);
+int TableGenMain(const char *argv0, MultiFileTableGenMainFn MainFn = nullptr);
 
 /// Controls emitting large character arrays as strings or character arrays.
 /// Typically set to false when building with MSVC.
diff --git a/llvm/lib/TableGen/Main.cpp b/llvm/lib/TableGen/Main.cpp
index 3330b70cdc2e1..939e9c6bf5d2f 100644
--- a/llvm/lib/TableGen/Main.cpp
+++ b/llvm/lib/TableGen/Main.cpp
@@ -127,8 +127,7 @@ static int WriteOutput(const TGParser &Parser, const char *argv0,
   return 0;
 }
 
-int llvm::TableGenMain(const char *argv0,
-                       std::function<MultiFileTableGenMainFn> MainFn) {
+int llvm::TableGenMain(const char *argv0, MultiFileTableGenMainFn MainFn) {
   RecordKeeper Records;
   TGTimer &Timer = Records.getTimer();
 
@@ -209,8 +208,7 @@ int llvm::TableGenMain(const char *argv0,
   return 0;
 }
 
-int llvm::TableGenMain(const char *argv0,
-                       std::function<TableGenMainFn> MainFn) {
+int llvm::TableGenMain(const char *argv0, TableGenMainFn MainFn) {
   return TableGenMain(argv0, [&MainFn](TableGenOutputFiles &OutFiles,
                                        const RecordKeeper &Records) {
     std::string S;
diff --git a/llvm/utils/TableGen/Basic/TableGen.cpp b/llvm/utils/TableGen/Basic/TableGen.cpp
index b79ae93dab4f7..a655cbbc16096 100644
--- a/llvm/utils/TableGen/Basic/TableGen.cpp
+++ b/llvm/utils/TableGen/Basic/TableGen.cpp
@@ -73,7 +73,7 @@ int tblgen_main(int argc, char **argv) {
   InitLLVM X(argc, argv);
   cl::ParseCommandLineOptions(argc, argv);
 
-  std::function<MultiFileTableGenMainFn> MainFn = nullptr;
+  MultiFileTableGenMainFn MainFn = nullptr;
   return TableGenMain(argv[0], MainFn);
 }
 

From 4c9020ded754707448f2d541c0b5d13a95725384 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Tue, 18 Nov 2025 13:58:52 +0100
Subject: [PATCH 05/33] [ORC] Fix shlibs build: add Object to
 libLLVMOrcDebugging (#168343)

---
 llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt           | 1 +
 llvm/lib/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.cpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt
index ab287c7af60be..6be59b0890c44 100644
--- a/llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt
@@ -22,6 +22,7 @@ add_llvm_component_library(LLVMOrcDebugging
   BinaryFormat
   DebugInfoDWARF
   JITLink
+  Object
   OrcJIT
   OrcShared
   Support
diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.cpp
index 9f556b0d07a8b..653645ff03f15 100644
--- a/llvm/lib/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.cpp
@@ -1,4 +1,4 @@
-//===------- ELFDebugObjectPlugin.cpp - JITLink debug objects ---------===//
+//===--------- ELFDebugObjectPlugin.cpp - JITLink debug objects -----------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.

From 52f4c360e382e6926dccb315d4402af6211e25f0 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 18 Nov 2025 13:13:35 +0000
Subject: [PATCH 06/33] [X86] combineTruncate - trunc(srl(load(p),amt)) ->
 load(p+amt/8) - ensure amt doesn't depend on original load chain (#168400)

Relax fix for #165755 / #165850 - it doesn't matter if the amt is dependent on the original load value, just any users of the chain
---
 llvm/lib/Target/X86/X86ISelLowering.cpp      |   7 +-
 llvm/test/CodeGen/X86/bittest-big-integer.ll | 272 +++++++------------
 2 files changed, 103 insertions(+), 176 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 621f1868d3311..864e5dc67682c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -54688,11 +54688,14 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
       KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
       // Check the shift amount is byte aligned.
       // Check the truncation doesn't use any shifted in (zero) top bits.
-      // Check the shift amount doesn't depend on the original load.
+      // Check the shift amount doesn't depend on the original load chain.
       if (KnownAmt.countMinTrailingZeros() >= 3 &&
           KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() -
                                      VT.getSizeInBits()) &&
-          !Ld->isPredecessorOf(ShAmt.getNode())) {
+          none_of(Ld->uses(), [&ShAmt](SDUse &Use) {
+            return Use.getResNo() == 1 &&
+                   Use.getUser()->isPredecessorOf(ShAmt.getNode());
+          })) {
         EVT PtrVT = Ld->getBasePtr().getValueType();
         SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT);
         SDValue PtrByteOfs =
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index b85a20b9d6b6e..023fb5065b892 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1877,85 +1877,56 @@ define i32 @blsr_u512(ptr %word) nounwind {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushq %r15
 ; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r12
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    pushq %rax
-; SSE-NEXT:    movq 56(%rdi), %rcx
-; SSE-NEXT:    movq 48(%rdi), %rdx
-; SSE-NEXT:    movq 40(%rdi), %rsi
-; SSE-NEXT:    movq 32(%rdi), %r11
+; SSE-NEXT:    movq 48(%rdi), %r11
+; SSE-NEXT:    movq 40(%rdi), %r9
 ; SSE-NEXT:    movq 24(%rdi), %r8
-; SSE-NEXT:    movq 16(%rdi), %r9
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    movq 8(%rdi), %r10
-; SSE-NEXT:    rep bsfq %rax, %rbx
-; SSE-NEXT:    rep bsfq %r10, %r14
-; SSE-NEXT:    addq $64, %r14
-; SSE-NEXT:    testq %rax, %rax
-; SSE-NEXT:    cmovneq %rbx, %r14
-; SSE-NEXT:    rep bsfq %r9, %r15
-; SSE-NEXT:    rep bsfq %r8, %rbx
+; SSE-NEXT:    movq 16(%rdi), %rdx
+; SSE-NEXT:    movq (%rdi), %rcx
+; SSE-NEXT:    movq 8(%rdi), %rsi
+; SSE-NEXT:    rep bsfq %rcx, %rax
+; SSE-NEXT:    rep bsfq %rsi, %rbx
 ; SSE-NEXT:    addq $64, %rbx
-; SSE-NEXT:    testq %r9, %r9
-; SSE-NEXT:    cmovneq %r15, %rbx
-; SSE-NEXT:    subq $-128, %rbx
-; SSE-NEXT:    movq %rax, %r15
-; SSE-NEXT:    movq %rax, %r12
-; SSE-NEXT:    orq %r10, %r12
-; SSE-NEXT:    cmovneq %r14, %rbx
-; SSE-NEXT:    rep bsfq %r11, %r12
-; SSE-NEXT:    rep bsfq %rsi, %r14
-; SSE-NEXT:    addq $64, %r14
-; SSE-NEXT:    testq %r11, %r11
-; SSE-NEXT:    cmovneq %r12, %r14
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    rep bsfq %rdx, %r12
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovneq %rax, %rbx
+; SSE-NEXT:    rep bsfq %rdx, %rax
+; SSE-NEXT:    rep bsfq %r8, %r10
+; SSE-NEXT:    addq $64, %r10
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovneq %rax, %r10
+; SSE-NEXT:    movq 32(%rdi), %r14
+; SSE-NEXT:    subq $-128, %r10
+; SSE-NEXT:    movq %rcx, %rax
+; SSE-NEXT:    orq %rsi, %rax
+; SSE-NEXT:    cmovneq %rbx, %r10
+; SSE-NEXT:    rep bsfq %r14, %rax
+; SSE-NEXT:    rep bsfq %r9, %rbx
+; SSE-NEXT:    addq $64, %rbx
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovneq %rax, %rbx
+; SSE-NEXT:    rep bsfq %r11, %r15
 ; SSE-NEXT:    movl $64, %eax
-; SSE-NEXT:    rep bsfq %rcx, %rax
+; SSE-NEXT:    rep bsfq 56(%rdi), %rax
 ; SSE-NEXT:    addq $64, %rax
-; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    cmovneq %r12, %rax
+; SSE-NEXT:    testq %r11, %r11
+; SSE-NEXT:    cmovneq %r15, %rax
 ; SSE-NEXT:    subq $-128, %rax
-; SSE-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    orq %rsi, %r11
-; SSE-NEXT:    cmovneq %r14, %rax
-; SSE-NEXT:    addq $256, %rax # imm = 0x100
-; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    orq %r8, %r10
-; SSE-NEXT:    orq %r9, %r15
-; SSE-NEXT:    orq %r10, %r15
+; SSE-NEXT:    orq %r9, %r14
 ; SSE-NEXT:    cmovneq %rbx, %rax
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    addq $256, %rax # imm = 0x100
+; SSE-NEXT:    orq %r8, %rsi
+; SSE-NEXT:    orq %rdx, %rcx
+; SSE-NEXT:    orq %rsi, %rcx
+; SSE-NEXT:    cmovneq %r10, %rax
+; SSE-NEXT:    movl $-2, %edx
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    roll %cl, %edx
 ; SSE-NEXT:    movl %eax, %ecx
-; SSE-NEXT:    andl $32, %ecx
-; SSE-NEXT:    movl %eax, %edx
-; SSE-NEXT:    andl $480, %edx # imm = 0x1E0
-; SSE-NEXT:    shrl $3, %edx
-; SSE-NEXT:    movl %edx, %esi
-; SSE-NEXT:    andl $-8, %esi
-; SSE-NEXT:    movq -128(%rsp,%rsi), %r8
-; SSE-NEXT:    shrq %cl, %r8
-; SSE-NEXT:    movl -120(%rsp,%rsi), %esi
-; SSE-NEXT:    addl %esi, %esi
-; SSE-NEXT:    notl %ecx
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rsi
-; SSE-NEXT:    orl %r8d, %esi
-; SSE-NEXT:    btrl %eax, %esi
-; SSE-NEXT:    movl %esi, (%rdi,%rdx)
+; SSE-NEXT:    shrl $3, %ecx
+; SSE-NEXT:    andl $60, %ecx
+; SSE-NEXT:    andl %edx, (%rdi,%rcx)
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
-; SSE-NEXT:    addq $8, %rsp
 ; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
 ; SSE-NEXT:    popq %r14
 ; SSE-NEXT:    popq %r15
 ; SSE-NEXT:    retq
@@ -1964,133 +1935,86 @@ define i32 @blsr_u512(ptr %word) nounwind {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %r15
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    movq 56(%rdi), %rcx
-; AVX2-NEXT:    movq 40(%rdi), %rdx
-; AVX2-NEXT:    movq 32(%rdi), %r11
-; AVX2-NEXT:    movq 24(%rdi), %rsi
-; AVX2-NEXT:    movq 16(%rdi), %r8
-; AVX2-NEXT:    movq (%rdi), %r9
-; AVX2-NEXT:    movq 8(%rdi), %r10
-; AVX2-NEXT:    xorl %ebx, %ebx
-; AVX2-NEXT:    tzcntq %r9, %rbx
-; AVX2-NEXT:    tzcntq %r10, %rax
-; AVX2-NEXT:    addq $64, %rax
-; AVX2-NEXT:    testq %r9, %r9
-; AVX2-NEXT:    cmovneq %rbx, %rax
-; AVX2-NEXT:    xorl %r14d, %r14d
-; AVX2-NEXT:    tzcntq %r8, %r14
+; AVX2-NEXT:    movq 40(%rdi), %r9
+; AVX2-NEXT:    movq 32(%rdi), %r10
+; AVX2-NEXT:    movq 24(%rdi), %r8
+; AVX2-NEXT:    movq 16(%rdi), %rdx
+; AVX2-NEXT:    movq (%rdi), %rcx
+; AVX2-NEXT:    movq 8(%rdi), %rsi
+; AVX2-NEXT:    tzcntq %rcx, %rax
 ; AVX2-NEXT:    xorl %ebx, %ebx
 ; AVX2-NEXT:    tzcntq %rsi, %rbx
 ; AVX2-NEXT:    addq $64, %rbx
-; AVX2-NEXT:    testq %r8, %r8
-; AVX2-NEXT:    cmovneq %r14, %rbx
-; AVX2-NEXT:    subq $-128, %rbx
-; AVX2-NEXT:    movq %r9, %r14
-; AVX2-NEXT:    movq %r9, %r15
-; AVX2-NEXT:    orq %r10, %r15
+; AVX2-NEXT:    testq %rcx, %rcx
 ; AVX2-NEXT:    cmovneq %rax, %rbx
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq %r11, %rax
-; AVX2-NEXT:    xorl %r12d, %r12d
-; AVX2-NEXT:    tzcntq %rdx, %r12
-; AVX2-NEXT:    addq $64, %r12
-; AVX2-NEXT:    testq %r11, %r11
-; AVX2-NEXT:    cmovneq %rax, %r12
-; AVX2-NEXT:    movq 48(%rdi), %r15
-; AVX2-NEXT:    xorl %r13d, %r13d
-; AVX2-NEXT:    tzcntq %r15, %r13
+; AVX2-NEXT:    tzcntq %rdx, %rax
+; AVX2-NEXT:    tzcntq %r8, %r11
+; AVX2-NEXT:    addq $64, %r11
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovneq %rax, %r11
+; AVX2-NEXT:    subq $-128, %r11
+; AVX2-NEXT:    movq %rcx, %rax
+; AVX2-NEXT:    orq %rsi, %rax
+; AVX2-NEXT:    cmovneq %rbx, %r11
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq %rcx, %rax
+; AVX2-NEXT:    tzcntq %r10, %rax
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    tzcntq %r9, %rbx
+; AVX2-NEXT:    addq $64, %rbx
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovneq %rax, %rbx
+; AVX2-NEXT:    movq 48(%rdi), %r14
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    tzcntq %r14, %r15
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq 56(%rdi), %rax
 ; AVX2-NEXT:    addq $64, %rax
-; AVX2-NEXT:    testq %r15, %r15
-; AVX2-NEXT:    cmovneq %r13, %rax
+; AVX2-NEXT:    testq %r14, %r14
+; AVX2-NEXT:    cmovneq %r15, %rax
 ; AVX2-NEXT:    subq $-128, %rax
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    orq %rdx, %r11
-; AVX2-NEXT:    cmovneq %r12, %rax
-; AVX2-NEXT:    addq $256, %rax # imm = 0x100
-; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    orq %rsi, %r10
-; AVX2-NEXT:    orq %r8, %r14
-; AVX2-NEXT:    orq %r10, %r14
+; AVX2-NEXT:    orq %r9, %r10
 ; AVX2-NEXT:    cmovneq %rbx, %rax
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r15, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    addq $256, %rax # imm = 0x100
+; AVX2-NEXT:    orq %r8, %rsi
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    orq %rsi, %rcx
+; AVX2-NEXT:    cmovneq %r11, %rax
+; AVX2-NEXT:    movl $-2, %edx
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    roll %cl, %edx
 ; AVX2-NEXT:    movl %eax, %ecx
-; AVX2-NEXT:    andl $32, %ecx
-; AVX2-NEXT:    movl %eax, %edx
-; AVX2-NEXT:    andl $480, %edx # imm = 0x1E0
-; AVX2-NEXT:    shrl $3, %edx
-; AVX2-NEXT:    movl %edx, %esi
-; AVX2-NEXT:    andl $-8, %esi
-; AVX2-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %r8
-; AVX2-NEXT:    notl %ecx
-; AVX2-NEXT:    movl -120(%rsp,%rsi), %esi
-; AVX2-NEXT:    addl %esi, %esi
-; AVX2-NEXT:    shlxq %rcx, %rsi, %rcx
-; AVX2-NEXT:    orl %r8d, %ecx
-; AVX2-NEXT:    btrl %eax, %ecx
-; AVX2-NEXT:    movl %ecx, (%rdi,%rdx)
+; AVX2-NEXT:    shrl $3, %ecx
+; AVX2-NEXT:    andl $60, %ecx
+; AVX2-NEXT:    andl %edx, (%rdi,%rcx)
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
 ; AVX2-NEXT:    popq %r14
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: blsr_u512:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rax
-; AVX512-NEXT:    vmovups (%rdi), %ymm0
-; AVX512-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm2
-; AVX512-NEXT:    vpternlogd {{.*#+}} zmm3 = -1
-; AVX512-NEXT:    vpaddq %zmm3, %zmm2, %zmm3
-; AVX512-NEXT:    vpandnq %zmm3, %zmm2, %zmm3
-; AVX512-NEXT:    vplzcntq %zmm3, %zmm3
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
-; AVX512-NEXT:    vpsubq %zmm3, %zmm4, %zmm3
-; AVX512-NEXT:    vptestmq %zmm2, %zmm2, %k1
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [512,512,512,512,512,512,512,512]
-; AVX512-NEXT:    vpcompressq %zmm3, %zmm2 {%k1}
-; AVX512-NEXT:    vmovq %xmm2, %rax
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vmovdqu %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovdqu %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    movl $-2, %edx
+; AVX512-NEXT:    movl %eax, %ecx
+; AVX512-NEXT:    roll %cl, %edx
 ; AVX512-NEXT:    movl %eax, %ecx
-; AVX512-NEXT:    andl $32, %ecx
-; AVX512-NEXT:    movl %ecx, %edx
-; AVX512-NEXT:    notl %edx
-; AVX512-NEXT:    movl %eax, %esi
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    movl %esi, %r8d
-; AVX512-NEXT:    andl $56, %r8d
-; AVX512-NEXT:    movl -120(%rsp,%r8), %r9d
-; AVX512-NEXT:    addl %r9d, %r9d
-; AVX512-NEXT:    shlxq %rdx, %r9, %rdx
 ; AVX512-NEXT:    shrl $3, %ecx
-; AVX512-NEXT:    addq %rsp, %r8
-; AVX512-NEXT:    addq $-128, %r8
-; AVX512-NEXT:    orl (%rcx,%r8), %edx
-; AVX512-NEXT:    btrl %eax, %edx
-; AVX512-NEXT:    andl $60, %esi
-; AVX512-NEXT:    movl %edx, (%rdi,%rsi)
+; AVX512-NEXT:    andl $60, %ecx
+; AVX512-NEXT:    andl %edx, (%rdi,%rcx)
 ; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT:    popq %rcx
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %ld = load i512, ptr %word

From 3d5d32c6058807008e579dd5ea2faced33a7943b Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi@arm.com>
Date: Tue, 18 Nov 2025 13:15:47 +0000
Subject: [PATCH 07/33] [CGP]: Optimize mul.overflow. (#148343)

- Detect cases where LHS & RHS values will not cause overflow
(when the Hi halfs are zero).
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |   7 +
 llvm/lib/CodeGen/CodeGenPrepare.cpp           | 182 ++++++++++++
 .../Target/AArch64/AArch64ISelLowering.cpp    |   9 +
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   5 +
 llvm/test/CodeGen/AArch64/i128-math.ll        | 189 ++++++++-----
 .../CodeGen/AArch64/i128_with_overflow.ll     |  93 ++++---
 .../test/CodeGen/AArch64/mul-i128-overflow.ll | 261 ++++++++++++++++++
 .../umulo-128-legalisation-lowering.ll        |  97 ++++---
 8 files changed, 699 insertions(+), 144 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/mul-i128-overflow.ll

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index cec7d09f494d6..4c932c523e423 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3492,6 +3492,13 @@ class LLVM_ABI TargetLoweringBase {
     return MathUsed && (VT.isSimple() || !isOperationExpand(Opcode, VT));
   }
 
+  // Return true if the target wants to optimize the mul overflow intrinsic
+  // for the given \p VT.
+  virtual bool shouldOptimizeMulOverflowWithZeroHighBits(LLVMContext &Context,
+                                                         EVT VT) const {
+    return false;
+  }
+
   // Return true if it is profitable to use a scalar input to a BUILD_VECTOR
   // even if the vector itself has multiple uses.
   virtual bool aggressivelyPreferBuildVectorSources(EVT VecVT) const {
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index b6dd174f9be80..587c1372b19cb 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -431,6 +431,8 @@ class CodeGenPrepare {
   bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy,
                           unsigned AddrSpace);
   bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr);
+  bool optimizeMulWithOverflow(Instruction *I, bool IsSigned,
+                               ModifyDT &ModifiedDT);
   bool optimizeInlineAsmInst(CallInst *CS);
   bool optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT);
   bool optimizeExt(Instruction *&I);
@@ -2797,6 +2799,10 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
         }
       }
       return false;
+    case Intrinsic::umul_with_overflow:
+      return optimizeMulWithOverflow(II, /*IsSigned=*/false, ModifiedDT);
+    case Intrinsic::smul_with_overflow:
+      return optimizeMulWithOverflow(II, /*IsSigned=*/true, ModifiedDT);
     }
 
     SmallVector<Value *, 2> PtrOps;
@@ -6391,6 +6397,182 @@ bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,
   return true;
 }
 
+// This is a helper for CodeGenPrepare::optimizeMulWithOverflow.
+// Check the pattern we are interested in where there are maximum 2 uses
+// of the intrinsic which are the extract instructions.
+static bool matchOverflowPattern(Instruction *&I, ExtractValueInst *&MulExtract,
+                                 ExtractValueInst *&OverflowExtract) {
+  // Bail out if it's more than 2 users:
+  if (I->hasNUsesOrMore(3))
+    return false;
+
+  for (User *U : I->users()) {
+    auto *Extract = dyn_cast<ExtractValueInst>(U);
+    if (!Extract || Extract->getNumIndices() != 1)
+      return false;
+
+    unsigned Index = Extract->getIndices()[0];
+    if (Index == 0)
+      MulExtract = Extract;
+    else if (Index == 1)
+      OverflowExtract = Extract;
+    else
+      return false;
+  }
+  return true;
+}
+
+// Rewrite the mul_with_overflow intrinsic by checking if both of the
+// operands' value ranges are within the legal type. If so, we can optimize the
+// multiplication algorithm. This code is supposed to be written during the step
+// of type legalization, but given that we need to reconstruct the IR which is
+// not doable there, we do it here.
+// The IR after the optimization will look like:
+// entry:
+//   if signed:
+//     ( (lhs_lo>>BW-1) ^ lhs_hi) || ( (rhs_lo>>BW-1) ^ rhs_hi) ? overflow,
+//     overflow_no
+//   else:
+//     (lhs_hi != 0) || (rhs_hi != 0) ? overflow, overflow_no
+// overflow_no:
+// overflow:
+// overflow.res:
+// \returns true if optimization was applied
+// TODO: This optimization can be further improved to optimize branching on
+// overflow where the 'overflow_no' BB can branch directly to the false
+// successor of overflow, but that would add additional complexity so we leave
+// it for future work.
+bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
+                                             ModifyDT &ModifiedDT) {
+  // Check if target supports this optimization.
+  if (!TLI->shouldOptimizeMulOverflowWithZeroHighBits(
+          I->getContext(),
+          TLI->getValueType(*DL, I->getType()->getContainedType(0))))
+    return false;
+
+  ExtractValueInst *MulExtract = nullptr, *OverflowExtract = nullptr;
+  if (!matchOverflowPattern(I, MulExtract, OverflowExtract))
+    return false;
+
+  // Keep track of the instruction to stop reoptimizing it again.
+  InsertedInsts.insert(I);
+
+  Value *LHS = I->getOperand(0);
+  Value *RHS = I->getOperand(1);
+  Type *Ty = LHS->getType();
+  unsigned VTHalfBitWidth = Ty->getScalarSizeInBits() / 2;
+  Type *LegalTy = Ty->getWithNewBitWidth(VTHalfBitWidth);
+
+  // New BBs:
+  BasicBlock *OverflowEntryBB =
+      I->getParent()->splitBasicBlock(I, "", /*Before*/ true);
+  OverflowEntryBB->takeName(I->getParent());
+  // Keep the 'br' instruction that is generated as a result of the split to be
+  // erased/replaced later.
+  Instruction *OldTerminator = OverflowEntryBB->getTerminator();
+  BasicBlock *NoOverflowBB =
+      BasicBlock::Create(I->getContext(), "overflow.no", I->getFunction());
+  NoOverflowBB->moveAfter(OverflowEntryBB);
+  BasicBlock *OverflowBB =
+      BasicBlock::Create(I->getContext(), "overflow", I->getFunction());
+  OverflowBB->moveAfter(NoOverflowBB);
+
+  // BB overflow.entry:
+  IRBuilder<> Builder(OverflowEntryBB);
+  // Extract low and high halves of LHS:
+  Value *LoLHS = Builder.CreateTrunc(LHS, LegalTy, "lo.lhs");
+  Value *HiLHS = Builder.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr");
+  HiLHS = Builder.CreateTrunc(HiLHS, LegalTy, "hi.lhs");
+
+  // Extract low and high halves of RHS:
+  Value *LoRHS = Builder.CreateTrunc(RHS, LegalTy, "lo.rhs");
+  Value *HiRHS = Builder.CreateLShr(RHS, VTHalfBitWidth, "rhs.lsr");
+  HiRHS = Builder.CreateTrunc(HiRHS, LegalTy, "hi.rhs");
+
+  Value *IsAnyBitTrue;
+  if (IsSigned) {
+    Value *SignLoLHS =
+        Builder.CreateAShr(LoLHS, VTHalfBitWidth - 1, "sign.lo.lhs");
+    Value *SignLoRHS =
+        Builder.CreateAShr(LoRHS, VTHalfBitWidth - 1, "sign.lo.rhs");
+    Value *XorLHS = Builder.CreateXor(HiLHS, SignLoLHS);
+    Value *XorRHS = Builder.CreateXor(HiRHS, SignLoRHS);
+    Value *Or = Builder.CreateOr(XorLHS, XorRHS, "or.lhs.rhs");
+    IsAnyBitTrue = Builder.CreateCmp(ICmpInst::ICMP_NE, Or,
+                                     ConstantInt::getNullValue(Or->getType()));
+  } else {
+    Value *CmpLHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiLHS,
+                                      ConstantInt::getNullValue(LegalTy));
+    Value *CmpRHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiRHS,
+                                      ConstantInt::getNullValue(LegalTy));
+    IsAnyBitTrue = Builder.CreateOr(CmpLHS, CmpRHS, "or.lhs.rhs");
+  }
+  Builder.CreateCondBr(IsAnyBitTrue, OverflowBB, NoOverflowBB);
+
+  // BB overflow.no:
+  Builder.SetInsertPoint(NoOverflowBB);
+  Value *ExtLoLHS, *ExtLoRHS;
+  if (IsSigned) {
+    ExtLoLHS = Builder.CreateSExt(LoLHS, Ty, "lo.lhs.ext");
+    ExtLoRHS = Builder.CreateSExt(LoRHS, Ty, "lo.rhs.ext");
+  } else {
+    ExtLoLHS = Builder.CreateZExt(LoLHS, Ty, "lo.lhs.ext");
+    ExtLoRHS = Builder.CreateZExt(LoRHS, Ty, "lo.rhs.ext");
+  }
+
+  Value *Mul = Builder.CreateMul(ExtLoLHS, ExtLoRHS, "mul.overflow.no");
+
+  // Create the 'overflow.res' BB to merge the results of
+  // the two paths:
+  BasicBlock *OverflowResBB = I->getParent();
+  OverflowResBB->setName("overflow.res");
+
+  // BB overflow.no: jump to overflow.res BB
+  Builder.CreateBr(OverflowResBB);
+  // No we don't need the old terminator in overflow.entry BB, erase it:
+  OldTerminator->eraseFromParent();
+
+  // BB overflow.res:
+  Builder.SetInsertPoint(OverflowResBB, OverflowResBB->getFirstInsertionPt());
+  // Create PHI nodes to merge results from no.overflow BB and overflow BB to
+  // replace the extract instructions.
+  PHINode *OverflowResPHI = Builder.CreatePHI(Ty, 2),
+          *OverflowFlagPHI =
+              Builder.CreatePHI(IntegerType::getInt1Ty(I->getContext()), 2);
+
+  // Add the incoming values from no.overflow BB and later from overflow BB.
+  OverflowResPHI->addIncoming(Mul, NoOverflowBB);
+  OverflowFlagPHI->addIncoming(ConstantInt::getFalse(I->getContext()),
+                               NoOverflowBB);
+
+  // Replace all users of MulExtract and OverflowExtract to use the PHI nodes.
+  if (MulExtract) {
+    MulExtract->replaceAllUsesWith(OverflowResPHI);
+    MulExtract->eraseFromParent();
+  }
+  if (OverflowExtract) {
+    OverflowExtract->replaceAllUsesWith(OverflowFlagPHI);
+    OverflowExtract->eraseFromParent();
+  }
+
+  // Remove the intrinsic from parent (overflow.res BB) as it will be part of
+  // overflow BB
+  I->removeFromParent();
+  // BB overflow:
+  I->insertInto(OverflowBB, OverflowBB->end());
+  Builder.SetInsertPoint(OverflowBB, OverflowBB->end());
+  Value *MulOverflow = Builder.CreateExtractValue(I, {0}, "mul.overflow");
+  Value *OverflowFlag = Builder.CreateExtractValue(I, {1}, "overflow.flag");
+  Builder.CreateBr(OverflowResBB);
+
+  // Add The Extracted values to the PHINodes in the overflow.res BB.
+  OverflowResPHI->addIncoming(MulOverflow, OverflowBB);
+  OverflowFlagPHI->addIncoming(OverflowFlag, OverflowBB);
+
+  ModifiedDT = ModifyDT::ModifyBBDT;
+  return true;
+}
+
 /// If there are any memory operands, use OptimizeMemoryInst to sink their
 /// address computing into the block when possible / profitable.
 bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 42567883b2594..d21e19b2ecd46 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18851,6 +18851,15 @@ bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
   return (Index == 0 || Index == ResVT.getVectorMinNumElements());
 }
 
+bool AArch64TargetLowering::shouldOptimizeMulOverflowWithZeroHighBits(
+    LLVMContext &Context, EVT VT) const {
+  if (getTypeAction(Context, VT) != TypeExpandInteger)
+    return false;
+
+  EVT LegalTy = EVT::getIntegerVT(Context, VT.getSizeInBits() / 2);
+  return getTypeAction(Context, LegalTy) == TargetLowering::TypeLegal;
+}
+
 /// Turn vector tests of the signbit in the form of:
 ///   xor (sra X, elt_size(X)-1), -1
 /// into:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 70bfae717fb76..be198e54cbcbf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -333,6 +333,11 @@ class AArch64TargetLowering : public TargetLowering {
     return TargetLowering::shouldFormOverflowOp(Opcode, VT, true);
   }
 
+  // Return true if the target wants to optimize the mul overflow intrinsic
+  // for the given \p VT.
+  bool shouldOptimizeMulOverflowWithZeroHighBits(LLVMContext &Context,
+                                                 EVT VT) const override;
+
   Value *emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr,
                         AtomicOrdering Ord) const override;
   Value *emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr,
diff --git a/llvm/test/CodeGen/AArch64/i128-math.ll b/llvm/test/CodeGen/AArch64/i128-math.ll
index 9e1c0c1b115ab..12ae241dda4bd 100644
--- a/llvm/test/CodeGen/AArch64/i128-math.ll
+++ b/llvm/test/CodeGen/AArch64/i128-math.ll
@@ -262,20 +262,28 @@ define i128 @u128_mul(i128 %x, i128 %y) {
 define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_checked_mul:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr x8, x1, x3
+; CHECK-NEXT:    cbz x8, .LBB17_2
+; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    mul x9, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
-; CHECK-NEXT:    umulh x8, x1, x2
-; CHECK-NEXT:    umulh x10, x3, x0
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x8, x3, x0
 ; CHECK-NEXT:    madd x9, x1, x2, x9
-; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    umulh x11, x0, x2
 ; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x11, x0, x2
+; CHECK-NEXT:    ccmp xzr, x8, #0, eq
 ; CHECK-NEXT:    mul x0, x0, x2
 ; CHECK-NEXT:    cset w8, ne
 ; CHECK-NEXT:    adds x1, x11, x9
 ; CHECK-NEXT:    csinc w8, w8, wzr, lo
 ; CHECK-NEXT:    eor w2, w8, #0x1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB17_2: // %overflow.no
+; CHECK-NEXT:    umulh x1, x0, x2
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    eor w2, w8, #0x1
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -290,19 +298,27 @@ define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {
 define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_overflowing_mul:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr x8, x1, x3
+; CHECK-NEXT:    cbz x8, .LBB18_2
+; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    mul x9, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
-; CHECK-NEXT:    umulh x8, x1, x2
-; CHECK-NEXT:    umulh x10, x3, x0
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x8, x3, x0
 ; CHECK-NEXT:    madd x9, x1, x2, x9
-; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    umulh x11, x0, x2
 ; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x11, x0, x2
+; CHECK-NEXT:    ccmp xzr, x8, #0, eq
 ; CHECK-NEXT:    mul x0, x0, x2
 ; CHECK-NEXT:    cset w8, ne
 ; CHECK-NEXT:    adds x1, x11, x9
 ; CHECK-NEXT:    csinc w2, w8, wzr, lo
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB18_2: // %overflow.no
+; CHECK-NEXT:    umulh x1, x0, x2
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    mov w2, wzr
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -316,19 +332,28 @@ define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) {
 define i128 @u128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_saturating_mul:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul x9, x3, x0
+; CHECK-NEXT:    orr x8, x1, x3
+; CHECK-NEXT:    cbz x8, .LBB19_2
+; CHECK-NEXT:  // %bb.1: // %overflow
+; CHECK-NEXT:    mul x8, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
-; CHECK-NEXT:    umulh x8, x1, x2
-; CHECK-NEXT:    umulh x10, x3, x0
-; CHECK-NEXT:    madd x9, x1, x2, x9
-; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    umulh x11, x0, x2
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x9, x3, x0
+; CHECK-NEXT:    madd x11, x1, x2, x8
 ; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x12, x0, x2
+; CHECK-NEXT:    ccmp xzr, x9, #0, eq
 ; CHECK-NEXT:    mul x8, x0, x2
 ; CHECK-NEXT:    cset w10, ne
-; CHECK-NEXT:    adds x9, x11, x9
+; CHECK-NEXT:    adds x9, x12, x11
 ; CHECK-NEXT:    csinc w10, w10, wzr, lo
+; CHECK-NEXT:    b .LBB19_3
+; CHECK-NEXT:  .LBB19_2: // %overflow.no
+; CHECK-NEXT:    umulh x9, x0, x2
+; CHECK-NEXT:    mov w10, wzr
+; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:  .LBB19_3: // %overflow.res
 ; CHECK-NEXT:    cmp w10, #0
 ; CHECK-NEXT:    csinv x0, x8, xzr, eq
 ; CHECK-NEXT:    csinv x1, x9, xzr, eq
@@ -355,6 +380,11 @@ define i128 @i128_mul(i128 %x, i128 %y) {
 define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_checked_mul:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor x8, x3, x2, asr #63
+; CHECK-NEXT:    eor x9, x1, x0, asr #63
+; CHECK-NEXT:    orr x8, x9, x8
+; CHECK-NEXT:    cbz x8, .LBB21_2
+; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
 ; CHECK-NEXT:    asr x13, x3, #63
@@ -364,24 +394,30 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    adds x10, x11, x10
 ; CHECK-NEXT:    mul x14, x0, x3
 ; CHECK-NEXT:    umulh x12, x0, x3
-; CHECK-NEXT:    adc x9, x8, x9
+; CHECK-NEXT:    adc x8, x8, x9
+; CHECK-NEXT:    mov x9, x1
 ; CHECK-NEXT:    mul x13, x0, x13
-; CHECK-NEXT:    adds x8, x14, x10
-; CHECK-NEXT:    mul x15, x1, x3
-; CHECK-NEXT:    smulh x10, x1, x3
-; CHECK-NEXT:    mov x1, x8
-; CHECK-NEXT:    adc x11, x12, x13
-; CHECK-NEXT:    asr x12, x9, #63
-; CHECK-NEXT:    asr x13, x11, #63
-; CHECK-NEXT:    adds x9, x9, x11
 ; CHECK-NEXT:    asr x11, x8, #63
+; CHECK-NEXT:    mul x15, x1, x3
+; CHECK-NEXT:    adds x1, x14, x10
+; CHECK-NEXT:    smulh x9, x9, x3
+; CHECK-NEXT:    adc x10, x12, x13
+; CHECK-NEXT:    asr x12, x10, #63
+; CHECK-NEXT:    adds x8, x8, x10
+; CHECK-NEXT:    asr x10, x1, #63
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    adc x12, x12, x13
-; CHECK-NEXT:    adds x9, x15, x9
-; CHECK-NEXT:    adc x10, x10, x12
-; CHECK-NEXT:    cmp x9, x11
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w2, eq
+; CHECK-NEXT:    adc x11, x11, x12
+; CHECK-NEXT:    adds x8, x15, x8
+; CHECK-NEXT:    adc x9, x9, x11
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ccmp x9, x10, #0, eq
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    eor w2, w8, #0x1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB21_2: // %overflow.no
+; CHECK-NEXT:    smulh x1, x0, x2
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    eor w2, w8, #0x1
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -396,6 +432,11 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_overflowing_mul:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor x8, x3, x2, asr #63
+; CHECK-NEXT:    eor x9, x1, x0, asr #63
+; CHECK-NEXT:    orr x8, x9, x8
+; CHECK-NEXT:    cbz x8, .LBB22_2
+; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
 ; CHECK-NEXT:    asr x13, x3, #63
@@ -405,24 +446,29 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    adds x10, x11, x10
 ; CHECK-NEXT:    mul x14, x0, x3
 ; CHECK-NEXT:    umulh x12, x0, x3
-; CHECK-NEXT:    adc x9, x8, x9
+; CHECK-NEXT:    adc x8, x8, x9
+; CHECK-NEXT:    mov x9, x1
 ; CHECK-NEXT:    mul x13, x0, x13
-; CHECK-NEXT:    adds x8, x14, x10
-; CHECK-NEXT:    mul x15, x1, x3
-; CHECK-NEXT:    smulh x10, x1, x3
-; CHECK-NEXT:    mov x1, x8
-; CHECK-NEXT:    adc x11, x12, x13
-; CHECK-NEXT:    asr x12, x9, #63
-; CHECK-NEXT:    asr x13, x11, #63
-; CHECK-NEXT:    adds x9, x9, x11
 ; CHECK-NEXT:    asr x11, x8, #63
+; CHECK-NEXT:    mul x15, x1, x3
+; CHECK-NEXT:    adds x1, x14, x10
+; CHECK-NEXT:    smulh x9, x9, x3
+; CHECK-NEXT:    adc x10, x12, x13
+; CHECK-NEXT:    asr x12, x10, #63
+; CHECK-NEXT:    adds x8, x8, x10
+; CHECK-NEXT:    asr x10, x1, #63
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    adc x12, x12, x13
-; CHECK-NEXT:    adds x9, x15, x9
-; CHECK-NEXT:    adc x10, x10, x12
-; CHECK-NEXT:    cmp x9, x11
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    adc x11, x11, x12
+; CHECK-NEXT:    adds x8, x15, x8
+; CHECK-NEXT:    adc x9, x9, x11
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ccmp x9, x10, #0, eq
 ; CHECK-NEXT:    cset w2, ne
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB22_2: // %overflow.no
+; CHECK-NEXT:    smulh x1, x0, x2
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    mov w2, wzr
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -436,6 +482,11 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 define i128 @i128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_saturating_mul:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor x8, x3, x2, asr #63
+; CHECK-NEXT:    eor x9, x1, x0, asr #63
+; CHECK-NEXT:    orr x8, x9, x8
+; CHECK-NEXT:    cbz x8, .LBB23_2
+; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
 ; CHECK-NEXT:    asr x13, x3, #63
@@ -445,29 +496,35 @@ define i128 @i128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    adds x10, x11, x10
 ; CHECK-NEXT:    mul x14, x0, x3
 ; CHECK-NEXT:    umulh x12, x0, x3
-; CHECK-NEXT:    adc x8, x8, x9
+; CHECK-NEXT:    adc x9, x8, x9
 ; CHECK-NEXT:    mul x13, x0, x13
-; CHECK-NEXT:    adds x9, x14, x10
-; CHECK-NEXT:    mul x11, x1, x3
-; CHECK-NEXT:    adc x10, x12, x13
-; CHECK-NEXT:    smulh x12, x1, x3
-; CHECK-NEXT:    asr x13, x8, #63
-; CHECK-NEXT:    asr x14, x10, #63
-; CHECK-NEXT:    adds x8, x8, x10
-; CHECK-NEXT:    adc x10, x13, x14
-; CHECK-NEXT:    adds x8, x11, x8
-; CHECK-NEXT:    asr x11, x9, #63
-; CHECK-NEXT:    mul x13, x0, x2
-; CHECK-NEXT:    adc x10, x12, x10
-; CHECK-NEXT:    eor x12, x3, x1
-; CHECK-NEXT:    eor x8, x8, x11
-; CHECK-NEXT:    eor x10, x10, x11
-; CHECK-NEXT:    asr x11, x12, #63
-; CHECK-NEXT:    orr x8, x8, x10
-; CHECK-NEXT:    eor x10, x11, #0x7fffffffffffffff
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:    csinv x0, x13, x11, eq
-; CHECK-NEXT:    csel x1, x10, x9, ne
+; CHECK-NEXT:    adds x8, x14, x10
+; CHECK-NEXT:    mul x15, x1, x3
+; CHECK-NEXT:    asr x14, x8, #63
+; CHECK-NEXT:    smulh x10, x1, x3
+; CHECK-NEXT:    adc x11, x12, x13
+; CHECK-NEXT:    asr x12, x9, #63
+; CHECK-NEXT:    asr x13, x11, #63
+; CHECK-NEXT:    adds x11, x9, x11
+; CHECK-NEXT:    mul x9, x0, x2
+; CHECK-NEXT:    adc x12, x12, x13
+; CHECK-NEXT:    adds x11, x15, x11
+; CHECK-NEXT:    adc x10, x10, x12
+; CHECK-NEXT:    cmp x11, x14
+; CHECK-NEXT:    ccmp x10, x14, #0, eq
+; CHECK-NEXT:    cset w10, ne
+; CHECK-NEXT:    b .LBB23_3
+; CHECK-NEXT:  .LBB23_2: // %overflow.no
+; CHECK-NEXT:    smulh x8, x0, x2
+; CHECK-NEXT:    mov w10, wzr
+; CHECK-NEXT:    mul x9, x0, x2
+; CHECK-NEXT:  .LBB23_3: // %overflow.res
+; CHECK-NEXT:    eor x11, x3, x1
+; CHECK-NEXT:    cmp w10, #0
+; CHECK-NEXT:    asr x11, x11, #63
+; CHECK-NEXT:    eor x12, x11, #0x7fffffffffffffff
+; CHECK-NEXT:    csinv x0, x9, x11, eq
+; CHECK-NEXT:    csel x1, x12, x8, ne
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
diff --git a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
index 9924b7c63f763..3d90e094a5747 100644
--- a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
+++ b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
@@ -224,21 +224,29 @@ cleanup:
 define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-LABEL: test_umul_i128:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr x8, x1, x3
+; CHECK-NEXT:    cbz x8, .LBB4_2
+; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    mul x9, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
-; CHECK-NEXT:    umulh x8, x1, x2
-; CHECK-NEXT:    umulh x10, x3, x0
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x8, x3, x0
 ; CHECK-NEXT:    madd x9, x1, x2, x9
-; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    umulh x11, x0, x2
 ; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x11, x0, x2
+; CHECK-NEXT:    ccmp xzr, x8, #0, eq
+; CHECK-NEXT:    mul x0, x0, x2
 ; CHECK-NEXT:    cset w8, ne
 ; CHECK-NEXT:    adds x1, x11, x9
 ; CHECK-NEXT:    csinc w8, w8, wzr, lo
-; CHECK-NEXT:    cmp w8, #1
-; CHECK-NEXT:    b.ne .LBB4_2
-; CHECK-NEXT:  // %bb.1: // %if.then
+; CHECK-NEXT:    cbnz w8, .LBB4_3
+; CHECK-NEXT:    b .LBB4_4
+; CHECK-NEXT:  .LBB4_2: // %overflow.no
+; CHECK-NEXT:    umulh x1, x0, x2
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    cbz w8, .LBB4_4
+; CHECK-NEXT:  .LBB4_3: // %if.then
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
@@ -247,9 +255,7 @@ define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    sxtw x0, w0
 ; CHECK-NEXT:    asr x1, x0, #63
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB4_2: // %if.end
-; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:  .LBB4_4: // %cleanup
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
@@ -273,34 +279,40 @@ cleanup:
 define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-LABEL: test_smul_i128:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    asr x10, x1, #63
-; CHECK-NEXT:    umulh x11, x0, x2
-; CHECK-NEXT:    asr x14, x3, #63
-; CHECK-NEXT:    mov x8, x1
-; CHECK-NEXT:    mul x12, x1, x2
-; CHECK-NEXT:    umulh x9, x1, x2
-; CHECK-NEXT:    mul x10, x10, x2
-; CHECK-NEXT:    adds x11, x12, x11
-; CHECK-NEXT:    mul x15, x0, x3
-; CHECK-NEXT:    umulh x13, x0, x3
-; CHECK-NEXT:    adc x9, x9, x10
-; CHECK-NEXT:    mul x14, x0, x14
-; CHECK-NEXT:    mul x16, x1, x3
-; CHECK-NEXT:    adds x1, x15, x11
-; CHECK-NEXT:    asr x11, x9, #63
-; CHECK-NEXT:    smulh x8, x8, x3
-; CHECK-NEXT:    adc x10, x13, x14
-; CHECK-NEXT:    asr x12, x10, #63
-; CHECK-NEXT:    adds x9, x9, x10
-; CHECK-NEXT:    adc x10, x11, x12
-; CHECK-NEXT:    adds x9, x16, x9
-; CHECK-NEXT:    asr x11, x1, #63
-; CHECK-NEXT:    adc x8, x8, x10
-; CHECK-NEXT:    eor x8, x8, x11
-; CHECK-NEXT:    eor x9, x9, x11
+; CHECK-NEXT:    eor x8, x3, x2, asr #63
+; CHECK-NEXT:    eor x9, x1, x0, asr #63
 ; CHECK-NEXT:    orr x8, x9, x8
-; CHECK-NEXT:    cbz x8, .LBB5_2
-; CHECK-NEXT:  // %bb.1: // %if.then
+; CHECK-NEXT:    cbz x8, .LBB5_4
+; CHECK-NEXT:  // %bb.1: // %overflow
+; CHECK-NEXT:    asr x9, x1, #63
+; CHECK-NEXT:    umulh x10, x0, x2
+; CHECK-NEXT:    asr x13, x3, #63
+; CHECK-NEXT:    mul x11, x1, x2
+; CHECK-NEXT:    umulh x8, x1, x2
+; CHECK-NEXT:    mul x9, x9, x2
+; CHECK-NEXT:    adds x10, x11, x10
+; CHECK-NEXT:    mul x14, x0, x3
+; CHECK-NEXT:    umulh x12, x0, x3
+; CHECK-NEXT:    adc x8, x8, x9
+; CHECK-NEXT:    mov x9, x1
+; CHECK-NEXT:    mul x13, x0, x13
+; CHECK-NEXT:    asr x11, x8, #63
+; CHECK-NEXT:    mul x15, x1, x3
+; CHECK-NEXT:    adds x1, x14, x10
+; CHECK-NEXT:    smulh x9, x9, x3
+; CHECK-NEXT:    adc x10, x12, x13
+; CHECK-NEXT:    asr x12, x10, #63
+; CHECK-NEXT:    adds x8, x8, x10
+; CHECK-NEXT:    asr x10, x1, #63
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    adc x11, x11, x12
+; CHECK-NEXT:    adds x8, x15, x8
+; CHECK-NEXT:    adc x9, x9, x11
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ccmp x9, x10, #0, eq
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    cbz w8, .LBB5_3
+; CHECK-NEXT:  .LBB5_2: // %if.then
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
@@ -309,10 +321,13 @@ define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    sxtw x0, w0
 ; CHECK-NEXT:    asr x1, x0, #63
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:  .LBB5_3: // %cleanup
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB5_2: // %if.end
+; CHECK-NEXT:  .LBB5_4: // %overflow.no
+; CHECK-NEXT:    smulh x1, x0, x2
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    cbnz w8, .LBB5_2
+; CHECK-NEXT:    b .LBB5_3
 entry:
   %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %1 = extractvalue { i128, i1 } %0, 1
diff --git a/llvm/test/CodeGen/AArch64/mul-i128-overflow.ll b/llvm/test/CodeGen/AArch64/mul-i128-overflow.ll
new file mode 100644
index 0000000000000..7b60f81539aa8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/mul-i128-overflow.ll
@@ -0,0 +1,261 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64 -o - %s | FileCheck %s
+
+
+declare i32 @error()
+
+define i128 @test1(i128 noundef %x, i128 noundef %y) {
+; CHECK-LABEL: test1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    eor x8, x3, x2, asr #63
+; CHECK-NEXT:    eor x9, x1, x0, asr #63
+; CHECK-NEXT:    orr x8, x9, x8
+; CHECK-NEXT:    cbz x8, .LBB0_4
+; CHECK-NEXT:  // %bb.1: // %overflow
+; CHECK-NEXT:    asr x9, x1, #63
+; CHECK-NEXT:    umulh x10, x0, x2
+; CHECK-NEXT:    asr x13, x3, #63
+; CHECK-NEXT:    mul x11, x1, x2
+; CHECK-NEXT:    umulh x8, x1, x2
+; CHECK-NEXT:    mul x9, x9, x2
+; CHECK-NEXT:    adds x10, x11, x10
+; CHECK-NEXT:    mul x14, x0, x3
+; CHECK-NEXT:    umulh x12, x0, x3
+; CHECK-NEXT:    adc x8, x8, x9
+; CHECK-NEXT:    mov x9, x1
+; CHECK-NEXT:    mul x13, x0, x13
+; CHECK-NEXT:    asr x11, x8, #63
+; CHECK-NEXT:    mul x15, x1, x3
+; CHECK-NEXT:    adds x1, x14, x10
+; CHECK-NEXT:    smulh x9, x9, x3
+; CHECK-NEXT:    adc x10, x12, x13
+; CHECK-NEXT:    asr x12, x10, #63
+; CHECK-NEXT:    adds x8, x8, x10
+; CHECK-NEXT:    asr x10, x1, #63
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    adc x11, x11, x12
+; CHECK-NEXT:    adds x8, x15, x8
+; CHECK-NEXT:    adc x9, x9, x11
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ccmp x9, x10, #0, eq
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    cbz w8, .LBB0_3
+; CHECK-NEXT:  .LBB0_2: // %if.then
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl error
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x0, w0
+; CHECK-NEXT:    asr x1, x0, #63
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:  .LBB0_3: // %cleanup
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_4: // %overflow.no
+; CHECK-NEXT:    smulh x1, x0, x2
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    cbnz w8, .LBB0_2
+; CHECK-NEXT:    b .LBB0_3
+entry:
+  %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
+  %1 = extractvalue { i128, i1 } %0, 1
+  br i1 %1, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @error()
+  %conv1 = sext i32 %call to i128
+  br label %cleanup
+
+if.end:
+  %2 = extractvalue { i128, i1 } %0, 0
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i128 [ %conv1, %if.then ], [ %2, %if.end ]
+  ret i128 %retval.0
+}
+
+define i128 @test2(i128 noundef %x, i128 noundef %y, ptr %out) {
+; CHECK-LABEL: test2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    eor x8, x3, x2, asr #63
+; CHECK-NEXT:    eor x9, x1, x0, asr #63
+; CHECK-NEXT:    orr x8, x9, x8
+; CHECK-NEXT:    cbz x8, .LBB1_4
+; CHECK-NEXT:  // %bb.1: // %overflow
+; CHECK-NEXT:    asr x9, x1, #63
+; CHECK-NEXT:    umulh x10, x0, x2
+; CHECK-NEXT:    asr x13, x3, #63
+; CHECK-NEXT:    mul x11, x1, x2
+; CHECK-NEXT:    umulh x8, x1, x2
+; CHECK-NEXT:    mul x9, x9, x2
+; CHECK-NEXT:    adds x10, x11, x10
+; CHECK-NEXT:    mul x14, x0, x3
+; CHECK-NEXT:    umulh x12, x0, x3
+; CHECK-NEXT:    adc x8, x8, x9
+; CHECK-NEXT:    mov x9, x1
+; CHECK-NEXT:    mul x13, x0, x13
+; CHECK-NEXT:    asr x11, x8, #63
+; CHECK-NEXT:    mul x15, x1, x3
+; CHECK-NEXT:    adds x1, x14, x10
+; CHECK-NEXT:    smulh x9, x9, x3
+; CHECK-NEXT:    adc x10, x12, x13
+; CHECK-NEXT:    asr x12, x10, #63
+; CHECK-NEXT:    adds x8, x8, x10
+; CHECK-NEXT:    asr x10, x1, #63
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    adc x11, x11, x12
+; CHECK-NEXT:    adds x8, x15, x8
+; CHECK-NEXT:    adc x9, x9, x11
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ccmp x9, x10, #0, eq
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    stp x0, x1, [x4]
+; CHECK-NEXT:    cbz w8, .LBB1_3
+; CHECK-NEXT:  .LBB1_2: // %if.then
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl error
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x0, w0
+; CHECK-NEXT:    asr x1, x0, #63
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:  .LBB1_3: // %cleanup
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_4: // %overflow.no
+; CHECK-NEXT:    smulh x1, x0, x2
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    stp x0, x1, [x4]
+; CHECK-NEXT:    cbnz w8, .LBB1_2
+; CHECK-NEXT:    b .LBB1_3
+entry:
+  %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
+  %1 = extractvalue { i128, i1 } %0, 0
+  store i128 %1, ptr %out
+  %2 = extractvalue { i128, i1 } %0, 1
+  br i1 %2, label %if.then, label %cleanup
+
+if.then:
+  %call = tail call i32 @error()
+  %conv1 = sext i32 %call to i128
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i128 [ %conv1, %if.then ], [ %1, %entry ]
+  ret i128 %retval.0
+}
+
+define i128 @test3(i128 noundef %x, i128 noundef %y, ptr %out) {
+; CHECK-LABEL: test3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr x8, x1, x3
+; CHECK-NEXT:    cbz x8, .LBB2_3
+; CHECK-NEXT:  // %bb.1: // %overflow
+; CHECK-NEXT:    mul x8, x3, x0
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    ccmp x3, #0, #4, ne
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x9, x3, x0
+; CHECK-NEXT:    madd x11, x1, x2, x8
+; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x12, x0, x2
+; CHECK-NEXT:    ccmp xzr, x9, #0, eq
+; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:    cset w10, ne
+; CHECK-NEXT:    adds x9, x12, x11
+; CHECK-NEXT:    csinc w10, w10, wzr, lo
+; CHECK-NEXT:    stp x8, x9, [x4]
+; CHECK-NEXT:    cbnz w10, .LBB2_4
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    mov x1, xzr
+; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB2_3: // %overflow.no
+; CHECK-NEXT:    umulh x9, x0, x2
+; CHECK-NEXT:    mov w10, wzr
+; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:    stp x8, x9, [x4]
+; CHECK-NEXT:    cbz w10, .LBB2_2
+; CHECK-NEXT:  .LBB2_4: // %if.then
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl error
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x0, w0
+; CHECK-NEXT:    asr x1, x0, #63
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
+  %1 = extractvalue { i128, i1 } %0, 0
+  store i128 %1, ptr %out
+  %2 = extractvalue { i128, i1 } %0, 1
+  br i1 %2, label %if.then, label %cleanup
+
+if.then:
+  %call = tail call i32 @error()
+  %conv1 = sext i32 %call to i128
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i128 [ %conv1, %if.then ], [ 1, %entry ]
+  ret i128 %retval.0
+}
+
+define i128 @test4(i128 noundef %x, i128 noundef %y, i128 %out) {
+; CHECK-LABEL: test4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr x8, x1, x3
+; CHECK-NEXT:    cbz x8, .LBB3_2
+; CHECK-NEXT:  // %bb.1: // %overflow
+; CHECK-NEXT:    mul x8, x3, x0
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    ccmp x3, #0, #4, ne
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x9, x3, x0
+; CHECK-NEXT:    madd x11, x1, x2, x8
+; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x12, x0, x2
+; CHECK-NEXT:    ccmp xzr, x9, #0, eq
+; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:    cset w10, ne
+; CHECK-NEXT:    adds x9, x12, x11
+; CHECK-NEXT:    csinc w10, w10, wzr, lo
+; CHECK-NEXT:    b .LBB3_3
+; CHECK-NEXT:  .LBB3_2: // %overflow.no
+; CHECK-NEXT:    umulh x9, x0, x2
+; CHECK-NEXT:    mov w10, wzr
+; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:  .LBB3_3: // %overflow.res
+; CHECK-NEXT:    adds x0, x8, x4
+; CHECK-NEXT:    adc x1, x9, x5
+; CHECK-NEXT:    cbz w10, .LBB3_5
+; CHECK-NEXT:  // %bb.4: // %if.then
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl error
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x0, w0
+; CHECK-NEXT:    asr x1, x0, #63
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:  .LBB3_5: // %cleanup
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
+  %1 = extractvalue { i128, i1 } %0, 0
+  %res = add i128 %1, %out
+  %2 = extractvalue { i128, i1 } %0, 1
+  br i1 %2, label %if.then, label %cleanup
+
+if.then:
+  %call = tail call i32 @error()
+  %conv1 = sext i32 %call to i128
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i128 [ %conv1, %if.then ], [ %res, %entry ]
+  ret i128 %retval.0
+}
diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
index edfd80b4f2706..ace0c83e63c7c 100644
--- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
@@ -4,20 +4,28 @@
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; AARCH-LABEL: muloti_test:
 ; AARCH:       // %bb.0: // %start
+; AARCH-NEXT:    orr x8, x1, x3
+; AARCH-NEXT:    cbz x8, .LBB0_2
+; AARCH-NEXT:  // %bb.1: // %overflow
 ; AARCH-NEXT:    mul x9, x3, x0
 ; AARCH-NEXT:    cmp x1, #0
 ; AARCH-NEXT:    ccmp x3, #0, #4, ne
-; AARCH-NEXT:    umulh x8, x1, x2
-; AARCH-NEXT:    umulh x10, x3, x0
+; AARCH-NEXT:    umulh x10, x1, x2
+; AARCH-NEXT:    umulh x8, x3, x0
 ; AARCH-NEXT:    madd x9, x1, x2, x9
-; AARCH-NEXT:    ccmp xzr, x8, #0, eq
-; AARCH-NEXT:    umulh x11, x0, x2
 ; AARCH-NEXT:    ccmp xzr, x10, #0, eq
+; AARCH-NEXT:    umulh x11, x0, x2
+; AARCH-NEXT:    ccmp xzr, x8, #0, eq
 ; AARCH-NEXT:    mul x0, x0, x2
 ; AARCH-NEXT:    cset w8, ne
 ; AARCH-NEXT:    adds x1, x11, x9
 ; AARCH-NEXT:    csinc w2, w8, wzr, lo
 ; AARCH-NEXT:    ret
+; AARCH-NEXT:  .LBB0_2: // %overflow.no
+; AARCH-NEXT:    umulh x1, x0, x2
+; AARCH-NEXT:    mul x0, x0, x2
+; AARCH-NEXT:    mov w2, wzr
+; AARCH-NEXT:    ret
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
   %1 = extractvalue { i128, i1 } %0, 0
@@ -35,45 +43,56 @@ start:
 define i128 @__muloti4(i128 %0, i128 %1, ptr nocapture nonnull writeonly align 4 %2) #2 {
 ; AARCH-LABEL: __muloti4:
 ; AARCH:       // %bb.0: // %Entry
-; AARCH-NEXT:    asr x11, x1, #63
-; AARCH-NEXT:    asr x9, x3, #63
-; AARCH-NEXT:    umulh x12, x0, x2
-; AARCH-NEXT:    mov x8, x1
+; AARCH-NEXT:    eor x8, x3, x2, asr #63
+; AARCH-NEXT:    eor x9, x1, x0, asr #63
 ; AARCH-NEXT:    str wzr, [x4]
-; AARCH-NEXT:    mul x13, x1, x2
-; AARCH-NEXT:    umulh x10, x1, x2
-; AARCH-NEXT:    mul x11, x11, x2
-; AARCH-NEXT:    adds x12, x13, x12
-; AARCH-NEXT:    mul x15, x0, x3
-; AARCH-NEXT:    umulh x14, x0, x3
-; AARCH-NEXT:    adc x10, x10, x11
-; AARCH-NEXT:    mul x9, x0, x9
-; AARCH-NEXT:    mul x16, x1, x3
-; AARCH-NEXT:    adds x1, x15, x12
-; AARCH-NEXT:    asr x12, x10, #63
-; AARCH-NEXT:    smulh x11, x8, x3
-; AARCH-NEXT:    adc x9, x14, x9
-; AARCH-NEXT:    asr x13, x9, #63
-; AARCH-NEXT:    adds x9, x10, x9
-; AARCH-NEXT:    asr x10, x1, #63
+; AARCH-NEXT:    orr x8, x9, x8
+; AARCH-NEXT:    cbz x8, .LBB1_2
+; AARCH-NEXT:  // %bb.1: // %overflow
+; AARCH-NEXT:    asr x9, x1, #63
+; AARCH-NEXT:    umulh x10, x0, x2
+; AARCH-NEXT:    asr x13, x3, #63
+; AARCH-NEXT:    mul x11, x1, x2
+; AARCH-NEXT:    umulh x8, x1, x2
+; AARCH-NEXT:    mul x9, x9, x2
+; AARCH-NEXT:    adds x10, x11, x10
+; AARCH-NEXT:    mul x14, x0, x3
+; AARCH-NEXT:    umulh x12, x0, x3
+; AARCH-NEXT:    adc x9, x8, x9
+; AARCH-NEXT:    mul x13, x0, x13
+; AARCH-NEXT:    adds x8, x14, x10
+; AARCH-NEXT:    mul x15, x1, x3
+; AARCH-NEXT:    smulh x10, x1, x3
+; AARCH-NEXT:    adc x11, x12, x13
+; AARCH-NEXT:    asr x12, x9, #63
+; AARCH-NEXT:    asr x13, x11, #63
+; AARCH-NEXT:    adds x9, x9, x11
+; AARCH-NEXT:    asr x11, x8, #63
 ; AARCH-NEXT:    mul x0, x0, x2
 ; AARCH-NEXT:    adc x12, x12, x13
-; AARCH-NEXT:    adds x9, x16, x9
-; AARCH-NEXT:    adc x11, x11, x12
-; AARCH-NEXT:    cmp x9, x10
-; AARCH-NEXT:    ccmp x11, x10, #0, eq
+; AARCH-NEXT:    adds x9, x15, x9
+; AARCH-NEXT:    adc x10, x10, x12
+; AARCH-NEXT:    cmp x9, x11
+; AARCH-NEXT:    ccmp x10, x11, #0, eq
 ; AARCH-NEXT:    cset w9, ne
-; AARCH-NEXT:    tbz x8, #63, .LBB1_2
-; AARCH-NEXT:  // %bb.1: // %Entry
-; AARCH-NEXT:    eor x8, x3, #0x8000000000000000
-; AARCH-NEXT:    orr x8, x2, x8
-; AARCH-NEXT:    cbz x8, .LBB1_3
-; AARCH-NEXT:  .LBB1_2: // %Else2
-; AARCH-NEXT:    cbz w9, .LBB1_4
-; AARCH-NEXT:  .LBB1_3: // %Then7
-; AARCH-NEXT:    mov w8, #1 // =0x1
-; AARCH-NEXT:    str w8, [x4]
-; AARCH-NEXT:  .LBB1_4: // %Block9
+; AARCH-NEXT:    tbnz x1, #63, .LBB1_3
+; AARCH-NEXT:    b .LBB1_4
+; AARCH-NEXT:  .LBB1_2: // %overflow.no
+; AARCH-NEXT:    smulh x8, x0, x2
+; AARCH-NEXT:    mov w9, wzr
+; AARCH-NEXT:    mul x0, x0, x2
+; AARCH-NEXT:    tbz x1, #63, .LBB1_4
+; AARCH-NEXT:  .LBB1_3: // %overflow.res
+; AARCH-NEXT:    eor x10, x3, #0x8000000000000000
+; AARCH-NEXT:    orr x10, x2, x10
+; AARCH-NEXT:    cbz x10, .LBB1_5
+; AARCH-NEXT:  .LBB1_4: // %Else2
+; AARCH-NEXT:    cbz w9, .LBB1_6
+; AARCH-NEXT:  .LBB1_5: // %Then7
+; AARCH-NEXT:    mov w9, #1 // =0x1
+; AARCH-NEXT:    str w9, [x4]
+; AARCH-NEXT:  .LBB1_6: // %Block9
+; AARCH-NEXT:    mov x1, x8
 ; AARCH-NEXT:    ret
 Entry:
   store i32 0, ptr %2, align 4

From c61c5d29334c7ff044ba46bff17e1f3d57e230a3 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Tue, 18 Nov 2025 13:54:21 +0000
Subject: [PATCH 08/33] [mlir][tosa] Add a pass to narrow i64 to i32 (#165581)

This pass aims to narrow i64 types on TOSA operations to i32. It can be
useful for legalizations from various frameworks. It comes with the
following options:
- "aggressive-rewrite" - This option is typically able to narrow more
values, but may impact numerical behaviour if not used carefully.
- "convert-function-boundaries" - If enabled, parameters/ results
to/from a function may be narrowed. Otherwise, casts are inserted to
preserve the I/O of the function.

Currently the non aggressive mode is very limited, targeting an argmax
-> cast sequence that has been observed during legalization as well as
some data layout operations that can always narrow. Support for more
operations will be added in the future.

Co-authored-by: Vitalii Shutov <vitalii.shutov@arm.com>
Co-authored-by: Shubham <shubham@arm.com>
Co-authored-by: Declan Flavin <declan.flavin@arm.com>

Signed-off-by: Luke Hutton <luke.hutton@arm.com>
Co-authored-by: Vitalii Shutov <vitalii.shutov@arm.com>
Co-authored-by: Shubham <shubham@arm.com>
Co-authored-by: Declan Flavin <declan.flavin@arm.com>
---
 .../mlir/Dialect/Tosa/Transforms/Passes.td    |  23 ++
 .../Dialect/Tosa/Transforms/CMakeLists.txt    |   1 +
 .../Tosa/Transforms/TosaNarrowI64ToI32.cpp    | 310 ++++++++++++++++++
 .../tosa-narrow-i64-to-i32-aggressive.mlir    |  81 +++++
 .../Dialect/Tosa/tosa-narrow-i64-to-i32.mlir  | 162 +++++++++
 5 files changed, 577 insertions(+)
 create mode 100644 mlir/lib/Dialect/Tosa/Transforms/TosaNarrowI64ToI32.cpp
 create mode 100644 mlir/test/Dialect/Tosa/tosa-narrow-i64-to-i32-aggressive.mlir
 create mode 100644 mlir/test/Dialect/Tosa/tosa-narrow-i64-to-i32.mlir

diff --git a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
index 14b00b04ccc18..420e58192b8fd 100644
--- a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
@@ -166,4 +166,27 @@ def TosaAttachTarget : Pass<"tosa-attach-target", "ModuleOp"> {
   ];
 }
 
+def TosaNarrowI64ToI32Pass : Pass<"tosa-narrow-i64-to-i32", "func::FuncOp"> {
+  let summary = "Narrow I64 TOSA operations to I32";
+  let description = [{
+    This pass narrows TOSA operations with 64-bit integer tensor types to
+    32-bit integer tensor types. This can be useful for backends that do not
+    support the EXT-INT64 extension of TOSA.
+  }];
+
+  let options = [
+    Option<"aggressiveRewrite", "aggressive-rewrite", "bool", "false",
+      "If enabled, all TOSA operations are rewritten, regardless or whether the narrowing"
+      "is safe. This option may lead to data loss if not used carefully.">,
+    Option<"convertFunctionBoundaries", "convert-function-boundaries", "bool", "false",
+      "If enabled, the pass will convert function I/O types as well. Otherwise casts will"
+      "be inserted at the I/O boundaries.">
+  ];
+
+  let dependentDialects = [
+    "func::FuncDialect",
+    "tosa::TosaDialect",
+  ];
+}
+
 #endif // MLIR_DIALECT_TOSA_TRANSFORMS_PASSES
diff --git a/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt
index 41b338d6e7189..987ce4ed870c9 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt
@@ -12,6 +12,7 @@ add_mlir_dialect_library(MLIRTosaTransforms
   TosaTypeConverters.cpp
   TosaProfileCompliance.cpp
   TosaValidation.cpp
+  TosaNarrowI64ToI32.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Tosa/Transforms
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaNarrowI64ToI32.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaNarrowI64ToI32.cpp
new file mode 100644
index 0000000000000..ddaf7d8a5e033
--- /dev/null
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaNarrowI64ToI32.cpp
@@ -0,0 +1,310 @@
+//===- TosaNarrowI64ToI32.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass narrows TOSA operations with 64-bit integer tensor types to
+// 32-bit integer tensor types. This can be useful for backends that do not
+// support the EXT-INT64 extension of TOSA. The pass has two options:
+//
+// - aggressive-rewrite - If enabled, all TOSA operations are rewritten,
+//     regardless or whether the narrowing is safe. This option may lead to
+//     data loss if not used carefully.
+// - convert-function-boundaries - If enabled, the pass will convert function
+//     I/O types as well. Otherwise casts will be inserted at the I/O
+//     boundaries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Tosa/Transforms/Passes.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
+#include "mlir/IR/Verifier.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace tosa {
+#define GEN_PASS_DEF_TOSANARROWI64TOI32PASS
+#include "mlir/Dialect/Tosa/Transforms/Passes.h.inc"
+} // namespace tosa
+} // namespace mlir
+
+using namespace mlir;
+using namespace mlir::tosa;
+
+namespace {
+
+LogicalResult convertGenericOp(Operation *op, ValueRange operands,
+                               ConversionPatternRewriter &rewriter,
+                               const TypeConverter *typeConverter) {
+  // Convert types of results
+  SmallVector<Type, 4> newResults;
+  if (failed(typeConverter->convertTypes(op->getResultTypes(), newResults)))
+    return failure();
+
+  // Create a new operation state
+  OperationState state(op->getLoc(), op->getName().getStringRef(), operands,
+                       newResults, {}, op->getSuccessors());
+
+  for (const NamedAttribute &namedAttribute : op->getAttrs()) {
+    const Attribute attribute = namedAttribute.getValue();
+
+    // Convert integer attribute type
+    if (const auto intAttr = dyn_cast<IntegerAttr>(attribute)) {
+      const std::optional<Attribute> convertedAttribute =
+          typeConverter->convertTypeAttribute(intAttr.getType(), attribute);
+      state.addAttribute(namedAttribute.getName(), convertedAttribute.value());
+      continue;
+    }
+
+    if (const auto typeAttr = dyn_cast<TypeAttr>(attribute)) {
+      Type type = typeAttr.getValue();
+      const std::optional<Attribute> convertedAttribute =
+          typeConverter->convertTypeAttribute(type, attribute);
+      if (!convertedAttribute)
+        return rewriter.notifyMatchFailure(op,
+                                           "Failed to convert type attribute.");
+      state.addAttribute(namedAttribute.getName(), convertedAttribute.value());
+      continue;
+    }
+
+    if (const auto denseElementsAttr = dyn_cast<DenseElementsAttr>(attribute)) {
+      const Type type = denseElementsAttr.getType();
+      const std::optional<Attribute> convertedAttribute =
+          typeConverter->convertTypeAttribute(type, denseElementsAttr);
+      if (!convertedAttribute)
+        return rewriter.notifyMatchFailure(
+            op, "Failed to convert dense elements attribute.");
+      state.addAttribute(namedAttribute.getName(), convertedAttribute.value());
+      continue;
+    }
+
+    state.addAttribute(namedAttribute.getName(), attribute);
+  }
+
+  for (Region &region : op->getRegions()) {
+    Region *newRegion = state.addRegion();
+    rewriter.inlineRegionBefore(region, *newRegion, newRegion->begin());
+    if (failed(rewriter.convertRegionTypes(newRegion, *typeConverter)))
+      return failure();
+  }
+
+  Operation *newOp = rewriter.create(state);
+  rewriter.replaceOp(op, newOp->getResults());
+  return success();
+}
+
+// ===========================
+// Aggressive rewrite patterns
+// ===========================
+
+class ConvertGenericOp : public ConversionPattern {
+public:
+  ConvertGenericOp(TypeConverter &typeConverter, MLIRContext *context)
+      : ConversionPattern(typeConverter, MatchAnyOpTypeTag{}, 0, context) {}
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    if (!isa<tosa::TosaOp>(op))
+      return rewriter.notifyMatchFailure(
+          op,
+          "Support for operations other than TOSA has not been implemented.");
+
+    return convertGenericOp(op, operands, rewriter, typeConverter);
+  }
+};
+
+// ===============================
+// Bounds checked rewrite patterns
+// ===============================
+
+class ConvertArgMaxOpWithBoundsChecking
+    : public OpConversionPattern<tosa::ArgMaxOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(tosa::ArgMaxOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    // Output type can be narrowed based on the size of the axis dimension
+    const int32_t axis = op.getAxis();
+    const auto inputType = dyn_cast<ShapedType>(adaptor.getInput().getType());
+    if (!inputType || !inputType.isStaticDim(axis))
+      return rewriter.notifyMatchFailure(
+          op, "Requires a static axis dimension for bounds checking.");
+    const int64_t axisDim = inputType.getDimSize(axis);
+    if (axisDim >= std::numeric_limits<int32_t>::max())
+      return rewriter.notifyMatchFailure(
+          op, "Axis dimension is too large to narrow safely.");
+
+    const Type resultType = op.getOutput().getType();
+    const Type newResultType = typeConverter->convertType(resultType);
+    rewriter.replaceOpWithNewOp<tosa::ArgMaxOp>(op, newResultType,
+                                                adaptor.getInput(), axis);
+    return success();
+  }
+};
+
+class ConvertCastOpWithBoundsChecking
+    : public OpConversionPattern<tosa::CastOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(tosa::CastOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    const auto inputType = dyn_cast<ShapedType>(adaptor.getInput().getType());
+    const auto resultType = dyn_cast<ShapedType>(op.getResult().getType());
+    if (!inputType || !resultType)
+      return failure();
+
+    const auto elementInputIntType =
+        dyn_cast<IntegerType>(inputType.getElementType());
+    const auto elementResultIntType =
+        dyn_cast<IntegerType>(resultType.getElementType());
+    if (elementInputIntType && elementResultIntType &&
+        elementInputIntType.getWidth() > elementResultIntType.getWidth())
+      return rewriter.notifyMatchFailure(
+          op, "Narrowing cast may lead to data loss.");
+
+    rewriter.replaceOpWithNewOp<tosa::CastOp>(
+        op, typeConverter->convertType(resultType), adaptor.getInput());
+    return success();
+  }
+};
+
+template <typename OpTy>
+class ConvertTypedOp : public OpConversionPattern<OpTy> {
+  using OpConversionPattern<OpTy>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    return convertGenericOp(op, adaptor.getOperands(), rewriter,
+                            this->getTypeConverter());
+  }
+};
+
+struct TosaNarrowI64ToI32
+    : public tosa::impl::TosaNarrowI64ToI32PassBase<TosaNarrowI64ToI32> {
+public:
+  explicit TosaNarrowI64ToI32() = default;
+  explicit TosaNarrowI64ToI32(const TosaNarrowI64ToI32PassOptions &options)
+      : TosaNarrowI64ToI32() {
+    this->aggressiveRewrite = options.aggressiveRewrite;
+    this->convertFunctionBoundaries = options.convertFunctionBoundaries;
+  }
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+
+    TypeConverter typeConverter;
+    typeConverter.addConversion([](Type type) -> Type { return type; });
+    typeConverter.addConversion([](IntegerType type) -> Type {
+      if (!type.isInteger(64))
+        return type;
+      return IntegerType::get(type.getContext(), 32);
+    });
+    typeConverter.addConversion(
+        [&typeConverter](RankedTensorType type) -> Type {
+          const Type elementType = type.getElementType();
+          if (!elementType.isInteger(64))
+            return type;
+          return RankedTensorType::get(type.getShape(),
+                                       typeConverter.convertType(elementType));
+        });
+
+    const auto materializeCast = [](OpBuilder &builder, Type resultType,
+                                    ValueRange inputs, Location loc) -> Value {
+      if (inputs.size() != 1)
+        return Value();
+      return tosa::CastOp::create(builder, loc, resultType, inputs.front());
+    };
+    typeConverter.addSourceMaterialization(materializeCast);
+    typeConverter.addTargetMaterialization(materializeCast);
+
+    typeConverter.addTypeAttributeConversion(
+        [](IntegerType type, IntegerAttr attribute) -> Attribute {
+          const APInt value = attribute.getValue().truncSSat(32);
+          return IntegerAttr::get(IntegerType::get(type.getContext(), 32),
+                                  value);
+        });
+    typeConverter.addTypeAttributeConversion(
+        [&typeConverter](ShapedType type,
+                         DenseIntElementsAttr attr) -> Attribute {
+          const ShapedType newType =
+              cast<ShapedType>(typeConverter.convertType(type));
+          const auto oldElementType = cast<IntegerType>(type.getElementType());
+          const auto newElementType =
+              cast<IntegerType>(newType.getElementType());
+          if (oldElementType.getWidth() == newElementType.getWidth())
+            return attr;
+
+          DenseElementsAttr mapped =
+              attr.mapValues(newElementType, [&](const APInt &v) {
+                return v.truncSSat(newElementType.getWidth());
+              });
+          return mapped;
+        });
+
+    ConversionTarget target(*context);
+    target.addDynamicallyLegalDialect<tosa::TosaDialect>(
+        [&typeConverter](Operation *op) {
+          return typeConverter.isLegal(op->getResultTypes()) &&
+                 typeConverter.isLegal(op->getOperandTypes());
+        });
+    if (convertFunctionBoundaries) {
+      target.addDynamicallyLegalOp<func::FuncOp>(
+          [&typeConverter](func::FuncOp op) {
+            return typeConverter.isSignatureLegal(op.getFunctionType()) &&
+                   typeConverter.isLegal(&op.getBody());
+          });
+      target.addDynamicallyLegalOp<func::ReturnOp>([](func::ReturnOp op) {
+        const FunctionType funcType =
+            op->getParentOfType<func::FuncOp>().getFunctionType();
+        return llvm::equal(op.getOperandTypes(), funcType.getResults());
+      });
+    } else {
+      target.addDynamicallyLegalOp<func::FuncOp>(
+          [](func::FuncOp op) { return true; });
+      target.addDynamicallyLegalOp<func::ReturnOp>(
+          [](func::ReturnOp op) { return true; });
+    }
+
+    RewritePatternSet patterns(context);
+    if (convertFunctionBoundaries) {
+      populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(
+          patterns, typeConverter);
+      populateReturnOpTypeConversionPattern(patterns, typeConverter);
+    }
+    if (aggressiveRewrite) {
+      patterns.add<ConvertGenericOp>(typeConverter, context);
+    } else {
+      // Tensor
+      patterns.add<ConvertArgMaxOpWithBoundsChecking>(typeConverter, context);
+      // Data layout
+      patterns.add<ConvertTypedOp<tosa::ConcatOp>>(typeConverter, context);
+      patterns.add<ConvertTypedOp<tosa::PadOp>>(typeConverter, context);
+      patterns.add<ConvertTypedOp<tosa::ReshapeOp>>(typeConverter, context);
+      patterns.add<ConvertTypedOp<tosa::ReverseOp>>(typeConverter, context);
+      patterns.add<ConvertTypedOp<tosa::SliceOp>>(typeConverter, context);
+      patterns.add<ConvertTypedOp<tosa::TileOp>>(typeConverter, context);
+      patterns.add<ConvertTypedOp<tosa::TransposeOp>>(typeConverter, context);
+      patterns.add<ConvertTypedOp<tosa::IdentityOp>>(typeConverter, context);
+      // Type conversion
+      patterns.add<ConvertCastOpWithBoundsChecking>(typeConverter, context);
+      // Controlflow
+      patterns.add<ConvertTypedOp<tosa::IfOp>>(typeConverter, context);
+      patterns.add<ConvertTypedOp<tosa::WhileOp>>(typeConverter, context);
+    }
+
+    if (failed(
+            applyFullConversion(getOperation(), target, std::move(patterns))))
+      signalPassFailure();
+  }
+};
+
+} // namespace
diff --git a/mlir/test/Dialect/Tosa/tosa-narrow-i64-to-i32-aggressive.mlir b/mlir/test/Dialect/Tosa/tosa-narrow-i64-to-i32-aggressive.mlir
new file mode 100644
index 0000000000000..1a36177a37033
--- /dev/null
+++ b/mlir/test/Dialect/Tosa/tosa-narrow-i64-to-i32-aggressive.mlir
@@ -0,0 +1,81 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics -tosa-narrow-i64-to-i32="aggressive-rewrite=1" %s | FileCheck %s --allow-unused-prefixes --check-prefixes=COMMON,DEFAULT
+// RUN: mlir-opt -split-input-file -verify-diagnostics -tosa-narrow-i64-to-i32="aggressive-rewrite=1 convert-function-boundaries=1" %s | FileCheck %s --allow-unused-prefixes --check-prefixes=COMMON,FUNCBOUND
+
+// CHECK-LABEL: test_i64_argmax_large_axis_dim
+func.func @test_i64_argmax_large_axis_dim(%arg0: tensor<1x513x513x2147483650xi8>) -> tensor<1x513x513xi64> {
+  // DEFAULT: tosa.argmax %arg0 {axis = 3 : i32} : (tensor<1x513x513x2147483650xi8>) -> tensor<1x513x513xi32>
+  %0 = tosa.argmax %arg0 {axis = 3 : i32} : (tensor<1x513x513x2147483650xi8>) -> tensor<1x513x513xi64>
+  return %0 : tensor<1x513x513xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_convert_input_parameters
+// DEFAULT: %[[IN:.*]]: tensor<1x513x513x3xi64>
+// FUNCBOUND: %[[IN:.*]]: tensor<1x513x513x3xi32>
+func.func @test_convert_input_parameters(%arg0: tensor<1x513x513x3xi64>) -> tensor<1x513x513x3xf32> {
+  // DEFAULT: %[[FUNC_BOUND_CAST:.*]] = tosa.cast %[[IN]] : (tensor<1x513x513x3xi64>) -> tensor<1x513x513x3xi32>
+  // DEFAULT: %[[CAST1:.*]] = tosa.cast %[[FUNC_BOUND_CAST]] : (tensor<1x513x513x3xi32>) -> tensor<1x513x513x3xi32>
+  // FUNCBOUND: %[[CAST1:.*]] = tosa.cast %[[IN]] : (tensor<1x513x513x3xi32>) -> tensor<1x513x513x3xi32>
+  %0 = tosa.cast %arg0 : (tensor<1x513x513x3xi64>) -> tensor<1x513x513x3xi32>
+
+  // COMMON: %[[CAST2:.*]] = tosa.cast %[[CAST1]] : (tensor<1x513x513x3xi32>) -> tensor<1x513x513x3xf32>
+  %1 = tosa.cast %0 : (tensor<1x513x513x3xi32>) -> tensor<1x513x513x3xf32>
+  return %1 : tensor<1x513x513x3xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_add
+// DEFAULT: %[[IN0:.*]]: tensor<13x21x1xi64>, %[[IN1:.*]]: tensor<13x21x3xi64>
+// FUNCBOUND: %[[IN0:.*]]: tensor<13x21x1xi32>, %[[IN1:.*]]: tensor<13x21x3xi32>
+func.func @test_add(%arg0: tensor<13x21x1xi64>, %arg1: tensor<13x21x3xi64>) -> tensor<13x21x3xi64> {
+  // DEFAULT-DAG: %[[FUNC_BOUND_CAST0:.*]] = tosa.cast %[[IN0]] : (tensor<13x21x1xi64>) -> tensor<13x21x1xi32>
+  // DEFAULT-DAG: %[[FUNC_BOUND_CAST1:.*]] = tosa.cast %[[IN1]] : (tensor<13x21x3xi64>) -> tensor<13x21x3xi32>
+  // DEFAULT: %[[ADD:.*]] = tosa.add %[[FUNC_BOUND_CAST0]], %[[FUNC_BOUND_CAST1]] : (tensor<13x21x1xi32>, tensor<13x21x3xi32>) -> tensor<13x21x3xi32>
+  // DEFAULT: %[[CAST:.*]] = tosa.cast %[[ADD]] : (tensor<13x21x3xi32>) -> tensor<13x21x3xi64>
+  // DEFAULT: return %[[CAST]] : tensor<13x21x3xi64>
+  // FUNCBOUND: %[[ADD:.*]] = tosa.add %[[IN0]], %[[IN1]] : (tensor<13x21x1xi32>, tensor<13x21x3xi32>) -> tensor<13x21x3xi32>
+  // FUNCBOUND: return %[[ADD]] : tensor<13x21x3xi32>
+  %0 = tosa.add %arg0, %arg1 : (tensor<13x21x1xi64>, tensor<13x21x3xi64>) -> tensor<13x21x3xi64>
+  return %0 : tensor<13x21x3xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_regions
+// DEFAULT: %[[IN0:.*]]: tensor<i64>, %[[IN1:.*]]: tensor<i64>
+func.func @test_regions(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<i1>) -> tensor<i64> {
+  // DEFAULT-DAG: %[[CAST0:.*]] = tosa.cast %[[IN0]] : (tensor<i64>) -> tensor<i32>
+  // DEFAULT-DAG: %[[CAST1:.*]] = tosa.cast %[[IN1]] : (tensor<i64>) -> tensor<i32>
+  // COMMON: %[[IF_RESULT:.*]] = tosa.cond_if
+  %0 = tosa.cond_if %arg2 : tensor<i1> -> (tensor<i64>) {
+    // DEFAULT: %[[ADD:.*]] = tosa.add %[[CAST0]], %[[CAST1]] : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    // FUNCBOUND: %[[ADD:.*]] = tosa.add %[[IN0]], %[[IN1]] : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %1 = tosa.add %arg0, %arg1 : (tensor<i64>, tensor<i64>) -> tensor<i64>
+    // COMMON: tosa.yield %[[ADD]] : tensor<i32>
+    tosa.yield %1 : tensor<i64>
+  } else {
+    // DEFAULT: %[[SUB:.*]] = tosa.sub %[[CAST0]], %[[CAST1]] : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    // FUNCBOUND: %[[SUB:.*]] = tosa.sub %[[IN0]], %[[IN1]] : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %1 = tosa.sub %arg0, %arg1 : (tensor<i64>, tensor<i64>) -> tensor<i64>
+    // COMMON: tosa.yield %[[SUB]] : tensor<i32>
+    tosa.yield %1 : tensor<i64>
+  }
+  // DEFAULT: %[[OUT:.*]] = tosa.cast %[[IF_RESULT]] : (tensor<i32>) -> tensor<i64>
+  // DEFAULT: return %[[OUT]] : tensor<i64>
+  // FUNCBOUND: return %[[IF_RESULT]] : tensor<i32>
+  return %0 : tensor<i64>
+}
+
+// -----
+
+// CHECK-LABEL: test_const
+func.func @test_const() -> tensor<2xi64> {
+  // COMMON: %[[CONST:.*]] = "tosa.const"() <{values = dense<[1, 2]> : tensor<2xi32>}> : () -> tensor<2xi32>
+  %0 = "tosa.const"() <{values = dense<[1, 2]> : tensor<2xi64>}> : () -> tensor<2xi64>
+  // DEFAULT: %[[OUT:.*]] = tosa.cast %[[CONST]] : (tensor<2xi32>) -> tensor<2xi64>
+  // DEFAULT: return %[[OUT]] : tensor<2xi64>
+  // FUNCBOUND: return %[[CONST]] : tensor<2xi32>
+  return %0 : tensor<2xi64>
+}
diff --git a/mlir/test/Dialect/Tosa/tosa-narrow-i64-to-i32.mlir b/mlir/test/Dialect/Tosa/tosa-narrow-i64-to-i32.mlir
new file mode 100644
index 0000000000000..a14483fcdd7b0
--- /dev/null
+++ b/mlir/test/Dialect/Tosa/tosa-narrow-i64-to-i32.mlir
@@ -0,0 +1,162 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics -tosa-narrow-i64-to-i32="convert-function-boundaries=0" %s | FileCheck %s --allow-unused-prefixes --check-prefixes=COMMON,DEFAULT
+// RUN: mlir-opt -split-input-file -verify-diagnostics -tosa-narrow-i64-to-i32="convert-function-boundaries=1" %s | FileCheck %s --allow-unused-prefixes --check-prefixes=COMMON,FUNCBOUND
+
+// -----
+
+// CHECK-LABEL: test_i64_argmax
+func.func @test_i64_argmax(%arg0: tensor<1x513x513x19xi8>) -> tensor<1x513x513xi64> {
+  // COMMON: %[[ARGMAX:.*]] = tosa.argmax %arg0 {axis = 3 : i32} : (tensor<1x513x513x19xi8>) -> tensor<1x513x513xi32>
+  %0 = tosa.argmax %arg0 {axis = 3 : i32} : (tensor<1x513x513x19xi8>) -> tensor<1x513x513xi64>
+
+  // DEFAULT: %[[CAST:.*]] = tosa.cast %[[ARGMAX]] : (tensor<1x513x513xi32>) -> tensor<1x513x513xi64>
+  // FUNCBOUND: return %[[ARGMAX]] : tensor<1x513x513xi32>
+  return %0 : tensor<1x513x513xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_i64_argmax_cast
+func.func @test_i64_argmax_cast(%arg0: tensor<1x513x513x19xi8>) -> tensor<1x513x513xf32> {
+  // COMMON: %[[ARGMAX:.*]] = tosa.argmax %arg0 {axis = 3 : i32} : (tensor<1x513x513x19xi8>) -> tensor<1x513x513xi32>
+  %0 = tosa.argmax %arg0 {axis = 3 : i32} : (tensor<1x513x513x19xi8>) -> tensor<1x513x513xi64>
+  // COMMON: tosa.cast %[[ARGMAX]] : (tensor<1x513x513xi32>) -> tensor<1x513x513xf32>
+  %1 = tosa.cast %0 : (tensor<1x513x513xi64>) -> tensor<1x513x513xf32>
+  return %1 : tensor<1x513x513xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_i64_argmax_large_axis_dim
+func.func @test_i64_argmax_large_axis_dim(%arg0: tensor<1x513x513x2147483650xi8>) -> tensor<1x513x513xi64> {
+  // expected-error @+1 {{failed to legalize operation 'tosa.argmax'}}
+  %0 = tosa.argmax %arg0 {axis = 3 : i32} : (tensor<1x513x513x2147483650xi8>) -> tensor<1x513x513xi64>
+  return %0 : tensor<1x513x513xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_add
+func.func @test_add(%arg0: tensor<13x21x1xi64>, %arg1: tensor<13x21x3xi64>) -> tensor<13x21x3xi64> {
+  // expected-error @+1 {{failed to legalize operation 'tosa.add'}}
+  %0 = tosa.add %arg0, %arg1 : (tensor<13x21x1xi64>, tensor<13x21x3xi64>) -> tensor<13x21x3xi64>
+  return %0 : tensor<13x21x3xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_regions
+func.func @test_regions(%arg0: tensor<1x2xi32>, %arg1: tensor<1xi32>, %arg2: tensor<i1>) -> tensor<1xi32> {
+  // COMMON: %[[IF_RESULT:.*]] = tosa.cond_if %arg2 : tensor<i1> -> tensor<1xi32>
+  %0 = tosa.cond_if %arg2 : tensor<i1> -> tensor<1xi32> {
+    // COMMON: %[[ARGMAX:.*]] = tosa.argmax %arg0 {axis = 1 : i32} : (tensor<1x2xi32>) -> tensor<1xi32>
+    %1 = tosa.argmax %arg0 {axis = 1 : i32} : (tensor<1x2xi32>) -> tensor<1xi64>
+    // COMMON: %[[CAST:.*]] = tosa.cast %[[ARGMAX]] : (tensor<1xi32>) -> tensor<1xi32>
+    %2 = tosa.cast %1 : (tensor<1xi64>) -> tensor<1xi32>
+    // COMMON: tosa.yield %[[CAST]] : tensor<1xi32>
+    tosa.yield %2 : tensor<1xi32>
+  } else {
+    tosa.yield %arg1 : tensor<1xi32>
+  }
+  // COMMON: return %[[IF_RESULT]] : tensor<1xi32>
+  return %0 : tensor<1xi32>
+}
+
+// -----
+
+// CHECK-LABEL: test_concat
+func.func @test_concat(%arg0: tensor<13x21x3xi64>, %arg1: tensor<13x21x3xi64>) -> tensor<26x21x3xi64> {
+  // COMMON: tosa.concat %{{.*}}, %{{.*}} {axis = 0 : i32} : (tensor<13x21x3xi32>, tensor<13x21x3xi32>) -> tensor<26x21x3xi32>
+  %0 = tosa.concat %arg0, %arg1 {axis = 0 : i32} : (tensor<13x21x3xi64>, tensor<13x21x3xi64>) -> tensor<26x21x3xi64>
+  return %0 : tensor<26x21x3xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_pad
+func.func @test_pad(%arg0: tensor<13x21x3xi64>, %arg1: tensor<1xi64>) -> tensor<15x23x5xi64> {
+  %padding = tosa.const_shape {values = dense<1> : tensor<6xindex>} : () -> !tosa.shape<6>
+  // COMMON: tosa.pad %{{.*}}, %{{.*}}, %{{.*}} : (tensor<13x21x3xi32>, !tosa.shape<6>, tensor<1xi32>) -> tensor<15x23x5xi32>
+  %1 = tosa.pad %arg0, %padding, %arg1 : (tensor<13x21x3xi64>, !tosa.shape<6>, tensor<1xi64>) -> tensor<15x23x5xi64>
+  return %1 : tensor<15x23x5xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_reshape
+func.func @test_reshape(%arg0: tensor<13x21x3xi64>) -> tensor<1x819xi64> {
+  %1 = tosa.const_shape {values = dense<[1, 819]> : tensor<2xindex>} : () -> !tosa.shape<2>
+  // COMMON: tosa.reshape %{{.*}}, %{{.*}} : (tensor<13x21x3xi32>, !tosa.shape<2>) -> tensor<1x819xi32>
+  %0 = tosa.reshape %arg0, %1 : (tensor<13x21x3xi64>, !tosa.shape<2>) -> tensor<1x819xi64>
+  return %0 : tensor<1x819xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_reverse
+func.func @test_reverse(%arg0: tensor<13x21x3xi64>) -> tensor<13x21x3xi64> {
+  // COMMON: tosa.reverse %{{.*}} {axis = 0 : i32} : (tensor<13x21x3xi32>) -> tensor<13x21x3xi32>
+  %0 = tosa.reverse %arg0 {axis = 0 : i32} : (tensor<13x21x3xi64>) -> tensor<13x21x3xi64>
+  return %0 : tensor<13x21x3xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_slice
+func.func @test_slice(%arg0: tensor<13x21x3xi64>) -> tensor<4x11x1xi64> {
+  %0 = tosa.const_shape {values = dense<[4, 11, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
+  %1 = tosa.const_shape {values = dense<[6, 8, 0]> : tensor<3xindex>} : () -> !tosa.shape<3>
+  // COMMON: tosa.slice %{{.*}}, %{{.*}}, %{{.*}} : (tensor<13x21x3xi32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<4x11x1xi32>
+  %2 = tosa.slice %arg0, %0, %1 : (tensor<13x21x3xi64>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<4x11x1xi64>
+  return %2 : tensor<4x11x1xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_tile
+func.func @test_tile(%arg0: tensor<13x21x3xi64>) -> tensor<39x21x6xi64> {
+  %cst = tosa.const_shape { values = dense<[3, 1, 2]> : tensor<3xindex> } : () -> !tosa.shape<3>
+  // COMMON: tosa.tile %{{.*}}, %{{.*}} : (tensor<13x21x3xi32>, !tosa.shape<3>) -> tensor<39x21x6xi32>
+  %0 = tosa.tile %arg0, %cst: (tensor<13x21x3xi64>, !tosa.shape<3>) -> tensor<39x21x6xi64>
+  return %0 : tensor<39x21x6xi64>
+}
+
+// -----
+
+// CHECK-LABEL: transpose
+func.func @test_transpose(%arg0: tensor<13x21x3xi64>) -> tensor<3x13x21xi64> {
+  // COMMON: tosa.transpose %{{.*}} {perms = array<i32: 2, 0, 1>} : (tensor<13x21x3xi32>) -> tensor<3x13x21xi32>
+  %1 = tosa.transpose %arg0 {perms = array<i32: 2, 0, 1>} : (tensor<13x21x3xi64>) -> tensor<3x13x21xi64>
+  return %1 : tensor<3x13x21xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_transition_to_i64
+func.func @test_transition_to_i64(%arg0: tensor<1xi32>) -> tensor<1xi64> {
+  // COMMON: %[[CAST:.*]] = tosa.cast %arg0 : (tensor<1xi32>) -> tensor<1xi32>
+  %0 = tosa.cast %arg0 : (tensor<1xi32>) -> tensor<1xi64>
+  // COMMON: %[[IDENTITY1:.*]] = tosa.identity %[[CAST]] : (tensor<1xi32>) -> tensor<1xi32>
+  %1 = tosa.identity %0 : (tensor<1xi64>) -> tensor<1xi64>
+  // COMMON: %[[IDENTITY2:.*]] = tosa.identity %[[IDENTITY1]] : (tensor<1xi32>) -> tensor<1xi32>
+  %2 = tosa.identity %1 : (tensor<1xi64>) -> tensor<1xi64>
+  // DEFAULT: %[[OUT_CAST:.*]] = tosa.cast %[[IDENTITY2]] : (tensor<1xi32>) -> tensor<1xi64>
+  // DEFAULT: return %[[OUT_CAST]] : tensor<1xi64>
+  // FUNCBOUND: return %[[IDENTITY2]] : tensor<1xi32>
+  return %2 : tensor<1xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_transition_from_i64
+func.func @test_transition_from_i64(%arg0: tensor<1xi64>) -> tensor<1xi32> {
+  // DEFAULT: %[[CAST:.*]] = tosa.cast %arg0 : (tensor<1xi64>) -> tensor<1xi32>
+  // DEFAULT: %[[IDENTITY1:.*]] = tosa.identity %[[CAST]] : (tensor<1xi32>) -> tensor<1xi32>
+  // FUNCBOUND: %[[IDENTITY1:.*]] = tosa.identity %arg0 : (tensor<1xi32>) -> tensor<1xi32>
+  %0 = tosa.identity %arg0 : (tensor<1xi64>) -> tensor<1xi64>
+  // COMMON: %[[IDENTITY2:.*]] = tosa.identity %[[IDENTITY1]] : (tensor<1xi32>) -> tensor<1xi32>
+  %1 = tosa.identity %0 : (tensor<1xi64>) -> tensor<1xi64>
+  // COMMON: %[[OUT_CAST:.*]] = tosa.cast %[[IDENTITY2]] : (tensor<1xi32>) -> tensor<1xi32>
+  %2 = tosa.cast %1 : (tensor<1xi64>) -> tensor<1xi32>
+  // COMMON: return %[[OUT_CAST]] : tensor<1xi32>
+  return %2 : tensor<1xi32>
+}

From c771159ab54ae9185c651216614715c1d28f1a74 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Tue, 18 Nov 2025 05:58:22 -0800
Subject: [PATCH 09/33] [RTSan] Fix tests under Internal Shell (#168470)

This patch fixes the only RTSan test that was broken by enabling lit's
internal shell on Darwin. This patch rewrites the test to prefix env
variables with `env` and to avoid the use of subshells.
---
 compiler-rt/test/rtsan/Darwin/dlopen.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/test/rtsan/Darwin/dlopen.cpp b/compiler-rt/test/rtsan/Darwin/dlopen.cpp
index 1aabe5cb6e580..435a4353b7026 100644
--- a/compiler-rt/test/rtsan/Darwin/dlopen.cpp
+++ b/compiler-rt/test/rtsan/Darwin/dlopen.cpp
@@ -8,18 +8,19 @@
 // RUN: %clangxx -fsanitize=realtime %s -o %t.so -shared -DSHARED_LIB
 // RUN: %clangxx %s -o %t
 
-// RUN: RTSAN_DYLIB_PATH=`%clangxx -fsanitize=realtime %s -### 2>&1 \
+// RUN: %clangxx -fsanitize=realtime %s -### 2>&1 \
 // RUN:   | grep "libclang_rt.rtsan_osx_dynamic.dylib" \
-// RUN:   | sed -e 's/.*"\(.*libclang_rt.rtsan_osx_dynamic.dylib\)".*/\1/'`
+// RUN:   | sed -e 's/.*"\(.*libclang_rt.rtsan_osx_dynamic.dylib\)".*/\1/' \
+// RUN:   | tr -d '\n' > %t.rtsan_dylib_path
 
 // Launching a non-instrumented binary that dlopen's an instrumented library should fail.
 // RUN: not %run %t %t.so 2>&1 | FileCheck %s --check-prefix=CHECK-FAIL
 // Launching a non-instrumented binary with an explicit DYLD_INSERT_LIBRARIES should work.
-// RUN: DYLD_INSERT_LIBRARIES=$RTSAN_DYLIB_PATH %run %t %t.so 2>&1 | FileCheck %s
+// RUN: env DYLD_INSERT_LIBRARIES="%{readfile:%t.rtsan_dylib_path}" %run %t %t.so 2>&1 | FileCheck %s
 
 // Launching an instrumented binary with the DYLD_INSERT_LIBRARIES env variable has no error
 // RUN: %clangxx -fsanitize=realtime %s -o %t
-// RUN: DYLD_INSERT_LIBRARIES=$RTSAN_DYLIB_PATH %run %t %t.so 2>&1 | FileCheck %s --check-prefix=CHECK-INSTRUMENTED
+// RUN: env DYLD_INSERT_LIBRARIES="%{readfile:%t.rtsan_dylib_path}" %run %t %t.so 2>&1 | FileCheck %s --check-prefix=CHECK-INSTRUMENTED
 
 #include <dlfcn.h>
 #include <stdio.h>

From e9f74dff138c9d31bb582efe097f326253368834 Mon Sep 17 00:00:00 2001
From: Alan Li <me@alanli.org>
Date: Tue, 18 Nov 2025 09:12:08 -0500
Subject: [PATCH 10/33] [BAZEL] Fix BAZEL build issue (#168539)

---
 utils/bazel/llvm-project-overlay/clang/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index deb56dc0957e9..790709bdef05c 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -1025,6 +1025,7 @@ cc_library(
 gentbl_cc_library(
     name = "sema_attr_gen",
     tbl_outs = {
+        "include/clang/Sema/AttrIsTypeDependent.inc": ["-gen-clang-attr-is-type-dependent"],
         "include/clang/Sema/AttrParsedAttrImpl.inc": ["-gen-clang-attr-parsed-attr-impl"],
         "include/clang/Sema/AttrParsedAttrKinds.inc": ["-gen-clang-attr-parsed-attr-kinds"],
         "include/clang/Sema/AttrSpellingListIndex.inc": ["-gen-clang-attr-spelling-index"],

From 38891bacaef474e10b87356545b10d2d1ed8fb2d Mon Sep 17 00:00:00 2001
From: Michael Liao <michael.hliao@gmail.com>
Date: Tue, 18 Nov 2025 09:17:11 -0500
Subject: [PATCH 11/33] [mlir][tosa] Fix shared build

---
 mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt
index 987ce4ed870c9..76e9ddd5b2304 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt
@@ -22,6 +22,7 @@ add_mlir_dialect_library(MLIRTosaTransforms
 
   LINK_LIBS PUBLIC
   MLIRFuncDialect
+  MLIRFuncTransformOps
   MLIRPass
   MLIRTosaDialect
   MLIRTransformUtils

From 65c4a534bd55ed56962fb99c36f464b3f1c9732f Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler@amd.com>
Date: Tue, 18 Nov 2025 15:22:49 +0100
Subject: [PATCH 12/33] [OpenMP] Implement omp_get_uid_from_device() /
 omp_get_device_from_uid() (#164392)

Use the implementation in libomptarget. If libomptarget is not
available, always return the UID / device number of the host / the
initial device.
---
 offload/include/OpenMP/omp.h               |  7 ++
 offload/include/omptarget.h                |  2 +
 offload/libomptarget/OpenMP/API.cpp        | 58 ++++++++++++++++
 offload/libomptarget/exports               |  2 +
 offload/test/api/omp_device_uid.c          | 76 +++++++++++++++++++++
 openmp/device/include/DeviceTypes.h        |  3 +
 openmp/device/include/Interface.h          |  4 ++
 openmp/device/src/State.cpp                |  6 ++
 openmp/runtime/src/dllexports              |  2 +
 openmp/runtime/src/include/omp.h.var       |  5 ++
 openmp/runtime/src/include/omp_lib.F90.var | 14 ++++
 openmp/runtime/src/include/omp_lib.h.var   | 19 ++++++
 openmp/runtime/src/kmp_ftn_entry.h         | 29 +++++++-
 openmp/runtime/src/kmp_ftn_os.h            |  8 +++
 openmp/runtime/test/api/omp_device_uid.c   | 77 ++++++++++++++++++++++
 15 files changed, 310 insertions(+), 2 deletions(-)
 create mode 100644 offload/test/api/omp_device_uid.c
 create mode 100644 openmp/runtime/test/api/omp_device_uid.c

diff --git a/offload/include/OpenMP/omp.h b/offload/include/OpenMP/omp.h
index 768ca46a9bed0..d92c7e450c677 100644
--- a/offload/include/OpenMP/omp.h
+++ b/offload/include/OpenMP/omp.h
@@ -30,6 +30,13 @@
 
 extern "C" {
 
+/// Definitions
+///{
+
+#define omp_invalid_device -2
+
+///}
+
 /// Type declarations
 ///{
 
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index fbb4a06accf84..00910704a979a 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -270,6 +270,8 @@ extern "C" {
 void ompx_dump_mapping_tables(void);
 int omp_get_num_devices(void);
 int omp_get_device_num(void);
+int omp_get_device_from_uid(const char *DeviceUid);
+const char *omp_get_uid_from_device(int DeviceNum);
 int omp_get_initial_device(void);
 void *omp_target_alloc(size_t Size, int DeviceNum);
 void omp_target_free(void *DevicePtr, int DeviceNum);
diff --git a/offload/libomptarget/OpenMP/API.cpp b/offload/libomptarget/OpenMP/API.cpp
index dd83a3ccd08e6..6e85e5764449c 100644
--- a/offload/libomptarget/OpenMP/API.cpp
+++ b/offload/libomptarget/OpenMP/API.cpp
@@ -40,6 +40,8 @@ EXTERN void ompx_dump_mapping_tables() {
 using namespace llvm::omp::target::ompt;
 #endif
 
+using GenericDeviceTy = llvm::omp::target::plugin::GenericDeviceTy;
+
 void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
                           const char *Name);
 void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
@@ -68,6 +70,62 @@ EXTERN int omp_get_device_num(void) {
   return HostDevice;
 }
 
+static inline bool is_initial_device_uid(const char *DeviceUid) {
+  return strcmp(DeviceUid, GenericPluginTy::getHostDeviceUid()) == 0;
+}
+
+EXTERN int omp_get_device_from_uid(const char *DeviceUid) {
+  TIMESCOPE();
+  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
+
+  if (!DeviceUid) {
+    DP("Call to omp_get_device_from_uid returning omp_invalid_device\n");
+    return omp_invalid_device;
+  }
+  if (is_initial_device_uid(DeviceUid)) {
+    DP("Call to omp_get_device_from_uid returning initial device number %d\n",
+       omp_get_initial_device());
+    return omp_get_initial_device();
+  }
+
+  int DeviceNum = omp_invalid_device;
+
+  auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor();
+  for (const DeviceTy &Device : PM->devices(ExclusiveDevicesAccessor)) {
+    const char *Uid = Device.RTL->getDevice(Device.RTLDeviceID).getDeviceUid();
+    if (Uid && strcmp(DeviceUid, Uid) == 0) {
+      DeviceNum = Device.DeviceID;
+      break;
+    }
+  }
+
+  DP("Call to omp_get_device_from_uid returning %d\n", DeviceNum);
+  return DeviceNum;
+}
+
+EXTERN const char *omp_get_uid_from_device(int DeviceNum) {
+  TIMESCOPE();
+  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
+
+  if (DeviceNum == omp_invalid_device) {
+    DP("Call to omp_get_uid_from_device returning nullptr\n");
+    return nullptr;
+  }
+  if (DeviceNum == omp_get_initial_device()) {
+    DP("Call to omp_get_uid_from_device returning initial device UID\n");
+    return GenericPluginTy::getHostDeviceUid();
+  }
+
+  auto DeviceOrErr = PM->getDevice(DeviceNum);
+  if (!DeviceOrErr)
+    FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
+
+  const char *Uid =
+      DeviceOrErr->RTL->getDevice(DeviceOrErr->RTLDeviceID).getDeviceUid();
+  DP("Call to omp_get_uid_from_device returning %s\n", Uid);
+  return Uid;
+}
+
 EXTERN int omp_get_initial_device(void) {
   TIMESCOPE();
   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports
index 910a5b6c827a7..2ebc23e3cf60a 100644
--- a/offload/libomptarget/exports
+++ b/offload/libomptarget/exports
@@ -40,6 +40,8 @@ VERS1.0 {
     omp_get_mapped_ptr;
     omp_get_num_devices;
     omp_get_device_num;
+    omp_get_device_from_uid;
+    omp_get_uid_from_device;
     omp_get_initial_device;
     omp_target_alloc;
     omp_target_free;
diff --git a/offload/test/api/omp_device_uid.c b/offload/test/api/omp_device_uid.c
new file mode 100644
index 0000000000000..2a41d8d04ef8a
--- /dev/null
+++ b/offload/test/api/omp_device_uid.c
@@ -0,0 +1,76 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+
+#include <omp.h>
+#include <stdio.h>
+#include <string.h>
+
+int test_omp_device_uid(int device_num) {
+  const char *device_uid = omp_get_uid_from_device(device_num);
+  if (device_uid == NULL) {
+    printf("FAIL for device %d: omp_get_uid_from_device returned NULL\n",
+           device_num);
+    return 0;
+  }
+
+  int device_num_from_uid = omp_get_device_from_uid(device_uid);
+  if (device_num_from_uid != device_num) {
+    printf(
+        "FAIL for device %d: omp_get_device_from_uid returned %d (UID: %s)\n",
+        device_num, device_num_from_uid, device_uid);
+    return 0;
+  }
+
+  if (device_num == omp_get_initial_device())
+    return 1;
+
+  int success = 1;
+
+// Note that the following code may be executed on the host if the host is the
+// device
+#pragma omp target map(tofrom : success) device(device_num)
+  {
+    int device_num = omp_get_device_num();
+
+    // omp_get_uid_from_device() in the device runtime is a dummy function
+    // returning NULL
+    const char *device_uid = omp_get_uid_from_device(device_num);
+
+    // omp_get_device_from_uid() in the device runtime is a dummy function
+    // returning omp_invalid_device.
+    int device_num_from_uid = omp_get_device_from_uid(device_uid);
+
+    // Depending on whether we're executing on the device or the host, we either
+    // got NULL as the device UID or the correct device UID.  Consequently,
+    // omp_get_device_from_uid() either returned omp_invalid_device or the
+    // correct device number (aka omp_get_initial_device()).
+    if (device_uid ? device_num_from_uid != device_num
+                   : device_num_from_uid != omp_invalid_device) {
+      printf("FAIL for device %d (target): omp_get_device_from_uid returned %d "
+             "(UID: %s)\n",
+             device_num, device_num_from_uid, device_uid);
+      success = 0;
+    }
+  }
+
+  return success;
+}
+
+int main() {
+  int num_devices = omp_get_num_devices();
+  int num_failed = 0;
+  // (also test initial device aka num_devices)
+  for (int i = 0; i < num_devices + 1; i++) {
+    if (!test_omp_device_uid(i)) {
+      printf("FAIL for device %d\n", i);
+      num_failed++;
+    }
+  }
+  if (num_failed) {
+    printf("FAIL\n");
+    return 1;
+  }
+  printf("PASS\n");
+  return 0;
+}
+
+// CHECK: PASS
diff --git a/openmp/device/include/DeviceTypes.h b/openmp/device/include/DeviceTypes.h
index 2e5d92380f040..213ccfe58b4fb 100644
--- a/openmp/device/include/DeviceTypes.h
+++ b/openmp/device/include/DeviceTypes.h
@@ -21,6 +21,9 @@ template <typename T> using Constant = __gpu_constant T;
 template <typename T> using Local = __gpu_local T;
 template <typename T> using Global = __gpu_local T;
 
+// See definition in OpenMP (omp.h.var/omp_lib.(F90|h).var)
+#define omp_invalid_device -2
+
 enum omp_proc_bind_t {
   omp_proc_bind_false = 0,
   omp_proc_bind_true = 1,
diff --git a/openmp/device/include/Interface.h b/openmp/device/include/Interface.h
index c4bfaaa2404b4..71c3b1fc06d40 100644
--- a/openmp/device/include/Interface.h
+++ b/openmp/device/include/Interface.h
@@ -130,6 +130,10 @@ int omp_get_num_devices(void);
 
 int omp_get_device_num(void);
 
+int omp_get_device_from_uid(const char *DeviceUid);
+
+const char *omp_get_uid_from_device(int DeviceNum);
+
 int omp_get_num_teams(void);
 
 int omp_get_team_num();
diff --git a/openmp/device/src/State.cpp b/openmp/device/src/State.cpp
index 9f38cf26f8c6f..985e6b169137f 100644
--- a/openmp/device/src/State.cpp
+++ b/openmp/device/src/State.cpp
@@ -403,6 +403,12 @@ int omp_get_num_devices(void) { return config::getNumDevices(); }
 
 int omp_get_device_num(void) { return config::getDeviceNum(); }
 
+int omp_get_device_from_uid(const char *DeviceUid) {
+  return omp_invalid_device;
+}
+
+const char *omp_get_uid_from_device(int DeviceNum) { return nullptr; }
+
 int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); }
 
 int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports
index 3983dae80c9f5..00becd1a657fd 100644
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -544,6 +544,8 @@ kmp_set_disp_num_buffers                    890
     omp_get_devices_all_allocator           819
     omp_get_memspace_num_resources          820
     omp_get_submemspace                     821
+    omp_get_device_from_uid                 822
+    omp_get_uid_from_device                 823
     %ifndef stub
         __kmpc_set_default_allocator
         __kmpc_get_default_allocator
diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var
index 74f385feb3ea5..e98df731ad888 100644
--- a/openmp/runtime/src/include/omp.h.var
+++ b/openmp/runtime/src/include/omp.h.var
@@ -536,6 +536,11 @@
 
     /* OpenMP 5.2 */
     extern int __KAI_KMPC_CONVENTION omp_in_explicit_task(void);
+    #define omp_invalid_device -2
+
+    /* OpenMP 6.0 */
+    extern int   __KAI_KMPC_CONVENTION  omp_get_device_from_uid(const char *DeviceUid);
+    extern const char *   __KAI_KMPC_CONVENTION  omp_get_uid_from_device(int DeviceNum);
 
     /* LLVM Extensions */
     extern void *llvm_omp_target_dynamic_shared_alloc(void);
diff --git a/openmp/runtime/src/include/omp_lib.F90.var b/openmp/runtime/src/include/omp_lib.F90.var
index 90d7e49ebf549..159b42ab5b5cc 100644
--- a/openmp/runtime/src/include/omp_lib.F90.var
+++ b/openmp/runtime/src/include/omp_lib.F90.var
@@ -215,6 +215,8 @@
 
         integer (kind=omp_interop_kind), parameter, public :: omp_interop_none = 0
 
+        integer (kind=omp_integer_kind), parameter, public :: omp_invalid_device = -2
+
         interface
 
 !         ***
@@ -417,6 +419,18 @@
             integer (kind=omp_integer_kind) omp_get_device_num
           end function omp_get_device_num
 
+          function omp_get_uid_from_device(device_num) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: device_num
+            character (len=*) omp_get_uid_from_device
+          end function omp_get_uid_from_device
+
+          function omp_get_device_from_uid(device_uid) bind(c)
+            use omp_lib_kinds
+            character (len=*), value :: device_uid
+            integer (kind=omp_integer_kind) omp_get_device_from_uid
+          end function omp_get_device_from_uid
+
           function omp_pause_resource(kind, device_num) bind(c)
             use omp_lib_kinds
             integer (kind=omp_pause_resource_kind), value :: kind
diff --git a/openmp/runtime/src/include/omp_lib.h.var b/openmp/runtime/src/include/omp_lib.h.var
index a50bb018c7cc3..468eb03e99ef1 100644
--- a/openmp/runtime/src/include/omp_lib.h.var
+++ b/openmp/runtime/src/include/omp_lib.h.var
@@ -291,6 +291,9 @@
       integer(kind=omp_interop_kind)omp_interop_none
       parameter(omp_interop_none=0)
 
+      integer(kind=omp_integer_kind)omp_invalid_device
+      parameter(omp_invalid_device=-2)
+
       interface
 
 !       ***
@@ -486,6 +489,18 @@
           integer (kind=omp_integer_kind) omp_get_device_num
         end function omp_get_device_num
 
+        function omp_get_uid_from_device(device_num) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: device_num
+          character (len=*) omp_get_uid_from_device
+        end function omp_get_uid_from_device
+
+        function omp_get_device_from_uid(device_uid) bind(c)
+          import
+          character (len=*), value :: device_uid
+          integer (kind=omp_integer_kind) omp_get_device_from_uid
+        end function omp_get_device_from_uid
+
         function omp_pause_resource(kind, device_num) bind(c)
           import
           integer (kind=omp_pause_resource_kind), value :: kind
@@ -1159,6 +1174,8 @@
 !DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_initial_device
 !DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_num_devices
 !DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_device_num
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_uid_from_device
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_device_from_uid
 !DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_pause_resource
 !DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_pause_resource_all
 !DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_supported_active_levels
@@ -1242,6 +1259,8 @@
 !$omp declare target(omp_get_initial_device )
 !$omp declare target(omp_get_num_devices )
 !$omp declare target(omp_get_device_num )
+!$omp declare target(omp_get_uid_from_device )
+!$omp declare target(omp_get_device_from_uid )
 !$omp declare target(omp_pause_resource )
 !$omp declare target(omp_pause_resource_all )
 !$omp declare target(omp_get_supported_active_levels )
diff --git a/openmp/runtime/src/kmp_ftn_entry.h b/openmp/runtime/src/kmp_ftn_entry.h
index 2b0063eb23a0a..49c56d2b9a769 100644
--- a/openmp/runtime/src/kmp_ftn_entry.h
+++ b/openmp/runtime/src/kmp_ftn_entry.h
@@ -1543,13 +1543,38 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_MAX_TASK_PRIORITY)(void) {
 #endif
 }
 
-// This function will be defined in libomptarget. When libomptarget is not
-// loaded, we assume we are on the host and return KMP_HOST_DEVICE.
+// These functions will be defined in libomptarget. When libomptarget is not
+// loaded, we assume we are on the host.
 // Compiler/libomptarget will handle this if called inside target.
 int FTN_STDCALL FTN_GET_DEVICE_NUM(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
 int FTN_STDCALL FTN_GET_DEVICE_NUM(void) {
   return KMP_EXPAND_NAME(FTN_GET_INITIAL_DEVICE)();
 }
+const char *FTN_STDCALL FTN_GET_UID_FROM_DEVICE(int device_num)
+    KMP_WEAK_ATTRIBUTE_EXTERNAL;
+const char *FTN_STDCALL FTN_GET_UID_FROM_DEVICE(int device_num) {
+#if KMP_OS_DARWIN || KMP_OS_WASI || defined(KMP_STUB)
+  return nullptr;
+#else
+  const char *(*fptr)(int);
+  if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_uid_from_device")))
+    return (*fptr)(device_num);
+  // Returns the same string as used by libomptarget
+  return "HOST";
+#endif
+}
+int FTN_STDCALL FTN_GET_DEVICE_FROM_UID(const char *device_uid)
+    KMP_WEAK_ATTRIBUTE_EXTERNAL;
+int FTN_STDCALL FTN_GET_DEVICE_FROM_UID(const char *device_uid) {
+#if KMP_OS_DARWIN || KMP_OS_WASI || defined(KMP_STUB)
+  return omp_invalid_device;
+#else
+  int (*fptr)(const char *);
+  if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_device_from_uid")))
+    return (*fptr)(device_uid);
+  return KMP_EXPAND_NAME(FTN_GET_INITIAL_DEVICE)();
+#endif
+}
 
 // Compiler will ensure that this is only called from host in sequential region
 int FTN_STDCALL KMP_EXPAND_NAME(FTN_PAUSE_RESOURCE)(kmp_pause_status_t kind,
diff --git a/openmp/runtime/src/kmp_ftn_os.h b/openmp/runtime/src/kmp_ftn_os.h
index ae0ed067235e5..c439a058f22b4 100644
--- a/openmp/runtime/src/kmp_ftn_os.h
+++ b/openmp/runtime/src/kmp_ftn_os.h
@@ -140,6 +140,8 @@
 #define FTN_GET_MEMSPACE_NUM_RESOURCES omp_get_memspace_num_resources
 #define FTN_GET_SUBMEMSPACE omp_get_submemspace
 #define FTN_GET_DEVICE_NUM omp_get_device_num
+#define FTN_GET_UID_FROM_DEVICE omp_get_uid_from_device
+#define FTN_GET_DEVICE_FROM_UID omp_get_device_from_uid
 #define FTN_SET_AFFINITY_FORMAT omp_set_affinity_format
 #define FTN_GET_AFFINITY_FORMAT omp_get_affinity_format
 #define FTN_DISPLAY_AFFINITY omp_display_affinity
@@ -289,6 +291,8 @@
 #define FTN_ALLOC omp_alloc_
 #define FTN_FREE omp_free_
 #define FTN_GET_DEVICE_NUM omp_get_device_num_
+#define FTN_GET_UID_FROM_DEVICE omp_get_uid_from_device_
+#define FTN_GET_DEVICE_FROM_UID omp_get_device_from_uid_
 #define FTN_SET_AFFINITY_FORMAT omp_set_affinity_format_
 #define FTN_GET_AFFINITY_FORMAT omp_get_affinity_format_
 #define FTN_DISPLAY_AFFINITY omp_display_affinity_
@@ -436,6 +440,8 @@
 #define FTN_GET_MEMSPACE_NUM_RESOURCES OMP_GET_MEMSPACE_NUM_RESOURCES
 #define FTN_GET_SUBMEMSPACE OMP_GET_SUBMEMSPACE
 #define FTN_GET_DEVICE_NUM OMP_GET_DEVICE_NUM
+#define FTN_GET_UID_FROM_DEVICE OMP_GET_UID_FROM_DEVICE
+#define FTN_GET_DEVICE_FROM_UID OMP_GET_DEVICE_FROM_UID
 #define FTN_SET_AFFINITY_FORMAT OMP_SET_AFFINITY_FORMAT
 #define FTN_GET_AFFINITY_FORMAT OMP_GET_AFFINITY_FORMAT
 #define FTN_DISPLAY_AFFINITY OMP_DISPLAY_AFFINITY
@@ -585,6 +591,8 @@
 #define FTN_ALLOC OMP_ALLOC_
 #define FTN_FREE OMP_FREE_
 #define FTN_GET_DEVICE_NUM OMP_GET_DEVICE_NUM_
+#define FTN_GET_UID_FROM_DEVICE OMP_GET_UID_FROM_DEVICE_
+#define FTN_GET_DEVICE_FROM_UID OMP_GET_DEVICE_FROM_UID_
 #define FTN_SET_AFFINITY_FORMAT OMP_SET_AFFINITY_FORMAT_
 #define FTN_GET_AFFINITY_FORMAT OMP_GET_AFFINITY_FORMAT_
 #define FTN_DISPLAY_AFFINITY OMP_DISPLAY_AFFINITY_
diff --git a/openmp/runtime/test/api/omp_device_uid.c b/openmp/runtime/test/api/omp_device_uid.c
new file mode 100644
index 0000000000000..40a1cbb644c7b
--- /dev/null
+++ b/openmp/runtime/test/api/omp_device_uid.c
@@ -0,0 +1,77 @@
+// RUN: %libomp-compile-and-run 2>&1 | FileCheck %s
+// Linking fails for icc 18
+// UNSUPPORTED: icc-18
+
+#include <omp_testsuite.h>
+#include <string.h>
+
+int test_omp_device_uid(int device_num) {
+  const char *device_uid = omp_get_uid_from_device(device_num);
+  if (device_uid == NULL) {
+    printf("FAIL for device %d: omp_get_uid_from_device returned NULL\n",
+           device_num);
+    return 0;
+  }
+
+  int device_num_from_uid = omp_get_device_from_uid(device_uid);
+  if (device_num_from_uid != device_num) {
+    printf(
+        "FAIL for device %d: omp_get_device_from_uid returned %d (UID: %s)\n",
+        device_num, device_num_from_uid, device_uid);
+    return 0;
+  }
+
+  if (device_num == omp_get_initial_device())
+    return 1;
+
+  int success = 1;
+
+// Note that the following code may be executed on the host if the host is the
+// device
+#pragma omp target map(tofrom : success) device(device_num)
+  {
+    int device_num = omp_get_device_num();
+
+    // omp_get_uid_from_device() in the device runtime is a dummy function
+    // returning NULL
+    const char *device_uid = omp_get_uid_from_device(device_num);
+
+    // omp_get_device_from_uid() in the device runtime is a dummy function
+    // returning omp_invalid_device.
+    int device_num_from_uid = omp_get_device_from_uid(device_uid);
+
+    // Depending on whether we're executing on the device or the host, we either
+    // got NULL as the device UID or the correct device UID.  Consequently,
+    // omp_get_device_from_uid() either returned omp_invalid_device or the
+    // correct device number (aka omp_get_initial_device()).
+    if (device_uid ? device_num_from_uid != device_num
+                   : device_num_from_uid != omp_invalid_device) {
+      printf("FAIL for device %d (target): omp_get_device_from_uid returned %d "
+             "(UID: %s)\n",
+             device_num, device_num_from_uid, device_uid);
+      success = 0;
+    }
+  }
+
+  return success;
+}
+
+int main() {
+  int num_devices = omp_get_num_devices();
+  int num_failed = 0;
+  // (also test initial device aka num_devices)
+  for (int i = 0; i < num_devices + 1; i++) {
+    if (!test_omp_device_uid(i)) {
+      printf("FAIL for device %d\n", i);
+      num_failed++;
+    }
+  }
+  if (num_failed) {
+    printf("FAIL\n");
+    return 1;
+  }
+  printf("PASS\n");
+  return 0;
+}
+
+// CHECK: PASS

From 6fc2bc1ccc0d8f08ed794c792ec6ef145ee4ea1f Mon Sep 17 00:00:00 2001
From: Alan Li <me@alanli.org>
Date: Tue, 18 Nov 2025 09:25:30 -0500
Subject: [PATCH 13/33] [BAZEL] Fix OrcDebugging dep (#168540)

---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 635f77215b38f..ddad2f4f7611d 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -4100,6 +4100,7 @@ cc_library(
         ":DebugInfo",
         ":DebugInfoDWARF",
         ":JITLink",
+        ":Object",
         ":OrcJIT",
         ":OrcShared",
         ":Support",

From f2b5d04f2968727270a9d9368c6a4222bbebf12c Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Tue, 18 Nov 2025 14:33:43 +0000
Subject: [PATCH 14/33] [LLVM][InstSimplify] Add folds for SVE integer
 reduction intrinsics. (#167519)

[andv, eorv, orv, s/uaddv, s/umaxv, s/uminv]
sve_reduce_##(none, ?) -> op's neutral value
sve_reduce_##(any, neutral) -> op's neutral value

[andv, orv, s/umaxv, s/uminv]
sve_reduce_##(all, splat(X)) -> X

[eorv]
sve_reduce_##(all, splat(X)) -> 0
---
 llvm/include/llvm/IR/Constant.h               |   3 +
 llvm/lib/Analysis/InstructionSimplify.cpp     |  68 ++
 llvm/lib/IR/Constants.cpp                     |  17 +
 .../AArch64/aarch64-sve-reductions.ll         | 912 ++++++++++++++++++
 .../InstSimplify/AArch64/lit.local.cfg        |   2 +
 5 files changed, 1002 insertions(+)
 create mode 100644 llvm/test/Transforms/InstSimplify/AArch64/aarch64-sve-reductions.ll
 create mode 100644 llvm/test/Transforms/InstSimplify/AArch64/lit.local.cfg

diff --git a/llvm/include/llvm/IR/Constant.h b/llvm/include/llvm/IR/Constant.h
index 0be1fc172ebd4..e8ce453559ed7 100644
--- a/llvm/include/llvm/IR/Constant.h
+++ b/llvm/include/llvm/IR/Constant.h
@@ -79,6 +79,9 @@ class Constant : public User {
   /// Return true if the value is the smallest signed value.
   LLVM_ABI bool isMinSignedValue() const;
 
+  /// Return true if the value is the largest signed value.
+  LLVM_ABI bool isMaxSignedValue() const;
+
   /// Return true if this is a finite and non-zero floating-point scalar
   /// constant or a fixed width vector constant with all finite and non-zero
   /// elements.
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 6f44713bd22cd..8968f6b934d77 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -41,6 +41,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Statepoint.h"
@@ -6676,6 +6677,62 @@ static MinMaxOptResult OptimizeConstMinMax(const Constant *RHSConst,
   return MinMaxOptResult::CannotOptimize;
 }
 
+static Value *simplifySVEIntReduction(Intrinsic::ID IID, Type *ReturnType,
+                                      Value *Op0, Value *Op1) {
+  Constant *C0 = dyn_cast<Constant>(Op0);
+  Constant *C1 = dyn_cast<Constant>(Op1);
+  unsigned Width = ReturnType->getPrimitiveSizeInBits();
+
+  // All false predicate or reduction of neutral values ==> neutral result.
+  switch (IID) {
+  case Intrinsic::aarch64_sve_eorv:
+  case Intrinsic::aarch64_sve_orv:
+  case Intrinsic::aarch64_sve_saddv:
+  case Intrinsic::aarch64_sve_uaddv:
+  case Intrinsic::aarch64_sve_umaxv:
+    if ((C0 && C0->isNullValue()) || (C1 && C1->isNullValue()))
+      return ConstantInt::get(ReturnType, 0);
+    break;
+  case Intrinsic::aarch64_sve_andv:
+  case Intrinsic::aarch64_sve_uminv:
+    if ((C0 && C0->isNullValue()) || (C1 && C1->isAllOnesValue()))
+      return ConstantInt::get(ReturnType, APInt::getMaxValue(Width));
+    break;
+  case Intrinsic::aarch64_sve_smaxv:
+    if ((C0 && C0->isNullValue()) || (C1 && C1->isMinSignedValue()))
+      return ConstantInt::get(ReturnType, APInt::getSignedMinValue(Width));
+    break;
+  case Intrinsic::aarch64_sve_sminv:
+    if ((C0 && C0->isNullValue()) || (C1 && C1->isMaxSignedValue()))
+      return ConstantInt::get(ReturnType, APInt::getSignedMaxValue(Width));
+    break;
+  }
+
+  switch (IID) {
+  case Intrinsic::aarch64_sve_andv:
+  case Intrinsic::aarch64_sve_orv:
+  case Intrinsic::aarch64_sve_smaxv:
+  case Intrinsic::aarch64_sve_sminv:
+  case Intrinsic::aarch64_sve_umaxv:
+  case Intrinsic::aarch64_sve_uminv:
+    // sve_reduce_##(all, splat(X)) ==> X
+    if (C0 && C0->isAllOnesValue()) {
+      if (Value *SplatVal = getSplatValue(Op1)) {
+        assert(SplatVal->getType() == ReturnType && "Unexpected result type!");
+        return SplatVal;
+      }
+    }
+    break;
+  case Intrinsic::aarch64_sve_eorv:
+    // sve_reduce_xor(all, splat(X)) ==> 0
+    if (C0 && C0->isAllOnesValue())
+      return ConstantInt::get(ReturnType, 0);
+    break;
+  }
+
+  return nullptr;
+}
+
 Value *llvm::simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType,
                                      Value *Op0, Value *Op1,
                                      const SimplifyQuery &Q,
@@ -7037,6 +7094,17 @@ Value *llvm::simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType,
 
     break;
   }
+
+  case Intrinsic::aarch64_sve_andv:
+  case Intrinsic::aarch64_sve_eorv:
+  case Intrinsic::aarch64_sve_orv:
+  case Intrinsic::aarch64_sve_saddv:
+  case Intrinsic::aarch64_sve_smaxv:
+  case Intrinsic::aarch64_sve_sminv:
+  case Intrinsic::aarch64_sve_uaddv:
+  case Intrinsic::aarch64_sve_umaxv:
+  case Intrinsic::aarch64_sve_uminv:
+    return simplifySVEIntReduction(IID, ReturnType, Op0, Op1);
   default:
     break;
   }
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index cbce8bd736102..a3aa5e9571657 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -183,6 +183,23 @@ bool Constant::isMinSignedValue() const {
   return false;
 }
 
+bool Constant::isMaxSignedValue() const {
+  // Check for INT_MAX integers
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
+    return CI->isMaxValue(/*isSigned=*/true);
+
+  // Check for FP which are bitcasted from INT_MAX integers
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
+    return CFP->getValueAPF().bitcastToAPInt().isMaxSignedValue();
+
+  // Check for splats of INT_MAX values.
+  if (getType()->isVectorTy())
+    if (const auto *SplatVal = getSplatValue())
+      return SplatVal->isMaxSignedValue();
+
+  return false;
+}
+
 bool Constant::isNotMinSignedValue() const {
   // Check for INT_MIN integers
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
diff --git a/llvm/test/Transforms/InstSimplify/AArch64/aarch64-sve-reductions.ll b/llvm/test/Transforms/InstSimplify/AArch64/aarch64-sve-reductions.ll
new file mode 100644
index 0000000000000..a54d6044d04b1
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/AArch64/aarch64-sve-reductions.ll
@@ -0,0 +1,912 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=instsimplify < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; ANDV
+;
+
+define i8 @andv_i8_no_active(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: define i8 @andv_i8_no_active(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret i8 -1
+;
+  %out = call i8 @llvm.aarch64.sve.andv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
+  ret i8 %out
+}
+
+define i8 @andv_i8_splat_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @andv_i8_splat_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 -1
+;
+  %out = call i8 @llvm.aarch64.sve.andv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> splat(i8 -1))
+  ret i8 %out
+}
+
+define i8 @andv_i8_splat_non_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @andv_i8_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i8 @llvm.aarch64.sve.andv.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    ret i8 [[OUT]]
+;
+  %out = call i8 @llvm.aarch64.sve.andv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> zeroinitializer)
+  ret i8 %out
+}
+
+define i8 @andv_i8_all_active_splat(i8 %a) #0 {
+; CHECK-LABEL: define i8 @andv_i8_all_active_splat(
+; CHECK-SAME: i8 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 [[A]]
+;
+  %a.insert = insertelement <vscale x 16 x i8> poison, i8 %a, i8 0
+  %a.splat = shufflevector <vscale x 16 x i8> %a.insert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %out = call i8 @llvm.aarch64.sve.andv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> %a.splat)
+  ret i8 %out
+}
+
+define i16 @andv_i16_splat_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @andv_i16_splat_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i16 -1
+;
+  %out = call i16 @llvm.aarch64.sve.andv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> splat(i16 -1))
+  ret i16 %out
+}
+
+define i16 @andv_i16_splat_non_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @andv_i16_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i16 @llvm.aarch64.sve.andv.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    ret i16 [[OUT]]
+;
+  %out = call i16 @llvm.aarch64.sve.andv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> zeroinitializer)
+  ret i16 %out
+}
+
+define i32 @andv_i32_splat_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @andv_i32_splat_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i32 -1
+;
+  %out = call i32 @llvm.aarch64.sve.andv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat(i32 -1))
+  ret i32 %out
+}
+
+define i32 @andv_i32_splat_non_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @andv_i32_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i32 @llvm.aarch64.sve.andv.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %out = call i32 @llvm.aarch64.sve.andv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
+  ret i32 %out
+}
+
+define i64 @andv_i64_splat_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @andv_i64_splat_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 -1
+;
+  %out = call i64 @llvm.aarch64.sve.andv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> splat(i64 -1))
+  ret i64 %out
+}
+
+define i64 @andv_i64_splat_non_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @andv_i64_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.andv.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.andv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> zeroinitializer)
+  ret i64 %out
+}
+
+;
+; EORV
+;
+
+define i8 @eorv_i8_no_active(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: define i8 @eorv_i8_no_active(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 0
+;
+  %out = call i8 @llvm.aarch64.sve.eorv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
+  ret i8 %out
+}
+
+define i8 @eorv_i8_splat_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @eorv_i8_splat_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 0
+;
+  %out = call i8 @llvm.aarch64.sve.eorv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> zeroinitializer)
+  ret i8 %out
+}
+
+define i8 @eorv_i8_splat_non_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @eorv_i8_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i8 @llvm.aarch64.sve.eorv.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> splat (i8 1))
+; CHECK-NEXT:    ret i8 [[OUT]]
+;
+  %out = call i8 @llvm.aarch64.sve.eorv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> splat(i8 1))
+  ret i8 %out
+}
+
+define i8 @eorv_i8_all_active_splat(i8 %a) #0 {
+; CHECK-LABEL: define i8 @eorv_i8_all_active_splat(
+; CHECK-SAME: i8 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 0
+;
+  %a.insert = insertelement <vscale x 16 x i8> poison, i8 %a, i8 0
+  %a.splat = shufflevector <vscale x 16 x i8> %a.insert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %out = call i8 @llvm.aarch64.sve.eorv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> %a.splat)
+  ret i8 %out
+}
+
+define i16 @eorv_i16_splat_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @eorv_i16_splat_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i16 0
+;
+  %out = call i16 @llvm.aarch64.sve.eorv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> zeroinitializer)
+  ret i16 %out
+}
+
+define i16 @eorv_i16_splat_non_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @eorv_i16_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i16 @llvm.aarch64.sve.eorv.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> splat (i16 1))
+; CHECK-NEXT:    ret i16 [[OUT]]
+;
+  %out = call i16 @llvm.aarch64.sve.eorv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> splat(i16 1))
+  ret i16 %out
+}
+
+define i32 @eorv_i32_splat_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @eorv_i32_splat_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i32 0
+;
+  %out = call i32 @llvm.aarch64.sve.eorv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
+  ret i32 %out
+}
+
+define i32 @eorv_i32_splat_non_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @eorv_i32_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i32 @llvm.aarch64.sve.eorv.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %out = call i32 @llvm.aarch64.sve.eorv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat(i32 1))
+  ret i32 %out
+}
+
+define i64 @eorv_i64_splat_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @eorv_i64_splat_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.eorv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @eorv_i64_splat_non_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @eorv_i64_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.eorv.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> splat (i64 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.eorv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> splat(i64 1))
+  ret i64 %out
+}
+
+;
+; ORV
+;
+
+define i8 @orv_i8_no_active(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: define i8 @orv_i8_no_active(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 0
+;
+  %out = call i8 @llvm.aarch64.sve.orv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
+  ret i8 %out
+}
+
+define i8 @orv_i8_splat_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @orv_i8_splat_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 0
+;
+  %out = call i8 @llvm.aarch64.sve.orv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> zeroinitializer)
+  ret i8 %out
+}
+
+define i8 @orv_i8_splat_non_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @orv_i8_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i8 @llvm.aarch64.sve.orv.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> splat (i8 1))
+; CHECK-NEXT:    ret i8 [[OUT]]
+;
+  %out = call i8 @llvm.aarch64.sve.orv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> splat(i8 1))
+  ret i8 %out
+}
+
+define i8 @orv_i8_all_active_splat(i8 %a) #0 {
+; CHECK-LABEL: define i8 @orv_i8_all_active_splat(
+; CHECK-SAME: i8 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 [[A]]
+;
+  %a.insert = insertelement <vscale x 16 x i8> poison, i8 %a, i8 0
+  %a.splat = shufflevector <vscale x 16 x i8> %a.insert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %out = call i8 @llvm.aarch64.sve.orv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> %a.splat)
+  ret i8 %out
+}
+
+define i16 @orv_i16_splat_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @orv_i16_splat_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i16 0
+;
+  %out = call i16 @llvm.aarch64.sve.orv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> zeroinitializer)
+  ret i16 %out
+}
+
+define i16 @orv_i16_splat_non_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @orv_i16_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i16 @llvm.aarch64.sve.orv.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> splat (i16 1))
+; CHECK-NEXT:    ret i16 [[OUT]]
+;
+  %out = call i16 @llvm.aarch64.sve.orv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> splat(i16 1))
+  ret i16 %out
+}
+
+define i32 @orv_i32_splat_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @orv_i32_splat_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i32 0
+;
+  %out = call i32 @llvm.aarch64.sve.orv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
+  ret i32 %out
+}
+
+define i32 @orv_i32_splat_non_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @orv_i32_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i32 @llvm.aarch64.sve.orv.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %out = call i32 @llvm.aarch64.sve.orv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat(i32 1))
+  ret i32 %out
+}
+
+define i64 @orv_i64_splat_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @orv_i64_splat_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.orv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @orv_i64_splat_non_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @orv_i64_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.orv.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> splat (i64 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.orv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> splat(i64 1))
+  ret i64 %out
+}
+
+;
+; SADDV
+;
+
+define i64 @saddv_i8_no_active(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: define i64 @saddv_i8_no_active(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.saddv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
+  ret i64 %out
+}
+
+define i64 @saddv_i8_splat_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @saddv_i8_splat_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.saddv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @saddv_i8_splat_non_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @saddv_i8_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.saddv.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> splat (i8 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.saddv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> splat(i8 1))
+  ret i64 %out
+}
+
+define i64 @saddv_i8_all_active_splat(i8 %a) #0 {
+; CHECK-LABEL: define i64 @saddv_i8_all_active_splat(
+; CHECK-SAME: i8 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[A_INSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[A]], i8 0
+; CHECK-NEXT:    [[A_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[A_INSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.saddv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A_SPLAT]])
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %a.insert = insertelement <vscale x 16 x i8> poison, i8 %a, i8 0
+  %a.splat = shufflevector <vscale x 16 x i8> %a.insert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %out = call i64 @llvm.aarch64.sve.saddv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> %a.splat)
+  ret i64 %out
+}
+
+define i64 @saddv_i16_splat_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @saddv_i16_splat_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.saddv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @saddv_i16_splat_non_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @saddv_i16_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.saddv.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> splat (i16 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.saddv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> splat(i16 1))
+  ret i64 %out
+}
+
+define i64 @saddv_i32_splat_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @saddv_i32_splat_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.saddv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @saddv_i32_splat_non_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @saddv_i32_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.saddv.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.saddv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat(i32 1))
+  ret i64 %out
+}
+
+define i64 @saddv_i64_splat_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @saddv_i64_splat_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.saddv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @saddv_i64_splat_non_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @saddv_i64_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.saddv.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> splat (i64 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.saddv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> splat(i64 1))
+  ret i64 %out
+}
+
+;
+; SMAXV
+;
+
+define i8 @smaxv_i8_no_active(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: define i8 @smaxv_i8_no_active(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 -128
+;
+  %out = call i8 @llvm.aarch64.sve.smaxv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
+  ret i8 %out
+}
+
+define i8 @smaxv_i8_splat_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @smaxv_i8_splat_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 -128
+;
+  %out = call i8 @llvm.aarch64.sve.smaxv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> splat(i8 -128))
+  ret i8 %out
+}
+
+define i8 @smaxv_i8_splat_non_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @smaxv_i8_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i8 @llvm.aarch64.sve.smaxv.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    ret i8 [[OUT]]
+;
+  %out = call i8 @llvm.aarch64.sve.smaxv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> zeroinitializer)
+  ret i8 %out
+}
+
+define i8 @smaxv_i8_all_active_splat(i8 %a) #0 {
+; CHECK-LABEL: define i8 @smaxv_i8_all_active_splat(
+; CHECK-SAME: i8 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 [[A]]
+;
+  %a.insert = insertelement <vscale x 16 x i8> poison, i8 %a, i8 0
+  %a.splat = shufflevector <vscale x 16 x i8> %a.insert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %out = call i8 @llvm.aarch64.sve.smaxv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> %a.splat)
+  ret i8 %out
+}
+
+define i16 @smaxv_i16_splat_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @smaxv_i16_splat_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i16 -32768
+;
+  %out = call i16 @llvm.aarch64.sve.smaxv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> splat(i16 -32768))
+  ret i16 %out
+}
+
+define i16 @smaxv_i16_splat_non_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @smaxv_i16_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i16 @llvm.aarch64.sve.smaxv.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    ret i16 [[OUT]]
+;
+  %out = call i16 @llvm.aarch64.sve.smaxv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> zeroinitializer)
+  ret i16 %out
+}
+
+define i32 @smaxv_i32_splat_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @smaxv_i32_splat_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i32 -2147483648
+;
+  %out = call i32 @llvm.aarch64.sve.smaxv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat(i32 -2147483648))
+  ret i32 %out
+}
+
+define i32 @smaxv_i32_splat_non_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @smaxv_i32_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i32 @llvm.aarch64.sve.smaxv.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %out = call i32 @llvm.aarch64.sve.smaxv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
+  ret i32 %out
+}
+
+define i64 @smaxv_i64_splat_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @smaxv_i64_splat_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 -9223372036854775808
+;
+  %out = call i64 @llvm.aarch64.sve.smaxv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> splat(i64 -9223372036854775808))
+  ret i64 %out
+}
+
+define i64 @smaxv_i64_splat_non_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @smaxv_i64_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.smaxv.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.smaxv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> zeroinitializer)
+  ret i64 %out
+}
+
+;
+; SMINV
+;
+
+define i8 @sminv_i8_no_active(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: define i8 @sminv_i8_no_active(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 127
+;
+  %out = call i8 @llvm.aarch64.sve.sminv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
+  ret i8 %out
+}
+
+define i8 @sminv_i8_splat_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @sminv_i8_splat_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 127
+;
+  %out = call i8 @llvm.aarch64.sve.sminv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> splat(i8 127))
+  ret i8 %out
+}
+
+define i8 @sminv_i8_splat_non_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @sminv_i8_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i8 @llvm.aarch64.sve.sminv.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    ret i8 [[OUT]]
+;
+  %out = call i8 @llvm.aarch64.sve.sminv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> zeroinitializer)
+  ret i8 %out
+}
+
+define i8 @sminv_i8_all_active_splat(i8 %a) #0 {
+; CHECK-LABEL: define i8 @sminv_i8_all_active_splat(
+; CHECK-SAME: i8 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 [[A]]
+;
+  %a.insert = insertelement <vscale x 16 x i8> poison, i8 %a, i8 0
+  %a.splat = shufflevector <vscale x 16 x i8> %a.insert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %out = call i8 @llvm.aarch64.sve.sminv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> %a.splat)
+  ret i8 %out
+}
+
+define i16 @sminv_i16_splat_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @sminv_i16_splat_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i16 32767
+;
+  %out = call i16 @llvm.aarch64.sve.sminv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> splat(i16 32767))
+  ret i16 %out
+}
+
+define i16 @sminv_i16_splat_non_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @sminv_i16_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i16 @llvm.aarch64.sve.sminv.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    ret i16 [[OUT]]
+;
+  %out = call i16 @llvm.aarch64.sve.sminv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> zeroinitializer)
+  ret i16 %out
+}
+
+define i32 @sminv_i32_splat_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @sminv_i32_splat_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i32 2147483647
+;
+  %out = call i32 @llvm.aarch64.sve.sminv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat(i32 2147483647))
+  ret i32 %out
+}
+
+define i32 @sminv_i32_splat_non_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @sminv_i32_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i32 @llvm.aarch64.sve.sminv.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %out = call i32 @llvm.aarch64.sve.sminv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
+  ret i32 %out
+}
+
+define i64 @sminv_i64_splat_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @sminv_i64_splat_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 9223372036854775807
+;
+  %out = call i64 @llvm.aarch64.sve.sminv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> splat(i64 9223372036854775807))
+  ret i64 %out
+}
+
+define i64 @sminv_i64_splat_non_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @sminv_i64_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.sminv.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.sminv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> zeroinitializer)
+  ret i64 %out
+}
+
+;
+; UADDV
+;
+
+define i64 @uaddv_i8_no_active(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: define i64 @uaddv_i8_no_active(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.uaddv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
+  ret i64 %out
+}
+
+define i64 @uaddv_i8_splat_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @uaddv_i8_splat_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.uaddv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @uaddv_i8_splat_non_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @uaddv_i8_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.uaddv.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> splat (i8 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.uaddv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> splat(i8 1))
+  ret i64 %out
+}
+
+define i64 @uaddv_i8_all_active_splat(i8 %a) #0 {
+; CHECK-LABEL: define i64 @uaddv_i8_all_active_splat(
+; CHECK-SAME: i8 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[A_INSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[A]], i8 0
+; CHECK-NEXT:    [[A_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[A_INSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.uaddv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A_SPLAT]])
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %a.insert = insertelement <vscale x 16 x i8> poison, i8 %a, i8 0
+  %a.splat = shufflevector <vscale x 16 x i8> %a.insert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %out = call i64 @llvm.aarch64.sve.uaddv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> %a.splat)
+  ret i64 %out
+}
+
+define i64 @uaddv_i16_splat_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @uaddv_i16_splat_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.uaddv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @uaddv_i16_splat_non_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @uaddv_i16_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.uaddv.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> splat (i16 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.uaddv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> splat(i16 1))
+  ret i64 %out
+}
+
+define i64 @uaddv_i32_splat_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @uaddv_i32_splat_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @uaddv_i32_splat_non_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @uaddv_i32_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat(i32 1))
+  ret i64 %out
+}
+
+define i64 @uaddv_i64_splat_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @uaddv_i64_splat_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.uaddv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @uaddv_i64_splat_non_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @uaddv_i64_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.uaddv.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> splat (i64 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.uaddv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> splat(i64 1))
+  ret i64 %out
+}
+
+;
+; UMAXV
+;
+
+define i8 @umaxv_i8_no_active(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: define i8 @umaxv_i8_no_active(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 0
+;
+  %out = call i8 @llvm.aarch64.sve.umaxv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
+  ret i8 %out
+}
+
+define i8 @umaxv_i8_splat_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @umaxv_i8_splat_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 0
+;
+  %out = call i8 @llvm.aarch64.sve.umaxv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> zeroinitializer)
+  ret i8 %out
+}
+
+define i8 @umaxv_i8_splat_non_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @umaxv_i8_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i8 @llvm.aarch64.sve.umaxv.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> splat (i8 1))
+; CHECK-NEXT:    ret i8 [[OUT]]
+;
+  %out = call i8 @llvm.aarch64.sve.umaxv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> splat(i8 1))
+  ret i8 %out
+}
+
+define i8 @umaxv_i8_all_active_splat(i8 %a) #0 {
+; CHECK-LABEL: define i8 @umaxv_i8_all_active_splat(
+; CHECK-SAME: i8 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 [[A]]
+;
+  %a.insert = insertelement <vscale x 16 x i8> poison, i8 %a, i8 0
+  %a.splat = shufflevector <vscale x 16 x i8> %a.insert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %out = call i8 @llvm.aarch64.sve.umaxv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> %a.splat)
+  ret i8 %out
+}
+
+define i16 @umaxv_i16_splat_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @umaxv_i16_splat_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i16 0
+;
+  %out = call i16 @llvm.aarch64.sve.umaxv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> zeroinitializer)
+  ret i16 %out
+}
+
+define i16 @umaxv_i16_splat_non_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @umaxv_i16_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i16 @llvm.aarch64.sve.umaxv.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> splat (i16 1))
+; CHECK-NEXT:    ret i16 [[OUT]]
+;
+  %out = call i16 @llvm.aarch64.sve.umaxv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> splat(i16 1))
+  ret i16 %out
+}
+
+define i32 @umaxv_i32_splat_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @umaxv_i32_splat_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i32 0
+;
+  %out = call i32 @llvm.aarch64.sve.umaxv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
+  ret i32 %out
+}
+
+define i32 @umaxv_i32_splat_non_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @umaxv_i32_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i32 @llvm.aarch64.sve.umaxv.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %out = call i32 @llvm.aarch64.sve.umaxv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat(i32 1))
+  ret i32 %out
+}
+
+define i64 @umaxv_i64_splat_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @umaxv_i64_splat_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 0
+;
+  %out = call i64 @llvm.aarch64.sve.umaxv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> zeroinitializer)
+  ret i64 %out
+}
+
+define i64 @umaxv_i64_splat_non_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @umaxv_i64_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.umaxv.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> splat (i64 1))
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.umaxv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> splat(i64 1))
+  ret i64 %out
+}
+
+;
+; UMINV
+;
+
+define i8 @uminv_i8_no_active(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: define i8 @uminv_i8_no_active(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 -1
+;
+  %out = call i8 @llvm.aarch64.sve.uminv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
+  ret i8 %out
+}
+
+define i8 @uminv_i8_splat_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @uminv_i8_splat_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 -1
+;
+  %out = call i8 @llvm.aarch64.sve.uminv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> splat(i8 -1))
+  ret i8 %out
+}
+
+define i8 @uminv_i8_splat_non_neutral_val(<vscale x 16 x i1> %pg) #0 {
+; CHECK-LABEL: define i8 @uminv_i8_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 16 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i8 @llvm.aarch64.sve.uminv.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    ret i8 [[OUT]]
+;
+  %out = call i8 @llvm.aarch64.sve.uminv.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> zeroinitializer)
+  ret i8 %out
+}
+
+define i8 @uminv_i8_all_active_splat(i8 %a) #0 {
+; CHECK-LABEL: define i8 @uminv_i8_all_active_splat(
+; CHECK-SAME: i8 [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i8 [[A]]
+;
+  %a.insert = insertelement <vscale x 16 x i8> poison, i8 %a, i8 0
+  %a.splat = shufflevector <vscale x 16 x i8> %a.insert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %out = call i8 @llvm.aarch64.sve.uminv.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> %a.splat)
+  ret i8 %out
+}
+
+define i16 @uminv_i16_splat_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @uminv_i16_splat_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i16 -1
+;
+  %out = call i16 @llvm.aarch64.sve.uminv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> splat(i16 -1))
+  ret i16 %out
+}
+
+define i16 @uminv_i16_splat_non_neutral_val(<vscale x 8 x i1> %pg) #0 {
+; CHECK-LABEL: define i16 @uminv_i16_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i16 @llvm.aarch64.sve.uminv.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    ret i16 [[OUT]]
+;
+  %out = call i16 @llvm.aarch64.sve.uminv.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> zeroinitializer)
+  ret i16 %out
+}
+
+define i32 @uminv_i32_splat_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @uminv_i32_splat_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i32 -1
+;
+  %out = call i32 @llvm.aarch64.sve.uminv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat(i32 -1))
+  ret i32 %out
+}
+
+define i32 @uminv_i32_splat_non_neutral_val(<vscale x 4 x i1> %pg) #0 {
+; CHECK-LABEL: define i32 @uminv_i32_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i32 @llvm.aarch64.sve.uminv.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %out = call i32 @llvm.aarch64.sve.uminv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
+  ret i32 %out
+}
+
+define i64 @uminv_i64_splat_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @uminv_i64_splat_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i64 -1
+;
+  %out = call i64 @llvm.aarch64.sve.uminv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> splat(i64 -1))
+  ret i64 %out
+}
+
+define i64 @uminv_i64_splat_non_neutral_val(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: define i64 @uminv_i64_splat_non_neutral_val(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[OUT:%.*]] = call i64 @llvm.aarch64.sve.uminv.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    ret i64 [[OUT]]
+;
+  %out = call i64 @llvm.aarch64.sve.uminv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> zeroinitializer)
+  ret i64 %out
+}
+
+attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/Transforms/InstSimplify/AArch64/lit.local.cfg b/llvm/test/Transforms/InstSimplify/AArch64/lit.local.cfg
new file mode 100644
index 0000000000000..10d4a0e953ed4
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/AArch64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "AArch64" in config.root.targets:
+    config.unsupported = True

From 75792d60778b34f20cd350d717baeb4ec6fadbcf Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Tue, 18 Nov 2025 16:05:53 +0100
Subject: [PATCH 15/33] [libc++] Fix header deprecations (#163356)

Currently, there are no diagnostics issued when including a deprecated
header, since the diagnostic is issued inside a system header. This
patch fixes that by using `#warning` instead, which also simplifies the
implementation of the deprecation warnings.
---
 libcxx/include/__config                            |  6 ++++++
 libcxx/include/ccomplex                            | 14 +++-----------
 libcxx/include/ciso646                             |  9 +++------
 libcxx/include/cstdalign                           | 13 +++----------
 libcxx/include/cstdbool                            | 13 +++----------
 libcxx/include/ctgmath                             | 13 ++-----------
 libcxx/test/libcxx/transitive_includes.gen.py      |  2 +-
 .../std/depr/depr.cpp.headers/ccomplex.verify.cpp  |  8 +-------
 .../std/depr/depr.cpp.headers/ciso646.verify.cpp   |  3 ++-
 .../std/depr/depr.cpp.headers/cstdalign.verify.cpp |  8 +-------
 .../std/depr/depr.cpp.headers/cstdbool.verify.cpp  |  8 +-------
 .../std/depr/depr.cpp.headers/ctgmath.verify.cpp   |  8 +-------
 .../tuple.apply/make_from_tuple.verify.cpp         |  7 +------
 libcxx/utils/libcxx/test/format.py                 |  2 +-
 14 files changed, 29 insertions(+), 85 deletions(-)

diff --git a/libcxx/include/__config b/libcxx/include/__config
index 8f461599ffd5b..d79ace0cbb896 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -546,6 +546,12 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_DEPRECATED_(m)
 #  endif
 
+#  if defined(__DEPRECATED) && __DEPRECATED && !defined(_LIBCPP_DISABLE_DEPRECATION_WARNINGS)
+#    define _LIBCPP_DIAGNOSE_DEPRECATED_HEADERS 1
+#  else
+#    define _LIBCPP_DIAGNOSE_DEPRECATED_HEADERS 0
+#  endif
+
 #  if !defined(_LIBCPP_CXX03_LANG)
 #    define _LIBCPP_DEPRECATED_IN_CXX11 _LIBCPP_DEPRECATED
 #  else
diff --git a/libcxx/include/ccomplex b/libcxx/include/ccomplex
index ee7e088aac54d..c1cb039f83a5e 100644
--- a/libcxx/include/ccomplex
+++ b/libcxx/include/ccomplex
@@ -26,18 +26,10 @@
 #    pragma GCC system_header
 #  endif
 
-#  if _LIBCPP_STD_VER >= 20
-
-using __standard_header_ccomplex
-    _LIBCPP_DEPRECATED_("removed in C++20. Include <complex> instead.") _LIBCPP_NODEBUG = void;
-using __use_standard_header_ccomplex _LIBCPP_NODEBUG                                    = __standard_header_ccomplex;
-
-#  elif _LIBCPP_STD_VER >= 17
-
-using __standard_header_ccomplex _LIBCPP_DEPRECATED_("Include <complex> instead.") _LIBCPP_NODEBUG = void;
-using __use_standard_header_ccomplex _LIBCPP_NODEBUG = __standard_header_ccomplex;
-
+#  if _LIBCPP_STD_VER >= 17 && !__building_module(std) && _LIBCPP_DIAGNOSE_DEPRECATED_HEADERS
+#    warning <ccomplex> is deprecated in C++17 and removed in C++20. Include <complex> instead.
 #  endif
+
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
 
 #endif // _LIBCPP_CCOMPLEX
diff --git a/libcxx/include/ciso646 b/libcxx/include/ciso646
index 34164362dc10d..d9eae41291024 100644
--- a/libcxx/include/ciso646
+++ b/libcxx/include/ciso646
@@ -24,13 +24,10 @@
 #    pragma GCC system_header
 #  endif
 
-#  if _LIBCPP_STD_VER >= 20
-
-using __standard_header_ciso646
-    _LIBCPP_DEPRECATED_("removed in C++20. Include <version> instead.") _LIBCPP_NODEBUG = void;
-using __use_standard_header_ciso646 _LIBCPP_NODEBUG                                     = __standard_header_ciso646;
-
+#  if _LIBCPP_STD_VER >= 20 && !__building_module(std) && _LIBCPP_DIAGNOSE_DEPRECATED_HEADERS
+#    warning <ciso646> is removed in C++20. Include <version> instead.
 #  endif
+
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
 
 #endif // _LIBCPP_CISO646
diff --git a/libcxx/include/cstdalign b/libcxx/include/cstdalign
index 7f8dd1e1fbaf8..7aa8cc81ad14c 100644
--- a/libcxx/include/cstdalign
+++ b/libcxx/include/cstdalign
@@ -43,17 +43,10 @@ Macros:
 #  undef __alignof_is_defined
 #  define __alignof_is_defined 1
 
-#  if _LIBCPP_STD_VER >= 20
-
-using __standard_header_cstdalign _LIBCPP_DEPRECATED_("removed in C++20.") _LIBCPP_NODEBUG = void;
-using __use_standard_header_cstdalign _LIBCPP_NODEBUG = __standard_header_cstdalign;
-
-#  elif _LIBCPP_STD_VER >= 17
-
-using __standard_header_cstdalign _LIBCPP_DEPRECATED _LIBCPP_NODEBUG = void;
-using __use_standard_header_cstdalign _LIBCPP_NODEBUG                = __standard_header_cstdalign;
-
+#  if _LIBCPP_STD_VER >= 17 && !__building_module(std) && _LIBCPP_DIAGNOSE_DEPRECATED_HEADERS
+#    warning <cstdalign> is deprecated in C++17 and removed in C++20.
 #  endif
+
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
 
 #endif // _LIBCPP_CSTDALIGN
diff --git a/libcxx/include/cstdbool b/libcxx/include/cstdbool
index a432d5f08b9ae..805a287bd7627 100644
--- a/libcxx/include/cstdbool
+++ b/libcxx/include/cstdbool
@@ -31,17 +31,10 @@ Macros:
 #  undef __bool_true_false_are_defined
 #  define __bool_true_false_are_defined 1
 
-#  if _LIBCPP_STD_VER >= 20
-
-using __standard_header_cstdbool _LIBCPP_DEPRECATED_("removed in C++20.") _LIBCPP_NODEBUG = void;
-using __use_standard_header_cstdbool _LIBCPP_NODEBUG                                      = __standard_header_cstdbool;
-
-#  elif _LIBCPP_STD_VER >= 17
-
-using __standard_header_cstdbool _LIBCPP_DEPRECATED _LIBCPP_NODEBUG = void;
-using __use_standard_header_cstdbool _LIBCPP_NODEBUG                = __standard_header_cstdbool;
-
+#  if _LIBCPP_STD_VER >= 17 && !__building_module(std) && _LIBCPP_DIAGNOSE_DEPRECATED_HEADERS
+#    warning <cstdbool> is deprecated in C++17 and removed in C++20.
 #  endif
+
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
 
 #endif // _LIBCPP_CSTDBOOL
diff --git a/libcxx/include/ctgmath b/libcxx/include/ctgmath
index db0786f1e2c46..13b7a96e4d8fc 100644
--- a/libcxx/include/ctgmath
+++ b/libcxx/include/ctgmath
@@ -28,17 +28,8 @@
 #    pragma GCC system_header
 #  endif
 
-#  if _LIBCPP_STD_VER >= 20
-
-using __standard_header_ctgmath
-    _LIBCPP_DEPRECATED_("removed in C++20. Include <cmath> and <complex> instead.") _LIBCPP_NODEBUG = void;
-using __use_standard_header_ctgmath _LIBCPP_NODEBUG = __standard_header_ctgmath;
-
-#  elif _LIBCPP_STD_VER >= 17
-
-using __standard_header_ctgmath _LIBCPP_DEPRECATED_("Include <cmath> and <complex> instead.") _LIBCPP_NODEBUG = void;
-using __use_standard_header_ctgmath _LIBCPP_NODEBUG = __standard_header_ctgmath;
-
+#  if _LIBCPP_STD_VER >= 17 && !__building_module(std) && _LIBCPP_DIAGNOSE_DEPRECATED_HEADERS
+#    warning <ctgmath> is deprecated in C++17 and removed in C++20. Include <cmath> and <complex> instead.
 #  endif
 
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
diff --git a/libcxx/test/libcxx/transitive_includes.gen.py b/libcxx/test/libcxx/transitive_includes.gen.py
index 6ed35af7e275e..2b643e1f2ad48 100644
--- a/libcxx/test/libcxx/transitive_includes.gen.py
+++ b/libcxx/test/libcxx/transitive_includes.gen.py
@@ -89,7 +89,7 @@
 // UNSUPPORTED: LIBCXX-FREEBSD-FIXME
 
 // RUN: mkdir %t
-// RUN: %{{cxx}} %s %{{flags}} %{{compile_flags}} --trace-includes -fshow-skipped-includes --preprocess > /dev/null 2> %t/trace-includes.txt
+// RUN: %{{cxx}} %s %{{flags}} %{{compile_flags}} -Wno-deprecated --trace-includes -fshow-skipped-includes --preprocess > /dev/null 2> %t/trace-includes.txt
 // RUN: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes/to_csv.py %t/trace-includes.txt > %t/actual_transitive_includes.csv
 // RUN: cat %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv | awk '/^{escaped_header} / {{ print }}' > %t/expected_transitive_includes.csv
 // RUN: diff -w %t/expected_transitive_includes.csv %t/actual_transitive_includes.csv
diff --git a/libcxx/test/std/depr/depr.cpp.headers/ccomplex.verify.cpp b/libcxx/test/std/depr/depr.cpp.headers/ccomplex.verify.cpp
index 0eaf82ce5cef0..8df89d0ba9206 100644
--- a/libcxx/test/std/depr/depr.cpp.headers/ccomplex.verify.cpp
+++ b/libcxx/test/std/depr/depr.cpp.headers/ccomplex.verify.cpp
@@ -14,12 +14,6 @@
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: clang-modules-build
 
-#include "test_macros.h"
-
 #include <ccomplex>
 
-#if TEST_STD_VER >= 20
-// expected-warning@ccomplex:* {{'__standard_header_ccomplex' is deprecated: removed in C++20. Include <complex> instead.}}
-#else
-// expected-warning@ccomplex:* {{'__standard_header_ccomplex' is deprecated: Include <complex> instead.}}
-#endif
+// expected-warning@ccomplex:* {{<ccomplex> is deprecated in C++17 and removed in C++20. Include <complex> instead.}}
diff --git a/libcxx/test/std/depr/depr.cpp.headers/ciso646.verify.cpp b/libcxx/test/std/depr/depr.cpp.headers/ciso646.verify.cpp
index 04acd10081548..32b57033331c8 100644
--- a/libcxx/test/std/depr/depr.cpp.headers/ciso646.verify.cpp
+++ b/libcxx/test/std/depr/depr.cpp.headers/ciso646.verify.cpp
@@ -15,4 +15,5 @@
 // UNSUPPORTED: clang-modules-build
 
 #include <ciso646>
-// expected-warning@ciso646:* {{'__standard_header_ciso646' is deprecated: removed in C++20. Include <version> instead.}}
+
+// expected-warning@ciso646:* {{<ciso646> is removed in C++20. Include <version> instead.}}
diff --git a/libcxx/test/std/depr/depr.cpp.headers/cstdalign.verify.cpp b/libcxx/test/std/depr/depr.cpp.headers/cstdalign.verify.cpp
index dc9f1af55b3f1..23a7709a9d658 100644
--- a/libcxx/test/std/depr/depr.cpp.headers/cstdalign.verify.cpp
+++ b/libcxx/test/std/depr/depr.cpp.headers/cstdalign.verify.cpp
@@ -14,12 +14,6 @@
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: clang-modules-build
 
-#include "test_macros.h"
-
 #include <cstdalign>
 
-#if TEST_STD_VER >= 20
-// expected-warning@cstdalign:* {{'__standard_header_cstdalign' is deprecated: removed in C++20.}}
-#else
-// expected-warning@cstdalign:* {{'__standard_header_cstdalign' is deprecated}}
-#endif
+// expected-warning@cstdalign:* {{<cstdalign> is deprecated in C++17 and removed in C++20.}}
diff --git a/libcxx/test/std/depr/depr.cpp.headers/cstdbool.verify.cpp b/libcxx/test/std/depr/depr.cpp.headers/cstdbool.verify.cpp
index eddefe14d35ea..c2c0f03c52d3c 100644
--- a/libcxx/test/std/depr/depr.cpp.headers/cstdbool.verify.cpp
+++ b/libcxx/test/std/depr/depr.cpp.headers/cstdbool.verify.cpp
@@ -14,12 +14,6 @@
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: clang-modules-build
 
-#include "test_macros.h"
-
 #include <cstdbool>
 
-#if TEST_STD_VER >= 20
-// expected-warning@cstdbool:* {{'__standard_header_cstdbool' is deprecated: removed in C++20.}}
-#else
-// expected-warning@cstdbool:* {{'__standard_header_cstdbool' is deprecated}}
-#endif
+// expected-warning@cstdbool:* {{<cstdbool> is deprecated in C++17 and removed in C++20.}}
diff --git a/libcxx/test/std/depr/depr.cpp.headers/ctgmath.verify.cpp b/libcxx/test/std/depr/depr.cpp.headers/ctgmath.verify.cpp
index 097ab1643d15a..4f5564915443d 100644
--- a/libcxx/test/std/depr/depr.cpp.headers/ctgmath.verify.cpp
+++ b/libcxx/test/std/depr/depr.cpp.headers/ctgmath.verify.cpp
@@ -14,12 +14,6 @@
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: clang-modules-build
 
-#include "test_macros.h"
-
 #include <ctgmath>
 
-#if TEST_STD_VER >= 20
-// expected-warning@ctgmath:* {{'__standard_header_ctgmath' is deprecated: removed in C++20. Include <cmath> and <complex> instead.}}
-#else
-// expected-warning@ctgmath:* {{'__standard_header_ctgmath' is deprecated: Include <cmath> and <complex> instead.}}
-#endif
+// expected-warning@ctgmath:* {{<ctgmath> is deprecated in C++17 and removed in C++20. Include <cmath> and <complex> instead.}}
diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp
index 12d778408d5ec..e58e760a5ce81 100644
--- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp
+++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: std-at-least-c++23
+// REQUIRES: std-at-least-c++26
 
 // <tuple>
 
@@ -21,11 +21,6 @@
 void test() {
   // expected-error@*:* {{static assertion failed}}
 
-  // Turns to an error since C++26 (Disallow Binding a Returned Glvalue to a Temporary https://wg21.link/P2748R5).
-#if TEST_STD_VER >= 26
   // expected-error@tuple:* {{returning reference to local temporary object}}
-#else
-  // expected-warning@tuple:* {{returning reference to local temporary object}}
-#endif
   std::ignore = std::make_from_tuple<const int&>(std::tuple<char>{});
 }
diff --git a/libcxx/utils/libcxx/test/format.py b/libcxx/utils/libcxx/test/format.py
index 975209c273f8c..76e9115295b99 100644
--- a/libcxx/utils/libcxx/test/format.py
+++ b/libcxx/utils/libcxx/test/format.py
@@ -99,7 +99,7 @@ def parseScript(test, preamble):
     substitutions.append(
         (
             "%{verify}",
-            "%{cxx} %s %{flags} %{compile_flags} -fsyntax-only -Wno-error -Xclang -verify -Xclang -verify-ignore-unexpected=note -ferror-limit=0",
+            "%{cxx} %s %{flags} %{compile_flags} -U_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER -fsyntax-only -Wno-error -Xclang -verify -Xclang -verify-ignore-unexpected=note -ferror-limit=0",
         )
     )
     substitutions.append(("%{run}", "%{exec} %t.exe"))

From 9a0fd22da1013281d6269f19facc5d5c1be58904 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler@amd.com>
Date: Tue, 18 Nov 2025 16:10:42 +0100
Subject: [PATCH 16/33] Revert "[OpenMP] Implement omp_get_uid_from_device() /
 omp_get_device_from_uid()" (#168547)

Reverts llvm/llvm-project#164392 due to fortran issues
---
 offload/include/OpenMP/omp.h               |  7 --
 offload/include/omptarget.h                |  2 -
 offload/libomptarget/OpenMP/API.cpp        | 58 ----------------
 offload/libomptarget/exports               |  2 -
 offload/test/api/omp_device_uid.c          | 76 ---------------------
 openmp/device/include/DeviceTypes.h        |  3 -
 openmp/device/include/Interface.h          |  4 --
 openmp/device/src/State.cpp                |  6 --
 openmp/runtime/src/dllexports              |  2 -
 openmp/runtime/src/include/omp.h.var       |  5 --
 openmp/runtime/src/include/omp_lib.F90.var | 14 ----
 openmp/runtime/src/include/omp_lib.h.var   | 19 ------
 openmp/runtime/src/kmp_ftn_entry.h         | 29 +-------
 openmp/runtime/src/kmp_ftn_os.h            |  8 ---
 openmp/runtime/test/api/omp_device_uid.c   | 77 ----------------------
 15 files changed, 2 insertions(+), 310 deletions(-)
 delete mode 100644 offload/test/api/omp_device_uid.c
 delete mode 100644 openmp/runtime/test/api/omp_device_uid.c

diff --git a/offload/include/OpenMP/omp.h b/offload/include/OpenMP/omp.h
index d92c7e450c677..768ca46a9bed0 100644
--- a/offload/include/OpenMP/omp.h
+++ b/offload/include/OpenMP/omp.h
@@ -30,13 +30,6 @@
 
 extern "C" {
 
-/// Definitions
-///{
-
-#define omp_invalid_device -2
-
-///}
-
 /// Type declarations
 ///{
 
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 00910704a979a..fbb4a06accf84 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -270,8 +270,6 @@ extern "C" {
 void ompx_dump_mapping_tables(void);
 int omp_get_num_devices(void);
 int omp_get_device_num(void);
-int omp_get_device_from_uid(const char *DeviceUid);
-const char *omp_get_uid_from_device(int DeviceNum);
 int omp_get_initial_device(void);
 void *omp_target_alloc(size_t Size, int DeviceNum);
 void omp_target_free(void *DevicePtr, int DeviceNum);
diff --git a/offload/libomptarget/OpenMP/API.cpp b/offload/libomptarget/OpenMP/API.cpp
index 6e85e5764449c..dd83a3ccd08e6 100644
--- a/offload/libomptarget/OpenMP/API.cpp
+++ b/offload/libomptarget/OpenMP/API.cpp
@@ -40,8 +40,6 @@ EXTERN void ompx_dump_mapping_tables() {
 using namespace llvm::omp::target::ompt;
 #endif
 
-using GenericDeviceTy = llvm::omp::target::plugin::GenericDeviceTy;
-
 void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
                           const char *Name);
 void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
@@ -70,62 +68,6 @@ EXTERN int omp_get_device_num(void) {
   return HostDevice;
 }
 
-static inline bool is_initial_device_uid(const char *DeviceUid) {
-  return strcmp(DeviceUid, GenericPluginTy::getHostDeviceUid()) == 0;
-}
-
-EXTERN int omp_get_device_from_uid(const char *DeviceUid) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-
-  if (!DeviceUid) {
-    DP("Call to omp_get_device_from_uid returning omp_invalid_device\n");
-    return omp_invalid_device;
-  }
-  if (is_initial_device_uid(DeviceUid)) {
-    DP("Call to omp_get_device_from_uid returning initial device number %d\n",
-       omp_get_initial_device());
-    return omp_get_initial_device();
-  }
-
-  int DeviceNum = omp_invalid_device;
-
-  auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor();
-  for (const DeviceTy &Device : PM->devices(ExclusiveDevicesAccessor)) {
-    const char *Uid = Device.RTL->getDevice(Device.RTLDeviceID).getDeviceUid();
-    if (Uid && strcmp(DeviceUid, Uid) == 0) {
-      DeviceNum = Device.DeviceID;
-      break;
-    }
-  }
-
-  DP("Call to omp_get_device_from_uid returning %d\n", DeviceNum);
-  return DeviceNum;
-}
-
-EXTERN const char *omp_get_uid_from_device(int DeviceNum) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-
-  if (DeviceNum == omp_invalid_device) {
-    DP("Call to omp_get_uid_from_device returning nullptr\n");
-    return nullptr;
-  }
-  if (DeviceNum == omp_get_initial_device()) {
-    DP("Call to omp_get_uid_from_device returning initial device UID\n");
-    return GenericPluginTy::getHostDeviceUid();
-  }
-
-  auto DeviceOrErr = PM->getDevice(DeviceNum);
-  if (!DeviceOrErr)
-    FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
-
-  const char *Uid =
-      DeviceOrErr->RTL->getDevice(DeviceOrErr->RTLDeviceID).getDeviceUid();
-  DP("Call to omp_get_uid_from_device returning %s\n", Uid);
-  return Uid;
-}
-
 EXTERN int omp_get_initial_device(void) {
   TIMESCOPE();
   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports
index 2ebc23e3cf60a..910a5b6c827a7 100644
--- a/offload/libomptarget/exports
+++ b/offload/libomptarget/exports
@@ -40,8 +40,6 @@ VERS1.0 {
     omp_get_mapped_ptr;
     omp_get_num_devices;
     omp_get_device_num;
-    omp_get_device_from_uid;
-    omp_get_uid_from_device;
     omp_get_initial_device;
     omp_target_alloc;
     omp_target_free;
diff --git a/offload/test/api/omp_device_uid.c b/offload/test/api/omp_device_uid.c
deleted file mode 100644
index 2a41d8d04ef8a..0000000000000
--- a/offload/test/api/omp_device_uid.c
+++ /dev/null
@@ -1,76 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-#include <string.h>
-
-int test_omp_device_uid(int device_num) {
-  const char *device_uid = omp_get_uid_from_device(device_num);
-  if (device_uid == NULL) {
-    printf("FAIL for device %d: omp_get_uid_from_device returned NULL\n",
-           device_num);
-    return 0;
-  }
-
-  int device_num_from_uid = omp_get_device_from_uid(device_uid);
-  if (device_num_from_uid != device_num) {
-    printf(
-        "FAIL for device %d: omp_get_device_from_uid returned %d (UID: %s)\n",
-        device_num, device_num_from_uid, device_uid);
-    return 0;
-  }
-
-  if (device_num == omp_get_initial_device())
-    return 1;
-
-  int success = 1;
-
-// Note that the following code may be executed on the host if the host is the
-// device
-#pragma omp target map(tofrom : success) device(device_num)
-  {
-    int device_num = omp_get_device_num();
-
-    // omp_get_uid_from_device() in the device runtime is a dummy function
-    // returning NULL
-    const char *device_uid = omp_get_uid_from_device(device_num);
-
-    // omp_get_device_from_uid() in the device runtime is a dummy function
-    // returning omp_invalid_device.
-    int device_num_from_uid = omp_get_device_from_uid(device_uid);
-
-    // Depending on whether we're executing on the device or the host, we either
-    // got NULL as the device UID or the correct device UID.  Consequently,
-    // omp_get_device_from_uid() either returned omp_invalid_device or the
-    // correct device number (aka omp_get_initial_device()).
-    if (device_uid ? device_num_from_uid != device_num
-                   : device_num_from_uid != omp_invalid_device) {
-      printf("FAIL for device %d (target): omp_get_device_from_uid returned %d "
-             "(UID: %s)\n",
-             device_num, device_num_from_uid, device_uid);
-      success = 0;
-    }
-  }
-
-  return success;
-}
-
-int main() {
-  int num_devices = omp_get_num_devices();
-  int num_failed = 0;
-  // (also test initial device aka num_devices)
-  for (int i = 0; i < num_devices + 1; i++) {
-    if (!test_omp_device_uid(i)) {
-      printf("FAIL for device %d\n", i);
-      num_failed++;
-    }
-  }
-  if (num_failed) {
-    printf("FAIL\n");
-    return 1;
-  }
-  printf("PASS\n");
-  return 0;
-}
-
-// CHECK: PASS
diff --git a/openmp/device/include/DeviceTypes.h b/openmp/device/include/DeviceTypes.h
index 213ccfe58b4fb..2e5d92380f040 100644
--- a/openmp/device/include/DeviceTypes.h
+++ b/openmp/device/include/DeviceTypes.h
@@ -21,9 +21,6 @@ template <typename T> using Constant = __gpu_constant T;
 template <typename T> using Local = __gpu_local T;
 template <typename T> using Global = __gpu_local T;
 
-// See definition in OpenMP (omp.h.var/omp_lib.(F90|h).var)
-#define omp_invalid_device -2
-
 enum omp_proc_bind_t {
   omp_proc_bind_false = 0,
   omp_proc_bind_true = 1,
diff --git a/openmp/device/include/Interface.h b/openmp/device/include/Interface.h
index 71c3b1fc06d40..c4bfaaa2404b4 100644
--- a/openmp/device/include/Interface.h
+++ b/openmp/device/include/Interface.h
@@ -130,10 +130,6 @@ int omp_get_num_devices(void);
 
 int omp_get_device_num(void);
 
-int omp_get_device_from_uid(const char *DeviceUid);
-
-const char *omp_get_uid_from_device(int DeviceNum);
-
 int omp_get_num_teams(void);
 
 int omp_get_team_num();
diff --git a/openmp/device/src/State.cpp b/openmp/device/src/State.cpp
index 985e6b169137f..9f38cf26f8c6f 100644
--- a/openmp/device/src/State.cpp
+++ b/openmp/device/src/State.cpp
@@ -403,12 +403,6 @@ int omp_get_num_devices(void) { return config::getNumDevices(); }
 
 int omp_get_device_num(void) { return config::getDeviceNum(); }
 
-int omp_get_device_from_uid(const char *DeviceUid) {
-  return omp_invalid_device;
-}
-
-const char *omp_get_uid_from_device(int DeviceNum) { return nullptr; }
-
 int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); }
 
 int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports
index 00becd1a657fd..3983dae80c9f5 100644
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -544,8 +544,6 @@ kmp_set_disp_num_buffers                    890
     omp_get_devices_all_allocator           819
     omp_get_memspace_num_resources          820
     omp_get_submemspace                     821
-    omp_get_device_from_uid                 822
-    omp_get_uid_from_device                 823
     %ifndef stub
         __kmpc_set_default_allocator
         __kmpc_get_default_allocator
diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var
index e98df731ad888..74f385feb3ea5 100644
--- a/openmp/runtime/src/include/omp.h.var
+++ b/openmp/runtime/src/include/omp.h.var
@@ -536,11 +536,6 @@
 
     /* OpenMP 5.2 */
     extern int __KAI_KMPC_CONVENTION omp_in_explicit_task(void);
-    #define omp_invalid_device -2
-
-    /* OpenMP 6.0 */
-    extern int   __KAI_KMPC_CONVENTION  omp_get_device_from_uid(const char *DeviceUid);
-    extern const char *   __KAI_KMPC_CONVENTION  omp_get_uid_from_device(int DeviceNum);
 
     /* LLVM Extensions */
     extern void *llvm_omp_target_dynamic_shared_alloc(void);
diff --git a/openmp/runtime/src/include/omp_lib.F90.var b/openmp/runtime/src/include/omp_lib.F90.var
index 159b42ab5b5cc..90d7e49ebf549 100644
--- a/openmp/runtime/src/include/omp_lib.F90.var
+++ b/openmp/runtime/src/include/omp_lib.F90.var
@@ -215,8 +215,6 @@
 
         integer (kind=omp_interop_kind), parameter, public :: omp_interop_none = 0
 
-        integer (kind=omp_integer_kind), parameter, public :: omp_invalid_device = -2
-
         interface
 
 !         ***
@@ -419,18 +417,6 @@
             integer (kind=omp_integer_kind) omp_get_device_num
           end function omp_get_device_num
 
-          function omp_get_uid_from_device(device_num) bind(c)
-            use omp_lib_kinds
-            integer (kind=omp_integer_kind), value :: device_num
-            character (len=*) omp_get_uid_from_device
-          end function omp_get_uid_from_device
-
-          function omp_get_device_from_uid(device_uid) bind(c)
-            use omp_lib_kinds
-            character (len=*), value :: device_uid
-            integer (kind=omp_integer_kind) omp_get_device_from_uid
-          end function omp_get_device_from_uid
-
           function omp_pause_resource(kind, device_num) bind(c)
             use omp_lib_kinds
             integer (kind=omp_pause_resource_kind), value :: kind
diff --git a/openmp/runtime/src/include/omp_lib.h.var b/openmp/runtime/src/include/omp_lib.h.var
index 468eb03e99ef1..a50bb018c7cc3 100644
--- a/openmp/runtime/src/include/omp_lib.h.var
+++ b/openmp/runtime/src/include/omp_lib.h.var
@@ -291,9 +291,6 @@
       integer(kind=omp_interop_kind)omp_interop_none
       parameter(omp_interop_none=0)
 
-      integer(kind=omp_integer_kind)omp_invalid_device
-      parameter(omp_invalid_device=-2)
-
       interface
 
 !       ***
@@ -489,18 +486,6 @@
           integer (kind=omp_integer_kind) omp_get_device_num
         end function omp_get_device_num
 
-        function omp_get_uid_from_device(device_num) bind(c)
-          import
-          integer (kind=omp_integer_kind), value :: device_num
-          character (len=*) omp_get_uid_from_device
-        end function omp_get_uid_from_device
-
-        function omp_get_device_from_uid(device_uid) bind(c)
-          import
-          character (len=*), value :: device_uid
-          integer (kind=omp_integer_kind) omp_get_device_from_uid
-        end function omp_get_device_from_uid
-
         function omp_pause_resource(kind, device_num) bind(c)
           import
           integer (kind=omp_pause_resource_kind), value :: kind
@@ -1174,8 +1159,6 @@
 !DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_initial_device
 !DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_num_devices
 !DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_device_num
-!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_uid_from_device
-!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_device_from_uid
 !DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_pause_resource
 !DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_pause_resource_all
 !DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_supported_active_levels
@@ -1259,8 +1242,6 @@
 !$omp declare target(omp_get_initial_device )
 !$omp declare target(omp_get_num_devices )
 !$omp declare target(omp_get_device_num )
-!$omp declare target(omp_get_uid_from_device )
-!$omp declare target(omp_get_device_from_uid )
 !$omp declare target(omp_pause_resource )
 !$omp declare target(omp_pause_resource_all )
 !$omp declare target(omp_get_supported_active_levels )
diff --git a/openmp/runtime/src/kmp_ftn_entry.h b/openmp/runtime/src/kmp_ftn_entry.h
index 49c56d2b9a769..2b0063eb23a0a 100644
--- a/openmp/runtime/src/kmp_ftn_entry.h
+++ b/openmp/runtime/src/kmp_ftn_entry.h
@@ -1543,38 +1543,13 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_MAX_TASK_PRIORITY)(void) {
 #endif
 }
 
-// These functions will be defined in libomptarget. When libomptarget is not
-// loaded, we assume we are on the host.
+// This function will be defined in libomptarget. When libomptarget is not
+// loaded, we assume we are on the host and return KMP_HOST_DEVICE.
 // Compiler/libomptarget will handle this if called inside target.
 int FTN_STDCALL FTN_GET_DEVICE_NUM(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
 int FTN_STDCALL FTN_GET_DEVICE_NUM(void) {
   return KMP_EXPAND_NAME(FTN_GET_INITIAL_DEVICE)();
 }
-const char *FTN_STDCALL FTN_GET_UID_FROM_DEVICE(int device_num)
-    KMP_WEAK_ATTRIBUTE_EXTERNAL;
-const char *FTN_STDCALL FTN_GET_UID_FROM_DEVICE(int device_num) {
-#if KMP_OS_DARWIN || KMP_OS_WASI || defined(KMP_STUB)
-  return nullptr;
-#else
-  const char *(*fptr)(int);
-  if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_uid_from_device")))
-    return (*fptr)(device_num);
-  // Returns the same string as used by libomptarget
-  return "HOST";
-#endif
-}
-int FTN_STDCALL FTN_GET_DEVICE_FROM_UID(const char *device_uid)
-    KMP_WEAK_ATTRIBUTE_EXTERNAL;
-int FTN_STDCALL FTN_GET_DEVICE_FROM_UID(const char *device_uid) {
-#if KMP_OS_DARWIN || KMP_OS_WASI || defined(KMP_STUB)
-  return omp_invalid_device;
-#else
-  int (*fptr)(const char *);
-  if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_device_from_uid")))
-    return (*fptr)(device_uid);
-  return KMP_EXPAND_NAME(FTN_GET_INITIAL_DEVICE)();
-#endif
-}
 
 // Compiler will ensure that this is only called from host in sequential region
 int FTN_STDCALL KMP_EXPAND_NAME(FTN_PAUSE_RESOURCE)(kmp_pause_status_t kind,
diff --git a/openmp/runtime/src/kmp_ftn_os.h b/openmp/runtime/src/kmp_ftn_os.h
index c439a058f22b4..ae0ed067235e5 100644
--- a/openmp/runtime/src/kmp_ftn_os.h
+++ b/openmp/runtime/src/kmp_ftn_os.h
@@ -140,8 +140,6 @@
 #define FTN_GET_MEMSPACE_NUM_RESOURCES omp_get_memspace_num_resources
 #define FTN_GET_SUBMEMSPACE omp_get_submemspace
 #define FTN_GET_DEVICE_NUM omp_get_device_num
-#define FTN_GET_UID_FROM_DEVICE omp_get_uid_from_device
-#define FTN_GET_DEVICE_FROM_UID omp_get_device_from_uid
 #define FTN_SET_AFFINITY_FORMAT omp_set_affinity_format
 #define FTN_GET_AFFINITY_FORMAT omp_get_affinity_format
 #define FTN_DISPLAY_AFFINITY omp_display_affinity
@@ -291,8 +289,6 @@
 #define FTN_ALLOC omp_alloc_
 #define FTN_FREE omp_free_
 #define FTN_GET_DEVICE_NUM omp_get_device_num_
-#define FTN_GET_UID_FROM_DEVICE omp_get_uid_from_device_
-#define FTN_GET_DEVICE_FROM_UID omp_get_device_from_uid_
 #define FTN_SET_AFFINITY_FORMAT omp_set_affinity_format_
 #define FTN_GET_AFFINITY_FORMAT omp_get_affinity_format_
 #define FTN_DISPLAY_AFFINITY omp_display_affinity_
@@ -440,8 +436,6 @@
 #define FTN_GET_MEMSPACE_NUM_RESOURCES OMP_GET_MEMSPACE_NUM_RESOURCES
 #define FTN_GET_SUBMEMSPACE OMP_GET_SUBMEMSPACE
 #define FTN_GET_DEVICE_NUM OMP_GET_DEVICE_NUM
-#define FTN_GET_UID_FROM_DEVICE OMP_GET_UID_FROM_DEVICE
-#define FTN_GET_DEVICE_FROM_UID OMP_GET_DEVICE_FROM_UID
 #define FTN_SET_AFFINITY_FORMAT OMP_SET_AFFINITY_FORMAT
 #define FTN_GET_AFFINITY_FORMAT OMP_GET_AFFINITY_FORMAT
 #define FTN_DISPLAY_AFFINITY OMP_DISPLAY_AFFINITY
@@ -591,8 +585,6 @@
 #define FTN_ALLOC OMP_ALLOC_
 #define FTN_FREE OMP_FREE_
 #define FTN_GET_DEVICE_NUM OMP_GET_DEVICE_NUM_
-#define FTN_GET_UID_FROM_DEVICE OMP_GET_UID_FROM_DEVICE_
-#define FTN_GET_DEVICE_FROM_UID OMP_GET_DEVICE_FROM_UID_
 #define FTN_SET_AFFINITY_FORMAT OMP_SET_AFFINITY_FORMAT_
 #define FTN_GET_AFFINITY_FORMAT OMP_GET_AFFINITY_FORMAT_
 #define FTN_DISPLAY_AFFINITY OMP_DISPLAY_AFFINITY_
diff --git a/openmp/runtime/test/api/omp_device_uid.c b/openmp/runtime/test/api/omp_device_uid.c
deleted file mode 100644
index 40a1cbb644c7b..0000000000000
--- a/openmp/runtime/test/api/omp_device_uid.c
+++ /dev/null
@@ -1,77 +0,0 @@
-// RUN: %libomp-compile-and-run 2>&1 | FileCheck %s
-// Linking fails for icc 18
-// UNSUPPORTED: icc-18
-
-#include <omp_testsuite.h>
-#include <string.h>
-
-int test_omp_device_uid(int device_num) {
-  const char *device_uid = omp_get_uid_from_device(device_num);
-  if (device_uid == NULL) {
-    printf("FAIL for device %d: omp_get_uid_from_device returned NULL\n",
-           device_num);
-    return 0;
-  }
-
-  int device_num_from_uid = omp_get_device_from_uid(device_uid);
-  if (device_num_from_uid != device_num) {
-    printf(
-        "FAIL for device %d: omp_get_device_from_uid returned %d (UID: %s)\n",
-        device_num, device_num_from_uid, device_uid);
-    return 0;
-  }
-
-  if (device_num == omp_get_initial_device())
-    return 1;
-
-  int success = 1;
-
-// Note that the following code may be executed on the host if the host is the
-// device
-#pragma omp target map(tofrom : success) device(device_num)
-  {
-    int device_num = omp_get_device_num();
-
-    // omp_get_uid_from_device() in the device runtime is a dummy function
-    // returning NULL
-    const char *device_uid = omp_get_uid_from_device(device_num);
-
-    // omp_get_device_from_uid() in the device runtime is a dummy function
-    // returning omp_invalid_device.
-    int device_num_from_uid = omp_get_device_from_uid(device_uid);
-
-    // Depending on whether we're executing on the device or the host, we either
-    // got NULL as the device UID or the correct device UID.  Consequently,
-    // omp_get_device_from_uid() either returned omp_invalid_device or the
-    // correct device number (aka omp_get_initial_device()).
-    if (device_uid ? device_num_from_uid != device_num
-                   : device_num_from_uid != omp_invalid_device) {
-      printf("FAIL for device %d (target): omp_get_device_from_uid returned %d "
-             "(UID: %s)\n",
-             device_num, device_num_from_uid, device_uid);
-      success = 0;
-    }
-  }
-
-  return success;
-}
-
-int main() {
-  int num_devices = omp_get_num_devices();
-  int num_failed = 0;
-  // (also test initial device aka num_devices)
-  for (int i = 0; i < num_devices + 1; i++) {
-    if (!test_omp_device_uid(i)) {
-      printf("FAIL for device %d\n", i);
-      num_failed++;
-    }
-  }
-  if (num_failed) {
-    printf("FAIL\n");
-    return 1;
-  }
-  printf("PASS\n");
-  return 0;
-}
-
-// CHECK: PASS

From 2befda2225a6c61d0308e536c19b066ab27bbf2a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 18 Nov 2025 15:15:14 +0000
Subject: [PATCH 17/33] [VPlan] Populate and use VPIRFlags from initial
 VPInstruction. (#168450)

Update VPlan to populate VPIRFlags during VPInstruction construction and
use it when creating widened recipes, instead of constructing VPIRFlags
from the underlying IR instruction each time. The VPRecipeWithIRFlags
constructor taking an underlying instruction and setting the flags based
on it has been removed.

This centralizes initial VPIRFlags creation and ensures flags are
consistently available throughout VPlan transformations and makes sure
we don't accidentally re-add flags from the underlying instruction that
already got dropped during transformations.

Follow-up to https://github.com/llvm/llvm-project/pull/167253, which did
the same for VPIRMetadata.

Should be NFC w.r.t. to the generated IR.

PR: https://github.com/llvm/llvm-project/pull/168450
---
 .../Vectorize/LoopVectorizationPlanner.h      |  5 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 35 ++++---
 llvm/lib/Transforms/Vectorize/VPlan.h         | 92 +++++++++----------
 .../Vectorize/VPlanConstruction.cpp           | 11 ++-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 16 ++--
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 50 +++++-----
 llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp |  6 +-
 .../vplan-printing-outer-loop.ll              | 14 +--
 .../Transforms/Vectorize/VPlanHCFGTest.cpp    | 10 +-
 .../Transforms/Vectorize/VPlanTest.cpp        | 15 ++-
 10 files changed, 134 insertions(+), 120 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index f533a47150a7b..741392247c0d6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -152,11 +152,12 @@ class VPBuilder {
   /// its underlying Instruction.
   VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
                               Instruction *Inst = nullptr,
+                              const VPIRFlags &Flags = {},
                               const VPIRMetadata &MD = {},
                               DebugLoc DL = DebugLoc::getUnknown(),
                               const Twine &Name = "") {
     VPInstruction *NewVPInst = tryInsertInstruction(
-        new VPInstruction(Opcode, Operands, {}, MD, DL, Name));
+        new VPInstruction(Opcode, Operands, Flags, MD, DL, Name));
     NewVPInst->setUnderlyingValue(Inst);
     return NewVPInst;
   }
@@ -329,7 +330,7 @@ class VPBuilder {
     else if (Opcode == Instruction::ZExt)
       Flags = VPIRFlags::NonNegFlagsTy(false);
     return tryInsertInstruction(
-        new VPWidenCastRecipe(Opcode, Op, ResultTy, Flags));
+        new VPWidenCastRecipe(Opcode, Op, ResultTy, nullptr, Flags));
   }
 
   VPScalarIVStepsRecipe *
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 356d759b94799..c680b6fca84cd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7750,7 +7750,7 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
                 },
                 Range);
   if (ShouldUseVectorIntrinsic)
-    return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(), *VPI,
+    return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(), *VPI, *VPI,
                                       VPI->getDebugLoc());
 
   Function *Variant = nullptr;
@@ -7804,7 +7804,8 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
     }
 
     Ops.push_back(VPI->getOperand(VPI->getNumOperands() - 1));
-    return new VPWidenCallRecipe(CI, Variant, Ops, VPI->getDebugLoc());
+    return new VPWidenCallRecipe(CI, Variant, Ops, *VPI, *VPI,
+                                 VPI->getDebugLoc());
   }
 
   return nullptr;
@@ -7842,7 +7843,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
       auto *SafeRHS =
           Builder.createSelect(Mask, Ops[1], One, VPI->getDebugLoc());
       Ops[1] = SafeRHS;
-      return new VPWidenRecipe(*I, Ops, *VPI, VPI->getDebugLoc());
+      return new VPWidenRecipe(*I, Ops, *VPI, *VPI, VPI->getDebugLoc());
     }
     [[fallthrough]];
   }
@@ -7888,7 +7889,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
       // For other binops, the legacy cost model only checks the second operand.
       NewOps[1] = GetConstantViaSCEV(NewOps[1]);
     }
-    return new VPWidenRecipe(*I, NewOps, *VPI, VPI->getDebugLoc());
+    return new VPWidenRecipe(*I, NewOps, *VPI, *VPI, VPI->getDebugLoc());
   }
   case Instruction::ExtractValue: {
     SmallVector<VPValue *> NewOps(VPI->operands());
@@ -7896,7 +7897,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
     assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
     unsigned Idx = EVI->getIndices()[0];
     NewOps.push_back(Plan.getConstantInt(32, Idx));
-    return new VPWidenRecipe(*I, NewOps, *VPI, VPI->getDebugLoc());
+    return new VPWidenRecipe(*I, NewOps, *VPI, *VPI, VPI->getDebugLoc());
   }
   };
 }
@@ -7981,7 +7982,8 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI,
           (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
          "Should not predicate a uniform recipe");
   auto *Recipe =
-      new VPReplicateRecipe(I, VPI->operands(), IsUniform, BlockInMask, *VPI);
+      new VPReplicateRecipe(I, VPI->operands(), IsUniform, BlockInMask, *VPI,
+                            *VPI, VPI->getDebugLoc());
   return Recipe;
 }
 
@@ -8231,17 +8233,19 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
     return nullptr;
 
   if (VPI->getOpcode() == Instruction::GetElementPtr)
-    return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Instr), R->operands());
+    return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Instr), R->operands(),
+                                *VPI, VPI->getDebugLoc());
 
   if (VPI->getOpcode() == Instruction::Select)
-    return new VPWidenSelectRecipe(*cast<SelectInst>(Instr), R->operands(),
-                                   *VPI);
+    return new VPWidenSelectRecipe(cast<SelectInst>(Instr), R->operands(), *VPI,
+                                   *VPI, VPI->getDebugLoc());
 
   if (Instruction::isCast(VPI->getOpcode())) {
-    auto *CastR = cast<VPInstructionWithType>(R);
     auto *CI = cast<CastInst>(Instr);
+    auto *CastR = cast<VPInstructionWithType>(VPI);
     return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(0),
-                                 CastR->getResultType(), *CI, *VPI);
+                                 CastR->getResultType(), CI, *VPI, *VPI,
+                                 VPI->getDebugLoc());
   }
 
   return tryToWiden(VPI);
@@ -8269,8 +8273,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction,
     SmallVector<VPValue *, 2> Ops;
     Ops.push_back(Plan.getOrAddLiveIn(Zero));
     Ops.push_back(BinOp);
-    BinOp = new VPWidenRecipe(*ReductionI, Ops, VPIRMetadata(),
-                              ReductionI->getDebugLoc());
+    BinOp = new VPWidenRecipe(*ReductionI, Ops, VPIRFlags(*ReductionI),
+                              VPIRMetadata(), ReductionI->getDebugLoc());
     Builder.insert(BinOp->getDefiningRecipe());
     ReductionOpcode = Instruction::Add;
   }
@@ -8454,9 +8458,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
         // Only create recipe for the final invariant store of the reduction.
         if (Legal->isInvariantStoreOfReduction(SI)) {
+          auto *VPI = cast<VPInstruction>(SingleDef);
           auto *Recipe = new VPReplicateRecipe(
-              SI, R.operands(), true /* IsUniform */, nullptr /*Mask*/,
-              *cast<VPInstruction>(SingleDef));
+              SI, R.operands(), true /* IsUniform */, nullptr /*Mask*/, *VPI,
+              *VPI, VPI->getDebugLoc());
           Recipe->insertBefore(*MiddleVPBB, MBIP);
         }
         R.eraseFromParent();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index fc29ab0c84093..fedbcfb6bd32a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -882,14 +882,6 @@ class VPIRFlags {
 /// A pure-virtual common base class for recipes defining a single VPValue and
 /// using IR flags.
 struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
-  VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands,
-                      DebugLoc DL = DebugLoc::getUnknown())
-      : VPSingleDefRecipe(SC, Operands, DL), VPIRFlags() {}
-
-  VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands,
-                      Instruction &I)
-      : VPSingleDefRecipe(SC, Operands, &I, I.getDebugLoc()), VPIRFlags(I) {}
-
   VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands,
                       const VPIRFlags &Flags,
                       DebugLoc DL = DebugLoc::getUnknown())
@@ -1474,9 +1466,12 @@ class LLVM_ABI_FOR_TEST VPWidenRecipe : public VPRecipeWithIRFlags,
         VPIRMetadata(Metadata), Opcode(Opcode) {}
 
   VPWidenRecipe(Instruction &I, ArrayRef<VPValue *> Operands,
-                const VPIRMetadata &Metadata, DebugLoc DL)
-      : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I),
-        VPIRMetadata(Metadata), Opcode(I.getOpcode()) {}
+                const VPIRFlags &Flags = {}, const VPIRMetadata &Metadata = {},
+                DebugLoc DL = {})
+      : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, Flags, DL),
+        VPIRMetadata(Metadata), Opcode(I.getOpcode()) {
+    setUnderlyingValue(&I);
+  }
 
   ~VPWidenRecipe() override = default;
 
@@ -1517,30 +1512,22 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
 
 public:
   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
-                    CastInst &UI, const VPIRMetadata &Metadata)
-      : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI),
-        VPIRMetadata(Metadata), Opcode(Opcode), ResultTy(ResultTy) {
-    assert(UI.getOpcode() == Opcode &&
-           "opcode of underlying cast doesn't match");
-  }
-  VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
-                    const VPIRFlags &Flags = {},
+                    CastInst *CI = nullptr, const VPIRFlags &Flags = {},
                     const VPIRMetadata &Metadata = {},
                     DebugLoc DL = DebugLoc::getUnknown())
       : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, Flags, DL),
         VPIRMetadata(Metadata), Opcode(Opcode), ResultTy(ResultTy) {
     assert(flagsValidForOpcode(Opcode) &&
            "Set flags not supported for the provided opcode");
+    setUnderlyingValue(CI);
   }
 
   ~VPWidenCastRecipe() override = default;
 
   VPWidenCastRecipe *clone() override {
-    auto *New = new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy, *this,
-                                      *this, getDebugLoc());
-    if (auto *UV = getUnderlyingValue())
-      New->setUnderlyingValue(UV);
-    return New;
+    return new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy,
+                                 cast_or_null<CastInst>(getUnderlyingValue()),
+                                 *this, *this, getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenCastSC)
@@ -1585,13 +1572,17 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
 public:
   VPWidenIntrinsicRecipe(CallInst &CI, Intrinsic::ID VectorIntrinsicID,
                          ArrayRef<VPValue *> CallArguments, Type *Ty,
+                         const VPIRFlags &Flags = {},
                          const VPIRMetadata &MD = {},
                          DebugLoc DL = DebugLoc::getUnknown())
-      : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, CI),
+      : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, Flags,
+                            DL),
         VPIRMetadata(MD), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty),
         MayReadFromMemory(CI.mayReadFromMemory()),
         MayWriteToMemory(CI.mayWriteToMemory()),
-        MayHaveSideEffects(CI.mayHaveSideEffects()) {}
+        MayHaveSideEffects(CI.mayHaveSideEffects()) {
+    setUnderlyingValue(&CI);
+  }
 
   VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
                          ArrayRef<VPValue *> CallArguments, Type *Ty,
@@ -1617,7 +1608,7 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
   VPWidenIntrinsicRecipe *clone() override {
     if (Value *CI = getUnderlyingValue())
       return new VPWidenIntrinsicRecipe(*cast<CallInst>(CI), VectorIntrinsicID,
-                                        operands(), ResultTy, *this,
+                                        operands(), ResultTy, *this, *this,
                                         getDebugLoc());
     return new VPWidenIntrinsicRecipe(VectorIntrinsicID, operands(), ResultTy,
                                       *this, *this, getDebugLoc());
@@ -1671,10 +1662,11 @@ class LLVM_ABI_FOR_TEST VPWidenCallRecipe : public VPRecipeWithIRFlags,
 public:
   VPWidenCallRecipe(Value *UV, Function *Variant,
                     ArrayRef<VPValue *> CallArguments,
-                    DebugLoc DL = DebugLoc::getUnknown())
-      : VPRecipeWithIRFlags(VPDef::VPWidenCallSC, CallArguments,
-                            *cast<Instruction>(UV)),
-        VPIRMetadata(*cast<Instruction>(UV)), Variant(Variant) {
+                    const VPIRFlags &Flags = {},
+                    const VPIRMetadata &Metadata = {}, DebugLoc DL = {})
+      : VPRecipeWithIRFlags(VPDef::VPWidenCallSC, CallArguments, Flags, DL),
+        VPIRMetadata(Metadata), Variant(Variant) {
+    setUnderlyingValue(UV);
     assert(
         isa<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()) &&
         "last operand must be the called function");
@@ -1684,7 +1676,7 @@ class LLVM_ABI_FOR_TEST VPWidenCallRecipe : public VPRecipeWithIRFlags,
 
   VPWidenCallRecipe *clone() override {
     return new VPWidenCallRecipe(getUnderlyingValue(), Variant, operands(),
-                                 getDebugLoc());
+                                 *this, *this, getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenCallSC)
@@ -1761,16 +1753,19 @@ class VPHistogramRecipe : public VPRecipeBase {
 /// instruction.
 struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
                                                public VPIRMetadata {
-  VPWidenSelectRecipe(SelectInst &I, ArrayRef<VPValue *> Operands,
-                      const VPIRMetadata &MD = {})
-      : VPRecipeWithIRFlags(VPDef::VPWidenSelectSC, Operands, I),
-        VPIRMetadata(MD) {}
+  VPWidenSelectRecipe(SelectInst *SI, ArrayRef<VPValue *> Operands,
+                      const VPIRFlags &Flags = {}, const VPIRMetadata &MD = {},
+                      DebugLoc DL = {})
+      : VPRecipeWithIRFlags(VPDef::VPWidenSelectSC, Operands, Flags, DL),
+        VPIRMetadata(MD) {
+    setUnderlyingValue(SI);
+  }
 
   ~VPWidenSelectRecipe() override = default;
 
   VPWidenSelectRecipe *clone() override {
-    return new VPWidenSelectRecipe(*cast<SelectInst>(getUnderlyingInstr()),
-                                   operands(), *this);
+    return new VPWidenSelectRecipe(cast<SelectInst>(getUnderlyingInstr()),
+                                   operands(), *this, *this, getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenSelectSC)
@@ -1822,9 +1817,12 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
   }
 
 public:
-  VPWidenGEPRecipe(GetElementPtrInst *GEP, ArrayRef<VPValue *> Operands)
-      : VPRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, *GEP),
+  VPWidenGEPRecipe(GetElementPtrInst *GEP, ArrayRef<VPValue *> Operands,
+                   const VPIRFlags &Flags = {},
+                   DebugLoc DL = DebugLoc::getUnknown())
+      : VPRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, Flags, DL),
         SourceElementTy(GEP->getSourceElementType()) {
+    setUnderlyingValue(GEP);
     SmallVector<std::pair<unsigned, MDNode *>> Metadata;
     (void)Metadata;
     getMetadataToPropagate(GEP, Metadata);
@@ -1835,7 +1833,7 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
 
   VPWidenGEPRecipe *clone() override {
     return new VPWidenGEPRecipe(cast<GetElementPtrInst>(getUnderlyingInstr()),
-                                operands());
+                                operands(), *this, getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenGEPSC)
@@ -2929,10 +2927,12 @@ class LLVM_ABI_FOR_TEST VPReplicateRecipe : public VPRecipeWithIRFlags,
 public:
   VPReplicateRecipe(Instruction *I, ArrayRef<VPValue *> Operands,
                     bool IsSingleScalar, VPValue *Mask = nullptr,
-                    VPIRMetadata Metadata = {})
-      : VPRecipeWithIRFlags(VPDef::VPReplicateSC, Operands, *I),
+                    const VPIRFlags &Flags = {}, VPIRMetadata Metadata = {},
+                    DebugLoc DL = DebugLoc::getUnknown())
+      : VPRecipeWithIRFlags(VPDef::VPReplicateSC, Operands, Flags, DL),
         VPIRMetadata(Metadata), IsSingleScalar(IsSingleScalar),
         IsPredicated(Mask) {
+    setUnderlyingValue(I);
     if (Mask)
       addOperand(Mask);
   }
@@ -2940,9 +2940,9 @@ class LLVM_ABI_FOR_TEST VPReplicateRecipe : public VPRecipeWithIRFlags,
   ~VPReplicateRecipe() override = default;
 
   VPReplicateRecipe *clone() override {
-    auto *Copy =
-        new VPReplicateRecipe(getUnderlyingInstr(), operands(), IsSingleScalar,
-                              isPredicated() ? getMask() : nullptr, *this);
+    auto *Copy = new VPReplicateRecipe(
+        getUnderlyingInstr(), operands(), IsSingleScalar,
+        isPredicated() ? getMask() : nullptr, *this, *this, getDebugLoc());
     Copy->transferFlags(*this);
     return Copy;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 612202d049774..dbbde1cafa9f2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -190,7 +190,7 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
       // recipes.
       if (Br->isConditional()) {
         VPValue *Cond = getOrCreateVPOperand(Br->getCondition());
-        VPIRBuilder.createNaryOp(VPInstruction::BranchOnCond, {Cond}, Inst,
+        VPIRBuilder.createNaryOp(VPInstruction::BranchOnCond, {Cond}, Inst, {},
                                  VPIRMetadata(*Inst), Inst->getDebugLoc());
       }
 
@@ -205,7 +205,7 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
       SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())};
       for (auto Case : SI->cases())
         Ops.push_back(getOrCreateVPOperand(Case.getCaseValue()));
-      VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst,
+      VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst, {},
                                VPIRMetadata(*Inst), Inst->getDebugLoc());
       continue;
     }
@@ -255,13 +255,14 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
       if (auto *CI = dyn_cast<CastInst>(Inst)) {
         NewR = VPIRBuilder.createScalarCast(CI->getOpcode(), VPOperands[0],
                                             CI->getType(), CI->getDebugLoc(),
-                                            {}, MD);
+                                            VPIRFlags(*CI), MD);
         NewR->setUnderlyingValue(CI);
       } else {
         // Build VPInstruction for any arbitrary Instruction without specific
         // representation in VPlan.
-        NewR = VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst, MD,
-                                        Inst->getDebugLoc());
+        NewR =
+            VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst,
+                                     VPIRFlags(*Inst), MD, Inst->getDebugLoc());
       }
     }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index fca6554ad77c6..ef36e29aaa5c4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2056,24 +2056,26 @@ bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
   switch (OpType) {
   case OperationType::OverflowingBinOp:
     return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
-           Opcode == Instruction::Mul ||
+           Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
            Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
   case OperationType::Trunc:
     return Opcode == Instruction::Trunc;
   case OperationType::DisjointOp:
     return Opcode == Instruction::Or;
   case OperationType::PossiblyExactOp:
-    return Opcode == Instruction::AShr;
+    return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
+           Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
   case OperationType::GEPOp:
     return Opcode == Instruction::GetElementPtr ||
            Opcode == VPInstruction::PtrAdd ||
            Opcode == VPInstruction::WidePtrAdd;
   case OperationType::FPMathOp:
-    return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
-           Opcode == Instruction::FSub || Opcode == Instruction::FNeg ||
-           Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
-           Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc ||
-           Opcode == Instruction::FCmp || Opcode == Instruction::Select ||
+    return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
+           Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
+           Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
+           Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
+           Opcode == Instruction::FPTrunc || Opcode == Instruction::FCmp ||
+           Opcode == Instruction::Select ||
            Opcode == VPInstruction::WideIVStep ||
            Opcode == VPInstruction::ReductionStartVector ||
            Opcode == VPInstruction::ComputeReductionResult;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 26563242de283..25557f1d5d651 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -104,24 +104,26 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
               nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
               Ingredient.getDebugLoc());
         } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
-          NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
+          NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
+                                           Ingredient.getDebugLoc());
         } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
           Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
           if (VectorID == Intrinsic::not_intrinsic)
             return false;
           NewRecipe = new VPWidenIntrinsicRecipe(
               *CI, getVectorIntrinsicIDForCall(CI, &TLI),
-              drop_end(Ingredient.operands()), CI->getType(), *VPI,
-              CI->getDebugLoc());
+              drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
+              *VPI, CI->getDebugLoc());
         } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
-          NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands(), *VPI);
+          NewRecipe = new VPWidenSelectRecipe(SI, Ingredient.operands(), *VPI,
+                                              *VPI, Ingredient.getDebugLoc());
         } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
-          NewRecipe =
-              new VPWidenCastRecipe(CI->getOpcode(), Ingredient.getOperand(0),
-                                    CI->getType(), *CI, *VPI);
+          NewRecipe = new VPWidenCastRecipe(
+              CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
+              VPIRFlags(*CI), VPIRMetadata(*CI));
         } else {
           NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
-                                        Ingredient.getDebugLoc());
+                                        *VPI, Ingredient.getDebugLoc());
         }
       }
 
@@ -226,7 +228,8 @@ static bool sinkScalarOperands(VPlan &Plan) {
         // then cloning should be sufficient here.
         Instruction *I = SinkCandidate->getUnderlyingInstr();
         Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
-                                      nullptr /*Mask*/, *SinkCandidateRepR);
+                                      nullptr /*Mask*/, *SinkCandidateRepR,
+                                      *SinkCandidateRepR);
         // TODO: add ".cloned" suffix to name of Clone's VPValue.
       } else {
         Clone = SinkCandidate->clone();
@@ -385,7 +388,8 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
   // mask but in the replicate region.
   auto *RecipeWithoutMask = new VPReplicateRecipe(
       PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
-      PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);
+      PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
+      PredRecipe->getDebugLoc());
   auto *Pred =
       Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
 
@@ -691,7 +695,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
     // analysis.
     auto Users = collectUsersRecursively(PhiR);
     for (VPUser *U : reverse(Users)) {
-      auto *Def = dyn_cast<VPSingleDefRecipe>(U);
+      auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
       auto *RepR = dyn_cast<VPReplicateRecipe>(U);
       // Skip recipes that shouldn't be narrowed.
       if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
@@ -704,7 +708,8 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
         continue;
 
       auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
-                                          Def->operands(), /*IsUniform*/ true);
+                                          Def->operands(), /*IsUniform*/ true,
+                                          /*Mask*/ nullptr, /*Flags*/ *Def);
       Clone->insertAfter(Def);
       Def->replaceAllUsesWith(Clone);
     }
@@ -1423,12 +1428,13 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
       if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
         continue;
 
-      auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
+      auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(&R);
       if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
           vputils::isSingleScalar(RepR->getOperand(1))) {
         auto *Clone = new VPReplicateRecipe(
             RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
-            true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
+            true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
+            *RepR /*Metadata*/, RepR->getDebugLoc());
         Clone->insertBefore(RepOrWidenR);
         unsigned ExtractOpc =
             vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1))
@@ -1469,9 +1475,9 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
           }))
         continue;
 
-      auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
-                                          RepOrWidenR->operands(),
-                                          true /*IsSingleScalar*/);
+      auto *Clone = new VPReplicateRecipe(
+          RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
+          true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
       Clone->insertBefore(RepOrWidenR);
       RepOrWidenR->replaceAllUsesWith(Clone);
       if (isDeadRecipe(*RepOrWidenR))
@@ -3824,15 +3830,15 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
         Ext0->getOpcode() == Ext1->getOpcode() &&
         IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
       auto *NewExt0 = new VPWidenCastRecipe(
-          Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0,
-          *Ext0, Ext0->getDebugLoc());
+          Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
+          *Ext0, *Ext0, Ext0->getDebugLoc());
       NewExt0->insertBefore(Ext0);
 
       VPWidenCastRecipe *NewExt1 = NewExt0;
       if (Ext0 != Ext1) {
         NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
-                                        Ext->getResultType(), *Ext1, *Ext1,
-                                        Ext1->getDebugLoc());
+                                        Ext->getResultType(), nullptr, *Ext1,
+                                        *Ext1, Ext1->getDebugLoc());
         NewExt1->insertBefore(Ext1);
       }
       Mul->setOperand(0, NewExt0);
@@ -4353,7 +4359,7 @@ narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) {
   // process one original iteration.
   auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
                                   /*IsUniform*/ true,
-                                  /*Mask*/ nullptr, *WideLoad);
+                                  /*Mask*/ nullptr, {}, *WideLoad);
   N->insertBefore(WideLoad);
   NarrowedOps.insert(N);
   return N;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index d4b8b72beb942..d76d2ed5f1c76 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -518,9 +518,9 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
     // TODO: have cloning of replicate recipes also provide the desired result
     // coupled with setting its operands to NewOps (deriving IsSingleScalar and
     // Mask from the operands?)
-    New =
-        new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
-                              /*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR);
+    New = new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
+                                /*IsSingleScalar=*/true, /*Mask=*/nullptr,
+                                *RepR, *RepR, RepR->getDebugLoc());
   } else {
     assert(isa<VPInstruction>(DefR) &&
            "DefR must be a VPReplicateRecipe or VPInstruction");
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll
index 20676f3702294..10c265519952b 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll
@@ -14,23 +14,23 @@ define void @foo(i64 %n) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: outer.header:
 ; CHECK-NEXT:   EMIT-SCALAR ir<%outer.iv> = phi [ ir<%outer.iv.next>, outer.latch ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT:   EMIT ir<%gep.1> = getelementptr ir<@arr2>, ir<0>, ir<%outer.iv>
+; CHECK-NEXT:   EMIT ir<%gep.1> = getelementptr inbounds ir<@arr2>, ir<0>, ir<%outer.iv>
 ; CHECK-NEXT:   EMIT store ir<%outer.iv>, ir<%gep.1>
-; CHECK-NEXT:   EMIT ir<%add> = add ir<%outer.iv>, ir<%n>
+; CHECK-NEXT:   EMIT ir<%add> = add nsw ir<%outer.iv>, ir<%n>
 ; CHECK-NEXT: Successor(s): inner
 ; CHECK-EMPTY:
 ; CHECK-NEXT: inner:
 ; CHECK-NEXT:   EMIT-SCALAR ir<%inner.iv> = phi [ ir<%inner.iv.next>, inner ], [ ir<0>, outer.header ]
-; CHECK-NEXT:   EMIT ir<%gep.2> = getelementptr ir<@arr>, ir<0>, ir<%inner.iv>, ir<%outer.iv>
+; CHECK-NEXT:   EMIT ir<%gep.2> = getelementptr inbounds ir<@arr>, ir<0>, ir<%inner.iv>, ir<%outer.iv>
 ; CHECK-NEXT:   EMIT store ir<%add>, ir<%gep.2>
-; CHECK-NEXT:   EMIT ir<%inner.iv.next> = add ir<%inner.iv>, ir<1>
-; CHECK-NEXT:   EMIT ir<%inner.ec> = icmp ir<%inner.iv.next>, ir<8>
+; CHECK-NEXT:   EMIT ir<%inner.iv.next> = add nuw nsw ir<%inner.iv>, ir<1>
+; CHECK-NEXT:   EMIT ir<%inner.ec> = icmp eq ir<%inner.iv.next>, ir<8>
 ; CHECK-NEXT:   EMIT branch-on-cond ir<%inner.ec>
 ; CHECK-NEXT: Successor(s): outer.latch, inner
 ; CHECK-EMPTY:
 ; CHECK-NEXT: outer.latch:
-; CHECK-NEXT:   EMIT ir<%outer.iv.next> = add ir<%outer.iv>, ir<1>
-; CHECK-NEXT:   EMIT ir<%outer.ec> = icmp ir<%outer.iv.next>, ir<8>
+; CHECK-NEXT:   EMIT ir<%outer.iv.next> = add nuw nsw ir<%outer.iv>, ir<1>
+; CHECK-NEXT:   EMIT ir<%outer.ec> = icmp eq ir<%outer.iv.next>, ir<8>
 ; CHECK-NEXT:   EMIT branch-on-cond ir<%outer.ec>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, outer.header
 ; CHECK-EMPTY:
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
index b99d656c5c50f..5742df2aa3c53 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
@@ -139,12 +139,12 @@ compound=true
       "vector.body:\l" +
       "  EMIT vp\<%2\> = CANONICAL-INDUCTION ir\<0\>, vp\<%index.next\>\l" +
       "  EMIT-SCALAR ir\<%indvars.iv\> = phi [ ir\<0\>, vector.ph ], [ ir\<%indvars.iv.next\>, vector.body ]\l" +
-      "  EMIT ir\<%arr.idx\> = getelementptr ir\<%A\>, ir\<%indvars.iv\>\l" +
+      "  EMIT ir\<%arr.idx\> = getelementptr inbounds ir\<%A\>, ir\<%indvars.iv\>\l" +
       "  EMIT ir\<%l1\> = load ir\<%arr.idx\>\l" +
       "  EMIT ir\<%res\> = add ir\<%l1\>, ir\<10\>\l" +
       "  EMIT store ir\<%res\>, ir\<%arr.idx\>\l" +
       "  EMIT ir\<%indvars.iv.next\> = add ir\<%indvars.iv\>, ir\<1\>\l" +
-      "  EMIT ir\<%exitcond\> = icmp ir\<%indvars.iv.next\>, ir\<%N\>\l" +
+      "  EMIT ir\<%exitcond\> = icmp ne ir\<%indvars.iv.next\>, ir\<%N\>\l" +
       "  EMIT vp\<%3\> = not ir\<%exitcond\>\l" +
       "  EMIT vp\<%index.next\> = add nuw vp\<%2\>, vp\<%0\>\l" +
       "  EMIT branch-on-count vp\<%index.next\>, vp\<%1\>\l" +
@@ -305,9 +305,9 @@ compound=true
       "vector.body:\l" +
       "  EMIT vp\<%2\> = CANONICAL-INDUCTION ir\<0\>, vp\<%index.next\>\l" +
       "  EMIT-SCALAR ir\<%iv\> = phi [ ir\<0\>, vector.ph ], [ ir\<%iv.next\>, loop.latch ]\l" +
-      "  EMIT ir\<%arr.idx\> = getelementptr ir\<%A\>, ir\<%iv\>\l" +
+      "  EMIT ir\<%arr.idx\> = getelementptr inbounds ir\<%A\>, ir\<%iv\>\l" +
       "  EMIT ir\<%l1\> = load ir\<%arr.idx\>\l" +
-      "  EMIT ir\<%c\> = icmp ir\<%l1\>, ir\<0\>\l" +
+      "  EMIT ir\<%c\> = icmp eq ir\<%l1\>, ir\<0\>\l" +
       "Successor(s): loop.latch\l"
     ]
     N4 -> N6 [ label=""]
@@ -316,7 +316,7 @@ compound=true
       "  EMIT ir\<%res\> = add ir\<%l1\>, ir\<10\>\l" +
       "  EMIT store ir\<%res\>, ir\<%arr.idx\>\l" +
       "  EMIT ir\<%iv.next\> = add ir\<%iv\>, ir\<1\>\l" +
-      "  EMIT ir\<%exitcond\> = icmp ir\<%iv.next\>, ir\<%N\>\l" +
+      "  EMIT ir\<%exitcond\> = icmp ne ir\<%iv.next\>, ir\<%N\>\l" +
       "  EMIT vp\<%3\> = not ir\<%exitcond\>\l" +
       "  EMIT vp\<%index.next\> = add nuw vp\<%2\>, vp\<%0\>\l" +
       "  EMIT branch-on-count vp\<%index.next\>, vp\<%1\>\l" +
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 3842ba235ead3..63776b78a2088 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1009,7 +1009,7 @@ TEST_F(VPRecipeTest, CastVPWidenRecipeToVPUser) {
   SmallVector<VPValue *, 2> Args;
   Args.push_back(Op1);
   Args.push_back(Op2);
-  VPWidenRecipe WidenR(*AI, Args, VPIRMetadata(), DebugLoc());
+  VPWidenRecipe WidenR(*AI, Args);
 
   checkVPRecipeCastImpl<VPWidenRecipe, VPUser, VPIRMetadata>(&WidenR);
   delete AI;
@@ -1053,7 +1053,7 @@ TEST_F(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) {
   Args.push_back(Op1);
   Args.push_back(Op2);
   Args.push_back(Op3);
-  VPWidenSelectRecipe WidenSelectR(*SelectI,
+  VPWidenSelectRecipe WidenSelectR(SelectI,
                                    make_range(Args.begin(), Args.end()));
 
   checkVPRecipeCastImpl<VPWidenSelectRecipe, VPUser, VPIRMetadata>(
@@ -1093,7 +1093,7 @@ TEST_F(VPRecipeTest, CastVPWidenCastRecipeToVPUser) {
   IntegerType *Int64 = IntegerType::get(C, 64);
   auto *Cast = CastInst::CreateZExtOrBitCast(PoisonValue::get(Int32), Int64);
   VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
-  VPWidenCastRecipe Recipe(Instruction::ZExt, Op1, Int64, *Cast, {});
+  VPWidenCastRecipe Recipe(Instruction::ZExt, Op1, Int64, Cast);
 
   checkVPRecipeCastImpl<VPWidenCastRecipe, VPUser, VPIRMetadata>(&Recipe);
   delete Cast;
@@ -1264,7 +1264,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     SmallVector<VPValue *, 2> Args;
     Args.push_back(Op1);
     Args.push_back(Op2);
-    VPWidenRecipe Recipe(*AI, Args, VPIRMetadata(), DebugLoc());
+    VPWidenRecipe Recipe(*AI, Args);
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1283,7 +1283,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     Args.push_back(Op1);
     Args.push_back(Op2);
     Args.push_back(Op3);
-    VPWidenSelectRecipe Recipe(*SelectI, make_range(Args.begin(), Args.end()));
+    VPWidenSelectRecipe Recipe(SelectI, make_range(Args.begin(), Args.end()));
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1412,7 +1412,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     Args.push_back(Op1);
     Args.push_back(Op2);
     Args.push_back(CalledFn);
-    VPWidenCallRecipe Recipe(Call, TheFn, Args);
+    VPWidenCallRecipe Recipe(Call, TheFn, Args, VPIRFlags(), VPIRMetadata());
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1468,8 +1468,7 @@ TEST_F(VPRecipeTest, dumpRecipeInPlan) {
   VPValue *ExtVPV2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
   Args.push_back(ExtVPV1);
   Args.push_back(ExtVPV2);
-  VPWidenRecipe *WidenR =
-      new VPWidenRecipe(*AI, Args, VPIRMetadata(), DebugLoc());
+  VPWidenRecipe *WidenR = new VPWidenRecipe(*AI, Args);
   VPBB1->appendRecipe(WidenR);
 
   {

From 1e18b4885bb44cfe7b03990274ab9de9d94935e0 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 18 Nov 2025 07:16:41 -0800
Subject: [PATCH 18/33] [DWARFCFIChecker] Remove an unused local variable (NFC)
 (#168487)

Note that getCurrentUnwindRow does not change any state.

Identified with unused-local-non-trivial-variable.
---
 llvm/lib/DWARFCFIChecker/DWARFCFIState.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/DWARFCFIChecker/DWARFCFIState.cpp b/llvm/lib/DWARFCFIChecker/DWARFCFIState.cpp
index bca820fa807c8..4acc064dbc212 100644
--- a/llvm/lib/DWARFCFIChecker/DWARFCFIState.cpp
+++ b/llvm/lib/DWARFCFIChecker/DWARFCFIState.cpp
@@ -64,7 +64,6 @@ dwarf::CFIProgram DWARFCFIState::convert(MCCFIInstruction Directive) {
       /* CodeAlignmentFactor */ 1, /* DataAlignmentFactor */ 1,
       Context->getTargetTriple().getArch());
 
-  auto MaybeCurrentRow = getCurrentUnwindRow();
   switch (Directive.getOperation()) {
   case MCCFIInstruction::OpSameValue:
     CFIP.addInstruction(dwarf::DW_CFA_same_value, Directive.getRegister());

From 4749cc407114f1e2da591491aacd0a8d3afb54e0 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 18 Nov 2025 07:16:50 -0800
Subject: [PATCH 19/33] [Bitcode] Use a range-based for loop (NFC) (#168489)

Identified with modernize-loop-convert.
---
 llvm/lib/Bitcode/Writer/ValueEnumerator.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
index f497c574ee75d..36d0d35d024cc 100644
--- a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -616,9 +616,8 @@ void ValueEnumerator::OptimizeConstants(unsigned CstStart, unsigned CstEnd) {
 /// EnumerateValueSymbolTable - Insert all of the values in the specified symbol
 /// table into the values table.
 void ValueEnumerator::EnumerateValueSymbolTable(const ValueSymbolTable &VST) {
-  for (ValueSymbolTable::const_iterator VI = VST.begin(), VE = VST.end();
-       VI != VE; ++VI)
-    EnumerateValue(VI->getValue());
+  for (const auto &VI : VST)
+    EnumerateValue(VI.getValue());
 }
 
 /// Insert all of the values referenced by named metadata in the specified

From 00ef94805a8c3ced416f8854b80452eb7d0bac2a Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 18 Nov 2025 07:16:58 -0800
Subject: [PATCH 20/33] [AMDGPU] Remove const on a return type. (#168490)

While I am at it, this patch switches to the constructor that takes
a container instead of a pair of begin/end.

Identified with readability-const-return-type.
---
 llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index 9af812960542c..b7078825928be 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -314,9 +314,7 @@ class SplitGraph {
 #endif
 
   bool empty() const { return Nodes.empty(); }
-  const iterator_range<nodes_iterator> nodes() const {
-    return {Nodes.begin(), Nodes.end()};
-  }
+  iterator_range<nodes_iterator> nodes() const { return Nodes; }
   const Node &getNode(unsigned ID) const { return *Nodes[ID]; }
 
   unsigned getNumNodes() const { return Nodes.size(); }

From cc0c899765db2c9a2ec16ff11824a8c1055174bb Mon Sep 17 00:00:00 2001
From: Michael Liao <michael.hliao@gmail.com>
Date: Tue, 18 Nov 2025 10:16:15 -0500
Subject: [PATCH 21/33] [clang][CIR] Temporarily fix CIR codegen test on call.
 NFC

- MemoryEffectsAttr in MLIR LLVM dialect is out of sync with LLVM
  itself.
---
 clang/test/CIR/CodeGen/call.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/CIR/CodeGen/call.c b/clang/test/CIR/CodeGen/call.c
index d780e37f3d153..99ae4506b1f16 100644
--- a/clang/test/CIR/CodeGen/call.c
+++ b/clang/test/CIR/CodeGen/call.c
@@ -130,7 +130,7 @@ int f12(void) {
 // OGCG:         %{{.+}} = call i32 @f10(i32 noundef 1) #[[ATTR0:.+]]
 // OGCG-NEXT:    %{{.+}} = call i32 @f11(i32 noundef 2) #[[ATTR1:.+]]
 
-// LLVM: attributes #[[ATTR0]] = { nounwind willreturn memory(read, errnomem: none) }
+// LLVM: attributes #[[ATTR0]] = { nounwind willreturn memory(read, errnomem: none, target_mem0: none, target_mem1: none) }
 // LLVM: attributes #[[ATTR1]] = { nounwind willreturn memory(none) }
 
 // OGCG: attributes #[[ATTR0]] = { nounwind willreturn memory(read) }

From 906f17566c3ad30696d5b51016acaa52e4c88ecc Mon Sep 17 00:00:00 2001
From: Jessica Clarke <jrtc27@jrtc27.com>
Date: Tue, 18 Nov 2025 15:23:18 +0000
Subject: [PATCH 22/33] [ELF][AArch64] Fix copy/paste error in llvm_unreachable
 message

Fixes: e1979aed0a15 ("Implement gd to ie relaxation for aarch64.")
---
 lld/ELF/Arch/AArch64.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index 2a97df4785ecb..b0dc797292511 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -762,7 +762,7 @@ void AArch64::relaxTlsGdToIe(uint8_t *loc, const Relocation &rel,
     relocateNoSym(loc, R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC, val);
     break;
   default:
-    llvm_unreachable("unsupported relocation for TLS GD to LE relaxation");
+    llvm_unreachable("unsupported relocation for TLS GD to IE relaxation");
   }
 }
 

From 2ede6afff07ad26419f22e00967120dbfc9e5617 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Tue, 18 Nov 2025 07:25:48 -0800
Subject: [PATCH 23/33] [TSan] Make tests work with internal shell

This patch makes all tsan tests work with the internal shell on Darwin. Tests
were using various features not supported by the internal shell, mainly subshells
and not using env to set environment variables. This patch also fixes one of the
dynamiclib substitutions to not use a subshell.

Reviewers: ndrewh, DanBlackwell, fmayer, vitalybuka

Reviewed By: DanBlackwell

Pull Request: https://github.com/llvm/llvm-project/pull/168544
---
 compiler-rt/test/lit.common.cfg.py                       | 5 ++++-
 compiler-rt/test/tsan/Darwin/dlopen.cpp                  | 7 ++++---
 .../test/tsan/Darwin/external-ignore-noninstrumented.cpp | 6 ++++--
 compiler-rt/test/tsan/Darwin/external.cpp                | 9 ++++++---
 compiler-rt/test/tsan/Darwin/malloc-stack-logging.cpp    | 2 +-
 5 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index 3f7dd8e402b78..ea22fb0babc46 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -875,7 +875,7 @@ def is_windows_lto_supported():
         config.substitutions.append(
             (
                 "%ld_flags_rpath_so" + postfix,
-                "-install_name @rpath/`basename %dynamiclib{}`".format(postfix),
+                "-install_name @rpath/%base_dynamiclib{}".format(postfix),
             )
         )
     elif config.target_os in ("FreeBSD", "NetBSD", "OpenBSD"):
@@ -908,6 +908,9 @@ def is_windows_lto_supported():
     config.substitutions.append(
         ("%dynamiclib" + postfix, "%t.dir/%xdynamiclib_filename" + postfix)
     )
+    config.substitutions.append(
+        ("%base_dynamiclib" + postfix, "%xdynamiclib_filename" + postfix)
+    )
     config.substitutions.append(
         (
             "%xdynamiclib_filename" + postfix,
diff --git a/compiler-rt/test/tsan/Darwin/dlopen.cpp b/compiler-rt/test/tsan/Darwin/dlopen.cpp
index 3d12b815f9c25..2ab052f1c0c26 100644
--- a/compiler-rt/test/tsan/Darwin/dlopen.cpp
+++ b/compiler-rt/test/tsan/Darwin/dlopen.cpp
@@ -9,14 +9,15 @@
 // RUN: %clangxx_tsan %s -o %t.so -shared -DSHARED_LIB
 // RUN: %clangxx_tsan -fno-sanitize=thread %s -o %t
 
-// RUN: TSAN_DYLIB_PATH=`%clangxx_tsan %s -### 2>&1 \
+// RUN: %clangxx_tsan %s -### 2>&1 \
 // RUN:   | grep "libclang_rt.tsan_osx_dynamic.dylib" \
-// RUN:   | sed -e 's/.*"\(.*libclang_rt.tsan_osx_dynamic.dylib\)".*/\1/'`
+// RUN:   | sed -e 's/.*"\(.*libclang_rt.tsan_osx_dynamic.dylib\)".*/\1/' \
+// RUN:   | tr -d '\n' > %t.tsan_dylib_path
 
 // Launching a non-instrumented binary that dlopen's an instrumented library should fail.
 // RUN: not %run %t %t.so 2>&1 | FileCheck %s --check-prefix=CHECK-FAIL
 // Launching a non-instrumented binary with an explicit DYLD_INSERT_LIBRARIES should work.
-// RUN: DYLD_INSERT_LIBRARIES=$TSAN_DYLIB_PATH %run %t %t.so 2>&1 | FileCheck %s
+// RUN: env DYLD_INSERT_LIBRARIES="%{readfile:%t.tsan_dylib_path}" %run %t %t.so 2>&1 | FileCheck %s
 
 #include <dlfcn.h>
 #include <pthread.h>
diff --git a/compiler-rt/test/tsan/Darwin/external-ignore-noninstrumented.cpp b/compiler-rt/test/tsan/Darwin/external-ignore-noninstrumented.cpp
index 916b0b893fc0d..cfa46e0f0a213 100644
--- a/compiler-rt/test/tsan/Darwin/external-ignore-noninstrumented.cpp
+++ b/compiler-rt/test/tsan/Darwin/external-ignore-noninstrumented.cpp
@@ -1,8 +1,10 @@
+// RUN: basename %t-lib.dylib | tr -d '\n' > %t.basename
 // RUN: %clangxx_tsan -shared %p/external-lib.cpp -fno-sanitize=thread -DUSE_TSAN_CALLBACKS \
-// RUN:   -o %t-lib.dylib -install_name @rpath/`basename %t-lib.dylib`
+// RUN:   -o %t-lib.dylib -install_name @rpath/%{readfile:%t.basename}
 
+// RUN: basename %t-module.dylib | tr -d '\n' > %t.basename
 // RUN: %clangxx_tsan -shared %p/external-noninstrumented-module.cpp %t-lib.dylib -fno-sanitize=thread \
-// RUN:   -o %t-module.dylib -install_name @rpath/`basename %t-module.dylib`
+// RUN:   -o %t-module.dylib -install_name @rpath/%{readfile:%t.basename}
 
 // RUN: %clangxx_tsan %s %t-module.dylib -o %t
 // RUN: %run %t 2>&1 | FileCheck %s
diff --git a/compiler-rt/test/tsan/Darwin/external.cpp b/compiler-rt/test/tsan/Darwin/external.cpp
index bf189eb1d6b5b..52fae36f0e1f4 100644
--- a/compiler-rt/test/tsan/Darwin/external.cpp
+++ b/compiler-rt/test/tsan/Darwin/external.cpp
@@ -1,14 +1,17 @@
+// RUN: basename %t-lib-instrumented.dylib | tr -d '\n' > %t.basename
 // RUN: %clangxx_tsan %p/external-lib.cpp -shared \
 // RUN:                               -o %t-lib-instrumented.dylib \
-// RUN:   -install_name @rpath/`basename %t-lib-instrumented.dylib`
+// RUN:   -install_name @rpath/%{readfile:%t.basename}
 
+// RUN: basename %t-lib-noninstrumented.dylib | tr -d '\n' > %t.basename
 // RUN: %clangxx_tsan %p/external-lib.cpp -shared -fno-sanitize=thread \
 // RUN:                               -o %t-lib-noninstrumented.dylib \
-// RUN:   -install_name @rpath/`basename %t-lib-noninstrumented.dylib`
+// RUN:   -install_name @rpath/%{readfile:%t.basename}
 
+// RUN: basename %t-lib-noninstrumented-callbacks.dylib | tr -d '\n' > %t.basename
 // RUN: %clangxx_tsan %p/external-lib.cpp -shared -fno-sanitize=thread -DUSE_TSAN_CALLBACKS \
 // RUN:                               -o %t-lib-noninstrumented-callbacks.dylib \
-// RUN:   -install_name @rpath/`basename %t-lib-noninstrumented-callbacks.dylib`
+// RUN:   -install_name @rpath/%{readfile:%t.basename}
 
 // RUN: %clangxx_tsan %s %t-lib-instrumented.dylib -o %t-lib-instrumented
 // RUN: %clangxx_tsan %s %t-lib-noninstrumented.dylib -o %t-lib-noninstrumented
diff --git a/compiler-rt/test/tsan/Darwin/malloc-stack-logging.cpp b/compiler-rt/test/tsan/Darwin/malloc-stack-logging.cpp
index 8d9c2122d0e6c..0a96e346f8012 100644
--- a/compiler-rt/test/tsan/Darwin/malloc-stack-logging.cpp
+++ b/compiler-rt/test/tsan/Darwin/malloc-stack-logging.cpp
@@ -4,7 +4,7 @@
 // use syscalls directly) to make sure other interceptors aren't called.
 
 // RUN: %clangxx_tsan -O1 %s -o %t
-// RUN: MallocStackLogging=1 %run %t 2>&1 | FileCheck %s
+// RUN: env MallocStackLogging=1 %run %t 2>&1 | FileCheck %s
 #include <pthread.h>
 #include <stdlib.h>
 #include <stdio.h>

From 40645ed4ed7ce853d9cc76bcc4aeabb6a83a0f2c Mon Sep 17 00:00:00 2001
From: Discookie <viktor.cseh@ericsson.com>
Date: Tue, 18 Nov 2025 15:26:20 +0000
Subject: [PATCH 24/33] [clang-tidy] Add a fully custom message to
 `bugprone-unsafe-functions` (#162443)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In some cases, such as when recommending the compiler option
_FORTIFY_SOURCE, the current custom message format is clunky. Now, when
the reason starts with `>`, the replacement string is omitted., so only
the Reason is shown.

`^function$,,has a custom message;` - function 'function' has a custom
message; it should not be used
`^function$,,>has a custom message and no replacement suggestion;` -
function 'function' has a custom message and no replacement suggestion

---------

Co-authored-by: Donát Nagy <donat.nagy@ericsson.com>
---
 .../bugprone/UnsafeFunctionsCheck.cpp         | 10 +++-
 clang-tools-extra/docs/ReleaseNotes.rst       | 12 ++++
 .../checks/bugprone/unsafe-functions.rst      | 55 ++++++++++++++-----
 .../bugprone/unsafe-functions-custom.c        |  6 +-
 4 files changed, 63 insertions(+), 20 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
index 5524c4b484be1..67d0931003c54 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
@@ -301,14 +301,20 @@ void UnsafeFunctionsCheck::check(const MatchFinder::MatchResult &Result) {
   if (Custom) {
     for (const auto &Entry : CustomFunctions) {
       if (Entry.Pattern.match(*FuncDecl)) {
-        const StringRef Reason =
+        StringRef Reason =
             Entry.Reason.empty() ? "is marked as unsafe" : Entry.Reason.c_str();
 
-        if (Entry.Replacement.empty()) {
+        // Omit the replacement, when a fully-custom reason is given.
+        if (Reason.consume_front(">")) {
+          diag(SourceExpr->getExprLoc(), "function %0 %1")
+              << FuncDecl << Reason.trim() << SourceExpr->getSourceRange();
+          // Do not recommend a replacement when it is not present.
+        } else if (Entry.Replacement.empty()) {
           diag(SourceExpr->getExprLoc(),
                "function %0 %1; it should not be used")
               << FuncDecl << Reason << Entry.Replacement
               << SourceExpr->getSourceRange();
+          // Otherwise, emit the replacement.
         } else {
           diag(SourceExpr->getExprLoc(),
                "function %0 %1; '%2' should be used instead")
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index b982216297919..743397e3ec6ce 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -69,6 +69,13 @@ Potentially Breaking Changes
   - `CharTypdefsToIgnore` to `CharTypedefsToIgnore` in
     :doc:`bugprone-signed-char-misuse
     <clang-tidy/checks/bugprone/signed-char-misuse>`
+  
+- Modified the custom message format of :doc:`bugprone-unsafe-functions
+  <clang-tidy/checks/bugprone/unsafe-functions>` by assigning a special meaning
+  to the character ``>`` at the start of the value of the option
+  ``CustomFunctions``. If the option value starts with ``>``, then the
+  replacement suggestion part of the message (which would be included by
+  default) is omitted. (This does not change the warning locations.)
 
 - :program:`clang-tidy` now displays warnings from all non-system headers by
   default. Previously, users had to explicitly opt-in to header warnings using
@@ -387,6 +394,11 @@ Changes in existing checks
   <clang-tidy/checks/bugprone/unhandled-self-assignment>` check by adding
   an additional matcher that generalizes the copy-and-swap idiom pattern
   detection.
+  
+- Improved :doc:`bugprone-unsafe-functions
+  <clang-tidy/checks/bugprone/unsafe-functions>` check by hiding the default
+  suffix when the reason starts with the character `>` in the `CustomFunctions`
+  option.
 
 - Improved :doc:`cppcoreguidelines-avoid-non-const-global-variables
   <clang-tidy/checks/cppcoreguidelines/avoid-non-const-global-variables>` check
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst
index f1fec13739271..cb7ea415c54b2 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst
@@ -96,37 +96,62 @@ to be checked. The format is the following, without newlines:
 The functions are matched using POSIX extended regular expressions.
 *(Note: The regular expressions do not support negative* ``(?!)`` *matches.)*
 
-The `reason` is optional and is used to provide additional information
-about the reasoning behind the replacement. The default reason is
-`is marked as unsafe`.
+The ``reason`` is optional and is used to provide additional information about the
+reasoning behind the replacement. The default reason is ``is marked as unsafe``.
 
-If `replacement` is empty, the text `it should not be used` will be shown
-instead of the suggestion for a replacement.
+If ``replacement`` is empty, the default text ``it should not be used`` will be
+shown instead of the suggestion for a replacement.
 
-As an example, the configuration `^original$, replacement, is deprecated;`
-will produce the following diagnostic message.
+If the ``reason`` starts with the character ``>``, the reason becomes fully custom.
+The default suffix is disabled even if a ``replacement`` is present, and only the
+reason message is shown after the matched function, to allow better control over
+the suggestions. (The starting ``>`` and whitespace directly after it are
+trimmed from the message.)
+
+As an example, the following configuration matches only the function ``original``
+in the default namespace. A similar diagnostic can also be printed using a fully
+custom reason.
 
 .. code:: c
 
+   // bugprone-unsafe-functions.CustomFunctions:
+   //   ^original$, replacement, is deprecated;
+   // Using the fully custom message syntax:
+   //   ^suspicious$,,> should be avoided if possible.
    original(); // warning: function 'original' is deprecated; 'replacement' should be used instead.
+   suspicious(); // warning: function 'suspicious' should be avoided if possible.
    ::std::original(); // no-warning
    original_function(); // no-warning
 
-If the regular expression contains the character `:`, it is matched against the
-qualified name (i.e. ``std::original``), otherwise the regex is matched against the unqualified name (``original``).
-If the regular expression starts with `::` (or `^::`), it is matched against the
-fully qualified name (``::std::original``).
+If the regular expression contains the character ``:``, it is matched against the
+qualified name (i.e. ``std::original``), otherwise the regex is matched against
+the unqualified name (``original``). If the regular expression starts with ``::``
+(or ``^::``), it is matched against the fully qualified name
+(``::std::original``).
+
+One of the use cases for fully custom messages is suggesting compiler options
+and warning flags:
+
+.. code:: c
+
+   // bugprone-unsafe-functions.CustomFunctions:
+   //   ^memcpy$,,>is recommended to have compiler hardening using '_FORTIFY_SOURCE';
+   //   ^printf$,,>is recommended to have the '-Werror=format-security' compiler warning flag;
+
+   memcpy(dest, src, 999'999); // warning: function 'memcpy' is recommended to have compiler hardening using '_FORTIFY_SOURCE'
+   printf(raw_str); // warning: function 'printf' is recommended to have the '-Werror=format-security' compiler warning flag
 
 .. note::
 
-   Fully qualified names can contain template parameters on certain C++ classes, but not on C++ functions.
-   Type aliases are resolved before matching.
+   Fully qualified names can contain template parameters on certain C++ classes,
+   but not on C++ functions. Type aliases are resolved before matching.
 
    As an example, the member function ``open`` in the class ``std::ifstream``
    has a fully qualified name of ``::std::basic_ifstream<char>::open``.
 
-   The example could also be matched with the regex ``::std::basic_ifstream<[^>]*>::open``, which matches all potential
-   template parameters, but does not match nested template classes.
+   The example could also be matched with the regex
+   ``::std::basic_ifstream<[^>]*>::open``, which matches all potential template
+   parameters, but does not match nested template classes.
 
 Options
 -------
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions-custom.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions-custom.c
index 7fd71ec2f2e7b..7eaf015f06aa2 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions-custom.c
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unsafe-functions-custom.c
@@ -1,5 +1,5 @@
 // RUN: %check_clang_tidy -check-suffix=NON-STRICT-REGEX %s bugprone-unsafe-functions %t --\
-// RUN:   -config="{CheckOptions: {bugprone-unsafe-functions.CustomFunctions: '::name_match,replacement,is a qualname match;^::prefix_match,,is matched on qualname prefix'}}"
+// RUN:   -config="{CheckOptions: {bugprone-unsafe-functions.CustomFunctions: \"::name_match,,>is a qualname match, but with a fully 'custom' message;^::prefix_match,,is matched on qualname prefix\"}}"
 // RUN: %check_clang_tidy -check-suffix=STRICT-REGEX     %s bugprone-unsafe-functions %t --\
 // RUN:   -config="{CheckOptions: {bugprone-unsafe-functions.CustomFunctions: '^name_match$,replacement,is matched on function name only;^::prefix_match$,,is a full qualname match'}}"
 
@@ -11,14 +11,14 @@ void prefix_match_regex();
 
 void f1() {
   name_match();
-  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'name_match' is a qualname match; 'replacement' should be used instead
+  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'name_match' is a qualname match, but with a fully 'custom' message
   // CHECK-MESSAGES-STRICT-REGEX: :[[@LINE-2]]:3: warning: function 'name_match' is matched on function name only; 'replacement' should be used instead
   prefix_match();
   // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'prefix_match' is matched on qualname prefix; it should not be used
   // CHECK-MESSAGES-STRICT-REGEX: :[[@LINE-2]]:3: warning: function 'prefix_match' is a full qualname match; it should not be used
 
   name_match_regex();
-  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'name_match_regex' is a qualname match; 'replacement' should be used instead
+  // CHECK-MESSAGES-NON-STRICT-REGEX: :[[@LINE-1]]:3: warning: function 'name_match_regex' is a qualname match, but with a fully 'custom' message
   // no-warning STRICT-REGEX
 
   prefix_match_regex();

From 1fcfd5c67bbabe5f134ef4268c2a890f2b0cfa0f Mon Sep 17 00:00:00 2001
From: Erick Ochoa Lopez <erick.ochoalopez@amd.com>
Date: Tue, 18 Nov 2025 10:35:05 -0500
Subject: [PATCH 25/33] [mlir][amdgpu] Sink op creation in scaled conversion
 intrinsics (NFC) (#168542)

Where possible:

* notifyMatchFailure happen first
* then op.emitOpError
* finally assertions / op creation.

---------

Co-authored-by: Jakub Kuderski <kubakuderski@gmail.com>
---
 .../Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp   | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index edc6565f44f00..b9a5e7d7f6eac 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1738,15 +1738,11 @@ LogicalResult ScaledExtPacked816OpLowering::matchAndRewrite(
   auto sourceType = cast<VectorType>(op.getSource().getType());
   auto srcElemType = cast<FloatType>(sourceType.getElementType());
   unsigned bitWidth = srcElemType.getWidth();
-  int32_t scaleSel =
-      getScaleSel(blockSize, bitWidth, firstScaleLane, firstScaleByte);
 
   auto targetType = cast<VectorType>(op.getResult().getType());
   auto destElemType = cast<FloatType>(targetType.getElementType());
-  IntegerType i32 = rewriter.getI32Type();
-  Value castedScale =
-      LLVM::BitcastOp::create(rewriter, loc, i32, adaptor.getScale());
 
+  IntegerType i32 = rewriter.getI32Type();
   Value source = adaptor.getSource();
   Type llvmResultType = typeConverter->convertType(op.getResult().getType());
   Type packedType = nullptr;
@@ -1767,15 +1763,19 @@ LogicalResult ScaledExtPacked816OpLowering::matchAndRewrite(
     return rewriter.notifyMatchFailure(op, "type conversion failed");
   }
 
-  Value castedSource =
-      LLVM::BitcastOp::create(rewriter, loc, packedType, source);
-
   std::optional<StringRef> maybeIntrinsic =
       scaledExtPacked816ToIntrinsic(srcElemType, destElemType);
   if (!maybeIntrinsic.has_value())
     return op.emitOpError(
         "no intrinsic matching packed scaled conversion on the given chipset");
 
+  int32_t scaleSel =
+      getScaleSel(blockSize, bitWidth, firstScaleLane, firstScaleByte);
+  Value castedScale =
+      LLVM::BitcastOp::create(rewriter, loc, i32, adaptor.getScale());
+  Value castedSource =
+      LLVM::BitcastOp::create(rewriter, loc, packedType, source);
+
   OperationState loweredOp(loc, *maybeIntrinsic);
   loweredOp.addTypes({llvmResultType});
   loweredOp.addOperands({castedSource, castedScale});

From ed60cd2563ca6ee474f76487857dd5fd56b83925 Mon Sep 17 00:00:00 2001
From: Alexander Johnston <alexander.javen.johnston@gmail.com>
Date: Tue, 18 Nov 2025 15:41:07 +0000
Subject: [PATCH 26/33] [HLSL] Implement ddx/ddy_coarse intrinsics (#164831)

Closes https://github.com/llvm/llvm-project/issues/99097
Closes https://github.com/llvm/llvm-project/issues/99100

As ddx and ddy are near identical implementations I've combined them in
this PR. This aims to unblock
https://github.com/llvm/llvm-project/pull/161378

---------

Co-authored-by: Alexander Johnston <alexander.johnston@amd.com>
---
 clang/include/clang/Basic/Builtins.td         | 12 +++
 clang/lib/CodeGen/CGHLSLBuiltins.cpp          | 18 ++++
 clang/lib/CodeGen/CGHLSLRuntime.h             |  2 +
 .../lib/Headers/hlsl/hlsl_alias_intrinsics.h  | 68 +++++++++++++++
 clang/lib/Sema/SemaHLSL.cpp                   |  4 +-
 .../builtins/ddx-coarse-builtin.hlsl          | 26 ++++++
 .../test/CodeGenHLSL/builtins/ddx-coarse.hlsl | 86 +++++++++++++++++++
 .../builtins/ddy-coarse-builtin.hlsl          | 26 ++++++
 .../test/CodeGenHLSL/builtins/ddy-coarse.hlsl | 86 +++++++++++++++++++
 .../SemaHLSL/BuiltIns/ddx-coarse-errors.hlsl  | 22 +++++
 .../SemaHLSL/BuiltIns/ddy-coarse-errors.hlsl  | 22 +++++
 llvm/include/llvm/IR/IntrinsicsDirectX.td     |  2 +
 llvm/include/llvm/IR/IntrinsicsSPIRV.td       |  2 +
 llvm/lib/Target/DirectX/DXIL.td               | 18 ++++
 .../DirectX/DirectXTargetTransformInfo.cpp    |  2 +
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp | 15 ++--
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 73 +++++++++++++++-
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp |  9 +-
 .../test/CodeGen/DirectX/ddx_coarse-errors.ll | 15 ++++
 llvm/test/CodeGen/DirectX/ddx_coarse.ll       | 40 +++++++++
 .../test/CodeGen/DirectX/ddy_coarse-errors.ll | 15 ++++
 llvm/test/CodeGen/DirectX/ddy_coarse.ll       | 40 +++++++++
 .../SPIRV/hlsl-intrinsics/ddx_coarse.ll       | 47 ++++++++++
 .../SPIRV/hlsl-intrinsics/ddy_coarse.ll       | 47 ++++++++++
 .../CodeGen/SPIRV/opencl/ddx_coarse-error.ll  | 12 +++
 .../CodeGen/SPIRV/opencl/ddy_coarse-error.ll  | 12 +++
 26 files changed, 713 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/builtins/ddx-coarse-builtin.hlsl
 create mode 100644 clang/test/CodeGenHLSL/builtins/ddx-coarse.hlsl
 create mode 100644 clang/test/CodeGenHLSL/builtins/ddy-coarse-builtin.hlsl
 create mode 100644 clang/test/CodeGenHLSL/builtins/ddy-coarse.hlsl
 create mode 100644 clang/test/SemaHLSL/BuiltIns/ddx-coarse-errors.hlsl
 create mode 100644 clang/test/SemaHLSL/BuiltIns/ddy-coarse-errors.hlsl
 create mode 100644 llvm/test/CodeGen/DirectX/ddx_coarse-errors.ll
 create mode 100644 llvm/test/CodeGen/DirectX/ddx_coarse.ll
 create mode 100644 llvm/test/CodeGen/DirectX/ddy_coarse-errors.ll
 create mode 100644 llvm/test/CodeGen/DirectX/ddy_coarse.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ddx_coarse.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ddy_coarse.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/opencl/ddx_coarse-error.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/opencl/ddy_coarse-error.ll

diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index dbf857afa08c8..47da17e5cfe83 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -5253,6 +5253,18 @@ def HLSLF16ToF32 : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void(...)";
 }
 
+def HLSLDdxCoarse : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_elementwise_ddx_coarse"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def HLSLDdyCoarse : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_elementwise_ddy_coarse"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
 // Builtins for XRay.
 def XRayCustomEvent : Builtin {
   let Spellings = ["__xray_customevent"];
diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index b6928ce7d9c44..12d9a98915ce3 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -924,6 +924,24 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     return EmitRuntimeCall(
         Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID));
   }
+  case Builtin::BI__builtin_hlsl_elementwise_ddx_coarse: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    if (!E->getArg(0)->getType()->hasFloatingRepresentation())
+      llvm_unreachable("ddx_coarse operand must have a float representation");
+    Intrinsic::ID ID = CGM.getHLSLRuntime().getDdxCoarseIntrinsic();
+    return Builder.CreateIntrinsic(/*ReturnType=*/Op0->getType(), ID,
+                                   ArrayRef<Value *>{Op0}, nullptr,
+                                   "hlsl.ddx.coarse");
+  }
+  case Builtin::BI__builtin_hlsl_elementwise_ddy_coarse: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    if (!E->getArg(0)->getType()->hasFloatingRepresentation())
+      llvm_unreachable("ddy_coarse operand must have a float representation");
+    Intrinsic::ID ID = CGM.getHLSLRuntime().getDdyCoarseIntrinsic();
+    return Builder.CreateIntrinsic(/*ReturnType=*/Op0->getType(), ID,
+                                   ArrayRef<Value *>{Op0}, nullptr,
+                                   "hlsl.ddy.coarse");
+  }
   case Builtin::BI__builtin_get_spirv_spec_constant_bool:
   case Builtin::BI__builtin_get_spirv_spec_constant_short:
   case Builtin::BI__builtin_get_spirv_spec_constant_ushort:
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index 48935584f28a2..e1200c62eccf1 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -163,6 +163,8 @@ class CGHLSLRuntime {
   GENERATE_HLSL_INTRINSIC_FUNCTION(GroupMemoryBarrierWithGroupSync,
                                    group_memory_barrier_with_group_sync)
   GENERATE_HLSL_INTRINSIC_FUNCTION(GetDimensionsX, resource_getdimensions_x)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(DdxCoarse, ddx_coarse)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(DdyCoarse, ddy_coarse)
 
   //===----------------------------------------------------------------------===//
   // End of reserved area for HLSL intrinsic getters.
diff --git a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
index 2e2703de18cb1..38b95ee90736a 100644
--- a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
@@ -2946,5 +2946,73 @@ float4 radians(float4);
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_group_memory_barrier_with_group_sync)
 __attribute__((convergent)) void GroupMemoryBarrierWithGroupSync(void);
 
+//===----------------------------------------------------------------------===//
+// ddx_coarse builtin
+//===----------------------------------------------------------------------===//
+
+/// \fn T ddx_coarse(T value)
+/// \brief Computes a low precision partial derivative with respect to the
+/// screen-space x-coordinate.
+/// \param value The input value.
+///
+/// The return value is a floating point scalar or vector containing the low
+/// prevision partial derivative of the input value.
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse)
+half ddx_coarse(half);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse)
+half2 ddx_coarse(half2);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse)
+half3 ddx_coarse(half3);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse)
+half4 ddx_coarse(half4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse)
+float ddx_coarse(float);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse)
+float2 ddx_coarse(float2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse)
+float3 ddx_coarse(float3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse)
+float4 ddx_coarse(float4);
+
+//===----------------------------------------------------------------------===//
+// ddy_coarse builtin
+//===----------------------------------------------------------------------===//
+
+/// \fn T ddy_coarse(T value)
+/// \brief Computes a low precision partial derivative with respect to the
+/// screen-space y-coordinate.
+/// \param value The input value.
+///
+/// The return value is a floating point scalar or vector containing the low
+/// prevision partial derivative of the input value.
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse)
+half ddy_coarse(half);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse)
+half2 ddy_coarse(half2);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse)
+half3 ddy_coarse(half3);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse)
+half4 ddy_coarse(half4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse)
+float ddy_coarse(float);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse)
+float2 ddy_coarse(float2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse)
+float3 ddy_coarse(float3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse)
+float4 ddy_coarse(float4);
+
 } // namespace hlsl
 #endif //_HLSL_HLSL_ALIAS_INTRINSICS_H_
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 2b9b3abbd5360..5555916c2536f 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -3239,7 +3239,9 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   case Builtin::BI__builtin_hlsl_elementwise_degrees:
   case Builtin::BI__builtin_hlsl_elementwise_radians:
   case Builtin::BI__builtin_hlsl_elementwise_rsqrt:
-  case Builtin::BI__builtin_hlsl_elementwise_frac: {
+  case Builtin::BI__builtin_hlsl_elementwise_frac:
+  case Builtin::BI__builtin_hlsl_elementwise_ddx_coarse:
+  case Builtin::BI__builtin_hlsl_elementwise_ddy_coarse: {
     if (SemaRef.checkArgCount(TheCall, 1))
       return true;
     if (CheckAllArgTypesAreCorrect(&SemaRef, TheCall,
diff --git a/clang/test/CodeGenHLSL/builtins/ddx-coarse-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/ddx-coarse-builtin.hlsl
new file mode 100644
index 0000000000000..01216eefadba2
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/ddx-coarse-builtin.hlsl
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -finclude-default-header  -x hlsl  -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -fnative-half-type -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -finclude-default-header  -x hlsl  -triple spirv-pc-vulkan-compute  %s \
+// RUN:  -emit-llvm -disable-llvm-passes -fnative-half-type -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK-SPIRV
+
+// CHECK-LABEL: half @_Z19test_f16_ddx_coarseDh
+// CHECK: %hlsl.ddx.coarse = call {{.*}} half @llvm.dx.ddx.coarse.f16(half %{{.*}})
+// CHECK: ret half %hlsl.ddx.coarse
+// CHECK-LABEL-SPIRV: half @_Z19test_f16_ddx_coarseDh
+// CHECK-SPIRV: %hlsl.ddx.coarse = call {{.*}} half @llvm.spv.ddx.coarse.f16(half %{{.*}})
+// CHECK-SPIRV: ret half %hlsl.ddx.coarse
+half test_f16_ddx_coarse(half val) {
+    return __builtin_hlsl_elementwise_ddx_coarse(val);
+}
+
+// CHECK-LABEL: float @_Z19test_f32_ddx_coarsef
+// CHECK: %hlsl.ddx.coarse = call {{.*}} float @llvm.dx.ddx.coarse.f32(float %{{.*}})
+// CHECK: ret float %hlsl.ddx.coarse
+// CHECK-LABEL-SPIRV: float @_Z19test_f32_ddx_coarsef
+// CHECK-SPIRV: %hlsl.ddx.coarse = call {{.*}} float @llvm.spv.ddx.coarse.f32(float %{{.*}})
+// CHECK-SPIRV: ret float %hlsl.ddx.coarse
+float test_f32_ddx_coarse(float val) {
+    return __builtin_hlsl_elementwise_ddx_coarse(val);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/ddx-coarse.hlsl b/clang/test/CodeGenHLSL/builtins/ddx-coarse.hlsl
new file mode 100644
index 0000000000000..c200d4715629e
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/ddx-coarse.hlsl
@@ -0,0 +1,86 @@
+// RUN: %clang_cc1 -finclude-default-header  -x hlsl  -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -fnative-half-type -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -finclude-default-header  -x hlsl  -triple spirv-pc-vulkan-compute  %s \
+// RUN:  -emit-llvm -disable-llvm-passes -fnative-half-type -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK-SPIRV
+
+// CHECK-LABEL: half @_Z19test_f16_ddx_coarseDh
+// CHECK: %hlsl.ddx.coarse = call {{.*}} half @llvm.dx.ddx.coarse.f16(half %{{.*}})
+// CHECK: ret half %hlsl.ddx.coarse
+// CHECK-LABEL-SPIRV: half @_Z19test_f16_ddx_coarseDh
+// CHECK-SPIRV: %hlsl.ddx.coarse = call {{.*}} half @llvm.spv.ddx.coarse.f16(half %{{.*}})
+// CHECK-SPIRV: ret half %hlsl.ddx.coarse
+half test_f16_ddx_coarse(half val) {
+    return ddx_coarse(val);
+}
+
+// CHECK-LABEL: <2 x half> @_Z20test_f16_ddx_coarse2Dv2_Dh
+// CHECK: %hlsl.ddx.coarse = call {{.*}} <2 x half> @llvm.dx.ddx.coarse.v2f16(<2 x half> %{{.*}})
+// CHECK: ret <2 x half> %hlsl.ddx.coarse
+// CHECK-LABEL-SPIRV: <2 x half> @_Z20test_f16_ddx_coarse2Dv2_Dh
+// CHECK-SPIRV: %hlsl.ddx.coarse = call {{.*}} <2 x half> @llvm.spv.ddx.coarse.v2f16(<2 x half> %{{.*}})
+// CHECK-SPIRV: ret <2 x half> %hlsl.ddx.coarse
+half2 test_f16_ddx_coarse2(half2 val) {
+    return ddx_coarse(val);
+}
+
+// CHECK-LABEL: <3 x half> @_Z20test_f16_ddx_coarse3Dv3_Dh
+// CHECK: %hlsl.ddx.coarse = call {{.*}} <3 x half> @llvm.dx.ddx.coarse.v3f16(<3 x half> %{{.*}})
+// CHECK: ret <3 x half> %hlsl.ddx.coarse
+// CHECK-LABEL-SPIRV: <3 x half> @_Z20test_f16_ddx_coarse3Dv3_Dh
+// CHECK-SPIRV: %hlsl.ddx.coarse = call {{.*}} <3 x half> @llvm.spv.ddx.coarse.v3f16(<3 x half> %{{.*}})
+// CHECK-SPIRV: ret <3 x half> %hlsl.ddx.coarse
+half3 test_f16_ddx_coarse3(half3 val) {
+    return ddx_coarse(val);
+}
+
+// CHECK-LABEL: <4 x half> @_Z20test_f16_ddx_coarse4Dv4_Dh
+// CHECK: %hlsl.ddx.coarse = call {{.*}} <4 x half> @llvm.dx.ddx.coarse.v4f16(<4 x half> %{{.*}})
+// CHECK: ret <4 x half> %hlsl.ddx.coarse
+// CHECK-LABEL-SPIRV: <4 x half> @_Z20test_f16_ddx_coarse4Dv4_Dh
+// CHECK-SPIRV: %hlsl.ddx.coarse = call {{.*}} <4 x half> @llvm.spv.ddx.coarse.v4f16(<4 x half> %{{.*}})
+// CHECK-SPIRV: ret <4 x half> %hlsl.ddx.coarse
+half4 test_f16_ddx_coarse4(half4 val) {
+    return ddx_coarse(val);
+}
+
+// CHECK-LABEL: float @_Z19test_f32_ddx_coarsef
+// CHECK: %hlsl.ddx.coarse = call {{.*}} float @llvm.dx.ddx.coarse.f32(float %{{.*}})
+// CHECK: ret float %hlsl.ddx.coarse
+// CHECK-LABEL-SPIRV: float @_Z19test_f32_ddx_coarsef
+// CHECK-SPIRV: %hlsl.ddx.coarse = call {{.*}} float @llvm.spv.ddx.coarse.f32(float %{{.*}})
+// CHECK-SPIRV: ret float %hlsl.ddx.coarse
+float test_f32_ddx_coarse(float val) {
+    return ddx_coarse(val);
+}
+
+// CHECK-LABEL: <2 x float> @_Z20test_f32_ddx_coarse2Dv2_f
+// CHECK: %hlsl.ddx.coarse = call {{.*}} <2 x float> @llvm.dx.ddx.coarse.v2f32(<2 x float> %{{.*}})
+// CHECK: ret <2 x float> %hlsl.ddx.coarse
+// CHECK-LABEL-SPIRV: <2 x float> @_Z20test_f32_ddx_coarse2Dv2_f
+// CHECK-SPIRV: %hlsl.ddx.coarse = call {{.*}} <2 x float> @llvm.spv.ddx.coarse.v2f32(<2 x float> %{{.*}})
+// CHECK-SPIRV: ret <2 x float> %hlsl.ddx.coarse
+float2 test_f32_ddx_coarse2(float2 val) {
+    return ddx_coarse(val);
+}
+
+// CHECK-LABEL: <3 x float> @_Z20test_f32_ddx_coarse3Dv3_f
+// CHECK: %hlsl.ddx.coarse = call {{.*}} <3 x float> @llvm.dx.ddx.coarse.v3f32(<3 x float> %{{.*}})
+// CHECK: ret <3 x float> %hlsl.ddx.coarse
+// CHECK-LABEL-SPIRV: <3 x float> @_Z20test_f32_ddx_coarse3Dv3_f
+// CHECK-SPIRV: %hlsl.ddx.coarse = call {{.*}} <3 x float> @llvm.spv.ddx.coarse.v3f32(<3 x float> %{{.*}})
+// CHECK-SPIRV: ret <3 x float> %hlsl.ddx.coarse
+float3 test_f32_ddx_coarse3(float3 val) {
+    return ddx_coarse(val);
+}
+
+// CHECK-LABEL: <4 x float> @_Z20test_f32_ddx_coarse4Dv4_f
+// CHECK: %hlsl.ddx.coarse = call {{.*}} <4 x float> @llvm.dx.ddx.coarse.v4f32(<4 x float> %{{.*}})
+// CHECK: ret <4 x float> %hlsl.ddx.coarse
+// CHECK-LABEL-SPIRV: <4 x float> @_Z20test_f32_ddx_coarse4Dv4_f
+// CHECK-SPIRV: %hlsl.ddx.coarse = call {{.*}} <4 x float> @llvm.spv.ddx.coarse.v4f32(<4 x float> %{{.*}})
+// CHECK-SPIRV: ret <4 x float> %hlsl.ddx.coarse
+float4 test_f32_ddx_coarse4(float4 val) {
+    return ddx_coarse(val);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/ddy-coarse-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/ddy-coarse-builtin.hlsl
new file mode 100644
index 0000000000000..2967deb75031f
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/ddy-coarse-builtin.hlsl
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -finclude-default-header  -x hlsl  -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -fnative-half-type -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -finclude-default-header  -x hlsl  -triple spirv-pc-vulkan-compute  %s \
+// RUN:  -emit-llvm -disable-llvm-passes -fnative-half-type -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK-SPIRV
+
+// CHECK-LABEL: half @_Z19test_f16_ddy_coarseDh
+// CHECK: %hlsl.ddy.coarse = call {{.*}} half @llvm.dx.ddy.coarse.f16(half %{{.*}})
+// CHECK: ret half %hlsl.ddy.coarse
+// CHECK-LABEL-SPIRV: half @_Z19test_f16_ddy_coarseDh
+// CHECK-SPIRV: %hlsl.ddy.coarse = call {{.*}} half @llvm.spv.ddy.coarse.f16(half %{{.*}})
+// CHECK-SPIRV: ret half %hlsl.ddy.coarse
+half test_f16_ddy_coarse(half val) {
+    return __builtin_hlsl_elementwise_ddy_coarse(val);
+}
+
+// CHECK-LABEL: float @_Z19test_f32_ddy_coarsef
+// CHECK: %hlsl.ddy.coarse = call {{.*}} float @llvm.dx.ddy.coarse.f32(float %{{.*}})
+// CHECK: ret float %hlsl.ddy.coarse
+// CHECK-LABEL-SPIRV: float @_Z19test_f32_ddy_coarsef
+// CHECK-SPIRV: %hlsl.ddy.coarse = call {{.*}} float @llvm.spv.ddy.coarse.f32(float %{{.*}})
+// CHECK-SPIRV: ret float %hlsl.ddy.coarse
+float test_f32_ddy_coarse(float val) {
+    return __builtin_hlsl_elementwise_ddy_coarse(val);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/ddy-coarse.hlsl b/clang/test/CodeGenHLSL/builtins/ddy-coarse.hlsl
new file mode 100644
index 0000000000000..faa972a1be326
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/ddy-coarse.hlsl
@@ -0,0 +1,86 @@
+// RUN: %clang_cc1 -finclude-default-header  -x hlsl  -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -fnative-half-type -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -finclude-default-header  -x hlsl  -triple spirv-pc-vulkan-compute  %s \
+// RUN:  -emit-llvm -disable-llvm-passes -fnative-half-type -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK-SPIRV
+
+// CHECK-LABEL: half @_Z19test_f16_ddy_coarseDh
+// CHECK: %hlsl.ddy.coarse = call {{.*}} half @llvm.dx.ddy.coarse.f16(half %{{.*}})
+// CHECK: ret half %hlsl.ddy.coarse
+// CHECK-LABEL-SPIRV: half @_Z19test_f16_ddy_coarseDh
+// CHECK-SPIRV: %hlsl.ddy.coarse = call {{.*}} half @llvm.spv.ddy.coarse.f16(half %{{.*}})
+// CHECK-SPIRV: ret half %hlsl.ddy.coarse
+half test_f16_ddy_coarse(half val) {
+    return ddy_coarse(val);
+}
+
+// CHECK-LABEL: <2 x half> @_Z20test_f16_ddy_coarse2Dv2_Dh
+// CHECK: %hlsl.ddy.coarse = call {{.*}} <2 x half> @llvm.dx.ddy.coarse.v2f16(<2 x half> %{{.*}})
+// CHECK: ret <2 x half> %hlsl.ddy.coarse
+// CHECK-LABEL-SPIRV: <2 x half> @_Z20test_f16_ddy_coarse2Dv2_Dh
+// CHECK-SPIRV: %hlsl.ddy.coarse = call {{.*}} <2 x half> @llvm.spv.ddy.coarse.v2f16(<2 x half> %{{.*}})
+// CHECK-SPIRV: ret <2 x half> %hlsl.ddy.coarse
+half2 test_f16_ddy_coarse2(half2 val) {
+    return ddy_coarse(val);
+}
+
+// CHECK-LABEL: <3 x half> @_Z20test_f16_ddy_coarse3Dv3_Dh
+// CHECK: %hlsl.ddy.coarse = call {{.*}} <3 x half> @llvm.dx.ddy.coarse.v3f16(<3 x half> %{{.*}})
+// CHECK: ret <3 x half> %hlsl.ddy.coarse
+// CHECK-LABEL-SPIRV: <3 x half> @_Z20test_f16_ddy_coarse3Dv3_Dh
+// CHECK-SPIRV: %hlsl.ddy.coarse = call {{.*}} <3 x half> @llvm.spv.ddy.coarse.v3f16(<3 x half> %{{.*}})
+// CHECK-SPIRV: ret <3 x half> %hlsl.ddy.coarse
+half3 test_f16_ddy_coarse3(half3 val) {
+    return ddy_coarse(val);
+}
+
+// CHECK-LABEL: <4 x half> @_Z20test_f16_ddy_coarse4Dv4_Dh
+// CHECK: %hlsl.ddy.coarse = call {{.*}} <4 x half> @llvm.dx.ddy.coarse.v4f16(<4 x half> %{{.*}})
+// CHECK: ret <4 x half> %hlsl.ddy.coarse
+// CHECK-LABEL-SPIRV: <4 x half> @_Z20test_f16_ddy_coarse4Dv4_Dh
+// CHECK-SPIRV: %hlsl.ddy.coarse = call {{.*}} <4 x half> @llvm.spv.ddy.coarse.v4f16(<4 x half> %{{.*}})
+// CHECK-SPIRV: ret <4 x half> %hlsl.ddy.coarse
+half4 test_f16_ddy_coarse4(half4 val) {
+    return ddy_coarse(val);
+}
+
+// CHECK-LABEL: float @_Z19test_f32_ddy_coarsef
+// CHECK: %hlsl.ddy.coarse = call {{.*}} float @llvm.dx.ddy.coarse.f32(float %{{.*}})
+// CHECK: ret float %hlsl.ddy.coarse
+// CHECK-LABEL-SPIRV: float @_Z19test_f32_ddy_coarsef
+// CHECK-SPIRV: %hlsl.ddy.coarse = call {{.*}} float @llvm.spv.ddy.coarse.f32(float %{{.*}})
+// CHECK-SPIRV: ret float %hlsl.ddy.coarse
+float test_f32_ddy_coarse(float val) {
+    return ddy_coarse(val);
+}
+
+// CHECK-LABEL: <2 x float> @_Z20test_f32_ddy_coarse2Dv2_f
+// CHECK: %hlsl.ddy.coarse = call {{.*}} <2 x float> @llvm.dx.ddy.coarse.v2f32(<2 x float> %{{.*}})
+// CHECK: ret <2 x float> %hlsl.ddy.coarse
+// CHECK-LABEL-SPIRV: <2 x float> @_Z20test_f32_ddy_coarse2Dv2_f
+// CHECK-SPIRV: %hlsl.ddy.coarse = call {{.*}} <2 x float> @llvm.spv.ddy.coarse.v2f32(<2 x float> %{{.*}})
+// CHECK-SPIRV: ret <2 x float> %hlsl.ddy.coarse
+float2 test_f32_ddy_coarse2(float2 val) {
+    return ddy_coarse(val);
+}
+
+// CHECK-LABEL: <3 x float> @_Z20test_f32_ddy_coarse3Dv3_f
+// CHECK: %hlsl.ddy.coarse = call {{.*}} <3 x float> @llvm.dx.ddy.coarse.v3f32(<3 x float> %{{.*}})
+// CHECK: ret <3 x float> %hlsl.ddy.coarse
+// CHECK-LABEL-SPIRV: <3 x float> @_Z20test_f32_ddy_coarse3Dv3_f
+// CHECK-SPIRV: %hlsl.ddy.coarse = call {{.*}} <3 x float> @llvm.spv.ddy.coarse.v3f32(<3 x float> %{{.*}})
+// CHECK-SPIRV: ret <3 x float> %hlsl.ddy.coarse
+float3 test_f32_ddy_coarse3(float3 val) {
+    return ddy_coarse(val);
+}
+
+// CHECK-LABEL: <4 x float> @_Z20test_f32_ddy_coarse4Dv4_f
+// CHECK: %hlsl.ddy.coarse = call {{.*}} <4 x float> @llvm.dx.ddy.coarse.v4f32(<4 x float> %{{.*}})
+// CHECK: ret <4 x float> %hlsl.ddy.coarse
+// CHECK-LABEL-SPIRV: <4 x float> @_Z20test_f32_ddy_coarse4Dv4_f
+// CHECK-SPIRV: %hlsl.ddy.coarse = call {{.*}} <4 x float> @llvm.spv.ddy.coarse.v4f32(<4 x float> %{{.*}})
+// CHECK-SPIRV: ret <4 x float> %hlsl.ddy.coarse
+float4 test_f32_ddy_coarse4(float4 val) {
+    return ddy_coarse(val);
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/ddx-coarse-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/ddx-coarse-errors.hlsl
new file mode 100644
index 0000000000000..ebad1cc6826d8
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/ddx-coarse-errors.hlsl
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-library %s -fnative-half-type -verify
+
+float no_arg() {
+  return __builtin_hlsl_elementwise_ddx_coarse();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+}
+
+float too_many_args(float val) {
+  return __builtin_hlsl_elementwise_ddx_coarse(val, val);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+}
+
+float test_integer_scalar_input(int val) {
+  return __builtin_hlsl_elementwise_ddx_coarse(val);
+  // expected-error@-1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'int')}}
+}
+
+double test_double_scalar_input(double val) {
+  return __builtin_hlsl_elementwise_ddx_coarse(val);
+  // expected-error@-1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'double')}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/ddy-coarse-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/ddy-coarse-errors.hlsl
new file mode 100644
index 0000000000000..9cc23665882c8
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/ddy-coarse-errors.hlsl
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-library %s -fnative-half-type -verify
+
+float no_arg() {
+  return __builtin_hlsl_elementwise_ddy_coarse();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+}
+
+float too_many_args(float val) {
+  return __builtin_hlsl_elementwise_ddy_coarse(val, val);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+}
+
+float test_integer_scalar_input(int val) {
+  return __builtin_hlsl_elementwise_ddy_coarse(val);
+  // expected-error@-1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'int')}}
+}
+
+double test_double_scalar_input(double val) {
+  return __builtin_hlsl_elementwise_ddy_coarse(val);
+  // expected-error@-1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'double')}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index d7db935ee07f1..5a4cc776b26a5 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -170,6 +170,8 @@ def int_dx_splitdouble : DefaultAttrsIntrinsic<[llvm_anyint_ty, LLVMMatchType<0>
     [LLVMScalarOrSameVectorWidth<0, llvm_double_ty>], [IntrNoMem]>;
 def int_dx_radians : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
 def int_dx_discard : DefaultAttrsIntrinsic<[], [llvm_i1_ty], []>;
+def int_dx_ddx_coarse : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+def int_dx_ddy_coarse : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
 def int_dx_firstbituhigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
 def int_dx_firstbitshigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
 def int_dx_firstbitlow : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index f39c6cda2c579..2f7c25550a0cc 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -134,6 +134,8 @@ def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty]
   def int_spv_group_memory_barrier_with_group_sync
       : DefaultAttrsIntrinsic<[], [], [IntrConvergent]>;
   def int_spv_discard : DefaultAttrsIntrinsic<[], [], []>;
+  def int_spv_ddx_coarse : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_spv_ddy_coarse : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
   def int_spv_uclamp : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
   def int_spv_sclamp : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
   def int_spv_nclamp : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 67437f6969b27..8b2866260e9c9 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -930,6 +930,24 @@ def Discard : DXILOp<82, discard> {
   let stages = [Stages<DXIL1_0, [pixel]>];
 }
 
+def DerivCoarseX : DXILOp<83, unary> {
+  let Doc = "computes the rate of change per stamp in x direction";
+  let intrinsics = [IntrinSelect<int_dx_ddx_coarse>];
+  let arguments = [OverloadTy];
+  let result = OverloadTy;
+  let overloads = [Overloads<DXIL1_0, [HalfTy, FloatTy]>];
+  let stages = [Stages<DXIL1_0, [library, pixel]>];
+}
+
+def DerivCoarseY : DXILOp<84, unary> {
+  let Doc = "computes the rate of change per stamp in y direction";
+  let intrinsics = [IntrinSelect<int_dx_ddy_coarse>];
+  let arguments = [OverloadTy];
+  let result = OverloadTy;
+  let overloads = [Overloads<DXIL1_0, [HalfTy, FloatTy]>];
+  let stages = [Stages<DXIL1_0, [library, pixel]>];
+}
+
 def ThreadId : DXILOp<93, threadId> {
   let Doc = "Reads the thread ID";
   let intrinsics = [IntrinSelect<int_dx_thread_id>];
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index 6cacbf6564db2..a755dd522969d 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -64,6 +64,8 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
   case Intrinsic::dx_wave_reduce_usum:
   case Intrinsic::dx_imad:
   case Intrinsic::dx_umad:
+  case Intrinsic::dx_ddx_coarse:
+  case Intrinsic::dx_ddy_coarse:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 47022b3f89a8b..76fd834fd7219 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -1697,11 +1697,16 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVType(unsigned BitWidth,
   MachineIRBuilder MIRBuilder(DepMBB, DepMBB.getFirstNonPHI());
   const MachineInstr *NewMI =
       createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
-        return BuildMI(MIRBuilder.getMBB(), *MIRBuilder.getInsertPt(),
-                       MIRBuilder.getDL(), TII.get(SPIRVOPcode))
-            .addDef(createTypeVReg(CurMF->getRegInfo()))
-            .addImm(BitWidth)
-            .addImm(0);
+        auto NewTypeMI = BuildMI(MIRBuilder.getMBB(), *MIRBuilder.getInsertPt(),
+                                 MIRBuilder.getDL(), TII.get(SPIRVOPcode))
+                             .addDef(createTypeVReg(CurMF->getRegInfo()))
+                             .addImm(BitWidth);
+        // Don't add Encoding to FP type
+        if (!Ty->isFloatTy()) {
+          return NewTypeMI.addImm(0);
+        } else {
+          return NewTypeMI;
+        }
       });
   add(Ty, false, NewMI);
   return finishCreatingSPIRVType(Ty, NewMI);
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index fc87288a4a212..0653b4eb9dfe2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -328,6 +328,8 @@ class SPIRVInstructionSelector : public InstructionSelector {
                            MachineInstr &I) const;
   bool selectFrexp(Register ResVReg, const SPIRVType *ResType,
                    MachineInstr &I) const;
+  bool selectDpdCoarse(Register ResVReg, const SPIRVType *ResType,
+                       MachineInstr &I, const unsigned DPdOpCode) const;
   // Utilities
   std::pair<Register, bool>
   buildI32Constant(uint32_t Val, MachineInstr &I,
@@ -371,6 +373,7 @@ class SPIRVInstructionSelector : public InstructionSelector {
   bool loadHandleBeforePosition(Register &HandleReg, const SPIRVType *ResType,
                                 GIntrinsic &HandleDef, MachineInstr &Pos) const;
   void decorateUsesAsNonUniform(Register &NonUniformReg) const;
+  void errorIfInstrOutsideShader(MachineInstr &I) const;
 };
 
 bool sampledTypeIsSignedInteger(const llvm::Type *HandleType) {
@@ -3140,6 +3143,58 @@ bool SPIRVInstructionSelector::wrapIntoSpecConstantOp(
   return Result;
 }
 
+bool SPIRVInstructionSelector::selectDpdCoarse(Register ResVReg,
+                                               const SPIRVType *ResType,
+                                               MachineInstr &I,
+                                               const unsigned DPdOpCode) const {
+  // TODO: This should check specifically for Fragment Execution Model, but STI
+  // doesn't provide that information yet. See #167562
+  errorIfInstrOutsideShader(I);
+
+  // If the arg/result types are half then we need to wrap the instr in
+  // conversions to float
+  // This case occurs because a half arg/result is legal in HLSL but not spirv.
+  Register SrcReg = I.getOperand(2).getReg();
+  SPIRVType *SrcType = GR.getSPIRVTypeForVReg(SrcReg);
+  unsigned BitWidth = std::min(GR.getScalarOrVectorBitWidth(SrcType),
+                               GR.getScalarOrVectorBitWidth(ResType));
+  if (BitWidth == 32)
+    return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(DPdOpCode))
+        .addDef(ResVReg)
+        .addUse(GR.getSPIRVTypeID(ResType))
+        .addUse(I.getOperand(2).getReg());
+
+  MachineIRBuilder MIRBuilder(I);
+  unsigned componentCount = GR.getScalarOrVectorComponentCount(SrcType);
+  SPIRVType *F32ConvertTy = GR.getOrCreateSPIRVFloatType(32, I, TII);
+  if (componentCount != 1)
+    F32ConvertTy = GR.getOrCreateSPIRVVectorType(F32ConvertTy, componentCount,
+                                                 MIRBuilder, false);
+
+  const TargetRegisterClass *RegClass = GR.getRegClass(SrcType);
+  Register ConvertToVReg = MRI->createVirtualRegister(RegClass);
+  Register DpdOpVReg = MRI->createVirtualRegister(RegClass);
+
+  bool Result =
+      BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpFConvert))
+          .addDef(ConvertToVReg)
+          .addUse(GR.getSPIRVTypeID(F32ConvertTy))
+          .addUse(SrcReg)
+          .constrainAllUses(TII, TRI, RBI);
+  Result &= BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(DPdOpCode))
+                .addDef(DpdOpVReg)
+                .addUse(GR.getSPIRVTypeID(F32ConvertTy))
+                .addUse(ConvertToVReg)
+                .constrainAllUses(TII, TRI, RBI);
+  Result &=
+      BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpFConvert))
+          .addDef(ResVReg)
+          .addUse(GR.getSPIRVTypeID(ResType))
+          .addUse(DpdOpVReg)
+          .constrainAllUses(TII, TRI, RBI);
+  return Result;
+}
+
 bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
                                                const SPIRVType *ResType,
                                                MachineInstr &I) const {
@@ -3528,7 +3583,12 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
   case Intrinsic::spv_unpackhalf2x16: {
     return selectExtInst(ResVReg, ResType, I, GL::UnpackHalf2x16);
   }
-
+  case Intrinsic::spv_ddx_coarse: {
+    return selectDpdCoarse(ResVReg, ResType, I, SPIRV::OpDPdxCoarse);
+  }
+  case Intrinsic::spv_ddy_coarse: {
+    return selectDpdCoarse(ResVReg, ResType, I, SPIRV::OpDPdyCoarse);
+  }
   default: {
     std::string DiagMsg;
     raw_string_ostream OS(DiagMsg);
@@ -4694,6 +4754,17 @@ bool SPIRVInstructionSelector::loadHandleBeforePosition(
       .constrainAllUses(TII, TRI, RBI);
 }
 
+void SPIRVInstructionSelector::errorIfInstrOutsideShader(
+    MachineInstr &I) const {
+  if (!STI.isShader()) {
+    std::string DiagMsg;
+    raw_string_ostream OS(DiagMsg);
+    I.print(OS, true, false, false, false);
+    DiagMsg += " is only supported in shaders.\n";
+    report_fatal_error(DiagMsg.c_str(), false);
+  }
+}
+
 namespace llvm {
 InstructionSelector *
 createSPIRVInstructionSelector(const SPIRVTargetMachine &TM,
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index b8cd9c1358f00..bd754d17694b8 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -934,7 +934,8 @@ void RequirementHandler::initAvailableCapabilitiesForVulkan(
                     Capability::UniformBufferArrayDynamicIndexing,
                     Capability::SampledImageArrayDynamicIndexing,
                     Capability::StorageBufferArrayDynamicIndexing,
-                    Capability::StorageImageArrayDynamicIndexing});
+                    Capability::StorageImageArrayDynamicIndexing,
+                    Capability::DerivativeControl});
 
   // Became core in Vulkan 1.2
   if (ST.isAtLeastSPIRVVer(VersionTuple(1, 5))) {
@@ -2148,6 +2149,12 @@ void addInstrRequirements(const MachineInstr &MI,
     }
     break;
   }
+  case SPIRV::OpDPdxCoarse:
+  case SPIRV::OpDPdyCoarse: {
+    Reqs.addCapability(SPIRV::Capability::DerivativeControl);
+    break;
+  }
+
   default:
     break;
   }
diff --git a/llvm/test/CodeGen/DirectX/ddx_coarse-errors.ll b/llvm/test/CodeGen/DirectX/ddx_coarse-errors.ll
new file mode 100644
index 0000000000000..0679eec31cec1
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ddx_coarse-errors.ll
@@ -0,0 +1,15 @@
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
+
+; DXIL operation ddx.coarse does not support double overload type
+; CHECK: in function ddx.coarse
+; CHECK-SAME: Cannot create DerivCoarseX operation: Invalid overload type
+
+; Function Attrs: noinline nounwind optnone
+define noundef double @ddx.coarse_double(double noundef %a) #0 {
+entry:
+  %a.addr = alloca double, align 8
+  store double %a, ptr %a.addr, align 8
+  %0 = load double, ptr %a.addr, align 8
+  %dx.ddx.coarse = call double @llvm.dx.ddx.coarse.f64(double %0)
+  ret double %dx.ddx.coarse
+}
diff --git a/llvm/test/CodeGen/DirectX/ddx_coarse.ll b/llvm/test/CodeGen/DirectX/ddx_coarse.ll
new file mode 100644
index 0000000000000..f6ea031273263
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ddx_coarse.ll
@@ -0,0 +1,40 @@
+; RUN: opt -S  -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+
+; Make sure dxil operation function calls for ddx_coarse are generated for half/float and matching vectors
+
+define noundef half @deriv_coarse_x_half(half noundef %a) {
+; CHECK: call half @dx.op.unary.f16(i32 83, half %{{.*}})
+entry:
+  %dx.ddx.coarse = call half @llvm.dx.ddx.coarse.f16(half %a)
+  ret half %dx.ddx.coarse
+}
+
+define noundef float @deriv_coarse_x_float(float noundef %a) {
+; CHECK: call float @dx.op.unary.f32(i32 83, float %{{.*}})
+entry:
+  %dx.ddx.coarse = call float @llvm.dx.ddx.coarse.f32(float %a)
+  ret float %dx.ddx.coarse
+}
+
+define noundef <4 x float> @deriv_coarse_x_float4(<4 x float> noundef %a) {
+; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 83, float [[ee0]])
+; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 83, float [[ee1]])
+; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 83, float [[ee2]])
+; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 83, float [[ee3]])
+; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+; CHECK: ret <4 x float> %{{.*}}
+entry:
+  %dx.ddx.coarse = call <4 x float> @llvm.dx.ddx.coarse.v4f32(<4 x float> %a)
+  ret <4 x float> %dx.ddx.coarse
+}
+
+declare half @llvm.dx.ddx.coarse.f16(half)
+declare float @llvm.dx.ddx.coarse.f32(float)
+declare <4 x float> @llvm.dx.ddx.coarse.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/DirectX/ddy_coarse-errors.ll b/llvm/test/CodeGen/DirectX/ddy_coarse-errors.ll
new file mode 100644
index 0000000000000..df8e3eb0f7e0b
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ddy_coarse-errors.ll
@@ -0,0 +1,15 @@
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
+
+; DXIL operation ddy.coarse does not support double overload type
+; CHECK: in function ddy.coarse
+; CHECK-SAME: Cannot create DerivCoarseY operation: Invalid overload type
+
+; Function Attrs: noinline nounwind optnone
+define noundef double @ddy.coarse_double(double noundef %a) #0 {
+entry:
+  %a.addr = alloca double, align 8
+  store double %a, ptr %a.addr, align 8
+  %0 = load double, ptr %a.addr, align 8
+  %dx.ddy.coarse = call double @llvm.dx.ddy.coarse.f64(double %0)
+  ret double %dx.ddy.coarse
+}
diff --git a/llvm/test/CodeGen/DirectX/ddy_coarse.ll b/llvm/test/CodeGen/DirectX/ddy_coarse.ll
new file mode 100644
index 0000000000000..e3337022e1b01
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ddy_coarse.ll
@@ -0,0 +1,40 @@
+; RUN: opt -S  -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+
+; Make sure dxil operation function calls for ddy_coarse are generated for half/float and matching vectors
+
+define noundef half @deriv_coarse_y_half(half noundef %a) {
+; CHECK: call half @dx.op.unary.f16(i32 84, half %{{.*}})
+entry:
+  %dx.ddy.coarse = call half @llvm.dx.ddy.coarse.f16(half %a)
+  ret half %dx.ddy.coarse
+}
+
+define noundef float @deriv_coarse_y_float(float noundef %a) {
+; CHECK: call float @dx.op.unary.f32(i32 84, float %{{.*}})
+entry:
+  %dx.ddy.coarse = call float @llvm.dx.ddy.coarse.f32(float %a)
+  ret float %dx.ddy.coarse
+}
+
+define noundef <4 x float> @deriv_coarse_y_float4(<4 x float> noundef %a) {
+; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 84, float [[ee0]])
+; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 84, float [[ee1]])
+; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 84, float [[ee2]])
+; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 84, float [[ee3]])
+; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+; CHECK: ret <4 x float> %{{.*}}
+entry:
+  %dx.ddy.coarse = call <4 x float> @llvm.dx.ddy.coarse.v4f32(<4 x float> %a)
+  ret <4 x float> %dx.ddy.coarse
+}
+
+declare half @llvm.dx.ddy.coarse.f16(half)
+declare float @llvm.dx.ddy.coarse.f32(float)
+declare <4 x float> @llvm.dx.ddy.coarse.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ddx_coarse.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ddx_coarse.ll
new file mode 100644
index 0000000000000..478acb53701ea
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ddx_coarse.ll
@@ -0,0 +1,47 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val --target-env spv1.4 %}
+
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+
+define noundef float @ddx_coarse_float(float noundef %a) {
+entry:
+; CHECK: %[[#float_32_arg:]] = OpFunctionParameter %[[#float_32]]
+; CHECK: %[[#]] = OpDPdxCoarse %[[#float_32]] %[[#float_32_arg]]
+  %elt.ddx.coarse = call float @llvm.spv.ddx.coarse.f32(float %a)
+  ret float %elt.ddx.coarse
+}
+
+define noundef half @ddx_coarse_half(half noundef %a) {
+entry:
+; CHECK: %[[#float_16_arg:]] = OpFunctionParameter %[[#float_16]]
+; CHECK: %[[#converted:]] = OpFConvert %[[#float_32:]] %[[#float_16_arg]]
+; CHECK: %[[#coarse:]] = OpDPdxCoarse %[[#float_32]] %[[#converted]]
+; CHECK: %[[#]] = OpFConvert %[[#float_16]] %[[#coarse]]
+  %elt.ddx.coarse = call half @llvm.spv.ddx.coarse.f16(half %a)
+  ret half %elt.ddx.coarse
+}
+
+define noundef <4 x float> @ddx_coarse_float_vector(<4 x float> noundef %a) {
+entry:
+; CHECK: %[[#vec4_float_32_arg:]] = OpFunctionParameter %[[#vec4_float_32]]
+; CHECK: %[[#]] = OpDPdxCoarse %[[#vec4_float_32]] %[[#vec4_float_32_arg]]
+  %elt.ddx.coarse = call <4 x float> @llvm.spv.ddx.coarse.v4f32(<4 x float> %a)
+  ret <4 x float> %elt.ddx.coarse
+}
+
+define noundef <4 x half> @ddx_coarse_half_vector(<4 x half> noundef %a) {
+entry:
+; CHECK: %[[#vec4_float_16_arg:]] = OpFunctionParameter %[[#vec4_float_16]]
+; CHECK: %[[#converted:]] = OpFConvert %[[#vec4_float_32:]] %[[#vec4_float_16_arg]]
+; CHECK: %[[#coarse:]] = OpDPdxCoarse %[[#vec4_float_32]] %[[#converted]]
+; CHECK: %[[#]] = OpFConvert %[[#vec4_float_16]] %[[#coarse]]
+  %elt.ddx.coarse = call <4 x half> @llvm.spv.ddx.coarse.v4f16(<4 x half> %a)
+  ret <4 x half> %elt.ddx.coarse
+}
+
+declare float @llvm.spv.ddx.coarse.f32(float)
+declare half @llvm.spv.ddx.coarse.f16(half)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ddy_coarse.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ddy_coarse.ll
new file mode 100644
index 0000000000000..8ad67cb644aa7
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ddy_coarse.ll
@@ -0,0 +1,47 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val --target-env spv1.4 %}
+
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+
+define noundef float @ddy_coarse_float(float noundef %a) {
+entry:
+; CHECK: %[[#float_32_arg:]] = OpFunctionParameter %[[#float_32]]
+; CHECK: %[[#]] = OpDPdyCoarse %[[#float_32]] %[[#float_32_arg]]
+  %elt.ddy.coarse = call float @llvm.spv.ddy.coarse.f32(float %a)
+  ret float %elt.ddy.coarse
+}
+
+define noundef half @ddy_coarse_half(half noundef %a) {
+entry:
+; CHECK: %[[#float_16_arg:]] = OpFunctionParameter %[[#float_16]]
+; CHECK: %[[#converted:]] = OpFConvert %[[#float_32:]] %[[#float_16_arg]]
+; CHECK: %[[#coarse:]] = OpDPdyCoarse %[[#float_32]] %[[#converted]]
+; CHECK: %[[#]] = OpFConvert %[[#float_16]] %[[#coarse]]
+  %elt.ddy.coarse = call half @llvm.spv.ddy.coarse.f16(half %a)
+  ret half %elt.ddy.coarse
+}
+
+define noundef <4 x float> @ddy_coarse_float_vector(<4 x float> noundef %a) {
+entry:
+; CHECK: %[[#vec4_float_32_arg:]] = OpFunctionParameter %[[#vec4_float_32]]
+; CHECK: %[[#]] = OpDPdyCoarse %[[#vec4_float_32]] %[[#vec4_float_32_arg]]
+  %elt.ddy.coarse = call <4 x float> @llvm.spv.ddy.coarse.v4f32(<4 x float> %a)
+  ret <4 x float> %elt.ddy.coarse
+}
+
+define noundef <4 x half> @ddy_coarse_half_vector(<4 x half> noundef %a) {
+entry:
+; CHECK: %[[#vec4_float_16_arg:]] = OpFunctionParameter %[[#vec4_float_16]]
+; CHECK: %[[#converted:]] = OpFConvert %[[#vec4_float_32:]] %[[#vec4_float_16_arg]]
+; CHECK: %[[#coarse:]] = OpDPdyCoarse %[[#vec4_float_32]] %[[#converted]]
+; CHECK: %[[#]] = OpFConvert %[[#vec4_float_16]] %[[#coarse]]
+  %elt.ddy.coarse = call <4 x half> @llvm.spv.ddy.coarse.v4f16(<4 x half> %a)
+  ret <4 x half> %elt.ddy.coarse
+}
+
+declare float @llvm.spv.ddy.coarse.f32(float)
+declare half @llvm.spv.ddy.coarse.f16(half)
diff --git a/llvm/test/CodeGen/SPIRV/opencl/ddx_coarse-error.ll b/llvm/test/CodeGen/SPIRV/opencl/ddx_coarse-error.ll
new file mode 100644
index 0000000000000..e93c1d1ba4d36
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/opencl/ddx_coarse-error.ll
@@ -0,0 +1,12 @@
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: %{{.*}} = G_INTRINSIC intrinsic(@llvm.spv.ddx.coarse), %{{.*}} is only supported in shaders.
+
+define noundef float @ddx_coarse(float noundef %a) {
+entry:
+  %spv.ddx.coarse = call float @llvm.spv.ddx.coarse.f32(float %a)
+  ret float %spv.ddx.coarse
+}
+
+declare float @llvm.spv.ddx.coarse.f32(float)
diff --git a/llvm/test/CodeGen/SPIRV/opencl/ddy_coarse-error.ll b/llvm/test/CodeGen/SPIRV/opencl/ddy_coarse-error.ll
new file mode 100644
index 0000000000000..aa71a395d8680
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/opencl/ddy_coarse-error.ll
@@ -0,0 +1,12 @@
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: %{{.*}} = G_INTRINSIC intrinsic(@llvm.spv.ddy.coarse), %{{.*}} is only supported in shaders.
+
+define noundef float @ddy_coarse(float noundef %a) {
+entry:
+  %spv.ddy.coarse = call float @llvm.spv.ddy.coarse.f32(float %a)
+  ret float %spv.ddy.coarse
+}
+
+declare float @llvm.spv.ddy.coarse.f32(float)

From 61c2cc9462d4cf4a1925975e34eed7122463ef16 Mon Sep 17 00:00:00 2001
From: Manuel Carrasco <Manuel.Carrasco@amd.com>
Date: Tue, 18 Nov 2025 15:48:04 +0000
Subject: [PATCH 27/33] [clang][clang-linker-wrapper] Use the correct triple
 for clang-offload-bundler and AMD SPIR-V. (#168521)

`clang-linker-wrapper` was incorrectly calling `clang-offload-bundler`
for AMD SPIR-V. This resulted in a binary that couldn't be executed if
built using the new driver.

The runtime couldn't recognise the triple triggering this error at
execution time:

```
No compatible code objects found for: gfx90a:sramecc+:xnack-,
```

With this PR, this is solved:

```
Creating ISA for: gfx90a:sramecc+:xnack- from spirv
```
---
 .../test/Driver/linker-wrapper-hip-amdgcnspirv.c | 16 ++++++++++++++++
 .../clang-linker-wrapper/ClangLinkerWrapper.cpp  |  7 +++++--
 2 files changed, 21 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/Driver/linker-wrapper-hip-amdgcnspirv.c

diff --git a/clang/test/Driver/linker-wrapper-hip-amdgcnspirv.c b/clang/test/Driver/linker-wrapper-hip-amdgcnspirv.c
new file mode 100644
index 0000000000000..429f7d3b9ee13
--- /dev/null
+++ b/clang/test/Driver/linker-wrapper-hip-amdgcnspirv.c
@@ -0,0 +1,16 @@
+// RUN: %clang -cc1 %s -triple "spirv64-amd-amdhsa" -emit-llvm-bc -o %t.bc
+// RUN: llvm-offload-binary -o %t.out "--image=file=%t.bc,triple=spirv64-amd-amdhsa,arch=amdgcnspirv,kind=hip"
+// RUN: clang-linker-wrapper \
+// RUN:     "--should-extract=amdgcnspirv" \
+// RUN:     "--host-triple=spirv64-amd-amdhsa" \
+// RUN:     "--linker-path=clang-offload-bundler" \
+// RUN:     "--emit-fatbin-only" \
+// RUN:     "-o" "%t.hipfb" \
+// RUN:     "%t.out" \
+// RUN:     --dry-run \
+// RUN: 2>&1 | FileCheck %s
+
+// clang-linker-wrapper was previously calling clang-offload-bundler with -targets=...,hip-amdgcn-amd-amdhsa--amdgcnspirv
+// This caused the runtime not to recognise the triple for the AMD SPIR-V code.
+
+// CHECK: {{".*clang-offload-bundler.*"}} {{.*}} -targets={{.*}},hip-spirv64-amd-amdhsa--amdgcnspirv
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index bd4b40192c9f2..4a4a43db6ef25 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -439,8 +439,11 @@ fatbinary(ArrayRef<std::pair<StringRef, StringRef>> InputFiles,
         Args.MakeArgString(Twine("-compression-level=") + Arg->getValue()));
 
   SmallVector<StringRef> Targets = {"-targets=host-x86_64-unknown-linux-gnu"};
-  for (const auto &[File, Arch] : InputFiles)
-    Targets.push_back(Saver.save("hip-amdgcn-amd-amdhsa--" + Arch));
+  for (const auto &[File, Arch] : InputFiles) {
+    Targets.push_back(Saver.save(Arch == "amdgcnspirv"
+                                     ? "hip-spirv64-amd-amdhsa--" + Arch
+                                     : "hip-amdgcn-amd-amdhsa--" + Arch));
+  }
   CmdArgs.push_back(Saver.save(llvm::join(Targets, ",")));
 
 #ifdef _WIN32

From 4d093683ceab90a8df17f6887c5b21a27ed95ba6 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Tue, 18 Nov 2025 09:48:13 -0600
Subject: [PATCH 28/33] [bazel] Add MODULE.bazel (#164891)

This is a simple translation of the current WORKSPACE file.

* External repos are replaced with `bazel_dep()`. The versions have been
bumped to newer versions.
* `maybe()` doesn't seem to be a thing, so I just removed that.
* Existing repos where we define our own BUILD file in third_party_build
have *not* been replaced due to compatibility issues. For example,
`nanobind_bazel` could replace the `nanobind` config we have, but
switching to that caused some build errors.
* For these existing repos, they have been specified as module
extensions

This should have no effect since `.bazelrc` defines `common
--enable_bzlmod=false --enable_workspace`

Tested locally: `bazel test --enable_bzlmod --noenable_workspace
--config=generic_clang @llvm-project//... //...`
---
 utils/bazel/MODULE.bazel      |  38 +++
 utils/bazel/MODULE.bazel.lock | 490 ++++++++++++++++++++++++++++++++++
 utils/bazel/extensions.bzl    | 127 +++++++++
 3 files changed, 655 insertions(+)
 create mode 100644 utils/bazel/MODULE.bazel
 create mode 100644 utils/bazel/MODULE.bazel.lock
 create mode 100644 utils/bazel/extensions.bzl

diff --git a/utils/bazel/MODULE.bazel b/utils/bazel/MODULE.bazel
new file mode 100644
index 0000000000000..d061487acf4d7
--- /dev/null
+++ b/utils/bazel/MODULE.bazel
@@ -0,0 +1,38 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""bzlmod configuration for llvm-project"""
+module(name = "llvm-project-overlay")
+
+bazel_dep(name = "apple_support", version = "1.24.1", repo_name = "build_bazel_apple_support")
+bazel_dep(name = "bazel_skylib", version = "1.8.2")
+bazel_dep(name = "platforms", version = "1.0.0")
+bazel_dep(name = "rules_android", version = "0.6.6")
+bazel_dep(name = "rules_cc", version = "0.2.11")
+bazel_dep(name = "rules_foreign_cc", version = "0.15.1")
+bazel_dep(name = "rules_python", version = "1.6.3")
+bazel_dep(name = "rules_shell", version = "0.6.1")
+
+llvm_repos_extension = use_extension(":extensions.bzl", "llvm_repos_extension")
+
+use_repo(
+    llvm_repos_extension,
+    "llvm-raw",
+    "llvm_zlib",
+    "vulkan_headers",
+    "vulkan_sdk_setup",
+    "gmp",
+    "mpfr",
+    "mpc",
+    "pfm",
+    "llvm_zstd",
+    "pybind11",
+    "pyyaml",
+    "robin_map",
+    "nanobind",
+)
+
+llvm_configure = use_repo_rule("@llvm-raw//utils/bazel:configure.bzl", "llvm_configure")
+
+llvm_configure(name = "llvm-project")
diff --git a/utils/bazel/MODULE.bazel.lock b/utils/bazel/MODULE.bazel.lock
new file mode 100644
index 0000000000000..64de258401e91
--- /dev/null
+++ b/utils/bazel/MODULE.bazel.lock
@@ -0,0 +1,490 @@
+{
+  "lockFileVersion": 16,
+  "registryFileHashes": {
+    "https://bcr.bazel.build/bazel_registry.json": "8a28e4aff06ee60aed2a8c281907fb8bcbf3b753c91fb5a5c57da3215d5b3497",
+    "https://bcr.bazel.build/modules/abseil-cpp/20210324.2/MODULE.bazel": "7cd0312e064fde87c8d1cd79ba06c876bd23630c83466e9500321be55c96ace2",
+    "https://bcr.bazel.build/modules/abseil-cpp/20211102.0/MODULE.bazel": "70390338f7a5106231d20620712f7cccb659cd0e9d073d1991c038eb9fc57589",
+    "https://bcr.bazel.build/modules/abseil-cpp/20230125.1/MODULE.bazel": "89047429cb0207707b2dface14ba7f8df85273d484c2572755be4bab7ce9c3a0",
+    "https://bcr.bazel.build/modules/abseil-cpp/20230802.0.bcr.1/MODULE.bazel": "1c8cec495288dccd14fdae6e3f95f772c1c91857047a098fad772034264cc8cb",
+    "https://bcr.bazel.build/modules/abseil-cpp/20230802.0/MODULE.bazel": "d253ae36a8bd9ee3c5955384096ccb6baf16a1b1e93e858370da0a3b94f77c16",
+    "https://bcr.bazel.build/modules/abseil-cpp/20230802.1/MODULE.bazel": "fa92e2eb41a04df73cdabeec37107316f7e5272650f81d6cc096418fe647b915",
+    "https://bcr.bazel.build/modules/abseil-cpp/20240116.1/MODULE.bazel": "37bcdb4440fbb61df6a1c296ae01b327f19e9bb521f9b8e26ec854b6f97309ed",
+    "https://bcr.bazel.build/modules/abseil-cpp/20240116.2/MODULE.bazel": "73939767a4686cd9a520d16af5ab440071ed75cec1a876bf2fcfaf1f71987a16",
+    "https://bcr.bazel.build/modules/abseil-cpp/20250127.0/MODULE.bazel": "d1086e248cda6576862b4b3fe9ad76a214e08c189af5b42557a6e1888812c5d5",
+    "https://bcr.bazel.build/modules/abseil-cpp/20250127.0/source.json": "1b996859f840d8efc7c720efc61dcf2a84b1261cb3974cbbe9b6666ebf567775",
+    "https://bcr.bazel.build/modules/abseil-py/2.1.0/MODULE.bazel": "5ebe5bf853769c65707e5c28f216798f7a4b1042015e6a36e6d03094d94bec8a",
+    "https://bcr.bazel.build/modules/abseil-py/2.1.0/source.json": "0e8fc4f088ce07099c1cd6594c20c7ddbb48b4b3c0849b7d94ba94be88ff042b",
+    "https://bcr.bazel.build/modules/apple_support/1.11.1/MODULE.bazel": "1843d7cd8a58369a444fc6000e7304425fba600ff641592161d9f15b179fb896",
+    "https://bcr.bazel.build/modules/apple_support/1.15.1/MODULE.bazel": "a0556fefca0b1bb2de8567b8827518f94db6a6e7e7d632b4c48dc5f865bc7c85",
+    "https://bcr.bazel.build/modules/apple_support/1.24.1/MODULE.bazel": "f46e8ddad60aef170ee92b2f3d00ef66c147ceafea68b6877cb45bd91737f5f8",
+    "https://bcr.bazel.build/modules/apple_support/1.24.1/source.json": "cf725267cbacc5f028ef13bb77e7f2c2e0066923a4dab1025e4a0511b1ed258a",
+    "https://bcr.bazel.build/modules/bazel_features/1.1.0/MODULE.bazel": "cfd42ff3b815a5f39554d97182657f8c4b9719568eb7fded2b9135f084bf760b",
+    "https://bcr.bazel.build/modules/bazel_features/1.1.1/MODULE.bazel": "27b8c79ef57efe08efccbd9dd6ef70d61b4798320b8d3c134fd571f78963dbcd",
+    "https://bcr.bazel.build/modules/bazel_features/1.11.0/MODULE.bazel": "f9382337dd5a474c3b7d334c2f83e50b6eaedc284253334cf823044a26de03e8",
+    "https://bcr.bazel.build/modules/bazel_features/1.13.0/MODULE.bazel": "c14c33c7c3c730612bdbe14ebbb5e61936b6f11322ea95a6e91cd1ba962f94df",
+    "https://bcr.bazel.build/modules/bazel_features/1.15.0/MODULE.bazel": "d38ff6e517149dc509406aca0db3ad1efdd890a85e049585b7234d04238e2a4d",
+    "https://bcr.bazel.build/modules/bazel_features/1.17.0/MODULE.bazel": "039de32d21b816b47bd42c778e0454217e9c9caac4a3cf8e15c7231ee3ddee4d",
+    "https://bcr.bazel.build/modules/bazel_features/1.18.0/MODULE.bazel": "1be0ae2557ab3a72a57aeb31b29be347bcdc5d2b1eb1e70f39e3851a7e97041a",
+    "https://bcr.bazel.build/modules/bazel_features/1.19.0/MODULE.bazel": "59adcdf28230d220f0067b1f435b8537dd033bfff8db21335ef9217919c7fb58",
+    "https://bcr.bazel.build/modules/bazel_features/1.21.0/MODULE.bazel": "675642261665d8eea09989aa3b8afb5c37627f1be178382c320d1b46afba5e3b",
+    "https://bcr.bazel.build/modules/bazel_features/1.23.0/MODULE.bazel": "fd1ac84bc4e97a5a0816b7fd7d4d4f6d837b0047cf4cbd81652d616af3a6591a",
+    "https://bcr.bazel.build/modules/bazel_features/1.27.0/MODULE.bazel": "621eeee06c4458a9121d1f104efb80f39d34deff4984e778359c60eaf1a8cb65",
+    "https://bcr.bazel.build/modules/bazel_features/1.28.0/MODULE.bazel": "4b4200e6cbf8fa335b2c3f43e1d6ef3e240319c33d43d60cc0fbd4b87ece299d",
+    "https://bcr.bazel.build/modules/bazel_features/1.3.0/MODULE.bazel": "cdcafe83ec318cda34e02948e81d790aab8df7a929cec6f6969f13a489ccecd9",
+    "https://bcr.bazel.build/modules/bazel_features/1.30.0/MODULE.bazel": "a14b62d05969a293b80257e72e597c2da7f717e1e69fa8b339703ed6731bec87",
+    "https://bcr.bazel.build/modules/bazel_features/1.30.0/source.json": "b07e17f067fe4f69f90b03b36ef1e08fe0d1f3cac254c1241a1818773e3423bc",
+    "https://bcr.bazel.build/modules/bazel_features/1.4.1/MODULE.bazel": "e45b6bb2350aff3e442ae1111c555e27eac1d915e77775f6fdc4b351b758b5d7",
+    "https://bcr.bazel.build/modules/bazel_features/1.9.1/MODULE.bazel": "8f679097876a9b609ad1f60249c49d68bfab783dd9be012faf9d82547b14815a",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.0.3/MODULE.bazel": "bcb0fd896384802d1ad283b4e4eb4d718eebd8cb820b0a2c3a347fb971afd9d8",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.1.1/MODULE.bazel": "1add3e7d93ff2e6998f9e118022c84d163917d912f5afafb3058e3d2f1545b5e",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.2.0/MODULE.bazel": "44fe84260e454ed94ad326352a698422dbe372b21a1ac9f3eab76eb531223686",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.2.1/MODULE.bazel": "f35baf9da0efe45fa3da1696ae906eea3d615ad41e2e3def4aeb4e8bc0ef9a7a",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.3.0/MODULE.bazel": "20228b92868bf5cfc41bda7afc8a8ba2a543201851de39d990ec957b513579c5",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.4.1/MODULE.bazel": "a0dcb779424be33100dcae821e9e27e4f2901d9dfd5333efe5ac6a8d7ab75e1d",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.4.2/MODULE.bazel": "3bd40978e7a1fac911d5989e6b09d8f64921865a45822d8b09e815eaa726a651",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.5.0/MODULE.bazel": "32880f5e2945ce6a03d1fbd588e9198c0a959bb42297b2cfaf1685b7bc32e138",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.6.1/MODULE.bazel": "8fdee2dbaace6c252131c00e1de4b165dc65af02ea278476187765e1a617b917",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.7.0/MODULE.bazel": "0db596f4563de7938de764cc8deeabec291f55e8ec15299718b93c4423e9796d",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.7.1/MODULE.bazel": "3120d80c5861aa616222ec015332e5f8d3171e062e3e804a2a0253e1be26e59b",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.8.1/MODULE.bazel": "88ade7293becda963e0e3ea33e7d54d3425127e0a326e0d17da085a5f1f03ff6",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.8.2/MODULE.bazel": "69ad6927098316848b34a9142bcc975e018ba27f08c4ff403f50c1b6e646ca67",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.8.2/source.json": "34a3c8bcf233b835eb74be9d628899bb32999d3e0eadef1947a0a562a2b16ffb",
+    "https://bcr.bazel.build/modules/bazel_worker_api/0.0.1/MODULE.bazel": "02a13b77321773b2042e70ee5e4c5e099c8ddee4cf2da9cd420442c36938d4bd",
+    "https://bcr.bazel.build/modules/bazel_worker_api/0.0.4/MODULE.bazel": "460aa12d01231a80cce03c548287b433b321d205b0028ae596728c35e5ee442e",
+    "https://bcr.bazel.build/modules/bazel_worker_api/0.0.4/source.json": "d353c410d47a8b65d09fa98e83d57ebec257a2c2b9c6e42d6fda1cb25e5464a5",
+    "https://bcr.bazel.build/modules/bazel_worker_java/0.0.4/MODULE.bazel": "82494a01018bb7ef06d4a17ec4cd7a758721f10eb8b6c820a818e70d669500db",
+    "https://bcr.bazel.build/modules/bazel_worker_java/0.0.4/source.json": "a2d30458fd86cf022c2b6331e652526fa08e17573b2f5034a9dbcacdf9c2583c",
+    "https://bcr.bazel.build/modules/buildozer/7.1.2/MODULE.bazel": "2e8dd40ede9c454042645fd8d8d0cd1527966aa5c919de86661e62953cd73d84",
+    "https://bcr.bazel.build/modules/buildozer/7.1.2/source.json": "c9028a501d2db85793a6996205c8de120944f50a0d570438fcae0457a5f9d1f8",
+    "https://bcr.bazel.build/modules/gazelle/0.32.0/MODULE.bazel": "b499f58a5d0d3537f3cf5b76d8ada18242f64ec474d8391247438bf04f58c7b8",
+    "https://bcr.bazel.build/modules/gazelle/0.33.0/MODULE.bazel": "a13a0f279b462b784fb8dd52a4074526c4a2afe70e114c7d09066097a46b3350",
+    "https://bcr.bazel.build/modules/gazelle/0.34.0/MODULE.bazel": "abdd8ce4d70978933209db92e436deb3a8b737859e9354fb5fd11fb5c2004c8a",
+    "https://bcr.bazel.build/modules/gazelle/0.36.0/MODULE.bazel": "e375d5d6e9a6ca59b0cb38b0540bc9a05b6aa926d322f2de268ad267a2ee74c0",
+    "https://bcr.bazel.build/modules/gazelle/0.40.0/MODULE.bazel": "42ba5378ebe845fca43989a53186ab436d956db498acde790685fe0e8f9c6146",
+    "https://bcr.bazel.build/modules/gazelle/0.40.0/source.json": "1e5ef6e4d8b9b6836d93273c781e78ff829ea2e077afef7a57298040fa4f010a",
+    "https://bcr.bazel.build/modules/google_benchmark/1.8.2/MODULE.bazel": "a70cf1bba851000ba93b58ae2f6d76490a9feb74192e57ab8e8ff13c34ec50cb",
+    "https://bcr.bazel.build/modules/googletest/1.11.0/MODULE.bazel": "3a83f095183f66345ca86aa13c58b59f9f94a2f81999c093d4eeaa2d262d12f4",
+    "https://bcr.bazel.build/modules/googletest/1.14.0.bcr.1/MODULE.bazel": "22c31a561553727960057361aa33bf20fb2e98584bc4fec007906e27053f80c6",
+    "https://bcr.bazel.build/modules/googletest/1.14.0/MODULE.bazel": "cfbcbf3e6eac06ef9d85900f64424708cc08687d1b527f0ef65aa7517af8118f",
+    "https://bcr.bazel.build/modules/googletest/1.15.2/MODULE.bazel": "6de1edc1d26cafb0ea1a6ab3f4d4192d91a312fd2d360b63adaa213cd00b2108",
+    "https://bcr.bazel.build/modules/googletest/1.15.2/source.json": "dbdda654dcb3a0d7a8bc5d0ac5fc7e150b58c2a986025ae5bc634bb2cb61f470",
+    "https://bcr.bazel.build/modules/jsoncpp/1.9.5/MODULE.bazel": "31271aedc59e815656f5736f282bb7509a97c7ecb43e927ac1a37966e0578075",
+    "https://bcr.bazel.build/modules/jsoncpp/1.9.6/MODULE.bazel": "2f8d20d3b7d54143213c4dfc3d98225c42de7d666011528dc8fe91591e2e17b0",
+    "https://bcr.bazel.build/modules/jsoncpp/1.9.6/source.json": "a04756d367a2126c3541682864ecec52f92cdee80a35735a3cb249ce015ca000",
+    "https://bcr.bazel.build/modules/libpfm/4.11.0/MODULE.bazel": "45061ff025b301940f1e30d2c16bea596c25b176c8b6b3087e92615adbd52902",
+    "https://bcr.bazel.build/modules/nlohmann_json/3.6.1/MODULE.bazel": "6f7b417dcc794d9add9e556673ad25cb3ba835224290f4f848f8e2db1e1fca74",
+    "https://bcr.bazel.build/modules/nlohmann_json/3.6.1/source.json": "f448c6e8963fdfa7eb831457df83ad63d3d6355018f6574fb017e8169deb43a9",
+    "https://bcr.bazel.build/modules/platforms/0.0.10/MODULE.bazel": "8cb8efaf200bdeb2150d93e162c40f388529a25852b332cec879373771e48ed5",
+    "https://bcr.bazel.build/modules/platforms/0.0.11/MODULE.bazel": "0daefc49732e227caa8bfa834d65dc52e8cc18a2faf80df25e8caea151a9413f",
+    "https://bcr.bazel.build/modules/platforms/0.0.4/MODULE.bazel": "9b328e31ee156f53f3c416a64f8491f7eb731742655a47c9eec4703a71644aee",
+    "https://bcr.bazel.build/modules/platforms/0.0.5/MODULE.bazel": "5733b54ea419d5eaf7997054bb55f6a1d0b5ff8aedf0176fef9eea44f3acda37",
+    "https://bcr.bazel.build/modules/platforms/0.0.6/MODULE.bazel": "ad6eeef431dc52aefd2d77ed20a4b353f8ebf0f4ecdd26a807d2da5aa8cd0615",
+    "https://bcr.bazel.build/modules/platforms/0.0.7/MODULE.bazel": "72fd4a0ede9ee5c021f6a8dd92b503e089f46c227ba2813ff183b71616034814",
+    "https://bcr.bazel.build/modules/platforms/0.0.8/MODULE.bazel": "9f142c03e348f6d263719f5074b21ef3adf0b139ee4c5133e2aa35664da9eb2d",
+    "https://bcr.bazel.build/modules/platforms/0.0.9/MODULE.bazel": "4a87a60c927b56ddd67db50c89acaa62f4ce2a1d2149ccb63ffd871d5ce29ebc",
+    "https://bcr.bazel.build/modules/platforms/1.0.0/MODULE.bazel": "f05feb42b48f1b3c225e4ccf351f367be0371411a803198ec34a389fb22aa580",
+    "https://bcr.bazel.build/modules/platforms/1.0.0/source.json": "f4ff1fd412e0246fd38c82328eb209130ead81d62dcd5a9e40910f867f733d96",
+    "https://bcr.bazel.build/modules/protobuf/21.7/MODULE.bazel": "a5a29bb89544f9b97edce05642fac225a808b5b7be74038ea3640fae2f8e66a7",
+    "https://bcr.bazel.build/modules/protobuf/23.1/MODULE.bazel": "88b393b3eb4101d18129e5db51847cd40a5517a53e81216144a8c32dfeeca52a",
+    "https://bcr.bazel.build/modules/protobuf/24.4/MODULE.bazel": "7bc7ce5f2abf36b3b7b7c8218d3acdebb9426aeb35c2257c96445756f970eb12",
+    "https://bcr.bazel.build/modules/protobuf/27.0/MODULE.bazel": "7873b60be88844a0a1d8f80b9d5d20cfbd8495a689b8763e76c6372998d3f64c",
+    "https://bcr.bazel.build/modules/protobuf/27.1/MODULE.bazel": "703a7b614728bb06647f965264967a8ef1c39e09e8f167b3ca0bb1fd80449c0d",
+    "https://bcr.bazel.build/modules/protobuf/27.2/MODULE.bazel": "32450b50673882e4c8c3d10a83f3bc82161b213ed2f80d17e38bece8f165c295",
+    "https://bcr.bazel.build/modules/protobuf/29.0-rc2/MODULE.bazel": "6241d35983510143049943fc0d57937937122baf1b287862f9dc8590fc4c37df",
+    "https://bcr.bazel.build/modules/protobuf/29.0-rc3/MODULE.bazel": "33c2dfa286578573afc55a7acaea3cada4122b9631007c594bf0729f41c8de92",
+    "https://bcr.bazel.build/modules/protobuf/29.0/MODULE.bazel": "319dc8bf4c679ff87e71b1ccfb5a6e90a6dbc4693501d471f48662ac46d04e4e",
+    "https://bcr.bazel.build/modules/protobuf/3.19.0/MODULE.bazel": "6b5fbb433f760a99a22b18b6850ed5784ef0e9928a72668b66e4d7ccd47db9b0",
+    "https://bcr.bazel.build/modules/protobuf/3.19.2/MODULE.bazel": "532ffe5f2186b69fdde039efe6df13ba726ff338c6bc82275ad433013fa10573",
+    "https://bcr.bazel.build/modules/protobuf/3.19.6/MODULE.bazel": "9233edc5e1f2ee276a60de3eaa47ac4132302ef9643238f23128fea53ea12858",
+    "https://bcr.bazel.build/modules/protobuf/31.1/MODULE.bazel": "379a389bb330b7b8c1cdf331cc90bf3e13de5614799b3b52cdb7c6f389f6b38e",
+    "https://bcr.bazel.build/modules/protobuf/31.1/source.json": "25af5d0219da0c0fc4d1191a24ce438e6ca7f49d2e1a94f354efeba6ef10426f",
+    "https://bcr.bazel.build/modules/pybind11_bazel/2.11.1/MODULE.bazel": "88af1c246226d87e65be78ed49ecd1e6f5e98648558c14ce99176da041dc378e",
+    "https://bcr.bazel.build/modules/pybind11_bazel/2.12.0/MODULE.bazel": "e6f4c20442eaa7c90d7190d8dc539d0ab422f95c65a57cc59562170c58ae3d34",
+    "https://bcr.bazel.build/modules/pybind11_bazel/2.12.0/source.json": "6900fdc8a9e95866b8c0d4ad4aba4d4236317b5c1cd04c502df3f0d33afed680",
+    "https://bcr.bazel.build/modules/re2/2023-09-01/MODULE.bazel": "cb3d511531b16cfc78a225a9e2136007a48cf8a677e4264baeab57fe78a80206",
+    "https://bcr.bazel.build/modules/re2/2024-07-02.bcr.1/MODULE.bazel": "b4963dda9b31080be1905ef085ecd7dd6cd47c05c79b9cdf83ade83ab2ab271a",
+    "https://bcr.bazel.build/modules/re2/2024-07-02.bcr.1/source.json": "2ff292be6ef3340325ce8a045ecc326e92cbfab47c7cbab4bd85d28971b97ac4",
+    "https://bcr.bazel.build/modules/re2/2024-07-02/MODULE.bazel": "0eadc4395959969297cbcf31a249ff457f2f1d456228c67719480205aa306daa",
+    "https://bcr.bazel.build/modules/rules_android/0.1.1/MODULE.bazel": "48809ab0091b07ad0182defb787c4c5328bd3a278938415c00a7b69b50c4d3a8",
+    "https://bcr.bazel.build/modules/rules_android/0.6.6/MODULE.bazel": "b0fb569752aab65ab1a9db0a8f6cfaf5aa1754965e17e95dcf0e4d88e192a68d",
+    "https://bcr.bazel.build/modules/rules_android/0.6.6/source.json": "a9d8dc2d5a102dc03269a94acc886a4cab82cdcb9ccbc77b0f665d6d17a6ae09",
+    "https://bcr.bazel.build/modules/rules_apple/3.16.0/MODULE.bazel": "0d1caf0b8375942ce98ea944be754a18874041e4e0459401d925577624d3a54a",
+    "https://bcr.bazel.build/modules/rules_apple/3.16.0/source.json": "d8b5fe461272018cc07cfafce11fe369c7525330804c37eec5a82f84cd475366",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.1/MODULE.bazel": "cb2aa0747f84c6c3a78dad4e2049c154f08ab9d166b1273835a8174940365647",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.10/MODULE.bazel": "ec1705118f7eaedd6e118508d3d26deba2a4e76476ada7e0e3965211be012002",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.13/MODULE.bazel": "0e8529ed7b323dad0775ff924d2ae5af7640b23553dfcd4d34344c7e7a867191",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.14/MODULE.bazel": "5e343a3aac88b8d7af3b1b6d2093b55c347b8eefc2e7d1442f7a02dc8fea48ac",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.15/MODULE.bazel": "6704c35f7b4a72502ee81f61bf88706b54f06b3cbe5558ac17e2e14666cd5dcc",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.16/MODULE.bazel": "7661303b8fc1b4d7f532e54e9d6565771fea666fbdf839e0a86affcd02defe87",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.17/MODULE.bazel": "2ae1d8f4238ec67d7185d8861cb0a2cdf4bc608697c331b95bf990e69b62e64a",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.2/MODULE.bazel": "6915987c90970493ab97393024c156ea8fb9f3bea953b2f3ec05c34f19b5695c",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.6/MODULE.bazel": "abf360251023dfe3efcef65ab9d56beefa8394d4176dd29529750e1c57eaa33f",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.8/MODULE.bazel": "964c85c82cfeb6f3855e6a07054fdb159aced38e99a5eecf7bce9d53990afa3e",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.9/MODULE.bazel": "836e76439f354b89afe6a911a7adf59a6b2518fafb174483ad78a2a2fde7b1c5",
+    "https://bcr.bazel.build/modules/rules_cc/0.1.1/MODULE.bazel": "2f0222a6f229f0bf44cd711dc13c858dad98c62d52bd51d8fc3a764a83125513",
+    "https://bcr.bazel.build/modules/rules_cc/0.2.11/MODULE.bazel": "e94f24f065bf2191dba2dace951814378b66a94bb3bcc48077492fe0508059b5",
+    "https://bcr.bazel.build/modules/rules_cc/0.2.11/source.json": "4d555dc20c9c135b21b2e403cf0ce8393fb65711b2305979ce053df4ee3e78de",
+    "https://bcr.bazel.build/modules/rules_cc/0.2.8/MODULE.bazel": "f1df20f0bf22c28192a794f29b501ee2018fa37a3862a1a2132ae2940a23a642",
+    "https://bcr.bazel.build/modules/rules_foreign_cc/0.15.1/MODULE.bazel": "c2c60d26c79fda484acb95cdbec46e89d6b28b4845cb277160ce1e0c8622bb88",
+    "https://bcr.bazel.build/modules/rules_foreign_cc/0.15.1/source.json": "a161811a63ba8a859086da3b7ff3ad04f2e9c255d7727b41087103fc0eb22f55",
+    "https://bcr.bazel.build/modules/rules_foreign_cc/0.9.0/MODULE.bazel": "c9e8c682bf75b0e7c704166d79b599f93b72cfca5ad7477df596947891feeef6",
+    "https://bcr.bazel.build/modules/rules_fuzzing/0.5.2/MODULE.bazel": "40c97d1144356f52905566c55811f13b299453a14ac7769dfba2ac38192337a8",
+    "https://bcr.bazel.build/modules/rules_go/0.41.0/MODULE.bazel": "55861d8e8bb0e62cbd2896f60ff303f62ffcb0eddb74ecb0e5c0cbe36fc292c8",
+    "https://bcr.bazel.build/modules/rules_go/0.42.0/MODULE.bazel": "8cfa875b9aa8c6fce2b2e5925e73c1388173ea3c32a0db4d2b4804b453c14270",
+    "https://bcr.bazel.build/modules/rules_go/0.46.0/MODULE.bazel": "3477df8bdcc49e698b9d25f734c4f3a9f5931ff34ee48a2c662be168f5f2d3fd",
+    "https://bcr.bazel.build/modules/rules_go/0.50.1/MODULE.bazel": "b91a308dc5782bb0a8021ad4330c81fea5bda77f96b9e4c117b9b9c8f6665ee0",
+    "https://bcr.bazel.build/modules/rules_go/0.51.0-rc2/MODULE.bazel": "edfc3a9cea7bedb0eaaff37b0d7817c1a4bf72b3c615580b0ffcee6c52690fd4",
+    "https://bcr.bazel.build/modules/rules_go/0.51.0-rc2/source.json": "6b5cd0b3da2bd0e6949580851db990a04af0a285f072b9a0f059424457cd8cc9",
+    "https://bcr.bazel.build/modules/rules_java/4.0.0/MODULE.bazel": "5a78a7ae82cd1a33cef56dc578c7d2a46ed0dca12643ee45edbb8417899e6f74",
+    "https://bcr.bazel.build/modules/rules_java/5.3.5/MODULE.bazel": "a4ec4f2db570171e3e5eb753276ee4b389bae16b96207e9d3230895c99644b86",
+    "https://bcr.bazel.build/modules/rules_java/6.0.0/MODULE.bazel": "8a43b7df601a7ec1af61d79345c17b31ea1fedc6711fd4abfd013ea612978e39",
+    "https://bcr.bazel.build/modules/rules_java/6.3.0/MODULE.bazel": "a97c7678c19f236a956ad260d59c86e10a463badb7eb2eda787490f4c969b963",
+    "https://bcr.bazel.build/modules/rules_java/6.4.0/MODULE.bazel": "e986a9fe25aeaa84ac17ca093ef13a4637f6107375f64667a15999f77db6c8f6",
+    "https://bcr.bazel.build/modules/rules_java/6.5.2/MODULE.bazel": "1d440d262d0e08453fa0c4d8f699ba81609ed0e9a9a0f02cd10b3e7942e61e31",
+    "https://bcr.bazel.build/modules/rules_java/7.1.0/MODULE.bazel": "30d9135a2b6561c761bd67bd4990da591e6bdc128790ce3e7afd6a3558b2fb64",
+    "https://bcr.bazel.build/modules/rules_java/7.10.0/MODULE.bazel": "530c3beb3067e870561739f1144329a21c851ff771cd752a49e06e3dc9c2e71a",
+    "https://bcr.bazel.build/modules/rules_java/7.12.2/MODULE.bazel": "579c505165ee757a4280ef83cda0150eea193eed3bef50b1004ba88b99da6de6",
+    "https://bcr.bazel.build/modules/rules_java/7.2.0/MODULE.bazel": "06c0334c9be61e6cef2c8c84a7800cef502063269a5af25ceb100b192453d4ab",
+    "https://bcr.bazel.build/modules/rules_java/7.3.2/MODULE.bazel": "50dece891cfdf1741ea230d001aa9c14398062f2b7c066470accace78e412bc2",
+    "https://bcr.bazel.build/modules/rules_java/7.4.0/MODULE.bazel": "a592852f8a3dd539e82ee6542013bf2cadfc4c6946be8941e189d224500a8934",
+    "https://bcr.bazel.build/modules/rules_java/7.6.1/MODULE.bazel": "2f14b7e8a1aa2f67ae92bc69d1ec0fa8d9f827c4e17ff5e5f02e91caa3b2d0fe",
+    "https://bcr.bazel.build/modules/rules_java/8.13.0/MODULE.bazel": "0444ebf737d144cf2bb2ccb368e7f1cce735264285f2a3711785827c1686625e",
+    "https://bcr.bazel.build/modules/rules_java/8.13.0/source.json": "4605c0f676b87dd9d1fabd4d743b71f04d97503bd1a79aad53f87399fb5396de",
+    "https://bcr.bazel.build/modules/rules_java/8.3.2/MODULE.bazel": "7336d5511ad5af0b8615fdc7477535a2e4e723a357b6713af439fe8cf0195017",
+    "https://bcr.bazel.build/modules/rules_java/8.5.1/MODULE.bazel": "d8a9e38cc5228881f7055a6079f6f7821a073df3744d441978e7a43e20226939",
+    "https://bcr.bazel.build/modules/rules_java/8.6.0/MODULE.bazel": "9c064c434606d75a086f15ade5edb514308cccd1544c2b2a89bbac4310e41c71",
+    "https://bcr.bazel.build/modules/rules_java/8.6.1/MODULE.bazel": "f4808e2ab5b0197f094cabce9f4b006a27766beb6a9975931da07099560ca9c2",
+    "https://bcr.bazel.build/modules/rules_jvm_external/4.4.2/MODULE.bazel": "a56b85e418c83eb1839819f0b515c431010160383306d13ec21959ac412d2fe7",
+    "https://bcr.bazel.build/modules/rules_jvm_external/5.1/MODULE.bazel": "33f6f999e03183f7d088c9be518a63467dfd0be94a11d0055fe2d210f89aa909",
+    "https://bcr.bazel.build/modules/rules_jvm_external/5.2/MODULE.bazel": "d9351ba35217ad0de03816ef3ed63f89d411349353077348a45348b096615036",
+    "https://bcr.bazel.build/modules/rules_jvm_external/5.3/MODULE.bazel": "bf93870767689637164657731849fb887ad086739bd5d360d90007a581d5527d",
+    "https://bcr.bazel.build/modules/rules_jvm_external/6.1/MODULE.bazel": "75b5fec090dbd46cf9b7d8ea08cf84a0472d92ba3585b476f44c326eda8059c4",
+    "https://bcr.bazel.build/modules/rules_jvm_external/6.2/MODULE.bazel": "36a6e52487a855f33cb960724eb56547fa87e2c98a0474c3acad94339d7f8e99",
+    "https://bcr.bazel.build/modules/rules_jvm_external/6.3/MODULE.bazel": "c998e060b85f71e00de5ec552019347c8bca255062c990ac02d051bb80a38df0",
+    "https://bcr.bazel.build/modules/rules_jvm_external/6.6/MODULE.bazel": "153042249c7060536dc95b6bb9f9bb8063b8a0b0cb7acdb381bddbc2374aed55",
+    "https://bcr.bazel.build/modules/rules_jvm_external/6.7/MODULE.bazel": "e717beabc4d091ecb2c803c2d341b88590e9116b8bf7947915eeb33aab4f96dd",
+    "https://bcr.bazel.build/modules/rules_jvm_external/6.7/source.json": "5426f412d0a7fc6b611643376c7e4a82dec991491b9ce5cb1cfdd25fe2e92be4",
+    "https://bcr.bazel.build/modules/rules_kotlin/1.9.0/MODULE.bazel": "ef85697305025e5a61f395d4eaede272a5393cee479ace6686dba707de804d59",
+    "https://bcr.bazel.build/modules/rules_kotlin/1.9.5/MODULE.bazel": "043a16a572f610558ec2030db3ff0c9938574e7dd9f58bded1bb07c0192ef025",
+    "https://bcr.bazel.build/modules/rules_kotlin/1.9.6/MODULE.bazel": "d269a01a18ee74d0335450b10f62c9ed81f2321d7958a2934e44272fe82dcef3",
+    "https://bcr.bazel.build/modules/rules_kotlin/1.9.6/source.json": "2faa4794364282db7c06600b7e5e34867a564ae91bda7cae7c29c64e9466b7d5",
+    "https://bcr.bazel.build/modules/rules_license/0.0.3/MODULE.bazel": "627e9ab0247f7d1e05736b59dbb1b6871373de5ad31c3011880b4133cafd4bd0",
+    "https://bcr.bazel.build/modules/rules_license/0.0.7/MODULE.bazel": "088fbeb0b6a419005b89cf93fe62d9517c0a2b8bb56af3244af65ecfe37e7d5d",
+    "https://bcr.bazel.build/modules/rules_license/1.0.0/MODULE.bazel": "a7fda60eefdf3d8c827262ba499957e4df06f659330bbe6cdbdb975b768bb65c",
+    "https://bcr.bazel.build/modules/rules_license/1.0.0/source.json": "a52c89e54cc311196e478f8382df91c15f7a2bfdf4c6cd0e2675cc2ff0b56efb",
+    "https://bcr.bazel.build/modules/rules_pkg/0.7.0/MODULE.bazel": "df99f03fc7934a4737122518bb87e667e62d780b610910f0447665a7e2be62dc",
+    "https://bcr.bazel.build/modules/rules_pkg/1.0.1/MODULE.bazel": "5b1df97dbc29623bccdf2b0dcd0f5cb08e2f2c9050aab1092fd39a41e82686ff",
+    "https://bcr.bazel.build/modules/rules_pkg/1.0.1/source.json": "bd82e5d7b9ce2d31e380dd9f50c111d678c3bdaca190cb76b0e1c71b05e1ba8a",
+    "https://bcr.bazel.build/modules/rules_proto/4.0.0/MODULE.bazel": "a7a7b6ce9bee418c1a760b3d84f83a299ad6952f9903c67f19e4edd964894e06",
+    "https://bcr.bazel.build/modules/rules_proto/5.3.0-21.7/MODULE.bazel": "e8dff86b0971688790ae75528fe1813f71809b5afd57facb44dad9e8eca631b7",
+    "https://bcr.bazel.build/modules/rules_proto/6.0.0-rc1/MODULE.bazel": "1e5b502e2e1a9e825eef74476a5a1ee524a92297085015a052510b09a1a09483",
+    "https://bcr.bazel.build/modules/rules_proto/6.0.0/MODULE.bazel": "b531d7f09f58dce456cd61b4579ce8c86b38544da75184eadaf0a7cb7966453f",
+    "https://bcr.bazel.build/modules/rules_proto/6.0.2/MODULE.bazel": "ce916b775a62b90b61888052a416ccdda405212b6aaeb39522f7dc53431a5e73",
+    "https://bcr.bazel.build/modules/rules_proto/7.0.2/MODULE.bazel": "bf81793bd6d2ad89a37a40693e56c61b0ee30f7a7fdbaf3eabbf5f39de47dea2",
+    "https://bcr.bazel.build/modules/rules_proto/7.0.2/source.json": "1e5e7260ae32ef4f2b52fd1d0de8d03b606a44c91b694d2f1afb1d3b28a48ce1",
+    "https://bcr.bazel.build/modules/rules_python/0.10.2/MODULE.bazel": "cc82bc96f2997baa545ab3ce73f196d040ffb8756fd2d66125a530031cd90e5f",
+    "https://bcr.bazel.build/modules/rules_python/0.23.1/MODULE.bazel": "49ffccf0511cb8414de28321f5fcf2a31312b47c40cc21577144b7447f2bf300",
+    "https://bcr.bazel.build/modules/rules_python/0.25.0/MODULE.bazel": "72f1506841c920a1afec76975b35312410eea3aa7b63267436bfb1dd91d2d382",
+    "https://bcr.bazel.build/modules/rules_python/0.28.0/MODULE.bazel": "cba2573d870babc976664a912539b320cbaa7114cd3e8f053c720171cde331ed",
+    "https://bcr.bazel.build/modules/rules_python/0.31.0/MODULE.bazel": "93a43dc47ee570e6ec9f5779b2e64c1476a6ce921c48cc9a1678a91dd5f8fd58",
+    "https://bcr.bazel.build/modules/rules_python/0.33.2/MODULE.bazel": "3e036c4ad8d804a4dad897d333d8dce200d943df4827cb849840055be8d2e937",
+    "https://bcr.bazel.build/modules/rules_python/0.37.1/MODULE.bazel": "3faeb2d9fa0a81f8980643ee33f212308f4d93eea4b9ce6f36d0b742e71e9500",
+    "https://bcr.bazel.build/modules/rules_python/0.37.2/MODULE.bazel": "b5ffde91410745750b6c13be1c5dc4555ef5bc50562af4a89fd77807fdde626a",
+    "https://bcr.bazel.build/modules/rules_python/0.4.0/MODULE.bazel": "9208ee05fd48bf09ac60ed269791cf17fb343db56c8226a720fbb1cdf467166c",
+    "https://bcr.bazel.build/modules/rules_python/0.40.0/MODULE.bazel": "9d1a3cd88ed7d8e39583d9ffe56ae8a244f67783ae89b60caafc9f5cf318ada7",
+    "https://bcr.bazel.build/modules/rules_python/1.0.0/MODULE.bazel": "898a3d999c22caa585eb062b600f88654bf92efb204fa346fb55f6f8edffca43",
+    "https://bcr.bazel.build/modules/rules_python/1.2.0/MODULE.bazel": "5aeeb48b2a6c19d668b48adf2b8a2b209a6310c230db0ce77450f148a89846e4",
+    "https://bcr.bazel.build/modules/rules_python/1.6.3/MODULE.bazel": "a7b80c42cb3de5ee2a5fa1abc119684593704fcd2fec83165ebe615dec76574f",
+    "https://bcr.bazel.build/modules/rules_python/1.6.3/source.json": "f0be74977e5604a6526c8a416cda22985093ff7d5d380d41722d7e44015cc419",
+    "https://bcr.bazel.build/modules/rules_robolectric/4.14.1.2/MODULE.bazel": "d44fec647d0aeb67b9f3b980cf68ba634976f3ae7ccd6c07d790b59b87a4f251",
+    "https://bcr.bazel.build/modules/rules_robolectric/4.14.1.2/source.json": "37c10335f2361c337c5c1f34ed36d2da70534c23088062b33a8bdaab68aa9dea",
+    "https://bcr.bazel.build/modules/rules_shell/0.1.2/MODULE.bazel": "66e4ca3ce084b04af0b9ff05ff14cab4e5df7503973818bb91cbc6cda08d32fc",
+    "https://bcr.bazel.build/modules/rules_shell/0.2.0/MODULE.bazel": "fda8a652ab3c7d8fee214de05e7a9916d8b28082234e8d2c0094505c5268ed3c",
+    "https://bcr.bazel.build/modules/rules_shell/0.3.0/MODULE.bazel": "de4402cd12f4cc8fda2354fce179fdb068c0b9ca1ec2d2b17b3e21b24c1a937b",
+    "https://bcr.bazel.build/modules/rules_shell/0.6.1/MODULE.bazel": "72e76b0eea4e81611ef5452aa82b3da34caca0c8b7b5c0c9584338aa93bae26b",
+    "https://bcr.bazel.build/modules/rules_shell/0.6.1/source.json": "20ec05cd5e592055e214b2da8ccb283c7f2a421ea0dc2acbf1aa792e11c03d0c",
+    "https://bcr.bazel.build/modules/rules_swift/1.16.0/MODULE.bazel": "4a09f199545a60d09895e8281362b1ff3bb08bbde69c6fc87aff5b92fcc916ca",
+    "https://bcr.bazel.build/modules/rules_swift/2.1.1/MODULE.bazel": "494900a80f944fc7aa61500c2073d9729dff0b764f0e89b824eb746959bc1046",
+    "https://bcr.bazel.build/modules/rules_swift/2.1.1/source.json": "40fc69dfaac64deddbb75bd99cdac55f4427d9ca0afbe408576a65428427a186",
+    "https://bcr.bazel.build/modules/stardoc/0.5.1/MODULE.bazel": "1a05d92974d0c122f5ccf09291442580317cdd859f07a8655f1db9a60374f9f8",
+    "https://bcr.bazel.build/modules/stardoc/0.5.3/MODULE.bazel": "c7f6948dae6999bf0db32c1858ae345f112cacf98f174c7a8bb707e41b974f1c",
+    "https://bcr.bazel.build/modules/stardoc/0.5.6/MODULE.bazel": "c43dabc564990eeab55e25ed61c07a1aadafe9ece96a4efabb3f8bf9063b71ef",
+    "https://bcr.bazel.build/modules/stardoc/0.6.2/MODULE.bazel": "7060193196395f5dd668eda046ccbeacebfd98efc77fed418dbe2b82ffaa39fd",
+    "https://bcr.bazel.build/modules/stardoc/0.7.0/MODULE.bazel": "05e3d6d30c099b6770e97da986c53bd31844d7f13d41412480ea265ac9e8079c",
+    "https://bcr.bazel.build/modules/stardoc/0.7.1/MODULE.bazel": "3548faea4ee5dda5580f9af150e79d0f6aea934fc60c1cc50f4efdd9420759e7",
+    "https://bcr.bazel.build/modules/stardoc/0.7.2/MODULE.bazel": "fc152419aa2ea0f51c29583fab1e8c99ddefd5b3778421845606ee628629e0e5",
+    "https://bcr.bazel.build/modules/stardoc/0.7.2/source.json": "58b029e5e901d6802967754adf0a9056747e8176f017cfe3607c0851f4d42216",
+    "https://bcr.bazel.build/modules/swift_argument_parser/1.3.1.1/MODULE.bazel": "5e463fbfba7b1701d957555ed45097d7f984211330106ccd1352c6e0af0dcf91",
+    "https://bcr.bazel.build/modules/swift_argument_parser/1.3.1.1/source.json": "32bd87e5f4d7acc57c5b2ff7c325ae3061d5e242c0c4c214ae87e0f1c13e54cb",
+    "https://bcr.bazel.build/modules/upb/0.0.0-20220923-a547704/MODULE.bazel": "7298990c00040a0e2f121f6c32544bab27d4452f80d9ce51349b1a28f3005c43",
+    "https://bcr.bazel.build/modules/upb/0.0.0-20230516-61a97ef/MODULE.bazel": "c0df5e35ad55e264160417fd0875932ee3c9dda63d9fccace35ac62f45e1b6f9",
+    "https://bcr.bazel.build/modules/zlib/1.2.11/MODULE.bazel": "07b389abc85fdbca459b69e2ec656ae5622873af3f845e1c9d80fe179f3effa0",
+    "https://bcr.bazel.build/modules/zlib/1.2.12/MODULE.bazel": "3b1a8834ada2a883674be8cbd36ede1b6ec481477ada359cd2d3ddc562340b27",
+    "https://bcr.bazel.build/modules/zlib/1.3.1.bcr.3/MODULE.bazel": "af322bc08976524477c79d1e45e241b6efbeb918c497e8840b8ab116802dda79",
+    "https://bcr.bazel.build/modules/zlib/1.3.1.bcr.5/MODULE.bazel": "eec517b5bbe5492629466e11dae908d043364302283de25581e3eb944326c4ca",
+    "https://bcr.bazel.build/modules/zlib/1.3.1.bcr.5/source.json": "22bc55c47af97246cfc093d0acf683a7869377de362b5d1c552c2c2e16b7a806",
+    "https://bcr.bazel.build/modules/zlib/1.3.1/MODULE.bazel": "751c9940dcfe869f5f7274e1295422a34623555916eb98c174c1e945594bf198"
+  },
+  "selectedYankedVersions": {},
+  "moduleExtensions": {
+    "//:extensions.bzl%llvm_deps_extension": {
+      "general": {
+        "bzlTransitiveDigest": "LGeZ4Ibt22AGXloFt/bm3EsBB05m6aTG+WxfH8fJVB4=",
+        "usagesDigest": "dHBLC1g5cqg/flxcuZRJMp2heDoB4+0/NDd6MutLhGE=",
+        "recordedFileInputs": {},
+        "recordedDirentsInputs": {},
+        "envVariables": {},
+        "generatedRepoSpecs": {
+          "llvm-raw": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:local.bzl%new_local_repository",
+            "attributes": {
+              "build_file_content": "# empty",
+              "path": "../../"
+            }
+          },
+          "llvm_zlib": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:zlib-ng.BUILD",
+              "sha256": "e36bb346c00472a1f9ff2a0a4643e590a254be6379da7cddd9daeb9a7f296731",
+              "strip_prefix": "zlib-ng-2.0.7",
+              "urls": [
+                "https://github.com/zlib-ng/zlib-ng/archive/refs/tags/2.0.7.zip"
+              ]
+            }
+          },
+          "vulkan_headers": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:vulkan_headers.BUILD",
+              "sha256": "19f491784ef0bc73caff877d11c96a48b946b5a1c805079d9006e3fbaa5c1895",
+              "strip_prefix": "Vulkan-Headers-9bd3f561bcee3f01d22912de10bb07ce4e23d378",
+              "urls": [
+                "https://github.com/KhronosGroup/Vulkan-Headers/archive/9bd3f561bcee3f01d22912de10bb07ce4e23d378.tar.gz"
+              ]
+            }
+          },
+          "vulkan_sdk_setup": {
+            "repoRuleId": "@@//:vulkan_sdk.bzl%vulkan_sdk_setup",
+            "attributes": {}
+          },
+          "gmp": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "urls": [
+                "https://gmplib.org/download/gmp/gmp-6.2.1.tar.xz",
+                "https://ftp.gnu.org/gnu/gmp/gmp-6.2.1.tar.xz"
+              ],
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:gmp.BUILD",
+              "sha256": "fd4829912cddd12f84181c3451cc752be224643e87fac497b69edddadc49b4f2",
+              "strip_prefix": "gmp-6.2.1"
+            }
+          },
+          "mpfr": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "urls": [
+                "https://www.mpfr.org/mpfr-current/mpfr-4.2.2.tar.gz"
+              ],
+              "sha256": "826cbb24610bd193f36fde172233fb8c009f3f5c2ad99f644d0dea2e16a20e42",
+              "strip_prefix": "mpfr-4.2.2",
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:mpfr.BUILD"
+            }
+          },
+          "mpc": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "urls": [
+                "https://ftp.gnu.org/gnu/mpc/mpc-1.3.1.tar.gz"
+              ],
+              "sha256": "ab642492f5cf882b74aa0cb730cd410a81edcdbec895183ce930e706c1c759b8",
+              "strip_prefix": "mpc-1.3.1",
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:mpc.BUILD"
+            }
+          },
+          "pfm": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "urls": [
+                "https://versaweb.dl.sourceforge.net/project/perfmon2/libpfm4/libpfm-4.13.0.tar.gz"
+              ],
+              "sha256": "d18b97764c755528c1051d376e33545d0eb60c6ebf85680436813fa5b04cc3d1",
+              "strip_prefix": "libpfm-4.13.0",
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:pfm.BUILD"
+            }
+          },
+          "llvm_zstd": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:zstd.BUILD",
+              "sha256": "7c42d56fac126929a6a85dbc73ff1db2411d04f104fae9bdea51305663a83fd0",
+              "strip_prefix": "zstd-1.5.2",
+              "urls": [
+                "https://github.com/facebook/zstd/releases/download/v1.5.2/zstd-1.5.2.tar.gz"
+              ]
+            }
+          },
+          "pybind11": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "url": "https://github.com/pybind/pybind11/archive/v2.10.3.zip",
+              "sha256": "201966a61dc826f1b1879a24a3317a1ec9214a918c8eb035be2f30c3e9cfbdcb",
+              "strip_prefix": "pybind11-2.10.3",
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:pybind.BUILD"
+            }
+          },
+          "pyyaml": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "url": "https://github.com/yaml/pyyaml/archive/refs/tags/5.1.zip",
+              "sha256": "f0a35d7f282a6d6b1a4f3f3965ef5c124e30ed27a0088efb97c0977268fd671f",
+              "strip_prefix": "pyyaml-5.1/lib3",
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:pyyaml.BUILD"
+            }
+          },
+          "robin_map": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:robin_map.BUILD",
+              "sha256": "a8424ad3b0affd4c57ed26f0f3d8a29604f0e1f2ef2089f497f614b1c94c7236",
+              "strip_prefix": "robin-map-1.3.0",
+              "url": "https://github.com/Tessil/robin-map/archive/refs/tags/v1.3.0.tar.gz"
+            }
+          },
+          "nanobind": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "build_file": "@@+llvm_deps_extension+llvm-raw//utils/bazel/third_party_build:nanobind.BUILD",
+              "sha256": "8ce3667dce3e64fc06bfb9b778b6f48731482362fb89a43da156632266cd5a90",
+              "strip_prefix": "nanobind-2.9.2",
+              "url": "https://github.com/wjakob/nanobind/archive/refs/tags/v2.9.2.tar.gz"
+            }
+          }
+        },
+        "recordedRepoMappingEntries": [
+          [
+            "",
+            "bazel_tools",
+            "bazel_tools"
+          ]
+        ]
+      }
+    },
+    "@@rules_android+//rules/android_sdk_repository:rule.bzl%android_sdk_repository_extension": {
+      "general": {
+        "bzlTransitiveDigest": "NAy+0M15JNVEBb8Tny6t7j3lKqTnsAMjoBB6LJ+C370=",
+        "usagesDigest": "g9Ur6X6qhf9a8MmY9qXU/jFjkyk/aZVBegI0yVMF0z4=",
+        "recordedFileInputs": {},
+        "recordedDirentsInputs": {},
+        "envVariables": {},
+        "generatedRepoSpecs": {
+          "androidsdk": {
+            "repoRuleId": "@@rules_android+//rules/android_sdk_repository:rule.bzl%_android_sdk_repository",
+            "attributes": {}
+          }
+        },
+        "recordedRepoMappingEntries": []
+      }
+    },
+    "@@rules_kotlin+//src/main/starlark/core/repositories:bzlmod_setup.bzl%rules_kotlin_extensions": {
+      "general": {
+        "bzlTransitiveDigest": "sFhcgPbDQehmbD1EOXzX4H1q/CD5df8zwG4kp4jbvr8=",
+        "usagesDigest": "QI2z8ZUR+mqtbwsf2fLqYdJAkPOHdOV+tF2yVAUgRzw=",
+        "recordedFileInputs": {},
+        "recordedDirentsInputs": {},
+        "envVariables": {},
+        "generatedRepoSpecs": {
+          "com_github_jetbrains_kotlin_git": {
+            "repoRuleId": "@@rules_kotlin+//src/main/starlark/core/repositories:compiler.bzl%kotlin_compiler_git_repository",
+            "attributes": {
+              "urls": [
+                "https://github.com/JetBrains/kotlin/releases/download/v1.9.23/kotlin-compiler-1.9.23.zip"
+              ],
+              "sha256": "93137d3aab9afa9b27cb06a824c2324195c6b6f6179d8a8653f440f5bd58be88"
+            }
+          },
+          "com_github_jetbrains_kotlin": {
+            "repoRuleId": "@@rules_kotlin+//src/main/starlark/core/repositories:compiler.bzl%kotlin_capabilities_repository",
+            "attributes": {
+              "git_repository_name": "com_github_jetbrains_kotlin_git",
+              "compiler_version": "1.9.23"
+            }
+          },
+          "com_github_google_ksp": {
+            "repoRuleId": "@@rules_kotlin+//src/main/starlark/core/repositories:ksp.bzl%ksp_compiler_plugin_repository",
+            "attributes": {
+              "urls": [
+                "https://github.com/google/ksp/releases/download/1.9.23-1.0.20/artifacts.zip"
+              ],
+              "sha256": "ee0618755913ef7fd6511288a232e8fad24838b9af6ea73972a76e81053c8c2d",
+              "strip_version": "1.9.23-1.0.20"
+            }
+          },
+          "com_github_pinterest_ktlint": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_file",
+            "attributes": {
+              "sha256": "01b2e0ef893383a50dbeb13970fe7fa3be36ca3e83259e01649945b09d736985",
+              "urls": [
+                "https://github.com/pinterest/ktlint/releases/download/1.3.0/ktlint"
+              ],
+              "executable": true
+            }
+          },
+          "rules_android": {
+            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
+            "attributes": {
+              "sha256": "cd06d15dd8bb59926e4d65f9003bfc20f9da4b2519985c27e190cddc8b7a7806",
+              "strip_prefix": "rules_android-0.1.1",
+              "urls": [
+                "https://github.com/bazelbuild/rules_android/archive/v0.1.1.zip"
+              ]
+            }
+          }
+        },
+        "recordedRepoMappingEntries": [
+          [
+            "rules_kotlin+",
+            "bazel_tools",
+            "bazel_tools"
+          ]
+        ]
+      }
+    },
+    "@@rules_python+//python/uv:uv.bzl%uv": {
+      "general": {
+        "bzlTransitiveDigest": "477hS4MXeJ7LqPNLTqL+1ltraV5lqwOw3tEXWqnJRt8=",
+        "usagesDigest": "icnInV8HDGrRQf9x8RMfxWfBHgT3OgRlYovS/9POEJw=",
+        "recordedFileInputs": {},
+        "recordedDirentsInputs": {},
+        "envVariables": {},
+        "generatedRepoSpecs": {
+          "uv": {
+            "repoRuleId": "@@rules_python+//python/uv/private:uv_toolchains_repo.bzl%uv_toolchains_repo",
+            "attributes": {
+              "toolchain_type": "'@@rules_python+//python/uv:uv_toolchain_type'",
+              "toolchain_names": [
+                "none"
+              ],
+              "toolchain_implementations": {
+                "none": "'@@rules_python+//python:none'"
+              },
+              "toolchain_compatible_with": {
+                "none": [
+                  "@platforms//:incompatible"
+                ]
+              },
+              "toolchain_target_settings": {}
+            }
+          }
+        },
+        "recordedRepoMappingEntries": [
+          [
+            "rules_python+",
+            "bazel_tools",
+            "bazel_tools"
+          ]
+        ]
+      }
+    }
+  }
+}
diff --git a/utils/bazel/extensions.bzl b/utils/bazel/extensions.bzl
new file mode 100644
index 0000000000000..b0d5871b722a7
--- /dev/null
+++ b/utils/bazel/extensions.bzl
@@ -0,0 +1,127 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""bzlmod extensions for llvm-project"""
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@bazel_tools//tools/build_defs/repo:local.bzl", "new_local_repository")
+load(":vulkan_sdk.bzl", "vulkan_sdk_setup")
+
+def _llvm_repos_extension_impl(module_ctx):
+    if any([m.is_root and m.name == "llvm-project-overlay" for m in module_ctx.modules]):
+        new_local_repository(
+            name = "llvm-raw",
+            build_file_content = "# empty",
+            path = "../../",
+        )
+
+    http_archive(
+        name = "llvm_zlib",
+        build_file = "@llvm-raw//utils/bazel/third_party_build:zlib-ng.BUILD",
+        sha256 = "e36bb346c00472a1f9ff2a0a4643e590a254be6379da7cddd9daeb9a7f296731",
+        strip_prefix = "zlib-ng-2.0.7",
+        urls = [
+            "https://github.com/zlib-ng/zlib-ng/archive/refs/tags/2.0.7.zip",
+        ],
+    )
+
+    http_archive(
+        name = "vulkan_headers",
+        build_file = "@llvm-raw//utils/bazel/third_party_build:vulkan_headers.BUILD",
+        sha256 = "19f491784ef0bc73caff877d11c96a48b946b5a1c805079d9006e3fbaa5c1895",
+        strip_prefix = "Vulkan-Headers-9bd3f561bcee3f01d22912de10bb07ce4e23d378",
+        urls = [
+            "https://github.com/KhronosGroup/Vulkan-Headers/archive/9bd3f561bcee3f01d22912de10bb07ce4e23d378.tar.gz",
+        ],
+    )
+
+    vulkan_sdk_setup(name = "vulkan_sdk_setup")
+
+    http_archive(
+        name = "gmp",
+        urls = [
+            "https://gmplib.org/download/gmp/gmp-6.2.1.tar.xz",
+            "https://ftp.gnu.org/gnu/gmp/gmp-6.2.1.tar.xz",
+        ],
+        build_file = "@llvm-raw//utils/bazel/third_party_build:gmp.BUILD",
+        sha256 = "fd4829912cddd12f84181c3451cc752be224643e87fac497b69edddadc49b4f2",
+        strip_prefix = "gmp-6.2.1",
+    )
+
+    http_archive(
+        name = "mpfr",
+        urls = [
+            "https://www.mpfr.org/mpfr-current/mpfr-4.2.2.tar.gz",
+        ],
+        sha256 = "826cbb24610bd193f36fde172233fb8c009f3f5c2ad99f644d0dea2e16a20e42",
+        strip_prefix = "mpfr-4.2.2",
+        build_file = "@llvm-raw//utils/bazel/third_party_build:mpfr.BUILD",
+    )
+
+    http_archive(
+        name = "mpc",
+        urls = [
+            "https://ftp.gnu.org/gnu/mpc/mpc-1.3.1.tar.gz",
+        ],
+        sha256 = "ab642492f5cf882b74aa0cb730cd410a81edcdbec895183ce930e706c1c759b8",
+        strip_prefix = "mpc-1.3.1",
+        build_file = "@llvm-raw//utils/bazel/third_party_build:mpc.BUILD",
+    )
+
+    http_archive(
+        name = "pfm",
+        urls = [
+            "https://versaweb.dl.sourceforge.net/project/perfmon2/libpfm4/libpfm-4.13.0.tar.gz",
+        ],
+        sha256 = "d18b97764c755528c1051d376e33545d0eb60c6ebf85680436813fa5b04cc3d1",
+        strip_prefix = "libpfm-4.13.0",
+        build_file = "@llvm-raw//utils/bazel/third_party_build:pfm.BUILD",
+    )
+
+    http_archive(
+        name = "llvm_zstd",
+        build_file = "@llvm-raw//utils/bazel/third_party_build:zstd.BUILD",
+        sha256 = "7c42d56fac126929a6a85dbc73ff1db2411d04f104fae9bdea51305663a83fd0",
+        strip_prefix = "zstd-1.5.2",
+        urls = [
+            "https://github.com/facebook/zstd/releases/download/v1.5.2/zstd-1.5.2.tar.gz",
+        ],
+    )
+
+    http_archive(
+        name = "pybind11",
+        url = "https://github.com/pybind/pybind11/archive/v2.10.3.zip",
+        sha256 = "201966a61dc826f1b1879a24a3317a1ec9214a918c8eb035be2f30c3e9cfbdcb",
+        strip_prefix = "pybind11-2.10.3",
+        build_file = "@llvm-raw//utils/bazel/third_party_build:pybind.BUILD",
+    )
+
+    http_archive(
+        name = "pyyaml",
+        url = "https://github.com/yaml/pyyaml/archive/refs/tags/5.1.zip",
+        sha256 = "f0a35d7f282a6d6b1a4f3f3965ef5c124e30ed27a0088efb97c0977268fd671f",
+        strip_prefix = "pyyaml-5.1/lib3",
+        build_file = "@llvm-raw//utils/bazel/third_party_build:pyyaml.BUILD",
+    )
+
+    # TODO: bump to robin-map-1.4.0
+    http_archive(
+        name = "robin_map",
+        build_file = "@llvm-raw//utils/bazel/third_party_build:robin_map.BUILD",
+        sha256 = "a8424ad3b0affd4c57ed26f0f3d8a29604f0e1f2ef2089f497f614b1c94c7236",
+        strip_prefix = "robin-map-1.3.0",
+        url = "https://github.com/Tessil/robin-map/archive/refs/tags/v1.3.0.tar.gz",
+    )
+
+    http_archive(
+        name = "nanobind",
+        build_file = "@llvm-raw//utils/bazel/third_party_build:nanobind.BUILD",
+        sha256 = "8ce3667dce3e64fc06bfb9b778b6f48731482362fb89a43da156632266cd5a90",
+        strip_prefix = "nanobind-2.9.2",
+        url = "https://github.com/wjakob/nanobind/archive/refs/tags/v2.9.2.tar.gz",
+    )
+
+llvm_repos_extension = module_extension(
+    implementation = _llvm_repos_extension_impl,
+)

From 47d9d735a7aef937256536af490876879c4b4731 Mon Sep 17 00:00:00 2001
From: Asher Mancinelli <ashermancinelli@gmail.com>
Date: Tue, 18 Nov 2025 07:55:11 -0800
Subject: [PATCH 29/33] [MLIR][Python] Add arg_attrs and res_attrs to gpu func
 (#168475)

I missed these attributes when I added the wrapper for GPUFuncOp in
fbdd98f74f0d.
---
 mlir/python/mlir/dialects/gpu/__init__.py |  6 ++++--
 mlir/test/python/dialects/gpu/dialect.py  | 12 +++++++-----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/mlir/python/mlir/dialects/gpu/__init__.py b/mlir/python/mlir/dialects/gpu/__init__.py
index 2fbcbb059f87a..d15643ca700e4 100644
--- a/mlir/python/mlir/dialects/gpu/__init__.py
+++ b/mlir/python/mlir/dialects/gpu/__init__.py
@@ -49,13 +49,13 @@ class GPUFuncOp(GPUFuncOp):
 
     FUNCTION_TYPE_ATTR_NAME = "function_type"
     SYM_NAME_ATTR_NAME = "sym_name"
-    ARGUMENT_ATTR_NAME = "arg_attrs"
-    RESULT_ATTR_NAME = "res_attrs"
 
     def __init__(
         self,
         function_type: Union[FunctionType, TypeAttr],
         sym_name: Optional[Union[str, StringAttr]] = None,
+        arg_attrs: Optional[Sequence[dict]] = None,
+        res_attrs: Optional[Sequence[dict]] = None,
         kernel: Optional[bool] = None,
         workgroup_attrib_attrs: Optional[Sequence[dict]] = None,
         private_attrib_attrs: Optional[Sequence[dict]] = None,
@@ -88,6 +88,8 @@ def __init__(
         )
         super().__init__(
             function_type,
+            arg_attrs=arg_attrs,
+            res_attrs=res_attrs,
             workgroup_attrib_attrs=workgroup_attrib_attrs,
             private_attrib_attrs=private_attrib_attrs,
             loc=loc,
diff --git a/mlir/test/python/dialects/gpu/dialect.py b/mlir/test/python/dialects/gpu/dialect.py
index 3945c99c41091..1a009b7dfa30d 100644
--- a/mlir/test/python/dialects/gpu/dialect.py
+++ b/mlir/test/python/dialects/gpu/dialect.py
@@ -133,9 +133,10 @@ def builder(func: gpu.GPUFuncOp) -> None:
             ), func.known_grid_size
 
             func = gpu.GPUFuncOp(
-                func_type,
+                ir.FunctionType.get(inputs=[T.index()], results=[]),
                 sym_name="non_kernel_func",
                 body_builder=builder,
+                arg_attrs=[{"gpu.some_attribute": ir.StringAttr.get("foo")}],
             )
             assert not func.is_kernel
             assert func.known_block_size is None
@@ -154,10 +155,11 @@ def builder(func: gpu.GPUFuncOp) -> None:
     # CHECK:   %[[VAL_0:.*]] = gpu.global_id  x
     # CHECK:   gpu.return
     # CHECK: }
-    # CHECK: gpu.func @non_kernel_func() {
-    # CHECK:   %[[VAL_0:.*]] = gpu.global_id  x
-    # CHECK:   gpu.return
-    # CHECK: }
+    # CHECK:   gpu.func @non_kernel_func(
+    # CHECK-SAME:      %[[ARG0:.*]]: index {gpu.some_attribute = "foo"}) {
+    # CHECK:           %[[GLOBAL_ID_0:.*]] = gpu.global_id  x
+    # CHECK:           gpu.return
+    # CHECK:         }
 
 
 # CHECK-LABEL: testGPULaunchFuncOp

From 83d27f6c84d92b4450a62f4b650b9cfadc0dab0f Mon Sep 17 00:00:00 2001
From: Nabeel Omer <nabeel.omer@sony.com>
Date: Tue, 18 Nov 2025 15:55:54 +0000
Subject: [PATCH 30/33] [Clang][Driver] Create crash reproducers for IR inputs
 (#165572)

This patch makes Clang produce the crash reproducer shell script for IR
inputs as well.
---
 clang/lib/Driver/Driver.cpp          | 109 +++++++++++++++++++--------
 clang/test/Driver/crash-ir-repro.cpp |  15 ++++
 2 files changed, 91 insertions(+), 33 deletions(-)
 create mode 100644 clang/test/Driver/crash-ir-repro.cpp

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 04fd68692d8d8..426fc796ffc20 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -70,6 +70,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
@@ -103,6 +104,7 @@
 #include <memory>
 #include <optional>
 #include <set>
+#include <string>
 #include <utility>
 #if LLVM_ON_UNIX
 #include <unistd.h> // getpid
@@ -2050,12 +2052,17 @@ void Driver::generateCompilationDiagnostics(
   InputList Inputs;
   BuildInputs(C.getDefaultToolChain(), C.getArgs(), Inputs);
 
+  ArgStringList IRInputs;
   for (InputList::iterator it = Inputs.begin(), ie = Inputs.end(); it != ie;) {
     bool IgnoreInput = false;
 
-    // Ignore input from stdin or any inputs that cannot be preprocessed.
-    // Check type first as not all linker inputs have a value.
-    if (types::getPreprocessedType(it->first) == types::TY_INVALID) {
+    // Save IR inputs separately, ignore input from stdin or any other inputs
+    // that cannot be preprocessed. Check type first as not all linker inputs
+    // have a value.
+    if (types::isLLVMIR(it->first)) {
+      IRInputs.push_back(it->second->getValue());
+      IgnoreInput = true;
+    } else if (types::getPreprocessedType(it->first) == types::TY_INVALID) {
       IgnoreInput = true;
     } else if (!strcmp(it->second->getValue(), "-")) {
       Diag(clang::diag::note_drv_command_failed_diag_msg)
@@ -2072,7 +2079,7 @@ void Driver::generateCompilationDiagnostics(
     }
   }
 
-  if (Inputs.empty()) {
+  if (Inputs.empty() && IRInputs.empty()) {
     Diag(clang::diag::note_drv_command_failed_diag_msg)
         << "Error generating preprocessed source(s) - "
            "no preprocessable inputs.";
@@ -2095,46 +2102,82 @@ void Driver::generateCompilationDiagnostics(
     return;
   }
 
-  // Construct the list of abstract actions to perform for this compilation. On
-  // Darwin OSes this uses the driver-driver and builds universal actions.
-  const ToolChain &TC = C.getDefaultToolChain();
-  if (TC.getTriple().isOSBinFormatMachO())
-    BuildUniversalActions(C, TC, Inputs);
-  else
-    BuildActions(C, C.getArgs(), Inputs, C.getActions());
+  // If we only have IR inputs there's no need for preprocessing.
+  if (!Inputs.empty()) {
+    // Construct the list of abstract actions to perform for this compilation.
+    // On Darwin OSes this uses the driver-driver and builds universal actions.
+    const ToolChain &TC = C.getDefaultToolChain();
+    if (TC.getTriple().isOSBinFormatMachO())
+      BuildUniversalActions(C, TC, Inputs);
+    else
+      BuildActions(C, C.getArgs(), Inputs, C.getActions());
 
-  BuildJobs(C);
+    BuildJobs(C);
 
-  // If there were errors building the compilation, quit now.
-  if (Trap.hasErrorOccurred()) {
-    Diag(clang::diag::note_drv_command_failed_diag_msg)
-        << "Error generating preprocessed source(s).";
-    return;
-  }
+    // If there were errors building the compilation, quit now.
+    if (Trap.hasErrorOccurred()) {
+      Diag(clang::diag::note_drv_command_failed_diag_msg)
+          << "Error generating preprocessed source(s).";
+      return;
+    }
+    // Generate preprocessed output.
+    SmallVector<std::pair<int, const Command *>, 4> FailingCommands;
+    C.ExecuteJobs(C.getJobs(), FailingCommands);
 
-  // Generate preprocessed output.
-  SmallVector<std::pair<int, const Command *>, 4> FailingCommands;
-  C.ExecuteJobs(C.getJobs(), FailingCommands);
+    // If any of the preprocessing commands failed, clean up and exit.
+    if (!FailingCommands.empty()) {
+      Diag(clang::diag::note_drv_command_failed_diag_msg)
+          << "Error generating preprocessed source(s).";
+      return;
+    }
 
-  // If any of the preprocessing commands failed, clean up and exit.
-  if (!FailingCommands.empty()) {
-    Diag(clang::diag::note_drv_command_failed_diag_msg)
-        << "Error generating preprocessed source(s).";
-    return;
+    const ArgStringList &TempFiles = C.getTempFiles();
+    if (TempFiles.empty()) {
+      Diag(clang::diag::note_drv_command_failed_diag_msg)
+          << "Error generating preprocessed source(s).";
+      return;
+    }
   }
 
-  const ArgStringList &TempFiles = C.getTempFiles();
-  if (TempFiles.empty()) {
-    Diag(clang::diag::note_drv_command_failed_diag_msg)
-        << "Error generating preprocessed source(s).";
-    return;
+  // Copying filenames due to ownership.
+  const ArgStringList &Files = C.getTempFiles();
+  SmallVector<std::string> TempFiles(Files.begin(), Files.end());
+
+  // We'd like to copy the IR input file into our own temp file
+  // because the build system might try to clean-up after itself.
+  for (auto const *Input : IRInputs) {
+    int FD;
+    llvm::SmallVector<char, 64> Path;
+
+    StringRef extension = llvm::sys::path::extension(Input);
+    if (!extension.empty())
+      extension = extension.drop_front();
+
+    std::error_code EC = llvm::sys::fs::createTemporaryFile(
+        llvm::sys::path::stem(Input), extension, FD, Path);
+    if (EC) {
+      Diag(clang::diag::note_drv_command_failed_diag_msg)
+          << "Error generating run script: " << "Failed copying IR input files"
+          << " " << EC.message();
+      return;
+    }
+
+    EC = llvm::sys::fs::copy_file(Input, FD);
+    if (EC) {
+      Diag(clang::diag::note_drv_command_failed_diag_msg)
+          << "Error generating run script: " << "Failed copying IR input files"
+          << " " << EC.message();
+      return;
+    }
+
+    TempFiles.push_back(std::string(Path.begin(), Path.end()));
   }
 
   Diag(clang::diag::note_drv_command_failed_diag_msg) << BugReporMsg;
 
   SmallString<128> VFS;
   SmallString<128> ReproCrashFilename;
-  for (const char *TempFile : TempFiles) {
+  for (std::string &TempFile : TempFiles) {
     Diag(clang::diag::note_drv_command_failed_diag_msg) << TempFile;
     if (Report)
       Report->TemporaryFiles.push_back(TempFile);
@@ -2151,7 +2194,7 @@ void Driver::generateCompilationDiagnostics(
   }
 
   for (const char *TempFile : SavedTemps)
-    C.addTempFile(TempFile);
+    TempFiles.push_back(TempFile);
 
   // Assume associated files are based off of the first temporary file.
   CrashReportInfo CrashInfo(TempFiles[0], VFS);
diff --git a/clang/test/Driver/crash-ir-repro.cpp b/clang/test/Driver/crash-ir-repro.cpp
new file mode 100644
index 0000000000000..1f31a5ca1bb34
--- /dev/null
+++ b/clang/test/Driver/crash-ir-repro.cpp
@@ -0,0 +1,15 @@
+// RUN: %clang -S -emit-llvm -o %t.ll %s
+// RUN: not %clang -S -DCRASH %s %t.ll 2>&1 | FileCheck %s
+
+// CHECK: Preprocessed source(s) and associated run script(s) are located at:
+// CHECK-NEXT: clang: note: diagnostic msg: {{.*}}.cpp
+// CHECK-NEXT: clang: note: diagnostic msg: {{.*}}.ll
+// CHECK-NEXT: clang: note: diagnostic msg: {{.*}}.sh
+
+#ifdef CRASH
+#pragma clang __debug parser_crash
+#endif
+
+int main() {
+  return 0;
+}

From a1e47cefa913d53e55d924a6326697f3fe5d1206 Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <aimilios.tsalapatis@gmail.com>
Date: Tue, 18 Nov 2025 10:58:09 -0500
Subject: [PATCH 31/33] [llvm][AddressSanitizer] option for specifying the
 address space of the shadow map (#167772)

The AddressSanitizer transform currently defaults to placing the shadow
map in address space 0, but it is desirable for some targets (namely
BPF) to select a different address space for the map. Add a compilation
option for specifying the address space of the target.
---
 llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 7c364f86fb0e8..49f03fa93f0e0 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -248,6 +248,11 @@ static cl::opt<bool>
                          "platforms that support this"),
                 cl::Hidden, cl::init(true));
 
+static cl::opt<int>
+    ClShadowAddrSpace("asan-shadow-addr-space",
+                      cl::desc("Address space for pointers to the shadow map"),
+                      cl::Hidden, cl::init(0));
+
 static cl::opt<bool> ClWithIfuncSuppressRemat(
     "asan-with-ifunc-suppress-remat",
     cl::desc("Suppress rematerialization of dynamic shadow address by passing "
@@ -1942,7 +1947,7 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
 
   Type *ShadowTy =
       IntegerType::get(*C, std::max(8U, TypeStoreSize >> Mapping.Scale));
-  Type *ShadowPtrTy = PointerType::get(*C, 0);
+  Type *ShadowPtrTy = PointerType::get(*C, ClShadowAddrSpace);
   Value *ShadowPtr = memToShadow(AddrLong, IRB);
   const uint64_t ShadowAlign =
       std::max<uint64_t>(Alignment.valueOrOne().value() >> Mapping.Scale, 1);

From 82a7832de27aad8f681773875b081013c2c0c9dd Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <aimilios.tsalapatis@gmail.com>
Date: Tue, 18 Nov 2025 10:58:56 -0500
Subject: [PATCH 32/33] [llvm][AddressSanitizer][BPF] add default shadow
 mapping offset for BPF target (#167768)

The AddressSanitizer transform does not have a default offset registered
for the shadow map. Set the default shadow map offset for BPF be
dynamically set by the KASAN implementation.
---
 llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 49f03fa93f0e0..3a14ee5addc2f 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -508,6 +508,7 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize,
   bool IsAMDGPU = TargetTriple.isAMDGPU();
   bool IsHaiku = TargetTriple.isOSHaiku();
   bool IsWasm = TargetTriple.isWasm();
+  bool IsBPF = TargetTriple.isBPF();
 
   ShadowMapping Mapping;
 
@@ -584,6 +585,8 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize,
     else if (IsHaiku && IsX86_64)
       Mapping.Offset = (kSmallX86_64ShadowOffsetBase &
                         (kSmallX86_64ShadowOffsetAlignMask << Mapping.Scale));
+    else if (IsBPF)
+      Mapping.Offset = kDynamicShadowSentinel;
     else
       Mapping.Offset = kDefaultShadowOffset64;
   }

From 1347b23cd6510a4149665616433e8505bb6fc6bc Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <aimilios.tsalapatis@gmail.com>
Date: Tue, 18 Nov 2025 10:59:10 -0500
Subject: [PATCH 33/33] [clang][BPF] Turn on AddressSanitizer pass (#167766)

The BPF LLVM target currently doesn't support turning on the
AddressSanitizer pass, either for userspace ASAN or KASAN. Enable the
KASAN option for the BPF target in anticipation of a KASAN
implementation for BPF.
---
 clang/lib/Driver/ToolChain.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 5ff7d83946137..77a2c73f0d446 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1639,6 +1639,8 @@ SanitizerMask ToolChain::getSupportedSanitizers() const {
     Res |= SanitizerKind::ShadowCallStack;
   if (getTriple().isAArch64(64))
     Res |= SanitizerKind::MemTag;
+  if (getTriple().isBPF())
+    Res |= SanitizerKind::KernelAddress;
   return Res;
 }